diff --git "a/sft/shared_expert_665k_llava/trainer_state.json" "b/sft/shared_expert_665k_llava/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/shared_expert_665k_llava/trainer_state.json" @@ -0,0 +1,282787 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0592014, + "auxiliary_loss_mlp": 0.02849952, + "balance_loss_clip": 1.9061954, + "balance_loss_mlp": 2.26414394, + "epoch": 6.012325266796934e-05, + "flos": 24455432897280.0, + "grad_norm": 59.88248960013876, + "language_loss": 2.96915507, + "learning_rate": 0.0, + "loss": 2.02837133, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 9.4375, + "router_z_loss_mlp": 36.5, + "step": 1, + "time_per_iteration": 13.95829463005066 + }, + { + "auxiliary_loss_clip": 0.03875387, + "auxiliary_loss_mlp": 0.01847448, + "balance_loss_clip": 1.26494408, + "balance_loss_mlp": 1.51304102, + "epoch": 0.00012024650533593868, + "flos": 20225010188160.0, + "grad_norm": 38.091340254623105, + "language_loss": 1.8701098, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.92733812, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 5.84375, + "router_z_loss_mlp": 23.625, + "step": 2, + "time_per_iteration": 2.3926379680633545 + }, + { + "auxiliary_loss_clip": 0.03867972, + "auxiliary_loss_mlp": 0.01837649, + "balance_loss_clip": 1.24942243, + "balance_loss_mlp": 1.51165771, + "epoch": 0.000180369758003908, + "flos": 22308835996800.0, + "grad_norm": 34.54398734614813, + "language_loss": 1.63752937, + "learning_rate": 7.073439208833112e-07, + "loss": 1.69458556, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 23.5, + "step": 3, + "time_per_iteration": 2.3708834648132324 + }, + { + "auxiliary_loss_clip": 0.03834647, + "auxiliary_loss_mlp": 0.01874712, + "balance_loss_clip": 1.23155427, + "balance_loss_mlp": 1.51126909, + "epoch": 0.00024049301067187735, + "flos": 22413680409600.0, + "grad_norm": 51.58421907872779, + "language_loss": 1.74334764, + "learning_rate": 8.925686513863519e-07, + "loss": 1.80044127, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 23.25, + "step": 4, + "time_per_iteration": 2.3859238624572754 + }, + { + "auxiliary_loss_clip": 0.0384364, + "auxiliary_loss_mlp": 0.01827733, + "balance_loss_clip": 1.2551465, + "balance_loss_mlp": 1.50975645, + "epoch": 0.0003006162633398467, + "flos": 21395927099520.0, + "grad_norm": 55.60498654380099, + "language_loss": 1.97572052, + "learning_rate": 1.0362401141348472e-06, + "loss": 2.03243423, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 23.375, + "step": 5, + "time_per_iteration": 2.3426313400268555 + }, + { + "auxiliary_loss_clip": 0.0384073, + "auxiliary_loss_mlp": 0.01876276, + "balance_loss_clip": 1.24265456, + "balance_loss_mlp": 1.51395893, + "epoch": 0.000360739516007816, + "flos": 21651316761600.0, + "grad_norm": 34.10565653870119, + "language_loss": 1.6289196, + "learning_rate": 1.153628246576487e-06, + "loss": 1.68608975, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 6.3125, + "router_z_loss_mlp": 23.25, + "step": 6, + "time_per_iteration": 2.3758718967437744 + }, + { + "auxiliary_loss_clip": 0.03817913, + "auxiliary_loss_mlp": 0.01907599, + "balance_loss_clip": 1.25719333, + "balance_loss_mlp": 1.51139355, + "epoch": 0.0004208627686757854, + "flos": 27158586312960.0, + "grad_norm": 24.29854221742999, + "language_loss": 1.55889654, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.61615169, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 23.0, + "step": 7, + "time_per_iteration": 2.497342348098755 + }, + { + "auxiliary_loss_clip": 0.03645904, + "auxiliary_loss_mlp": 0.01732322, + "balance_loss_clip": 1.18567562, + "balance_loss_mlp": 1.50970292, + "epoch": 0.0004809860213437547, + "flos": 31317824292480.0, + "grad_norm": 30.418445323895252, + "language_loss": 1.43859577, + "learning_rate": 1.338852977079528e-06, + "loss": 1.492378, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 5.46875, + "router_z_loss_mlp": 21.375, + "step": 8, + "time_per_iteration": 2.58284592628479 + }, + { + "auxiliary_loss_clip": 0.03627739, + "auxiliary_loss_mlp": 0.01695781, + "balance_loss_clip": 1.16935301, + "balance_loss_mlp": 1.50706589, + "epoch": 0.000541109274011724, + "flos": 32159056435200.0, + "grad_norm": 23.488022276991867, + "language_loss": 1.57478893, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.6280241, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 5.25, + "router_z_loss_mlp": 21.25, + "step": 9, + "time_per_iteration": 2.5866003036499023 + }, + { + "auxiliary_loss_clip": 0.03584784, + "auxiliary_loss_mlp": 0.01715115, + "balance_loss_clip": 1.16579831, + "balance_loss_mlp": 1.50366664, + "epoch": 0.0006012325266796934, + "flos": 18915801914880.0, + "grad_norm": 19.568016533137865, + "language_loss": 1.5172112, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.57021022, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 5.5, + "router_z_loss_mlp": 20.75, + "step": 10, + "time_per_iteration": 2.5052900314331055 + }, + { + "auxiliary_loss_clip": 0.0355306, + "auxiliary_loss_mlp": 0.01714114, + "balance_loss_clip": 1.20065534, + "balance_loss_mlp": 1.5034039, + "epoch": 0.0006613557793476627, + "flos": 20773879672320.0, + "grad_norm": 15.82446653794725, + "language_loss": 1.48783445, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.54050612, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 20.5, + "step": 11, + "time_per_iteration": 2.5404746532440186 + }, + { + "auxiliary_loss_clip": 0.03492447, + "auxiliary_loss_mlp": 0.01690133, + "balance_loss_clip": 1.15874553, + "balance_loss_mlp": 1.5042367, + "epoch": 0.000721479032015632, + "flos": 16580740896000.0, + "grad_norm": 12.687790286873813, + "language_loss": 1.47094202, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.5227679, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 19.875, + "step": 12, + "time_per_iteration": 2.496410846710205 + }, + { + "auxiliary_loss_clip": 0.03339392, + "auxiliary_loss_mlp": 0.01550267, + "balance_loss_clip": 1.08887899, + "balance_loss_mlp": 1.51099777, + "epoch": 0.0007816022846836014, + "flos": 23804340618240.0, + "grad_norm": 8.758946386735074, + "language_loss": 1.35767293, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.40656948, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 4.625, + "router_z_loss_mlp": 18.25, + "step": 13, + "time_per_iteration": 2.5658016204833984 + }, + { + "auxiliary_loss_clip": 0.02829984, + "auxiliary_loss_mlp": 0.01373452, + "balance_loss_clip": 1.02517021, + "balance_loss_mlp": 1.52180982, + "epoch": 0.0008417255373515708, + "flos": 19171191576960.0, + "grad_norm": 5.41326447908647, + "language_loss": 1.3153739, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.3574084, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 13.125, + "step": 14, + "time_per_iteration": 2.4941866397857666 + }, + { + "auxiliary_loss_clip": 0.02720327, + "auxiliary_loss_mlp": 0.0137032, + "balance_loss_clip": 1.03348207, + "balance_loss_mlp": 1.52959895, + "epoch": 0.00090184879001954, + "flos": 26394372362880.0, + "grad_norm": 4.764776492194351, + "language_loss": 1.20919251, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.25009894, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 11.875, + "step": 15, + "time_per_iteration": 2.579803466796875 + }, + { + "auxiliary_loss_clip": 0.02641292, + "auxiliary_loss_mlp": 0.01373756, + "balance_loss_clip": 1.03405726, + "balance_loss_mlp": 1.5335772, + "epoch": 0.0009619720426875094, + "flos": 24678391305600.0, + "grad_norm": 3.968460904230528, + "language_loss": 1.19075227, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.23090279, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 11.0625, + "step": 16, + "time_per_iteration": 2.5591747760772705 + }, + { + "auxiliary_loss_clip": 0.02610849, + "auxiliary_loss_mlp": 0.01333515, + "balance_loss_clip": 1.03768528, + "balance_loss_mlp": 1.53663445, + "epoch": 0.0010220952953554788, + "flos": 18623543990400.0, + "grad_norm": 4.419232927622343, + "language_loss": 1.27668214, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.31612563, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 10.75, + "step": 17, + "time_per_iteration": 5.376318693161011 + }, + { + "auxiliary_loss_clip": 0.02503798, + "auxiliary_loss_mlp": 0.0134741, + "balance_loss_clip": 1.03460526, + "balance_loss_mlp": 1.54176152, + "epoch": 0.001082218548023448, + "flos": 26141286850560.0, + "grad_norm": 3.2883009857040406, + "language_loss": 1.14481819, + "learning_rate": 1.860972167459798e-06, + "loss": 1.1833303, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 9.625, + "step": 18, + "time_per_iteration": 2.5738518238067627 + }, + { + "auxiliary_loss_clip": 0.02425537, + "auxiliary_loss_mlp": 0.01364425, + "balance_loss_clip": 1.05944014, + "balance_loss_mlp": 1.53502774, + "epoch": 0.0011423418006914173, + "flos": 19608758046720.0, + "grad_norm": 3.386770393584129, + "language_loss": 1.14091611, + "learning_rate": 1.89578346593066e-06, + "loss": 1.1788156, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 8.9375, + "step": 19, + "time_per_iteration": 2.4791295528411865 + }, + { + "auxiliary_loss_clip": 0.02353366, + "auxiliary_loss_mlp": 0.01330185, + "balance_loss_clip": 1.05724311, + "balance_loss_mlp": 1.5460546, + "epoch": 0.0012024650533593868, + "flos": 17894382912000.0, + "grad_norm": 3.4597418022705453, + "language_loss": 1.2497611, + "learning_rate": 1.928808765521199e-06, + "loss": 1.28659654, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 8.125, + "step": 20, + "time_per_iteration": 2.6001813411712646 + }, + { + "auxiliary_loss_clip": 0.02273422, + "auxiliary_loss_mlp": 0.013496, + "balance_loss_clip": 1.0778029, + "balance_loss_mlp": 1.54572868, + "epoch": 0.001262588306027356, + "flos": 21250967667840.0, + "grad_norm": 3.6037352982033153, + "language_loss": 1.1877749, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.2240051, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 7.28125, + "step": 21, + "time_per_iteration": 2.5836637020111084 + }, + { + "auxiliary_loss_clip": 0.02011067, + "auxiliary_loss_mlp": 0.01442024, + "balance_loss_clip": 1.14924622, + "balance_loss_mlp": 1.55435538, + "epoch": 0.0013227115586953253, + "flos": 26102882488320.0, + "grad_norm": 3.083812497930254, + "language_loss": 1.19608283, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.23061383, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 4.5625, + "step": 22, + "time_per_iteration": 2.539775848388672 + }, + { + "auxiliary_loss_clip": 0.01926986, + "auxiliary_loss_mlp": 0.01505657, + "balance_loss_clip": 1.20963633, + "balance_loss_mlp": 1.56594384, + "epoch": 0.0013828348113632948, + "flos": 23950242656640.0, + "grad_norm": 2.827621860614114, + "language_loss": 1.03031397, + "learning_rate": 2.018794797290208e-06, + "loss": 1.06464052, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 3.609375, + "step": 23, + "time_per_iteration": 2.514194965362549 + }, + { + "auxiliary_loss_clip": 0.01894492, + "auxiliary_loss_mlp": 0.01460161, + "balance_loss_clip": 1.14869106, + "balance_loss_mlp": 1.57090414, + "epoch": 0.001442958064031264, + "flos": 15958972759680.0, + "grad_norm": 2.703874562223705, + "language_loss": 1.15945315, + "learning_rate": 2.046196897962839e-06, + "loss": 1.19299972, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 3.234375, + "step": 24, + "time_per_iteration": 2.4529855251312256 + }, + { + "auxiliary_loss_clip": 0.01871654, + "auxiliary_loss_mlp": 0.01432521, + "balance_loss_clip": 1.16129601, + "balance_loss_mlp": 1.56573081, + "epoch": 0.0015030813166992333, + "flos": 18107527962240.0, + "grad_norm": 3.48604146843946, + "language_loss": 1.17391658, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.20695829, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 3.0625, + "step": 25, + "time_per_iteration": 2.442079782485962 + }, + { + "auxiliary_loss_clip": 0.01863681, + "auxiliary_loss_mlp": 0.01389685, + "balance_loss_clip": 1.12818706, + "balance_loss_mlp": 1.56979775, + "epoch": 0.0015632045693672028, + "flos": 22233528460800.0, + "grad_norm": 2.3852005903269413, + "language_loss": 1.15666127, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.18919492, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 2.9375, + "step": 26, + "time_per_iteration": 2.494749069213867 + }, + { + "auxiliary_loss_clip": 0.01859968, + "auxiliary_loss_mlp": 0.01338449, + "balance_loss_clip": 1.10689712, + "balance_loss_mlp": 1.5667156, + "epoch": 0.001623327822035172, + "flos": 23990706789120.0, + "grad_norm": 2.314721074457863, + "language_loss": 1.03393078, + "learning_rate": 2.122031762649933e-06, + "loss": 1.06591487, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 2.9375, + "step": 27, + "time_per_iteration": 2.513204574584961 + }, + { + "auxiliary_loss_clip": 0.01847053, + "auxiliary_loss_mlp": 0.01306927, + "balance_loss_clip": 1.09387577, + "balance_loss_mlp": 1.56402767, + "epoch": 0.0016834510747031415, + "flos": 19676769108480.0, + "grad_norm": 2.190330751737024, + "language_loss": 1.15545738, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.18699718, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 2.828125, + "step": 28, + "time_per_iteration": 2.471423387527466 + }, + { + "auxiliary_loss_clip": 0.01838201, + "auxiliary_loss_mlp": 0.01268613, + "balance_loss_clip": 1.0743494, + "balance_loss_mlp": 1.55438936, + "epoch": 0.0017435743273711108, + "flos": 20922749176320.0, + "grad_norm": 2.075086891165793, + "language_loss": 1.1381247, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.16919291, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 2.828125, + "step": 29, + "time_per_iteration": 2.4642415046691895 + }, + { + "auxiliary_loss_clip": 0.01820986, + "auxiliary_loss_mlp": 0.01259961, + "balance_loss_clip": 1.07828605, + "balance_loss_mlp": 1.54057741, + "epoch": 0.00180369758003908, + "flos": 19528178895360.0, + "grad_norm": 2.510270472528044, + "language_loss": 1.34787011, + "learning_rate": 2.189868360711334e-06, + "loss": 1.37867963, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 2.8125, + "step": 30, + "time_per_iteration": 2.4827816486358643 + }, + { + "auxiliary_loss_clip": 0.01815259, + "auxiliary_loss_mlp": 0.01236805, + "balance_loss_clip": 1.05951691, + "balance_loss_mlp": 1.53890336, + "epoch": 0.0018638208327070496, + "flos": 27451961400960.0, + "grad_norm": 2.1540247244676842, + "language_loss": 1.15840447, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.18892515, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 2.75, + "step": 31, + "time_per_iteration": 2.514981269836426 + }, + { + "auxiliary_loss_clip": 0.01797245, + "auxiliary_loss_mlp": 0.01202668, + "balance_loss_clip": 1.02967131, + "balance_loss_mlp": 1.52960193, + "epoch": 0.0019239440853750188, + "flos": 13588614489600.0, + "grad_norm": 1.958003866574222, + "language_loss": 1.04776859, + "learning_rate": 2.2314216284658796e-06, + "loss": 1.07776773, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 2.671875, + "step": 32, + "time_per_iteration": 2.4984066486358643 + }, + { + "auxiliary_loss_clip": 0.01792347, + "auxiliary_loss_mlp": 0.01218719, + "balance_loss_clip": 1.03799748, + "balance_loss_mlp": 1.52601659, + "epoch": 0.001984067338042988, + "flos": 11253099623040.0, + "grad_norm": 2.540123101717717, + "language_loss": 1.10896111, + "learning_rate": 2.2512340280885094e-06, + "loss": 1.1390717, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 2.65625, + "step": 33, + "time_per_iteration": 2.44722843170166 + }, + { + "auxiliary_loss_clip": 0.01751008, + "auxiliary_loss_mlp": 0.01202346, + "balance_loss_clip": 1.03211534, + "balance_loss_mlp": 1.50218964, + "epoch": 0.0020441905907109576, + "flos": 22385051228160.0, + "grad_norm": 1.766705137559389, + "language_loss": 0.98252147, + "learning_rate": 2.270454923596497e-06, + "loss": 1.01205504, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 2.484375, + "step": 34, + "time_per_iteration": 2.5171329975128174 + }, + { + "auxiliary_loss_clip": 0.0171905, + "auxiliary_loss_mlp": 0.01198976, + "balance_loss_clip": 1.03961742, + "balance_loss_mlp": 1.47864413, + "epoch": 0.0021043138433789266, + "flos": 49776858489600.0, + "grad_norm": 1.934875159455613, + "language_loss": 0.87226588, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.90144616, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 2.40625, + "step": 35, + "time_per_iteration": 2.731252908706665 + }, + { + "auxiliary_loss_clip": 0.01696442, + "auxiliary_loss_mlp": 0.0120763, + "balance_loss_clip": 1.05141854, + "balance_loss_mlp": 1.46685505, + "epoch": 0.002164437096046896, + "flos": 20556929283840.0, + "grad_norm": 1.852836988720178, + "language_loss": 0.99594855, + "learning_rate": 2.307256493152974e-06, + "loss": 1.02498937, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 2.296875, + "step": 36, + "time_per_iteration": 2.523585557937622 + }, + { + "auxiliary_loss_clip": 0.0166853, + "auxiliary_loss_mlp": 0.01179437, + "balance_loss_clip": 1.0360043, + "balance_loss_mlp": 1.45414805, + "epoch": 0.0022245603487148656, + "flos": 26541077362560.0, + "grad_norm": 1.8982971776811952, + "language_loss": 1.05561364, + "learning_rate": 2.3248973825097614e-06, + "loss": 1.08409333, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 2.140625, + "step": 37, + "time_per_iteration": 2.5188474655151367 + }, + { + "auxiliary_loss_clip": 0.01645951, + "auxiliary_loss_mlp": 0.01172056, + "balance_loss_clip": 1.03701544, + "balance_loss_mlp": 1.43997324, + "epoch": 0.0022846836013828346, + "flos": 20337185986560.0, + "grad_norm": 1.7940956563805204, + "language_loss": 1.10813737, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.13631737, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 2.0625, + "step": 38, + "time_per_iteration": 2.4835011959075928 + }, + { + "auxiliary_loss_clip": 0.01615695, + "auxiliary_loss_mlp": 0.01168341, + "balance_loss_clip": 1.03816485, + "balance_loss_mlp": 1.42254043, + "epoch": 0.002344806854050804, + "flos": 26246445465600.0, + "grad_norm": 1.797232245845728, + "language_loss": 0.94572014, + "learning_rate": 2.358792165262154e-06, + "loss": 0.97356045, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 1.9296875, + "step": 39, + "time_per_iteration": 2.545525550842285 + }, + { + "auxiliary_loss_clip": 0.01589535, + "auxiliary_loss_mlp": 0.01171658, + "balance_loss_clip": 1.04415178, + "balance_loss_mlp": 1.40366411, + "epoch": 0.0024049301067187736, + "flos": 11800747209600.0, + "grad_norm": 2.3252134330432153, + "language_loss": 1.05606556, + "learning_rate": 2.3750930912143747e-06, + "loss": 1.08367753, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 1.859375, + "step": 40, + "time_per_iteration": 2.4814131259918213 + }, + { + "auxiliary_loss_clip": 0.0156809, + "auxiliary_loss_mlp": 0.01151629, + "balance_loss_clip": 1.03137076, + "balance_loss_mlp": 1.39235711, + "epoch": 0.0024650533593867426, + "flos": 20630456340480.0, + "grad_norm": 1.9304736304141732, + "language_loss": 1.04341471, + "learning_rate": 2.3909914837471044e-06, + "loss": 1.07061172, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 1.7578125, + "step": 41, + "time_per_iteration": 2.4887442588806152 + }, + { + "auxiliary_loss_clip": 0.01547533, + "auxiliary_loss_mlp": 0.0115471, + "balance_loss_clip": 1.03435612, + "balance_loss_mlp": 1.37903047, + "epoch": 0.002525176612054712, + "flos": 18405127324800.0, + "grad_norm": 1.8610384533041682, + "language_loss": 1.05148911, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.07851148, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 1.6875, + "step": 42, + "time_per_iteration": 2.5541391372680664 + }, + { + "auxiliary_loss_clip": 0.01525354, + "auxiliary_loss_mlp": 0.01151981, + "balance_loss_clip": 1.04001999, + "balance_loss_mlp": 1.36522567, + "epoch": 0.0025852998647226816, + "flos": 28182763313280.0, + "grad_norm": 2.0744596214174855, + "language_loss": 1.10404158, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.13081491, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 1.6015625, + "step": 43, + "time_per_iteration": 2.533997058868408 + }, + { + "auxiliary_loss_clip": 0.01508479, + "auxiliary_loss_mlp": 0.01141589, + "balance_loss_clip": 1.03248906, + "balance_loss_mlp": 1.35540771, + "epoch": 0.0026454231173906506, + "flos": 14282233937280.0, + "grad_norm": 1.7923717148249703, + "language_loss": 1.07973742, + "learning_rate": 2.4364587585915504e-06, + "loss": 1.10623813, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 1.53125, + "step": 44, + "time_per_iteration": 2.4795925617218018 + }, + { + "auxiliary_loss_clip": 0.01497536, + "auxiliary_loss_mlp": 0.01137356, + "balance_loss_clip": 1.03011465, + "balance_loss_mlp": 1.34896302, + "epoch": 0.00270554637005862, + "flos": 22418114152320.0, + "grad_norm": 1.6965701630206476, + "language_loss": 1.09130347, + "learning_rate": 2.450927955901469e-06, + "loss": 1.11765242, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 1.484375, + "step": 45, + "time_per_iteration": 2.570896863937378 + }, + { + "auxiliary_loss_clip": 0.01485914, + "auxiliary_loss_mlp": 0.01129942, + "balance_loss_clip": 1.02408433, + "balance_loss_mlp": 1.34306371, + "epoch": 0.0027656696227265896, + "flos": 23984702035200.0, + "grad_norm": 1.5400208136049292, + "language_loss": 1.10024905, + "learning_rate": 2.465079122983384e-06, + "loss": 1.12640762, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 1.4296875, + "step": 46, + "time_per_iteration": 2.5858287811279297 + }, + { + "auxiliary_loss_clip": 0.01480002, + "auxiliary_loss_mlp": 0.01128872, + "balance_loss_clip": 1.02701914, + "balance_loss_mlp": 1.33938384, + "epoch": 0.0028257928753945586, + "flos": 37668001731840.0, + "grad_norm": 1.646375280954016, + "language_loss": 0.98705554, + "learning_rate": 2.4789259401737868e-06, + "loss": 1.01314437, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 1.40625, + "step": 47, + "time_per_iteration": 2.662827253341675 + }, + { + "auxiliary_loss_clip": 0.01477236, + "auxiliary_loss_mlp": 0.01140824, + "balance_loss_clip": 1.04202318, + "balance_loss_mlp": 1.34053612, + "epoch": 0.002885916128062528, + "flos": 22453481226240.0, + "grad_norm": 1.6837192815734772, + "language_loss": 0.94916022, + "learning_rate": 2.492481223656015e-06, + "loss": 0.97534078, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 1.3671875, + "step": 48, + "time_per_iteration": 2.646942377090454 + }, + { + "auxiliary_loss_clip": 0.01472159, + "auxiliary_loss_mlp": 0.01137421, + "balance_loss_clip": 1.0374279, + "balance_loss_mlp": 1.3341831, + "epoch": 0.0029460393807304976, + "flos": 27011671845120.0, + "grad_norm": 1.7167334518358937, + "language_loss": 0.97937459, + "learning_rate": 2.5057569967437924e-06, + "loss": 1.00547028, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 1.375, + "step": 49, + "time_per_iteration": 2.7306461334228516 + }, + { + "auxiliary_loss_clip": 0.01463358, + "auxiliary_loss_mlp": 0.01122727, + "balance_loss_clip": 1.03303349, + "balance_loss_mlp": 1.33046627, + "epoch": 0.0030061626333984666, + "flos": 15850916501760.0, + "grad_norm": 1.7760834902183293, + "language_loss": 0.98691154, + "learning_rate": 2.51876455396287e-06, + "loss": 1.01277232, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 1.328125, + "step": 50, + "time_per_iteration": 2.6136646270751953 + }, + { + "auxiliary_loss_clip": 0.01461312, + "auxiliary_loss_mlp": 0.01126198, + "balance_loss_clip": 1.04046226, + "balance_loss_mlp": 1.32714772, + "epoch": 0.003066285886066436, + "flos": 31825845619200.0, + "grad_norm": 1.8773957611407543, + "language_loss": 0.99911571, + "learning_rate": 2.5315145187866316e-06, + "loss": 1.0249908, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 1.34375, + "step": 51, + "time_per_iteration": 2.694885492324829 + }, + { + "auxiliary_loss_clip": 0.01448632, + "auxiliary_loss_mlp": 0.0112479, + "balance_loss_clip": 1.03767216, + "balance_loss_mlp": 1.32371688, + "epoch": 0.0031264091387344056, + "flos": 41425878188160.0, + "grad_norm": 1.755027267036778, + "language_loss": 1.0240953, + "learning_rate": 2.5440168957651953e-06, + "loss": 1.04982948, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 1.25, + "step": 52, + "time_per_iteration": 2.738954544067383 + }, + { + "auxiliary_loss_clip": 0.01438265, + "auxiliary_loss_mlp": 0.01107397, + "balance_loss_clip": 1.0242846, + "balance_loss_mlp": 1.31720757, + "epoch": 0.0031865323914023747, + "flos": 23439812446080.0, + "grad_norm": 1.6129298069702809, + "language_loss": 1.01391912, + "learning_rate": 2.5562811176888872e-06, + "loss": 1.03937578, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 1.2109375, + "step": 53, + "time_per_iteration": 2.570388078689575 + }, + { + "auxiliary_loss_clip": 0.01428581, + "auxiliary_loss_mlp": 0.01096553, + "balance_loss_clip": 1.02097392, + "balance_loss_mlp": 1.30994177, + "epoch": 0.003246655644070344, + "flos": 14428310532480.0, + "grad_norm": 1.8091849462926297, + "language_loss": 0.93399954, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.95925093, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 1.1875, + "step": 54, + "time_per_iteration": 2.6554181575775146 + }, + { + "auxiliary_loss_clip": 0.01420672, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_clip": 1.02031994, + "balance_loss_mlp": 1.30313826, + "epoch": 0.0033067788967383136, + "flos": 35916793246080.0, + "grad_norm": 1.9390605439657922, + "language_loss": 0.92271101, + "learning_rate": 2.580130221340046e-06, + "loss": 0.94790775, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 1.171875, + "step": 55, + "time_per_iteration": 2.6824378967285156 + }, + { + "auxiliary_loss_clip": 0.01416151, + "auxiliary_loss_mlp": 0.01100073, + "balance_loss_clip": 1.02072692, + "balance_loss_mlp": 1.29935884, + "epoch": 0.003366902149406283, + "flos": 22957836860160.0, + "grad_norm": 2.2229776628331526, + "language_loss": 1.02354062, + "learning_rate": 2.5917314754514246e-06, + "loss": 1.04870272, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 1.171875, + "step": 56, + "time_per_iteration": 4.229781866073608 + }, + { + "auxiliary_loss_clip": 0.0141158, + "auxiliary_loss_mlp": 0.01098514, + "balance_loss_clip": 1.02756071, + "balance_loss_mlp": 1.29667664, + "epoch": 0.003427025402074252, + "flos": 26581506583680.0, + "grad_norm": 1.6610750254902438, + "language_loss": 1.03736138, + "learning_rate": 2.6031273868139713e-06, + "loss": 1.06246233, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 1.1484375, + "step": 57, + "time_per_iteration": 4.268309831619263 + }, + { + "auxiliary_loss_clip": 0.01403879, + "auxiliary_loss_mlp": 0.01084009, + "balance_loss_clip": 1.0168705, + "balance_loss_mlp": 1.29193509, + "epoch": 0.0034871486547422216, + "flos": 23950068099840.0, + "grad_norm": 1.7818773582281597, + "language_loss": 1.07827854, + "learning_rate": 2.614325098333948e-06, + "loss": 1.1031574, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 1.1171875, + "step": 58, + "time_per_iteration": 2.533069372177124 + }, + { + "auxiliary_loss_clip": 0.01391815, + "auxiliary_loss_mlp": 0.01077123, + "balance_loss_clip": 1.01570654, + "balance_loss_mlp": 1.28348184, + "epoch": 0.003547271907410191, + "flos": 21213924848640.0, + "grad_norm": 1.8801083671758678, + "language_loss": 0.98636395, + "learning_rate": 2.625331386578098e-06, + "loss": 1.01105332, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 1.0859375, + "step": 59, + "time_per_iteration": 2.63613224029541 + }, + { + "auxiliary_loss_clip": 0.01389579, + "auxiliary_loss_mlp": 0.01094339, + "balance_loss_clip": 1.02772522, + "balance_loss_mlp": 1.27919149, + "epoch": 0.00360739516007816, + "flos": 16504071816960.0, + "grad_norm": 1.7572322462792613, + "language_loss": 1.03732479, + "learning_rate": 2.63615268640451e-06, + "loss": 1.06216395, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 1.109375, + "step": 60, + "time_per_iteration": 2.5318427085876465 + }, + { + "auxiliary_loss_clip": 0.01380882, + "auxiliary_loss_mlp": 0.0109855, + "balance_loss_clip": 1.03532171, + "balance_loss_mlp": 1.27369034, + "epoch": 0.0036675184127461296, + "flos": 19463763703680.0, + "grad_norm": 1.96366380895085, + "language_loss": 1.00712299, + "learning_rate": 2.6467951135575943e-06, + "loss": 1.03191733, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 1.078125, + "step": 61, + "time_per_iteration": 2.5204458236694336 + }, + { + "auxiliary_loss_clip": 0.01377498, + "auxiliary_loss_mlp": 0.0109224, + "balance_loss_clip": 1.02743793, + "balance_loss_mlp": 1.27065504, + "epoch": 0.003727641665414099, + "flos": 20956335770880.0, + "grad_norm": 1.6938978530507907, + "language_loss": 0.97925043, + "learning_rate": 2.657264485425803e-06, + "loss": 1.00394773, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 1.0703125, + "step": 62, + "time_per_iteration": 2.580873489379883 + }, + { + "auxiliary_loss_clip": 0.01369197, + "auxiliary_loss_mlp": 0.01073564, + "balance_loss_clip": 1.01529503, + "balance_loss_mlp": 1.26563585, + "epoch": 0.003787764918082068, + "flos": 18405057502080.0, + "grad_norm": 1.6056544984710879, + "language_loss": 1.02707219, + "learning_rate": 2.6675663401385186e-06, + "loss": 1.05149984, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 1.0390625, + "step": 63, + "time_per_iteration": 2.5049960613250732 + }, + { + "auxiliary_loss_clip": 0.01366937, + "auxiliary_loss_mlp": 0.01081452, + "balance_loss_clip": 1.02194309, + "balance_loss_mlp": 1.26473141, + "epoch": 0.0038478881707500376, + "flos": 12458406090240.0, + "grad_norm": 1.9646688865502346, + "language_loss": 1.10748148, + "learning_rate": 2.677705954159056e-06, + "loss": 1.13196528, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 1.0234375, + "step": 64, + "time_per_iteration": 3.22587251663208 + }, + { + "auxiliary_loss_clip": 0.01361245, + "auxiliary_loss_mlp": 0.01080482, + "balance_loss_clip": 1.02288008, + "balance_loss_mlp": 1.25962412, + "epoch": 0.003908011423418007, + "flos": 13552479365760.0, + "grad_norm": 1.925475193310242, + "language_loss": 1.00453138, + "learning_rate": 2.6876883585136904e-06, + "loss": 1.02894866, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 1.015625, + "step": 65, + "time_per_iteration": 4.017646551132202 + }, + { + "auxiliary_loss_clip": 0.01357257, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_clip": 1.02112687, + "balance_loss_mlp": 1.25508583, + "epoch": 0.003968134676085976, + "flos": 18332473052160.0, + "grad_norm": 1.5746626958295853, + "language_loss": 0.98168778, + "learning_rate": 2.697518353781685e-06, + "loss": 1.00606525, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 1.0234375, + "step": 66, + "time_per_iteration": 4.332080364227295 + }, + { + "auxiliary_loss_clip": 0.01348039, + "auxiliary_loss_mlp": 0.01080387, + "balance_loss_clip": 1.02288032, + "balance_loss_mlp": 1.24804461, + "epoch": 0.004028257928753946, + "flos": 20484205188480.0, + "grad_norm": 1.9783854595226316, + "language_loss": 1.10036111, + "learning_rate": 2.7072005239581103e-06, + "loss": 1.12464547, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.99609375, + "step": 67, + "time_per_iteration": 4.574966192245483 + }, + { + "auxiliary_loss_clip": 0.01343347, + "auxiliary_loss_mlp": 0.01079155, + "balance_loss_clip": 1.02431893, + "balance_loss_mlp": 1.24566841, + "epoch": 0.004088381181421915, + "flos": 18842833440000.0, + "grad_norm": 1.707964296993641, + "language_loss": 1.01583052, + "learning_rate": 2.7167392492896727e-06, + "loss": 1.04005563, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.9765625, + "step": 68, + "time_per_iteration": 2.9387080669403076 + }, + { + "auxiliary_loss_clip": 0.01336421, + "auxiliary_loss_mlp": 0.01077488, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.23911297, + "epoch": 0.004148504434089885, + "flos": 19426790707200.0, + "grad_norm": 1.6095956306570647, + "language_loss": 1.05997467, + "learning_rate": 2.7261387181735195e-06, + "loss": 1.0841136, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.97265625, + "step": 69, + "time_per_iteration": 2.608163356781006 + }, + { + "auxiliary_loss_clip": 0.01332264, + "auxiliary_loss_mlp": 0.01069159, + "balance_loss_clip": 1.01677835, + "balance_loss_mlp": 1.23615658, + "epoch": 0.004208627686757853, + "flos": 20810049707520.0, + "grad_norm": 2.0607956530144578, + "language_loss": 1.0951407, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.11915493, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.9609375, + "step": 70, + "time_per_iteration": 2.6302578449249268 + }, + { + "auxiliary_loss_clip": 0.01328541, + "auxiliary_loss_mlp": 0.01075035, + "balance_loss_clip": 1.018291, + "balance_loss_mlp": 1.23221707, + "epoch": 0.004268750939425823, + "flos": 19097629608960.0, + "grad_norm": 2.0416179902127864, + "language_loss": 1.17451215, + "learning_rate": 2.7445357464116983e-06, + "loss": 1.19854784, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.9609375, + "step": 71, + "time_per_iteration": 2.5630364418029785 + }, + { + "auxiliary_loss_clip": 0.01353489, + "auxiliary_loss_mlp": 0.01059675, + "balance_loss_clip": 1.00321722, + "balance_loss_mlp": 1.27533937, + "epoch": 0.004328874192093792, + "flos": 52436889377280.0, + "grad_norm": 2.3851030891106486, + "language_loss": 0.657565, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68169665, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.78125, + "step": 72, + "time_per_iteration": 3.2282726764678955 + }, + { + "auxiliary_loss_clip": 0.01345302, + "auxiliary_loss_mlp": 0.0105828, + "balance_loss_clip": 1.00907028, + "balance_loss_mlp": 1.26883674, + "epoch": 0.004388997444761762, + "flos": 66469459115520.0, + "grad_norm": 2.246094963193842, + "language_loss": 0.63956976, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66360557, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.765625, + "step": 73, + "time_per_iteration": 3.206207036972046 + }, + { + "auxiliary_loss_clip": 0.01312721, + "auxiliary_loss_mlp": 0.01078268, + "balance_loss_clip": 1.02445734, + "balance_loss_mlp": 1.21759152, + "epoch": 0.004449120697429731, + "flos": 18951971950080.0, + "grad_norm": 1.8000698391709369, + "language_loss": 0.98054969, + "learning_rate": 2.771181708202938e-06, + "loss": 1.00445962, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.953125, + "step": 74, + "time_per_iteration": 2.5685036182403564 + }, + { + "auxiliary_loss_clip": 0.01302433, + "auxiliary_loss_mlp": 0.01066214, + "balance_loss_clip": 1.0158602, + "balance_loss_mlp": 1.21137786, + "epoch": 0.004509243950097701, + "flos": 21104437224960.0, + "grad_norm": 1.7946804275055441, + "language_loss": 1.07349372, + "learning_rate": 2.779824149153005e-06, + "loss": 1.09718025, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.9140625, + "step": 75, + "time_per_iteration": 2.5850989818573 + }, + { + "auxiliary_loss_clip": 0.01301294, + "auxiliary_loss_mlp": 0.0107544, + "balance_loss_clip": 1.02284467, + "balance_loss_mlp": 1.20859528, + "epoch": 0.004569367202765669, + "flos": 20697838997760.0, + "grad_norm": 1.8031505198727962, + "language_loss": 0.98880738, + "learning_rate": 2.788352117317012e-06, + "loss": 1.01257467, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.92578125, + "step": 76, + "time_per_iteration": 2.566347360610962 + }, + { + "auxiliary_loss_clip": 0.01297131, + "auxiliary_loss_mlp": 0.01088256, + "balance_loss_clip": 1.03680515, + "balance_loss_mlp": 1.20522153, + "epoch": 0.004629490455433639, + "flos": 28657198045440.0, + "grad_norm": 1.6570994166061204, + "language_loss": 1.02446651, + "learning_rate": 2.796768605577095e-06, + "loss": 1.04832041, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.91796875, + "step": 77, + "time_per_iteration": 2.6131017208099365 + }, + { + "auxiliary_loss_clip": 0.01287556, + "auxiliary_loss_mlp": 0.01072594, + "balance_loss_clip": 1.02483845, + "balance_loss_mlp": 1.19914961, + "epoch": 0.004689613708101608, + "flos": 11071621042560.0, + "grad_norm": 1.8950067555859382, + "language_loss": 1.0496515, + "learning_rate": 2.80507649095533e-06, + "loss": 1.07325292, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.8828125, + "step": 78, + "time_per_iteration": 2.61002516746521 + }, + { + "auxiliary_loss_clip": 0.01283501, + "auxiliary_loss_mlp": 0.01062978, + "balance_loss_clip": 1.01422167, + "balance_loss_mlp": 1.19583321, + "epoch": 0.004749736960769578, + "flos": 21798021761280.0, + "grad_norm": 2.079147920979707, + "language_loss": 0.96072763, + "learning_rate": 2.813278540517843e-06, + "loss": 0.98419249, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.875, + "step": 79, + "time_per_iteration": 2.736137628555298 + }, + { + "auxiliary_loss_clip": 0.01282169, + "auxiliary_loss_mlp": 0.01069946, + "balance_loss_clip": 1.01854336, + "balance_loss_mlp": 1.19400644, + "epoch": 0.004809860213437547, + "flos": 19791563258880.0, + "grad_norm": 1.6633581471839818, + "language_loss": 0.99232942, + "learning_rate": 2.8213774169075505e-06, + "loss": 1.01585054, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.8828125, + "step": 80, + "time_per_iteration": 2.6624741554260254 + }, + { + "auxiliary_loss_clip": 0.01275935, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_clip": 1.02072954, + "balance_loss_mlp": 1.19066119, + "epoch": 0.004869983466105517, + "flos": 26573232591360.0, + "grad_norm": 1.8958047428715168, + "language_loss": 1.04964733, + "learning_rate": 2.829375683533245e-06, + "loss": 1.07309175, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.8515625, + "step": 81, + "time_per_iteration": 2.5657668113708496 + }, + { + "auxiliary_loss_clip": 0.01272667, + "auxiliary_loss_mlp": 0.01070946, + "balance_loss_clip": 1.02178407, + "balance_loss_mlp": 1.18862176, + "epoch": 0.004930106718773485, + "flos": 12822550237440.0, + "grad_norm": 2.36093047531353, + "language_loss": 1.13654375, + "learning_rate": 2.8372758094402803e-06, + "loss": 1.15997982, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.83984375, + "step": 82, + "time_per_iteration": 2.5176568031311035 + }, + { + "auxiliary_loss_clip": 0.01269172, + "auxiliary_loss_mlp": 0.0106399, + "balance_loss_clip": 1.01776123, + "balance_loss_mlp": 1.18479371, + "epoch": 0.004990229971441455, + "flos": 25773756301440.0, + "grad_norm": 1.7625678439477694, + "language_loss": 0.94652849, + "learning_rate": 2.84508017388607e-06, + "loss": 0.96986014, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.84375, + "step": 83, + "time_per_iteration": 2.597201108932495 + }, + { + "auxiliary_loss_clip": 0.01267738, + "auxiliary_loss_mlp": 0.01066804, + "balance_loss_clip": 1.01833367, + "balance_loss_mlp": 1.18357682, + "epoch": 0.005050353224109424, + "flos": 17456292771840.0, + "grad_norm": 1.9647619703085009, + "language_loss": 1.03632414, + "learning_rate": 2.852791070641559e-06, + "loss": 1.05966961, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.83984375, + "step": 84, + "time_per_iteration": 2.5145950317382812 + }, + { + "auxiliary_loss_clip": 0.01266584, + "auxiliary_loss_mlp": 0.01095994, + "balance_loss_clip": 1.07234323, + "balance_loss_mlp": 1.20612133, + "epoch": 0.005110476476777394, + "flos": 69802269235200.0, + "grad_norm": 1.6251546937924461, + "language_loss": 0.63005388, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65367967, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.6015625, + "step": 85, + "time_per_iteration": 3.1483633518218994 + }, + { + "auxiliary_loss_clip": 0.01262125, + "auxiliary_loss_mlp": 0.01065209, + "balance_loss_clip": 1.01676273, + "balance_loss_mlp": 1.18012619, + "epoch": 0.005170599729445363, + "flos": 24788961181440.0, + "grad_norm": 1.5118799067699176, + "language_loss": 0.9598431, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.98311651, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.8203125, + "step": 86, + "time_per_iteration": 2.582326889038086 + }, + { + "auxiliary_loss_clip": 0.01260811, + "auxiliary_loss_mlp": 0.01081091, + "balance_loss_clip": 1.03531432, + "balance_loss_mlp": 1.1800735, + "epoch": 0.005230722982113333, + "flos": 23256937411200.0, + "grad_norm": 2.016924833118203, + "language_loss": 0.9565239, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.97994292, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.8046875, + "step": 87, + "time_per_iteration": 2.543916940689087 + }, + { + "auxiliary_loss_clip": 0.01258088, + "auxiliary_loss_mlp": 0.01076995, + "balance_loss_clip": 1.0333643, + "balance_loss_mlp": 1.17947674, + "epoch": 0.005290846234781301, + "flos": 16726957136640.0, + "grad_norm": 1.5902949013477015, + "language_loss": 1.03879786, + "learning_rate": 2.8827430842847267e-06, + "loss": 1.06214881, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.7890625, + "step": 88, + "time_per_iteration": 2.593050956726074 + }, + { + "auxiliary_loss_clip": 0.01253169, + "auxiliary_loss_mlp": 0.01059565, + "balance_loss_clip": 1.01695991, + "balance_loss_mlp": 1.17404974, + "epoch": 0.005350969487449271, + "flos": 20885043041280.0, + "grad_norm": 1.651093208188181, + "language_loss": 0.95645708, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.97958446, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.7890625, + "step": 89, + "time_per_iteration": 2.5296895503997803 + }, + { + "auxiliary_loss_clip": 0.012481, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.02329516, + "balance_loss_mlp": 1.16942477, + "epoch": 0.00541109274011724, + "flos": 26208878976000.0, + "grad_norm": 1.8725408861909332, + "language_loss": 1.0131824, + "learning_rate": 2.8972122815946455e-06, + "loss": 1.03633487, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.78515625, + "step": 90, + "time_per_iteration": 2.578850746154785 + }, + { + "auxiliary_loss_clip": 0.01243275, + "auxiliary_loss_mlp": 0.01080846, + "balance_loss_clip": 1.0369767, + "balance_loss_mlp": 1.16579485, + "epoch": 0.00547121599278521, + "flos": 21177510433920.0, + "grad_norm": 1.8228078622960453, + "language_loss": 0.9514904, + "learning_rate": 2.90432674275074e-06, + "loss": 0.97473168, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.7734375, + "step": 91, + "time_per_iteration": 2.555706024169922 + }, + { + "auxiliary_loss_clip": 0.01241529, + "auxiliary_loss_mlp": 0.01071585, + "balance_loss_clip": 1.02728713, + "balance_loss_mlp": 1.16463041, + "epoch": 0.005531339245453179, + "flos": 19717791822720.0, + "grad_norm": 1.908948796633992, + "language_loss": 1.00433707, + "learning_rate": 2.91136344867656e-06, + "loss": 1.0274682, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.76953125, + "step": 92, + "time_per_iteration": 2.6066746711730957 + }, + { + "auxiliary_loss_clip": 0.01237542, + "auxiliary_loss_mlp": 0.0105625, + "balance_loss_clip": 1.01643443, + "balance_loss_mlp": 1.16334677, + "epoch": 0.005591462498121149, + "flos": 17635222823040.0, + "grad_norm": 2.2342423337120603, + "language_loss": 1.11653519, + "learning_rate": 2.918324080615938e-06, + "loss": 1.13947308, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.7421875, + "step": 93, + "time_per_iteration": 2.499570846557617 + }, + { + "auxiliary_loss_clip": 0.01239023, + "auxiliary_loss_mlp": 0.01060369, + "balance_loss_clip": 1.01983762, + "balance_loss_mlp": 1.16351247, + "epoch": 0.005651585750789117, + "flos": 20010189392640.0, + "grad_norm": 1.8136616275795483, + "language_loss": 1.00727367, + "learning_rate": 2.925210265866963e-06, + "loss": 1.0302676, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.75390625, + "step": 94, + "time_per_iteration": 2.5293662548065186 + }, + { + "auxiliary_loss_clip": 0.01242183, + "auxiliary_loss_mlp": 0.01299076, + "balance_loss_clip": 1.27351713, + "balance_loss_mlp": 1.1909281, + "epoch": 0.005711709003457087, + "flos": 59809917185280.0, + "grad_norm": 1.5632806356434423, + "language_loss": 0.68391669, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70932931, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.51171875, + "step": 95, + "time_per_iteration": 2.987596273422241 + }, + { + "auxiliary_loss_clip": 0.01235533, + "auxiliary_loss_mlp": 0.01096045, + "balance_loss_clip": 1.05508494, + "balance_loss_mlp": 1.1599288, + "epoch": 0.005771832256125056, + "flos": 15558693488640.0, + "grad_norm": 1.7943235705518, + "language_loss": 1.02090073, + "learning_rate": 2.9387655493491906e-06, + "loss": 1.04421663, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.7578125, + "step": 96, + "time_per_iteration": 3.9576892852783203 + }, + { + "auxiliary_loss_clip": 0.01237975, + "auxiliary_loss_mlp": 0.01192051, + "balance_loss_clip": 1.14813447, + "balance_loss_mlp": 1.16075039, + "epoch": 0.005831955508793026, + "flos": 22527287573760.0, + "grad_norm": 2.012633007760356, + "language_loss": 1.05167651, + "learning_rate": 2.9454376524092147e-06, + "loss": 1.07597673, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.7734375, + "step": 97, + "time_per_iteration": 3.9907126426696777 + }, + { + "auxiliary_loss_clip": 0.01236739, + "auxiliary_loss_mlp": 0.01164693, + "balance_loss_clip": 1.11698508, + "balance_loss_mlp": 1.15853596, + "epoch": 0.005892078761460995, + "flos": 22048872946560.0, + "grad_norm": 1.8680019172606075, + "language_loss": 0.84544736, + "learning_rate": 2.952041322436969e-06, + "loss": 0.86946172, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.78125, + "step": 98, + "time_per_iteration": 2.5436413288116455 + }, + { + "auxiliary_loss_clip": 0.01228903, + "auxiliary_loss_mlp": 0.01271366, + "balance_loss_clip": 1.24733341, + "balance_loss_mlp": 1.17839527, + "epoch": 0.005952202014128965, + "flos": 68535689598720.0, + "grad_norm": 1.2114521693221167, + "language_loss": 0.6578337, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68283641, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.5078125, + "step": 99, + "time_per_iteration": 3.1738245487213135 + }, + { + "auxiliary_loss_clip": 0.01237051, + "auxiliary_loss_mlp": 0.01095208, + "balance_loss_clip": 1.04056239, + "balance_loss_mlp": 1.15641499, + "epoch": 0.006012325266796933, + "flos": 22959931541760.0, + "grad_norm": 1.7681555320421318, + "language_loss": 1.03088045, + "learning_rate": 2.9650488796560464e-06, + "loss": 1.05420291, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.80859375, + "step": 100, + "time_per_iteration": 2.554457664489746 + }, + { + "auxiliary_loss_clip": 0.01233963, + "auxiliary_loss_mlp": 0.01201699, + "balance_loss_clip": 1.15279961, + "balance_loss_mlp": 1.15491962, + "epoch": 0.006072448519464903, + "flos": 17346979704960.0, + "grad_norm": 1.97824695090203, + "language_loss": 1.01097822, + "learning_rate": 2.971455421902446e-06, + "loss": 1.03533506, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.7890625, + "step": 101, + "time_per_iteration": 2.5085086822509766 + }, + { + "auxiliary_loss_clip": 0.01227307, + "auxiliary_loss_mlp": 0.0116684, + "balance_loss_clip": 1.12227917, + "balance_loss_mlp": 1.15449381, + "epoch": 0.006132571772132872, + "flos": 24679962316800.0, + "grad_norm": 1.7771090996803043, + "language_loss": 1.03684199, + "learning_rate": 2.9777988444798075e-06, + "loss": 1.06078339, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.7265625, + "step": 102, + "time_per_iteration": 2.5612902641296387 + }, + { + "auxiliary_loss_clip": 0.01222353, + "auxiliary_loss_mlp": 0.01137773, + "balance_loss_clip": 1.09419012, + "balance_loss_mlp": 1.15168059, + "epoch": 0.006192695024800842, + "flos": 21464741122560.0, + "grad_norm": 1.970948373339513, + "language_loss": 0.97891676, + "learning_rate": 2.9840803790210285e-06, + "loss": 1.00251794, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.70703125, + "step": 103, + "time_per_iteration": 2.532970666885376 + }, + { + "auxiliary_loss_clip": 0.01217977, + "auxiliary_loss_mlp": 0.0108734, + "balance_loss_clip": 1.04873979, + "balance_loss_mlp": 1.14728475, + "epoch": 0.006252818277468811, + "flos": 17419459420800.0, + "grad_norm": 1.702537147167651, + "language_loss": 1.00995564, + "learning_rate": 2.990301221458371e-06, + "loss": 1.03300893, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.70703125, + "step": 104, + "time_per_iteration": 2.5523431301116943 + }, + { + "auxiliary_loss_clip": 0.01232147, + "auxiliary_loss_mlp": 0.01064679, + "balance_loss_clip": 1.02929747, + "balance_loss_mlp": 1.15771067, + "epoch": 0.006312941530136781, + "flos": 19098537304320.0, + "grad_norm": 1.9100223115199066, + "language_loss": 1.08033586, + "learning_rate": 2.9964625333900544e-06, + "loss": 1.10330415, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.7421875, + "step": 105, + "time_per_iteration": 2.5272581577301025 + }, + { + "auxiliary_loss_clip": 0.01252854, + "auxiliary_loss_mlp": 0.01151895, + "balance_loss_clip": 1.11341465, + "balance_loss_mlp": 1.17395711, + "epoch": 0.006373064782804749, + "flos": 24059695368960.0, + "grad_norm": 2.0496554687033752, + "language_loss": 1.04248106, + "learning_rate": 3.002565443382063e-06, + "loss": 1.06652856, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.7890625, + "step": 106, + "time_per_iteration": 2.574812889099121 + }, + { + "auxiliary_loss_clip": 0.01245715, + "auxiliary_loss_mlp": 0.01190595, + "balance_loss_clip": 1.15201867, + "balance_loss_mlp": 1.16812122, + "epoch": 0.006433188035472719, + "flos": 18331460622720.0, + "grad_norm": 1.8950084325220462, + "language_loss": 0.99633414, + "learning_rate": 3.008611048208843e-06, + "loss": 1.02069736, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.7734375, + "step": 107, + "time_per_iteration": 2.481133222579956 + }, + { + "auxiliary_loss_clip": 0.01245767, + "auxiliary_loss_mlp": 0.01089366, + "balance_loss_clip": 1.06981611, + "balance_loss_mlp": 1.18363643, + "epoch": 0.006493311288140688, + "flos": 62558907816960.0, + "grad_norm": 1.0933082034188868, + "language_loss": 0.65013921, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67349052, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.62109375, + "step": 108, + "time_per_iteration": 3.1186723709106445 + }, + { + "auxiliary_loss_clip": 0.01216437, + "auxiliary_loss_mlp": 0.01141232, + "balance_loss_clip": 1.10146368, + "balance_loss_mlp": 1.14801717, + "epoch": 0.006553434540808658, + "flos": 19499130777600.0, + "grad_norm": 1.8055960807637508, + "language_loss": 1.08323288, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.10680962, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6875, + "step": 109, + "time_per_iteration": 2.4944097995758057 + }, + { + "auxiliary_loss_clip": 0.01207168, + "auxiliary_loss_mlp": 0.01182229, + "balance_loss_clip": 1.14262795, + "balance_loss_mlp": 1.14232838, + "epoch": 0.006613557793476627, + "flos": 21104088111360.0, + "grad_norm": 1.5176005721534047, + "language_loss": 0.94617105, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.970065, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.6484375, + "step": 110, + "time_per_iteration": 2.5456273555755615 + }, + { + "auxiliary_loss_clip": 0.01207781, + "auxiliary_loss_mlp": 0.01078394, + "balance_loss_clip": 1.04039049, + "balance_loss_mlp": 1.14323533, + "epoch": 0.006673681046144597, + "flos": 26029564899840.0, + "grad_norm": 1.7413916722577985, + "language_loss": 0.89089143, + "learning_rate": 3.032241303393073e-06, + "loss": 0.91375327, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.6484375, + "step": 111, + "time_per_iteration": 2.546675443649292 + }, + { + "auxiliary_loss_clip": 0.01212908, + "auxiliary_loss_mlp": 0.01074555, + "balance_loss_clip": 1.03700376, + "balance_loss_mlp": 1.14756036, + "epoch": 0.006733804298812566, + "flos": 23146681737600.0, + "grad_norm": 1.6461578012583646, + "language_loss": 1.02043927, + "learning_rate": 3.0380158011446e-06, + "loss": 1.04331386, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.65625, + "step": 112, + "time_per_iteration": 2.530122995376587 + }, + { + "auxiliary_loss_clip": 0.01208815, + "auxiliary_loss_mlp": 0.01157701, + "balance_loss_clip": 1.12007833, + "balance_loss_mlp": 1.14477301, + "epoch": 0.006793927551480535, + "flos": 11763669479040.0, + "grad_norm": 2.0180373693240323, + "language_loss": 0.93481207, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.9584772, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.640625, + "step": 113, + "time_per_iteration": 2.502267360687256 + }, + { + "auxiliary_loss_clip": 0.01196857, + "auxiliary_loss_mlp": 0.01117322, + "balance_loss_clip": 1.08148742, + "balance_loss_mlp": 1.13487744, + "epoch": 0.006854050804148504, + "flos": 19170947197440.0, + "grad_norm": 1.6503455504244549, + "language_loss": 1.04036093, + "learning_rate": 3.0494117125071475e-06, + "loss": 1.06350279, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.62109375, + "step": 114, + "time_per_iteration": 2.512989044189453 + }, + { + "auxiliary_loss_clip": 0.01189554, + "auxiliary_loss_mlp": 0.01067583, + "balance_loss_clip": 1.03422856, + "balance_loss_mlp": 1.1274718, + "epoch": 0.006914174056816474, + "flos": 21980792062080.0, + "grad_norm": 1.7224638794336582, + "language_loss": 1.05511618, + "learning_rate": 3.055034911425055e-06, + "loss": 1.0776875, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.62109375, + "step": 115, + "time_per_iteration": 2.5234997272491455 + }, + { + "auxiliary_loss_clip": 0.01194388, + "auxiliary_loss_mlp": 0.0109411, + "balance_loss_clip": 1.05694044, + "balance_loss_mlp": 1.12844658, + "epoch": 0.006974297309484443, + "flos": 16288238592000.0, + "grad_norm": 1.7813885375950698, + "language_loss": 0.96385103, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.98673606, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.66015625, + "step": 116, + "time_per_iteration": 2.473341464996338 + }, + { + "auxiliary_loss_clip": 0.01196451, + "auxiliary_loss_mlp": 0.01101771, + "balance_loss_clip": 1.06689072, + "balance_loss_mlp": 1.13037086, + "epoch": 0.007034420562152413, + "flos": 26102812665600.0, + "grad_norm": 1.8036944158860786, + "language_loss": 1.02992558, + "learning_rate": 3.0661360861454656e-06, + "loss": 1.05290771, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.66015625, + "step": 117, + "time_per_iteration": 2.5610122680664062 + }, + { + "auxiliary_loss_clip": 0.01192572, + "auxiliary_loss_mlp": 0.01069941, + "balance_loss_clip": 1.03587055, + "balance_loss_mlp": 1.12674475, + "epoch": 0.007094543814820382, + "flos": 14203889112960.0, + "grad_norm": 1.9399774393714495, + "language_loss": 0.98128647, + "learning_rate": 3.071615712271274e-06, + "loss": 1.00391161, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.65625, + "step": 118, + "time_per_iteration": 2.489553928375244 + }, + { + "auxiliary_loss_clip": 0.01187548, + "auxiliary_loss_mlp": 0.01046178, + "balance_loss_clip": 1.01375341, + "balance_loss_mlp": 1.12392044, + "epoch": 0.007154667067488351, + "flos": 14975120246400.0, + "grad_norm": 1.8259466580912889, + "language_loss": 1.08616149, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.10849881, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.63671875, + "step": 119, + "time_per_iteration": 2.467879056930542 + }, + { + "auxiliary_loss_clip": 0.01186702, + "auxiliary_loss_mlp": 0.01074925, + "balance_loss_clip": 1.03842318, + "balance_loss_mlp": 1.12444115, + "epoch": 0.00721479032015632, + "flos": 20192261466240.0, + "grad_norm": 2.033249640870371, + "language_loss": 1.10876846, + "learning_rate": 3.082437012097686e-06, + "loss": 1.13138461, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.625, + "step": 120, + "time_per_iteration": 2.505545139312744 + }, + { + "auxiliary_loss_clip": 0.01178172, + "auxiliary_loss_mlp": 0.01076028, + "balance_loss_clip": 1.04298306, + "balance_loss_mlp": 1.11830711, + "epoch": 0.00727491357282429, + "flos": 23146158067200.0, + "grad_norm": 1.6564435994101478, + "language_loss": 0.99066359, + "learning_rate": 3.0877802144103967e-06, + "loss": 1.01320553, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.6015625, + "step": 121, + "time_per_iteration": 2.5707788467407227 + }, + { + "auxiliary_loss_clip": 0.01176372, + "auxiliary_loss_mlp": 0.01050122, + "balance_loss_clip": 1.01605237, + "balance_loss_mlp": 1.11646891, + "epoch": 0.007335036825492259, + "flos": 15520812796800.0, + "grad_norm": 2.1572009005213566, + "language_loss": 1.04852247, + "learning_rate": 3.09307943925077e-06, + "loss": 1.07078755, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.59765625, + "step": 122, + "time_per_iteration": 2.4875452518463135 + }, + { + "auxiliary_loss_clip": 0.0117421, + "auxiliary_loss_mlp": 0.01073096, + "balance_loss_clip": 1.03807211, + "balance_loss_mlp": 1.11477709, + "epoch": 0.007395160078160229, + "flos": 24242221290240.0, + "grad_norm": 1.8772083341634107, + "language_loss": 1.04477477, + "learning_rate": 3.0983354046304154e-06, + "loss": 1.06724787, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.59375, + "step": 123, + "time_per_iteration": 2.528247117996216 + }, + { + "auxiliary_loss_clip": 0.01176097, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.04555905, + "balance_loss_mlp": 1.11549723, + "epoch": 0.007455283330828198, + "flos": 31758428050560.0, + "grad_norm": 1.6752973589855071, + "language_loss": 0.8493703, + "learning_rate": 3.103548811118979e-06, + "loss": 0.87192088, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.60546875, + "step": 124, + "time_per_iteration": 2.571991443634033 + }, + { + "auxiliary_loss_clip": 0.01170827, + "auxiliary_loss_mlp": 0.01042275, + "balance_loss_clip": 1.01230574, + "balance_loss_mlp": 1.11192513, + "epoch": 0.007515406583496167, + "flos": 26613941103360.0, + "grad_norm": 1.8416054911415718, + "language_loss": 1.00744247, + "learning_rate": 3.108720342404542e-06, + "loss": 1.02957344, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.58984375, + "step": 125, + "time_per_iteration": 2.5586955547332764 + }, + { + "auxiliary_loss_clip": 0.01168693, + "auxiliary_loss_mlp": 0.01042229, + "balance_loss_clip": 1.0125463, + "balance_loss_mlp": 1.11198676, + "epoch": 0.007575529836164136, + "flos": 18222706137600.0, + "grad_norm": 3.002509575058275, + "language_loss": 1.01587784, + "learning_rate": 3.1138506658316945e-06, + "loss": 1.03798699, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.56640625, + "step": 126, + "time_per_iteration": 2.4801864624023438 + }, + { + "auxiliary_loss_clip": 0.01170648, + "auxiliary_loss_mlp": 0.01054595, + "balance_loss_clip": 1.02290893, + "balance_loss_mlp": 1.11264396, + "epoch": 0.007635653088832106, + "flos": 21579325804800.0, + "grad_norm": 2.0439053980232926, + "language_loss": 0.84967506, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.87192738, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.578125, + "step": 127, + "time_per_iteration": 2.5059218406677246 + }, + { + "auxiliary_loss_clip": 0.01169939, + "auxiliary_loss_mlp": 0.01065317, + "balance_loss_clip": 1.03293967, + "balance_loss_mlp": 1.11319852, + "epoch": 0.007695776341500075, + "flos": 25373861055360.0, + "grad_norm": 1.776339512470131, + "language_loss": 0.96083999, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.98319256, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.56640625, + "step": 128, + "time_per_iteration": 2.5194311141967773 + }, + { + "auxiliary_loss_clip": 0.01164918, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.0119257, + "balance_loss_mlp": 1.10898721, + "epoch": 0.007755899594168045, + "flos": 22342876439040.0, + "grad_norm": 1.5367217544527998, + "language_loss": 0.91637528, + "learning_rate": 3.129000827968184e-06, + "loss": 0.93846893, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.55859375, + "step": 129, + "time_per_iteration": 2.5285797119140625 + }, + { + "auxiliary_loss_clip": 0.01165619, + "auxiliary_loss_mlp": 0.01053511, + "balance_loss_clip": 1.02192056, + "balance_loss_mlp": 1.10991979, + "epoch": 0.007816022846836013, + "flos": 22637124311040.0, + "grad_norm": 1.7814462279799346, + "language_loss": 1.06674862, + "learning_rate": 3.133972684206866e-06, + "loss": 1.08894002, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.5546875, + "step": 130, + "time_per_iteration": 2.5282108783721924 + }, + { + "auxiliary_loss_clip": 0.01170575, + "auxiliary_loss_mlp": 0.01076454, + "balance_loss_clip": 1.04297984, + "balance_loss_mlp": 1.11391091, + "epoch": 0.007876146099503984, + "flos": 18182032536960.0, + "grad_norm": 1.7820110478159792, + "language_loss": 0.91168344, + "learning_rate": 3.138906441556014e-06, + "loss": 0.93415374, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.56640625, + "step": 131, + "time_per_iteration": 2.503371238708496 + }, + { + "auxiliary_loss_clip": 0.01166018, + "auxiliary_loss_mlp": 0.01065416, + "balance_loss_clip": 1.03554177, + "balance_loss_mlp": 1.10995984, + "epoch": 0.007936269352171952, + "flos": 27118436382720.0, + "grad_norm": 1.807337774587413, + "language_loss": 0.91810304, + "learning_rate": 3.143802679474861e-06, + "loss": 0.94041741, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.5625, + "step": 132, + "time_per_iteration": 2.5488345623016357 + }, + { + "auxiliary_loss_clip": 0.01158313, + "auxiliary_loss_mlp": 0.01041248, + "balance_loss_clip": 1.01266193, + "balance_loss_mlp": 1.10263288, + "epoch": 0.007996392604839923, + "flos": 19025324449920.0, + "grad_norm": 1.8219003459879866, + "language_loss": 1.04806709, + "learning_rate": 3.1486619643025565e-06, + "loss": 1.07006276, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.5546875, + "step": 133, + "time_per_iteration": 2.536303997039795 + }, + { + "auxiliary_loss_clip": 0.01156624, + "auxiliary_loss_mlp": 0.0103903, + "balance_loss_clip": 1.01113486, + "balance_loss_mlp": 1.10205114, + "epoch": 0.008056515857507891, + "flos": 25482964654080.0, + "grad_norm": 1.3873604671091446, + "language_loss": 0.80927575, + "learning_rate": 3.153484849651286e-06, + "loss": 0.83123219, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.546875, + "step": 134, + "time_per_iteration": 2.541942596435547 + }, + { + "auxiliary_loss_clip": 0.01159421, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.01704311, + "balance_loss_mlp": 1.10331714, + "epoch": 0.00811663911017586, + "flos": 20556545258880.0, + "grad_norm": 2.385722404614008, + "language_loss": 1.03546047, + "learning_rate": 3.1582718767847806e-06, + "loss": 1.05753994, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.5625, + "step": 135, + "time_per_iteration": 5.4628942012786865 + }, + { + "auxiliary_loss_clip": 0.01154965, + "auxiliary_loss_mlp": 0.01049689, + "balance_loss_clip": 1.01929116, + "balance_loss_mlp": 1.09991395, + "epoch": 0.00817676236284383, + "flos": 18798947994240.0, + "grad_norm": 2.3107108551537663, + "language_loss": 1.03597558, + "learning_rate": 3.1630235749828485e-06, + "loss": 1.05802214, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.55078125, + "step": 136, + "time_per_iteration": 2.5820934772491455 + }, + { + "auxiliary_loss_clip": 0.01153489, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.01446652, + "balance_loss_mlp": 1.09903002, + "epoch": 0.008236885615511799, + "flos": 23872596059520.0, + "grad_norm": 1.8074169417862755, + "language_loss": 0.96794724, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.98992509, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.546875, + "step": 137, + "time_per_iteration": 4.063216924667358 + }, + { + "auxiliary_loss_clip": 0.01147655, + "auxiliary_loss_mlp": 0.01038887, + "balance_loss_clip": 1.00898957, + "balance_loss_mlp": 1.09404254, + "epoch": 0.00829700886817977, + "flos": 24642500561280.0, + "grad_norm": 1.5223026522970442, + "language_loss": 0.97398341, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.99584889, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.53515625, + "step": 138, + "time_per_iteration": 2.5396857261657715 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.01047484, + "balance_loss_clip": 1.01837313, + "balance_loss_mlp": 1.09264004, + "epoch": 0.008357132120847738, + "flos": 25260917207040.0, + "grad_norm": 1.8098998313141175, + "language_loss": 0.98372579, + "learning_rate": 3.177071816289865e-06, + "loss": 1.00565982, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.53125, + "step": 139, + "time_per_iteration": 2.545809030532837 + }, + { + "auxiliary_loss_clip": 0.01147291, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.01779318, + "balance_loss_mlp": 1.09388614, + "epoch": 0.008417255373515706, + "flos": 27343660763520.0, + "grad_norm": 1.8951216184509516, + "language_loss": 1.0093379, + "learning_rate": 3.181687263893095e-06, + "loss": 1.03128684, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.53125, + "step": 140, + "time_per_iteration": 2.5713443756103516 + }, + { + "auxiliary_loss_clip": 0.01148874, + "auxiliary_loss_mlp": 0.01049423, + "balance_loss_clip": 1.02033627, + "balance_loss_mlp": 1.09526396, + "epoch": 0.008477378626183677, + "flos": 17638120465920.0, + "grad_norm": 2.004789896250355, + "language_loss": 0.98197573, + "learning_rate": 3.186269861057098e-06, + "loss": 1.0039587, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.53515625, + "step": 141, + "time_per_iteration": 2.620731830596924 + }, + { + "auxiliary_loss_clip": 0.0114262, + "auxiliary_loss_mlp": 0.010425, + "balance_loss_clip": 1.01462853, + "balance_loss_mlp": 1.09023046, + "epoch": 0.008537501878851645, + "flos": 13880488389120.0, + "grad_norm": 1.852408252827294, + "language_loss": 0.97836852, + "learning_rate": 3.1908200721048745e-06, + "loss": 1.00021958, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.5234375, + "step": 142, + "time_per_iteration": 2.4873030185699463 + }, + { + "auxiliary_loss_clip": 0.01130772, + "auxiliary_loss_mlp": 0.01092867, + "balance_loss_clip": 1.07493758, + "balance_loss_mlp": 1.09406507, + "epoch": 0.008597625131519616, + "flos": 71244320832000.0, + "grad_norm": 1.619044840787843, + "language_loss": 0.67234296, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69457936, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.3671875, + "step": 143, + "time_per_iteration": 3.2432479858398438 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.0108049, + "balance_loss_clip": 1.05173695, + "balance_loss_mlp": 1.08797789, + "epoch": 0.008657748384187584, + "flos": 17601880608000.0, + "grad_norm": 1.8105849728074743, + "language_loss": 0.9700436, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.99224401, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.515625, + "step": 144, + "time_per_iteration": 2.467697858810425 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01094197, + "balance_loss_clip": 1.06476378, + "balance_loss_mlp": 1.0864017, + "epoch": 0.008717871636855555, + "flos": 19714405420800.0, + "grad_norm": 1.5708548721482403, + "language_loss": 0.99818945, + "learning_rate": 3.204280886775619e-06, + "loss": 1.02052689, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.53125, + "step": 145, + "time_per_iteration": 2.4968101978302 + }, + { + "auxiliary_loss_clip": 0.01139541, + "auxiliary_loss_mlp": 0.01076553, + "balance_loss_clip": 1.04696488, + "balance_loss_mlp": 1.08617938, + "epoch": 0.008777994889523523, + "flos": 24716271997440.0, + "grad_norm": 1.5243322166195072, + "language_loss": 0.97406828, + "learning_rate": 3.208706005112005e-06, + "loss": 0.99622923, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.53125, + "step": 146, + "time_per_iteration": 2.558898448944092 + }, + { + "auxiliary_loss_clip": 0.01119786, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.02805781, + "balance_loss_mlp": 1.0857501, + "epoch": 0.008838118142191492, + "flos": 70128916715520.0, + "grad_norm": 0.9323611416996458, + "language_loss": 0.60507274, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62665844, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.33984375, + "step": 147, + "time_per_iteration": 3.1915223598480225 + }, + { + "auxiliary_loss_clip": 0.01138769, + "auxiliary_loss_mlp": 0.01080127, + "balance_loss_clip": 1.05213618, + "balance_loss_mlp": 1.08553386, + "epoch": 0.008898241394859462, + "flos": 20043845809920.0, + "grad_norm": 1.81740094174444, + "language_loss": 0.91860807, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.94079697, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.53125, + "step": 148, + "time_per_iteration": 2.505553960800171 + }, + { + "auxiliary_loss_clip": 0.01140306, + "auxiliary_loss_mlp": 0.0110827, + "balance_loss_clip": 1.07756126, + "balance_loss_mlp": 1.08673239, + "epoch": 0.008958364647527431, + "flos": 10742843969280.0, + "grad_norm": 1.8750336527530238, + "language_loss": 0.97847569, + "learning_rate": 3.2218017552198588e-06, + "loss": 1.00096154, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.53515625, + "step": 149, + "time_per_iteration": 2.4914822578430176 + }, + { + "auxiliary_loss_clip": 0.01140386, + "auxiliary_loss_mlp": 0.01092028, + "balance_loss_clip": 1.05931664, + "balance_loss_mlp": 1.08699977, + "epoch": 0.009018487900195401, + "flos": 29126326250880.0, + "grad_norm": 1.8228253489488129, + "language_loss": 1.04904997, + "learning_rate": 3.226108474846181e-06, + "loss": 1.07137418, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.53515625, + "step": 150, + "time_per_iteration": 2.5454583168029785 + }, + { + "auxiliary_loss_clip": 0.0113672, + "auxiliary_loss_mlp": 0.01063869, + "balance_loss_clip": 1.03273177, + "balance_loss_mlp": 1.08456898, + "epoch": 0.00907861115286337, + "flos": 32962268240640.0, + "grad_norm": 1.6721955613619137, + "language_loss": 0.85564417, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.87765008, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.5234375, + "step": 151, + "time_per_iteration": 2.598466634750366 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.01860094, + "balance_loss_mlp": 1.09065437, + "epoch": 0.009138734405531338, + "flos": 21761362967040.0, + "grad_norm": 1.8065339788154082, + "language_loss": 1.01437366, + "learning_rate": 3.234636443010188e-06, + "loss": 1.03628099, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.515625, + "step": 152, + "time_per_iteration": 2.549598455429077 + }, + { + "auxiliary_loss_clip": 0.01149248, + "auxiliary_loss_mlp": 0.01091286, + "balance_loss_clip": 1.0635221, + "balance_loss_mlp": 1.09889424, + "epoch": 0.009198857658199309, + "flos": 20841681265920.0, + "grad_norm": 2.3865125594860905, + "language_loss": 1.02526176, + "learning_rate": 3.238858439669943e-06, + "loss": 1.04766715, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.50390625, + "step": 153, + "time_per_iteration": 2.5359206199645996 + }, + { + "auxiliary_loss_clip": 0.01151709, + "auxiliary_loss_mlp": 0.01135706, + "balance_loss_clip": 1.10788298, + "balance_loss_mlp": 1.10075128, + "epoch": 0.009258980910867277, + "flos": 24826213468800.0, + "grad_norm": 1.6487137201427435, + "language_loss": 0.95505238, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.97792649, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.5078125, + "step": 154, + "time_per_iteration": 2.5430989265441895 + }, + { + "auxiliary_loss_clip": 0.0113804, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_clip": 1.0253154, + "balance_loss_mlp": 1.08814311, + "epoch": 0.009319104163535248, + "flos": 28766511112320.0, + "grad_norm": 1.7505227703301158, + "language_loss": 0.96661776, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.98851645, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.5, + "step": 155, + "time_per_iteration": 2.5747148990631104 + }, + { + "auxiliary_loss_clip": 0.01131053, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.01144826, + "balance_loss_mlp": 1.08071637, + "epoch": 0.009379227416203216, + "flos": 16581055098240.0, + "grad_norm": 1.8411651345899376, + "language_loss": 0.9960739, + "learning_rate": 3.2513608166485063e-06, + "loss": 1.0177896, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.50390625, + "step": 156, + "time_per_iteration": 2.4934210777282715 + }, + { + "auxiliary_loss_clip": 0.01135313, + "auxiliary_loss_mlp": 0.01070251, + "balance_loss_clip": 1.04013824, + "balance_loss_mlp": 1.08538198, + "epoch": 0.009439350668871187, + "flos": 18329016827520.0, + "grad_norm": 2.0317943904965476, + "language_loss": 1.11902428, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.1410799, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.5, + "step": 157, + "time_per_iteration": 2.487980604171753 + }, + { + "auxiliary_loss_clip": 0.0113337, + "auxiliary_loss_mlp": 0.01066998, + "balance_loss_clip": 1.04098606, + "balance_loss_mlp": 1.08573306, + "epoch": 0.009499473921539155, + "flos": 24348846182400.0, + "grad_norm": 1.7869285150680119, + "language_loss": 0.99281025, + "learning_rate": 3.2595628662110186e-06, + "loss": 1.0148139, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.4765625, + "step": 158, + "time_per_iteration": 2.5409553050994873 + }, + { + "auxiliary_loss_clip": 0.01133501, + "auxiliary_loss_mlp": 0.01048139, + "balance_loss_clip": 1.02364159, + "balance_loss_mlp": 1.08584833, + "epoch": 0.009559597174207124, + "flos": 16398389531520.0, + "grad_norm": 2.016681159532475, + "language_loss": 1.00034785, + "learning_rate": 3.2636250385721982e-06, + "loss": 1.02216423, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.4765625, + "step": 159, + "time_per_iteration": 2.5823416709899902 + }, + { + "auxiliary_loss_clip": 0.01132978, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.01565897, + "balance_loss_mlp": 1.08596921, + "epoch": 0.009619720426875094, + "flos": 22855785356160.0, + "grad_norm": 1.4648504807235845, + "language_loss": 0.93612546, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.95786095, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.47070312, + "step": 160, + "time_per_iteration": 2.5437843799591064 + }, + { + "auxiliary_loss_clip": 0.01133021, + "auxiliary_loss_mlp": 0.01053092, + "balance_loss_clip": 1.02617431, + "balance_loss_mlp": 1.0849607, + "epoch": 0.009679843679543063, + "flos": 19134009112320.0, + "grad_norm": 1.913749567925863, + "language_loss": 1.04869103, + "learning_rate": 3.2716732956621042e-06, + "loss": 1.07055211, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.48046875, + "step": 161, + "time_per_iteration": 2.5234079360961914 + }, + { + "auxiliary_loss_clip": 0.01132437, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_clip": 1.05380058, + "balance_loss_mlp": 1.08474565, + "epoch": 0.009739966932211033, + "flos": 20301958558080.0, + "grad_norm": 1.6130591069013445, + "language_loss": 1.0267837, + "learning_rate": 3.2756600092264203e-06, + "loss": 1.0489192, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4765625, + "step": 162, + "time_per_iteration": 2.4932761192321777 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01132878, + "balance_loss_clip": 1.11695135, + "balance_loss_mlp": 1.08284581, + "epoch": 0.009800090184879002, + "flos": 67031073112320.0, + "grad_norm": 1.261144539344647, + "language_loss": 0.72789085, + "learning_rate": 3.279622189013474e-06, + "loss": 0.75034058, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.29296875, + "step": 163, + "time_per_iteration": 3.085137367248535 + }, + { + "auxiliary_loss_clip": 0.01128606, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.01294446, + "balance_loss_mlp": 1.08287776, + "epoch": 0.00986021343754697, + "flos": 17163755556480.0, + "grad_norm": 1.8763686953702905, + "language_loss": 0.97455072, + "learning_rate": 3.283560135133457e-06, + "loss": 0.99622023, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.45703125, + "step": 164, + "time_per_iteration": 2.4988625049591064 + }, + { + "auxiliary_loss_clip": 0.01125758, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_clip": 1.05584359, + "balance_loss_mlp": 1.07921314, + "epoch": 0.00992033669021494, + "flos": 17748445962240.0, + "grad_norm": 3.409909989672282, + "language_loss": 1.01168895, + "learning_rate": 3.2874741422233565e-06, + "loss": 1.03377426, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.46484375, + "step": 165, + "time_per_iteration": 2.4588077068328857 + }, + { + "auxiliary_loss_clip": 0.01130602, + "auxiliary_loss_mlp": 0.01098895, + "balance_loss_clip": 1.07002246, + "balance_loss_mlp": 1.08373833, + "epoch": 0.00998045994288291, + "flos": 25296109724160.0, + "grad_norm": 1.606780359975797, + "language_loss": 0.90116942, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.92346436, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.46875, + "step": 166, + "time_per_iteration": 2.5571889877319336 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01042029, + "balance_loss_clip": 1.01583886, + "balance_loss_mlp": 1.08215737, + "epoch": 0.01004058319555088, + "flos": 32297801644800.0, + "grad_norm": 1.884502432506522, + "language_loss": 1.01637363, + "learning_rate": 3.2952314912845914e-06, + "loss": 1.03807402, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.45898438, + "step": 167, + "time_per_iteration": 2.6032469272613525 + }, + { + "auxiliary_loss_clip": 0.01129957, + "auxiliary_loss_mlp": 0.01061388, + "balance_loss_clip": 1.03569889, + "balance_loss_mlp": 1.08292508, + "epoch": 0.010100706448218848, + "flos": 11319365116800.0, + "grad_norm": 1.99882234204152, + "language_loss": 1.06026495, + "learning_rate": 3.299075396334735e-06, + "loss": 1.08217835, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.47070312, + "step": 168, + "time_per_iteration": 2.495055675506592 + }, + { + "auxiliary_loss_clip": 0.01131425, + "auxiliary_loss_mlp": 0.01094566, + "balance_loss_clip": 1.0677793, + "balance_loss_mlp": 1.08162379, + "epoch": 0.010160829700886819, + "flos": 29718103662720.0, + "grad_norm": 1.4340699365950984, + "language_loss": 0.94515836, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.96741831, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.49804688, + "step": 169, + "time_per_iteration": 2.6222143173217773 + }, + { + "auxiliary_loss_clip": 0.0112907, + "auxiliary_loss_mlp": 0.01047869, + "balance_loss_clip": 1.02018809, + "balance_loss_mlp": 1.08031499, + "epoch": 0.010220952953554787, + "flos": 20411306536320.0, + "grad_norm": 1.5025721976504178, + "language_loss": 0.95044106, + "learning_rate": 3.306695037731344e-06, + "loss": 0.97221047, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.48828125, + "step": 170, + "time_per_iteration": 2.5119643211364746 + }, + { + "auxiliary_loss_clip": 0.01128434, + "auxiliary_loss_mlp": 0.01049745, + "balance_loss_clip": 1.02148008, + "balance_loss_mlp": 1.07969916, + "epoch": 0.010281076206222756, + "flos": 31283783850240.0, + "grad_norm": 1.6022968043148427, + "language_loss": 1.00824785, + "learning_rate": 3.3104713076972827e-06, + "loss": 1.03002965, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.48828125, + "step": 171, + "time_per_iteration": 2.606609582901001 + }, + { + "auxiliary_loss_clip": 0.01122217, + "auxiliary_loss_mlp": 0.01083674, + "balance_loss_clip": 1.05616105, + "balance_loss_mlp": 1.07616639, + "epoch": 0.010341199458890726, + "flos": 21981176087040.0, + "grad_norm": 1.6616138774649498, + "language_loss": 0.98271638, + "learning_rate": 3.314225558471224e-06, + "loss": 1.00477529, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.4609375, + "step": 172, + "time_per_iteration": 2.5501956939697266 + }, + { + "auxiliary_loss_clip": 0.01114095, + "auxiliary_loss_mlp": 0.01053214, + "balance_loss_clip": 1.02593875, + "balance_loss_mlp": 1.07071829, + "epoch": 0.010401322711558695, + "flos": 30809209472640.0, + "grad_norm": 1.4117990446047644, + "language_loss": 0.89141834, + "learning_rate": 3.317958045350308e-06, + "loss": 0.91309136, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.43359375, + "step": 173, + "time_per_iteration": 2.5926830768585205 + }, + { + "auxiliary_loss_clip": 0.01113294, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.01331067, + "balance_loss_mlp": 1.0681175, + "epoch": 0.010461445964226665, + "flos": 24714037670400.0, + "grad_norm": 1.5824782001419004, + "language_loss": 0.92250144, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.94403386, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.453125, + "step": 174, + "time_per_iteration": 2.5231244564056396 + }, + { + "auxiliary_loss_clip": 0.01111766, + "auxiliary_loss_mlp": 0.01073474, + "balance_loss_clip": 1.04599655, + "balance_loss_mlp": 1.06691909, + "epoch": 0.010521569216894634, + "flos": 27709096631040.0, + "grad_norm": 1.7159644708178656, + "language_loss": 0.82094216, + "learning_rate": 3.325358726641591e-06, + "loss": 0.84279454, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.44921875, + "step": 175, + "time_per_iteration": 5.420367240905762 + }, + { + "auxiliary_loss_clip": 0.01108329, + "auxiliary_loss_mlp": 0.01061841, + "balance_loss_clip": 1.03462589, + "balance_loss_mlp": 1.06344867, + "epoch": 0.010581692469562603, + "flos": 12457533306240.0, + "grad_norm": 1.855225605181663, + "language_loss": 1.09006524, + "learning_rate": 3.329027409977902e-06, + "loss": 1.11176705, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.44921875, + "step": 176, + "time_per_iteration": 3.932236909866333 + }, + { + "auxiliary_loss_clip": 0.01107977, + "auxiliary_loss_mlp": 0.01042396, + "balance_loss_clip": 1.01726687, + "balance_loss_mlp": 1.06352115, + "epoch": 0.010641815722230573, + "flos": 19426581239040.0, + "grad_norm": 2.0633428337284245, + "language_loss": 0.90053284, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.92203653, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.4453125, + "step": 177, + "time_per_iteration": 2.5009655952453613 + }, + { + "auxiliary_loss_clip": 0.01108465, + "auxiliary_loss_mlp": 0.01046372, + "balance_loss_clip": 1.01986027, + "balance_loss_mlp": 1.06119859, + "epoch": 0.010701938974898541, + "flos": 18331600268160.0, + "grad_norm": 2.8810073302678676, + "language_loss": 0.94233507, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.96388352, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.47265625, + "step": 178, + "time_per_iteration": 2.4481000900268555 + }, + { + "auxiliary_loss_clip": 0.01108904, + "auxiliary_loss_mlp": 0.01054608, + "balance_loss_clip": 1.02767897, + "balance_loss_mlp": 1.06241047, + "epoch": 0.010762062227566512, + "flos": 19203102426240.0, + "grad_norm": 1.935276908590826, + "language_loss": 0.95267212, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.9743073, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.46484375, + "step": 179, + "time_per_iteration": 2.462662696838379 + }, + { + "auxiliary_loss_clip": 0.01108332, + "auxiliary_loss_mlp": 0.01043485, + "balance_loss_clip": 1.01535177, + "balance_loss_mlp": 1.06018186, + "epoch": 0.01082218548023448, + "flos": 31424239716480.0, + "grad_norm": 1.824050483614449, + "language_loss": 0.97232193, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.99384004, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.48242188, + "step": 180, + "time_per_iteration": 2.625880002975464 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.01043804, + "balance_loss_clip": 1.0162189, + "balance_loss_mlp": 1.05589795, + "epoch": 0.01088230873290245, + "flos": 25045258538880.0, + "grad_norm": 1.7445022715107348, + "language_loss": 0.87962955, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.90110552, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.47851562, + "step": 181, + "time_per_iteration": 2.5555903911590576 + }, + { + "auxiliary_loss_clip": 0.01103581, + "auxiliary_loss_mlp": 0.01063185, + "balance_loss_clip": 1.03059316, + "balance_loss_mlp": 1.056005, + "epoch": 0.01094243198557042, + "flos": 22892304504960.0, + "grad_norm": 2.126886827733957, + "language_loss": 0.99448544, + "learning_rate": 3.3506110684439156e-06, + "loss": 1.0161531, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.47460938, + "step": 182, + "time_per_iteration": 2.558154344558716 + }, + { + "auxiliary_loss_clip": 0.01103192, + "auxiliary_loss_mlp": 0.01050057, + "balance_loss_clip": 1.01801348, + "balance_loss_mlp": 1.05479455, + "epoch": 0.011002555238238388, + "flos": 17164104670080.0, + "grad_norm": 1.845868485975377, + "language_loss": 1.01241052, + "learning_rate": 3.3541390344409054e-06, + "loss": 1.03394306, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.484375, + "step": 183, + "time_per_iteration": 2.480976104736328 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.00949216, + "balance_loss_mlp": 1.05313444, + "epoch": 0.011062678490906358, + "flos": 22309045464960.0, + "grad_norm": 1.717824545683583, + "language_loss": 0.98364997, + "learning_rate": 3.357647774369736e-06, + "loss": 1.00502455, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.47460938, + "step": 184, + "time_per_iteration": 2.50250506401062 + }, + { + "auxiliary_loss_clip": 0.0109969, + "auxiliary_loss_mlp": 0.01039525, + "balance_loss_clip": 1.01297688, + "balance_loss_mlp": 1.05309284, + "epoch": 0.011122801743574327, + "flos": 24387250544640.0, + "grad_norm": 1.5629507167603618, + "language_loss": 0.94785511, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.96924728, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.46484375, + "step": 185, + "time_per_iteration": 2.5196001529693604 + }, + { + "auxiliary_loss_clip": 0.01097175, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.0211935, + "balance_loss_mlp": 1.0516212, + "epoch": 0.011182924996242297, + "flos": 18149283815040.0, + "grad_norm": 1.7713677097629736, + "language_loss": 0.86431944, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.88577902, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.45507812, + "step": 186, + "time_per_iteration": 2.525308847427368 + }, + { + "auxiliary_loss_clip": 0.01096939, + "auxiliary_loss_mlp": 0.0104441, + "balance_loss_clip": 1.01812482, + "balance_loss_mlp": 1.05212104, + "epoch": 0.011243048248910266, + "flos": 15485899570560.0, + "grad_norm": 2.194222615678212, + "language_loss": 1.16813791, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.18955147, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.44726562, + "step": 187, + "time_per_iteration": 2.4834649562835693 + }, + { + "auxiliary_loss_clip": 0.01094342, + "auxiliary_loss_mlp": 0.01042195, + "balance_loss_clip": 1.01674342, + "balance_loss_mlp": 1.05074596, + "epoch": 0.011303171501578235, + "flos": 40915273420800.0, + "grad_norm": 1.4044566971179826, + "language_loss": 0.82456887, + "learning_rate": 3.371494591560139e-06, + "loss": 0.84593427, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.4375, + "step": 188, + "time_per_iteration": 2.7538275718688965 + }, + { + "auxiliary_loss_clip": 0.0107885, + "auxiliary_loss_mlp": 0.01078219, + "balance_loss_clip": 1.06267381, + "balance_loss_mlp": 1.04969454, + "epoch": 0.011363294754246205, + "flos": 66299607884160.0, + "grad_norm": 0.8769107283055859, + "language_loss": 0.56446695, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58603764, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.29101562, + "step": 189, + "time_per_iteration": 3.1571950912475586 + }, + { + "auxiliary_loss_clip": 0.01098987, + "auxiliary_loss_mlp": 0.01191718, + "balance_loss_clip": 1.16007996, + "balance_loss_mlp": 1.05176306, + "epoch": 0.011423418006914174, + "flos": 24899112120960.0, + "grad_norm": 1.8049669485500117, + "language_loss": 1.06696558, + "learning_rate": 3.3783079057586833e-06, + "loss": 1.0898726, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.47265625, + "step": 190, + "time_per_iteration": 2.53869891166687 + }, + { + "auxiliary_loss_clip": 0.01103717, + "auxiliary_loss_mlp": 0.01235216, + "balance_loss_clip": 1.20071638, + "balance_loss_mlp": 1.05140197, + "epoch": 0.011483541259582144, + "flos": 19790865031680.0, + "grad_norm": 2.004491653650201, + "language_loss": 0.98771751, + "learning_rate": 3.3816877150079665e-06, + "loss": 1.01110685, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5234375, + "step": 191, + "time_per_iteration": 2.4821109771728516 + }, + { + "auxiliary_loss_clip": 0.01103148, + "auxiliary_loss_mlp": 0.01225375, + "balance_loss_clip": 1.19154346, + "balance_loss_mlp": 1.05140662, + "epoch": 0.011543664512250112, + "flos": 26175746229120.0, + "grad_norm": 1.691901910399317, + "language_loss": 0.99615991, + "learning_rate": 3.385049875042367e-06, + "loss": 1.01944518, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.515625, + "step": 192, + "time_per_iteration": 2.544553518295288 + }, + { + "auxiliary_loss_clip": 0.01102695, + "auxiliary_loss_mlp": 0.01117862, + "balance_loss_clip": 1.08391166, + "balance_loss_mlp": 1.05200636, + "epoch": 0.011603787764918083, + "flos": 23767856380800.0, + "grad_norm": 2.0334207566118487, + "language_loss": 0.99136633, + "learning_rate": 3.3883945692315938e-06, + "loss": 1.01357186, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 193, + "time_per_iteration": 2.5115790367126465 + }, + { + "auxiliary_loss_clip": 0.01096818, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_clip": 1.01166224, + "balance_loss_mlp": 1.05028796, + "epoch": 0.011663911017586051, + "flos": 25953594048000.0, + "grad_norm": 1.7614406224923902, + "language_loss": 1.03044677, + "learning_rate": 3.3917219781023906e-06, + "loss": 1.05187416, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46484375, + "step": 194, + "time_per_iteration": 2.557671070098877 + }, + { + "auxiliary_loss_clip": 0.01099663, + "auxiliary_loss_mlp": 0.01108333, + "balance_loss_clip": 1.06820667, + "balance_loss_mlp": 1.05228031, + "epoch": 0.01172403427025402, + "flos": 17894173443840.0, + "grad_norm": 1.927176303647054, + "language_loss": 1.06707811, + "learning_rate": 3.3950322793970014e-06, + "loss": 1.08915806, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.47265625, + "step": 195, + "time_per_iteration": 2.495694875717163 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01137273, + "balance_loss_clip": 1.09757662, + "balance_loss_mlp": 1.05124545, + "epoch": 0.01178415752292199, + "flos": 17893579950720.0, + "grad_norm": 2.0585798463572424, + "language_loss": 0.99742883, + "learning_rate": 3.3983256481301445e-06, + "loss": 1.01978254, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.46875, + "step": 196, + "time_per_iteration": 2.521138906478882 + }, + { + "auxiliary_loss_clip": 0.01097922, + "auxiliary_loss_mlp": 0.01108691, + "balance_loss_clip": 1.07516909, + "balance_loss_mlp": 1.0514127, + "epoch": 0.011844280775589959, + "flos": 22892444150400.0, + "grad_norm": 1.9618744077688632, + "language_loss": 1.06366789, + "learning_rate": 3.4016022566445335e-06, + "loss": 1.08573401, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46484375, + "step": 197, + "time_per_iteration": 2.4926061630249023 + }, + { + "auxiliary_loss_clip": 0.0109545, + "auxiliary_loss_mlp": 0.01053314, + "balance_loss_clip": 1.02420259, + "balance_loss_mlp": 1.05084753, + "epoch": 0.01190440402825793, + "flos": 26979097680000.0, + "grad_norm": 1.794604503777132, + "language_loss": 0.90271187, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.92419952, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.44726562, + "step": 198, + "time_per_iteration": 2.5743231773376465 + }, + { + "auxiliary_loss_clip": 0.01097158, + "auxiliary_loss_mlp": 0.01072629, + "balance_loss_clip": 1.0424211, + "balance_loss_mlp": 1.05109262, + "epoch": 0.011964527280925898, + "flos": 20520549780480.0, + "grad_norm": 1.5716242810570824, + "language_loss": 0.94445264, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.96615058, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4609375, + "step": 199, + "time_per_iteration": 2.49124813079834 + }, + { + "auxiliary_loss_clip": 0.01101112, + "auxiliary_loss_mlp": 0.01105727, + "balance_loss_clip": 1.07194304, + "balance_loss_mlp": 1.05356622, + "epoch": 0.012024650533593867, + "flos": 27744742995840.0, + "grad_norm": 1.6261571136584236, + "language_loss": 0.92443496, + "learning_rate": 3.411333205349222e-06, + "loss": 0.9465034, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.4765625, + "step": 200, + "time_per_iteration": 2.550828695297241 + }, + { + "auxiliary_loss_clip": 0.01093348, + "auxiliary_loss_mlp": 0.01101853, + "balance_loss_clip": 1.0685699, + "balance_loss_mlp": 1.04718041, + "epoch": 0.012084773786261837, + "flos": 10451249360640.0, + "grad_norm": 1.6938137071463262, + "language_loss": 1.01948857, + "learning_rate": 3.4145444448414217e-06, + "loss": 1.04144073, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4609375, + "step": 201, + "time_per_iteration": 2.4822611808776855 + }, + { + "auxiliary_loss_clip": 0.01084611, + "auxiliary_loss_mlp": 0.0106071, + "balance_loss_clip": 1.03052664, + "balance_loss_mlp": 1.04157591, + "epoch": 0.012144897038929806, + "flos": 23104821150720.0, + "grad_norm": 1.6048851073238817, + "language_loss": 0.93243027, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.95388341, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4296875, + "step": 202, + "time_per_iteration": 2.5191028118133545 + }, + { + "auxiliary_loss_clip": 0.01082212, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.01575935, + "balance_loss_mlp": 1.03899705, + "epoch": 0.012205020291597776, + "flos": 21032132065920.0, + "grad_norm": 1.6181792641688677, + "language_loss": 0.99432361, + "learning_rate": 3.4209192710126685e-06, + "loss": 1.01557732, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.43359375, + "step": 203, + "time_per_iteration": 2.476510524749756 + }, + { + "auxiliary_loss_clip": 0.01062666, + "auxiliary_loss_mlp": 0.01152707, + "balance_loss_clip": 1.14212155, + "balance_loss_mlp": 1.03697503, + "epoch": 0.012265143544265745, + "flos": 68444846507520.0, + "grad_norm": 1.1245228372156577, + "language_loss": 0.61589622, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63804996, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.2578125, + "step": 204, + "time_per_iteration": 3.0146005153656006 + }, + { + "auxiliary_loss_clip": 0.0110672, + "auxiliary_loss_mlp": 0.01243779, + "balance_loss_clip": 1.20835042, + "balance_loss_mlp": 1.05874193, + "epoch": 0.012325266796933715, + "flos": 17018307365760.0, + "grad_norm": 1.9755689809381638, + "language_loss": 1.04080296, + "learning_rate": 3.4272315978819516e-06, + "loss": 1.06430793, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48046875, + "step": 205, + "time_per_iteration": 2.4901349544525146 + }, + { + "auxiliary_loss_clip": 0.011189, + "auxiliary_loss_mlp": 0.01304909, + "balance_loss_clip": 1.26585567, + "balance_loss_mlp": 1.06587601, + "epoch": 0.012385390049601683, + "flos": 20189119443840.0, + "grad_norm": 1.8699489769743696, + "language_loss": 1.01561546, + "learning_rate": 3.4303647047142043e-06, + "loss": 1.03985357, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.53125, + "step": 206, + "time_per_iteration": 2.504542589187622 + }, + { + "auxiliary_loss_clip": 0.01107157, + "auxiliary_loss_mlp": 0.01264451, + "balance_loss_clip": 1.23185897, + "balance_loss_mlp": 1.0563736, + "epoch": 0.012445513302269652, + "flos": 16252208202240.0, + "grad_norm": 1.7505488440962755, + "language_loss": 1.08401203, + "learning_rate": 3.43348263905683e-06, + "loss": 1.10772812, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.5078125, + "step": 207, + "time_per_iteration": 2.4713680744171143 + }, + { + "auxiliary_loss_clip": 0.01093147, + "auxiliary_loss_mlp": 0.01239402, + "balance_loss_clip": 1.20707214, + "balance_loss_mlp": 1.04464293, + "epoch": 0.012505636554937622, + "flos": 23768240405760.0, + "grad_norm": 1.7038821994544826, + "language_loss": 0.8575018, + "learning_rate": 3.436585547151547e-06, + "loss": 0.88082731, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.484375, + "step": 208, + "time_per_iteration": 2.566667318344116 + }, + { + "auxiliary_loss_clip": 0.01101981, + "auxiliary_loss_mlp": 0.01113928, + "balance_loss_clip": 1.07597136, + "balance_loss_mlp": 1.05930662, + "epoch": 0.012565759807605591, + "flos": 30590234225280.0, + "grad_norm": 1.918460532501461, + "language_loss": 1.10475111, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.12691021, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.42578125, + "step": 209, + "time_per_iteration": 2.571448564529419 + }, + { + "auxiliary_loss_clip": 0.01146159, + "auxiliary_loss_mlp": 0.01064591, + "balance_loss_clip": 1.01964951, + "balance_loss_mlp": 1.10477829, + "epoch": 0.012625883060273561, + "flos": 40111956881280.0, + "grad_norm": 2.404581684592111, + "language_loss": 0.97718978, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.9992972, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.4140625, + "step": 210, + "time_per_iteration": 2.663059711456299 + }, + { + "auxiliary_loss_clip": 0.01169735, + "auxiliary_loss_mlp": 0.01247272, + "balance_loss_clip": 1.2077657, + "balance_loss_mlp": 1.12621319, + "epoch": 0.01268600631294153, + "flos": 27087956899200.0, + "grad_norm": 1.9748600183347633, + "language_loss": 1.05048847, + "learning_rate": 3.445805545042314e-06, + "loss": 1.07465851, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.43554688, + "step": 211, + "time_per_iteration": 2.576044797897339 + }, + { + "auxiliary_loss_clip": 0.01148725, + "auxiliary_loss_mlp": 0.01269692, + "balance_loss_clip": 1.23457336, + "balance_loss_mlp": 1.10369885, + "epoch": 0.012746129565609499, + "flos": 16981823128320.0, + "grad_norm": 2.0005912457661315, + "language_loss": 1.0922991, + "learning_rate": 3.448849769075239e-06, + "loss": 1.11648321, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.45117188, + "step": 212, + "time_per_iteration": 2.5996060371398926 + }, + { + "auxiliary_loss_clip": 0.01103402, + "auxiliary_loss_mlp": 0.01114701, + "balance_loss_clip": 1.08470774, + "balance_loss_mlp": 1.05919051, + "epoch": 0.012806252818277469, + "flos": 46531786216320.0, + "grad_norm": 1.5769440776846986, + "language_loss": 0.87272328, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.89490432, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44140625, + "step": 213, + "time_per_iteration": 2.699709415435791 + }, + { + "auxiliary_loss_clip": 0.01091771, + "auxiliary_loss_mlp": 0.01037055, + "balance_loss_clip": 1.00830221, + "balance_loss_mlp": 1.04605031, + "epoch": 0.012866376070945438, + "flos": 14387846400000.0, + "grad_norm": 2.3194899577081682, + "language_loss": 0.99835563, + "learning_rate": 3.4548953739020187e-06, + "loss": 1.0196439, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.45703125, + "step": 214, + "time_per_iteration": 2.4537081718444824 + }, + { + "auxiliary_loss_clip": 0.01118137, + "auxiliary_loss_mlp": 0.01097442, + "balance_loss_clip": 1.06294298, + "balance_loss_mlp": 1.07102633, + "epoch": 0.012926499323613408, + "flos": 26139611105280.0, + "grad_norm": 1.8247650237274122, + "language_loss": 0.90476429, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.92692012, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47070312, + "step": 215, + "time_per_iteration": 4.073654413223267 + }, + { + "auxiliary_loss_clip": 0.01137959, + "auxiliary_loss_mlp": 0.01154666, + "balance_loss_clip": 1.10900891, + "balance_loss_mlp": 1.08928561, + "epoch": 0.012986622576281377, + "flos": 30115904227200.0, + "grad_norm": 1.9232658366492974, + "language_loss": 1.0574342, + "learning_rate": 3.460884739729461e-06, + "loss": 1.08036041, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.48632812, + "step": 216, + "time_per_iteration": 3.9853289127349854 + }, + { + "auxiliary_loss_clip": 0.0113832, + "auxiliary_loss_mlp": 0.0115294, + "balance_loss_clip": 1.1091187, + "balance_loss_mlp": 1.09031796, + "epoch": 0.013046745828949347, + "flos": 13953177573120.0, + "grad_norm": 2.1747942725460545, + "language_loss": 1.12298131, + "learning_rate": 3.463858658104523e-06, + "loss": 1.14589393, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.48046875, + "step": 217, + "time_per_iteration": 2.463624954223633 + }, + { + "auxiliary_loss_clip": 0.01126002, + "auxiliary_loss_mlp": 0.01138359, + "balance_loss_clip": 1.09792292, + "balance_loss_mlp": 1.0798049, + "epoch": 0.013106869081617315, + "flos": 17346874970880.0, + "grad_norm": 1.6824502379540924, + "language_loss": 1.03864717, + "learning_rate": 3.4668189032433696e-06, + "loss": 1.06129086, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.4609375, + "step": 218, + "time_per_iteration": 2.5329208374023438 + }, + { + "auxiliary_loss_clip": 0.01106929, + "auxiliary_loss_mlp": 0.01080042, + "balance_loss_clip": 1.05047822, + "balance_loss_mlp": 1.06290102, + "epoch": 0.013166992334285284, + "flos": 25883732684160.0, + "grad_norm": 1.7209989260840117, + "language_loss": 0.9726072, + "learning_rate": 3.46976560030214e-06, + "loss": 0.99447691, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44140625, + "step": 219, + "time_per_iteration": 2.524907350540161 + }, + { + "auxiliary_loss_clip": 0.01089994, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.01105499, + "balance_loss_mlp": 1.04625654, + "epoch": 0.013227115586953254, + "flos": 31174610428800.0, + "grad_norm": 1.5088127628256938, + "language_loss": 0.980483, + "learning_rate": 3.4726988727263976e-06, + "loss": 1.00177097, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4375, + "step": 220, + "time_per_iteration": 2.6165785789489746 + }, + { + "auxiliary_loss_clip": 0.01083868, + "auxiliary_loss_mlp": 0.01051757, + "balance_loss_clip": 1.02104831, + "balance_loss_mlp": 1.04074812, + "epoch": 0.013287238839621223, + "flos": 20408513627520.0, + "grad_norm": 1.6444710778523064, + "language_loss": 0.97700977, + "learning_rate": 3.475618842282164e-06, + "loss": 0.99836606, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43164062, + "step": 221, + "time_per_iteration": 2.50985050201416 + }, + { + "auxiliary_loss_clip": 0.01089744, + "auxiliary_loss_mlp": 0.01099199, + "balance_loss_clip": 1.06663144, + "balance_loss_mlp": 1.04638612, + "epoch": 0.013347362092289193, + "flos": 14136262076160.0, + "grad_norm": 1.8308138252476334, + "language_loss": 1.06829596, + "learning_rate": 3.4785256290862486e-06, + "loss": 1.0901854, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.43359375, + "step": 222, + "time_per_iteration": 2.470869779586792 + }, + { + "auxiliary_loss_clip": 0.01097978, + "auxiliary_loss_mlp": 0.01115348, + "balance_loss_clip": 1.07853651, + "balance_loss_mlp": 1.05494082, + "epoch": 0.013407485344957162, + "flos": 21796660218240.0, + "grad_norm": 1.902695554458468, + "language_loss": 1.05840564, + "learning_rate": 3.481419351635897e-06, + "loss": 1.08053899, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.4296875, + "step": 223, + "time_per_iteration": 2.492043972015381 + }, + { + "auxiliary_loss_clip": 0.01098679, + "auxiliary_loss_mlp": 0.01105397, + "balance_loss_clip": 1.06796527, + "balance_loss_mlp": 1.05564547, + "epoch": 0.013467608597625132, + "flos": 18620716170240.0, + "grad_norm": 2.031665178165983, + "language_loss": 1.01738811, + "learning_rate": 3.484300126837776e-06, + "loss": 1.03942895, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.4296875, + "step": 224, + "time_per_iteration": 2.4889323711395264 + }, + { + "auxiliary_loss_clip": 0.01096586, + "auxiliary_loss_mlp": 0.01077493, + "balance_loss_clip": 1.04354191, + "balance_loss_mlp": 1.05369246, + "epoch": 0.013527731850293101, + "flos": 18551308654080.0, + "grad_norm": 1.635028854133641, + "language_loss": 1.02950668, + "learning_rate": 3.487168070036317e-06, + "loss": 1.0512476, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4296875, + "step": 225, + "time_per_iteration": 2.4698359966278076 + }, + { + "auxiliary_loss_clip": 0.01088983, + "auxiliary_loss_mlp": 0.01049318, + "balance_loss_clip": 1.0203023, + "balance_loss_mlp": 1.04679775, + "epoch": 0.01358785510296107, + "flos": 19164558418560.0, + "grad_norm": 1.668201941654053, + "language_loss": 1.08488941, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.10627246, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 226, + "time_per_iteration": 2.4657299518585205 + }, + { + "auxiliary_loss_clip": 0.01080702, + "auxiliary_loss_mlp": 0.01043698, + "balance_loss_clip": 1.01372886, + "balance_loss_mlp": 1.03907442, + "epoch": 0.01364797835562904, + "flos": 23328858545280.0, + "grad_norm": 1.8529547712715548, + "language_loss": 1.03664923, + "learning_rate": 3.4928659141555727e-06, + "loss": 1.05789328, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41601562, + "step": 227, + "time_per_iteration": 2.4552206993103027 + }, + { + "auxiliary_loss_clip": 0.01071787, + "auxiliary_loss_mlp": 0.0113349, + "balance_loss_clip": 1.11394, + "balance_loss_mlp": 1.04463649, + "epoch": 0.013708101608297009, + "flos": 70989943599360.0, + "grad_norm": 1.0500160154291867, + "language_loss": 0.57966822, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.60172099, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.27148438, + "step": 228, + "time_per_iteration": 3.1548004150390625 + }, + { + "auxiliary_loss_clip": 0.01075507, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.02358627, + "balance_loss_mlp": 1.03497553, + "epoch": 0.013768224860964979, + "flos": 16324268981760.0, + "grad_norm": 2.0749228903278825, + "language_loss": 1.02698469, + "learning_rate": 3.4985137765422354e-06, + "loss": 1.04823601, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 229, + "time_per_iteration": 2.4137885570526123 + }, + { + "auxiliary_loss_clip": 0.0107582, + "auxiliary_loss_mlp": 0.01096173, + "balance_loss_clip": 1.06734776, + "balance_loss_mlp": 1.03477514, + "epoch": 0.013828348113632948, + "flos": 20192017086720.0, + "grad_norm": 2.0340409472213365, + "language_loss": 0.98320317, + "learning_rate": 3.501319237118231e-06, + "loss": 1.00492311, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41015625, + "step": 230, + "time_per_iteration": 2.4391608238220215 + }, + { + "auxiliary_loss_clip": 0.01078928, + "auxiliary_loss_mlp": 0.01109966, + "balance_loss_clip": 1.08059335, + "balance_loss_mlp": 1.03632855, + "epoch": 0.013888471366300916, + "flos": 20740013786880.0, + "grad_norm": 1.561018678690937, + "language_loss": 1.00255001, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.02443886, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 231, + "time_per_iteration": 2.43204402923584 + }, + { + "auxiliary_loss_clip": 0.01077624, + "auxiliary_loss_mlp": 0.01088841, + "balance_loss_clip": 1.06013572, + "balance_loss_mlp": 1.03567123, + "epoch": 0.013948594618968886, + "flos": 22089546547200.0, + "grad_norm": 1.6758753163861133, + "language_loss": 0.98966426, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.01132882, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41992188, + "step": 232, + "time_per_iteration": 2.457054615020752 + }, + { + "auxiliary_loss_clip": 0.01073614, + "auxiliary_loss_mlp": 0.01048586, + "balance_loss_clip": 1.02159727, + "balance_loss_mlp": 1.03352594, + "epoch": 0.014008717871636855, + "flos": 19062087978240.0, + "grad_norm": 2.371221136412141, + "language_loss": 0.89359134, + "learning_rate": 3.509663010692652e-06, + "loss": 0.9148134, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 233, + "time_per_iteration": 2.431537389755249 + }, + { + "auxiliary_loss_clip": 0.01076656, + "auxiliary_loss_mlp": 0.01048181, + "balance_loss_clip": 1.01859379, + "balance_loss_mlp": 1.03608823, + "epoch": 0.014068841124304825, + "flos": 14530152568320.0, + "grad_norm": 1.8277487307236124, + "language_loss": 0.99442166, + "learning_rate": 3.512420411838642e-06, + "loss": 1.01567006, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40625, + "step": 234, + "time_per_iteration": 2.4785776138305664 + }, + { + "auxiliary_loss_clip": 0.01075426, + "auxiliary_loss_mlp": 0.01059718, + "balance_loss_clip": 1.02707899, + "balance_loss_mlp": 1.03561401, + "epoch": 0.014128964376972794, + "flos": 18076420074240.0, + "grad_norm": 1.965602055508437, + "language_loss": 1.06779504, + "learning_rate": 3.515166054308634e-06, + "loss": 1.08914638, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.3984375, + "step": 235, + "time_per_iteration": 2.450258731842041 + }, + { + "auxiliary_loss_clip": 0.01076743, + "auxiliary_loss_mlp": 0.0106071, + "balance_loss_clip": 1.02840436, + "balance_loss_mlp": 1.03590918, + "epoch": 0.014189087629640764, + "flos": 25333257277440.0, + "grad_norm": 1.967850403399697, + "language_loss": 0.95677412, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.97814864, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.40820312, + "step": 236, + "time_per_iteration": 2.6083762645721436 + }, + { + "auxiliary_loss_clip": 0.01078437, + "auxiliary_loss_mlp": 0.01057042, + "balance_loss_clip": 1.0254519, + "balance_loss_mlp": 1.03690863, + "epoch": 0.014249210882308733, + "flos": 36138212288640.0, + "grad_norm": 1.6101465146016263, + "language_loss": 0.94536513, + "learning_rate": 3.520622461401154e-06, + "loss": 0.96671987, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4140625, + "step": 237, + "time_per_iteration": 2.6023366451263428 + }, + { + "auxiliary_loss_clip": 0.01075778, + "auxiliary_loss_mlp": 0.01044183, + "balance_loss_clip": 1.01414251, + "balance_loss_mlp": 1.03647864, + "epoch": 0.014309334134976702, + "flos": 12932142595200.0, + "grad_norm": 1.7039806804781887, + "language_loss": 0.9228217, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.94402122, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.39257812, + "step": 238, + "time_per_iteration": 2.4322116374969482 + }, + { + "auxiliary_loss_clip": 0.01075948, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.01462257, + "balance_loss_mlp": 1.03708851, + "epoch": 0.014369457387644672, + "flos": 20776463112960.0, + "grad_norm": 1.4593327963501108, + "language_loss": 0.96789944, + "learning_rate": 3.526033015791284e-06, + "loss": 0.98907089, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38867188, + "step": 239, + "time_per_iteration": 2.4896762371063232 + }, + { + "auxiliary_loss_clip": 0.01078415, + "auxiliary_loss_mlp": 0.01036202, + "balance_loss_clip": 1.01070309, + "balance_loss_mlp": 1.0398531, + "epoch": 0.01442958064031264, + "flos": 25847353180800.0, + "grad_norm": 1.8139910450337617, + "language_loss": 1.0199244, + "learning_rate": 3.528721337790862e-06, + "loss": 1.04107058, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 240, + "time_per_iteration": 2.5304315090179443 + }, + { + "auxiliary_loss_clip": 0.01081899, + "auxiliary_loss_mlp": 0.01058553, + "balance_loss_clip": 1.02996683, + "balance_loss_mlp": 1.0421263, + "epoch": 0.014489703892980611, + "flos": 28218479500800.0, + "grad_norm": 1.5811174619170947, + "language_loss": 0.96160829, + "learning_rate": 3.531398481704111e-06, + "loss": 0.9830128, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.3984375, + "step": 241, + "time_per_iteration": 2.51960825920105 + }, + { + "auxiliary_loss_clip": 0.01080413, + "auxiliary_loss_mlp": 0.01063672, + "balance_loss_clip": 1.03785181, + "balance_loss_mlp": 1.04137993, + "epoch": 0.01454982714564858, + "flos": 22489860729600.0, + "grad_norm": 1.5906862979908856, + "language_loss": 0.97562319, + "learning_rate": 3.534064540103573e-06, + "loss": 0.99706411, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 242, + "time_per_iteration": 2.4700756072998047 + }, + { + "auxiliary_loss_clip": 0.01075915, + "auxiliary_loss_mlp": 0.01078374, + "balance_loss_clip": 1.05218327, + "balance_loss_mlp": 1.03710485, + "epoch": 0.014609950398316548, + "flos": 21652119722880.0, + "grad_norm": 1.7859696660501259, + "language_loss": 0.96531606, + "learning_rate": 3.536719604416555e-06, + "loss": 0.98685902, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 243, + "time_per_iteration": 2.461885452270508 + }, + { + "auxiliary_loss_clip": 0.01074115, + "auxiliary_loss_mlp": 0.01070905, + "balance_loss_clip": 1.04452419, + "balance_loss_mlp": 1.03551435, + "epoch": 0.014670073650984519, + "flos": 21868965377280.0, + "grad_norm": 1.54499021698165, + "language_loss": 0.93622619, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.95767641, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38671875, + "step": 244, + "time_per_iteration": 2.4596147537231445 + }, + { + "auxiliary_loss_clip": 0.01076645, + "auxiliary_loss_mlp": 0.01038598, + "balance_loss_clip": 1.0103457, + "balance_loss_mlp": 1.03671432, + "epoch": 0.014730196903652487, + "flos": 23182642304640.0, + "grad_norm": 2.1456827382680492, + "language_loss": 0.9655484, + "learning_rate": 3.54199711087864e-06, + "loss": 0.98670083, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.3984375, + "step": 245, + "time_per_iteration": 2.4329512119293213 + }, + { + "auxiliary_loss_clip": 0.01078689, + "auxiliary_loss_mlp": 0.01043874, + "balance_loss_clip": 1.01665831, + "balance_loss_mlp": 1.04003501, + "epoch": 0.014790320156320457, + "flos": 23221465603200.0, + "grad_norm": 1.7582671076352405, + "language_loss": 0.94322646, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.96445209, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 246, + "time_per_iteration": 2.5023117065429688 + }, + { + "auxiliary_loss_clip": 0.01083627, + "auxiliary_loss_mlp": 0.01069723, + "balance_loss_clip": 1.04211402, + "balance_loss_mlp": 1.04438996, + "epoch": 0.014850443408988426, + "flos": 15814571909760.0, + "grad_norm": 1.566293307979433, + "language_loss": 1.00349915, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.02503276, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.390625, + "step": 247, + "time_per_iteration": 2.421661615371704 + }, + { + "auxiliary_loss_clip": 0.01080176, + "auxiliary_loss_mlp": 0.01069717, + "balance_loss_clip": 1.04172671, + "balance_loss_mlp": 1.04135633, + "epoch": 0.014910566661656396, + "flos": 22780617465600.0, + "grad_norm": 1.9211376406260587, + "language_loss": 0.90626585, + "learning_rate": 3.549833136812155e-06, + "loss": 0.92776477, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38671875, + "step": 248, + "time_per_iteration": 2.5011179447174072 + }, + { + "auxiliary_loss_clip": 0.01076011, + "auxiliary_loss_mlp": 0.010757, + "balance_loss_clip": 1.04852009, + "balance_loss_mlp": 1.03747702, + "epoch": 0.014970689914324365, + "flos": 26863954416000.0, + "grad_norm": 1.6565379334890873, + "language_loss": 0.93743962, + "learning_rate": 3.552424094769381e-06, + "loss": 0.95895672, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38476562, + "step": 249, + "time_per_iteration": 2.5460221767425537 + }, + { + "auxiliary_loss_clip": 0.01070466, + "auxiliary_loss_mlp": 0.01076226, + "balance_loss_clip": 1.04912972, + "balance_loss_mlp": 1.03327656, + "epoch": 0.015030813166992334, + "flos": 13984948776960.0, + "grad_norm": 2.0559168560461942, + "language_loss": 1.06404042, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.08550739, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37109375, + "step": 250, + "time_per_iteration": 2.4159696102142334 + }, + { + "auxiliary_loss_clip": 0.01070828, + "auxiliary_loss_mlp": 0.0105464, + "balance_loss_clip": 1.02688777, + "balance_loss_mlp": 1.03454304, + "epoch": 0.015090936419660304, + "flos": 24716656022400.0, + "grad_norm": 1.76409725680884, + "language_loss": 1.09362614, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.1148808, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.36328125, + "step": 251, + "time_per_iteration": 2.5014548301696777 + }, + { + "auxiliary_loss_clip": 0.01072752, + "auxiliary_loss_mlp": 0.01037333, + "balance_loss_clip": 1.0117867, + "balance_loss_mlp": 1.03694236, + "epoch": 0.015151059672328273, + "flos": 25737621177600.0, + "grad_norm": 1.7091590897612239, + "language_loss": 0.9745695, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.99567032, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.35742188, + "step": 252, + "time_per_iteration": 2.493452787399292 + }, + { + "auxiliary_loss_clip": 0.01076742, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.02903247, + "balance_loss_mlp": 1.04086673, + "epoch": 0.015211182924996243, + "flos": 21870152363520.0, + "grad_norm": 1.8688358806116068, + "language_loss": 1.12032104, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.14164662, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.359375, + "step": 253, + "time_per_iteration": 2.4544849395751953 + }, + { + "auxiliary_loss_clip": 0.01083387, + "auxiliary_loss_mlp": 0.01183997, + "balance_loss_clip": 1.16931033, + "balance_loss_mlp": 1.0582068, + "epoch": 0.015271306177664212, + "flos": 66891734409600.0, + "grad_norm": 0.9777478791121624, + "language_loss": 0.55954546, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.5822193, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.25195312, + "step": 254, + "time_per_iteration": 4.458518743515015 + }, + { + "auxiliary_loss_clip": 0.01074842, + "auxiliary_loss_mlp": 0.01068142, + "balance_loss_clip": 1.04085517, + "balance_loss_mlp": 1.03857684, + "epoch": 0.01533142943033218, + "flos": 26832846528000.0, + "grad_norm": 1.6855824759183693, + "language_loss": 1.03326499, + "learning_rate": 3.567754632921479e-06, + "loss": 1.05469477, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.36328125, + "step": 255, + "time_per_iteration": 3.9672932624816895 + }, + { + "auxiliary_loss_clip": 0.01071325, + "auxiliary_loss_mlp": 0.01061342, + "balance_loss_clip": 1.03542614, + "balance_loss_mlp": 1.03577185, + "epoch": 0.01539155268300015, + "flos": 20812702970880.0, + "grad_norm": 1.8813299776665477, + "language_loss": 0.97233367, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.99366027, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.35546875, + "step": 256, + "time_per_iteration": 3.900709629058838 + }, + { + "auxiliary_loss_clip": 0.01075141, + "auxiliary_loss_mlp": 0.01055303, + "balance_loss_clip": 1.02650189, + "balance_loss_mlp": 1.03796566, + "epoch": 0.01545167593566812, + "flos": 15960927795840.0, + "grad_norm": 2.2502576049601557, + "language_loss": 0.92342532, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.94472975, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.37109375, + "step": 257, + "time_per_iteration": 2.4153144359588623 + }, + { + "auxiliary_loss_clip": 0.01078318, + "auxiliary_loss_mlp": 0.01038936, + "balance_loss_clip": 1.01341343, + "balance_loss_mlp": 1.04083276, + "epoch": 0.01551179918833609, + "flos": 22600640073600.0, + "grad_norm": 1.9301110015952734, + "language_loss": 1.05837071, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.07954311, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 258, + "time_per_iteration": 2.4784247875213623 + }, + { + "auxiliary_loss_clip": 0.0108204, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.01113629, + "balance_loss_mlp": 1.04383373, + "epoch": 0.015571922441004058, + "flos": 22815705248640.0, + "grad_norm": 1.9274837672864904, + "language_loss": 1.01914859, + "learning_rate": 3.577775880881658e-06, + "loss": 1.04032576, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 259, + "time_per_iteration": 2.5347282886505127 + }, + { + "auxiliary_loss_clip": 0.01088531, + "auxiliary_loss_mlp": 0.01043368, + "balance_loss_clip": 1.01693892, + "balance_loss_mlp": 1.0490309, + "epoch": 0.015632045693672027, + "flos": 18946595600640.0, + "grad_norm": 1.6641482068588285, + "language_loss": 1.02975178, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.05107081, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 260, + "time_per_iteration": 2.504532814025879 + }, + { + "auxiliary_loss_clip": 0.01084408, + "auxiliary_loss_mlp": 0.01042269, + "balance_loss_clip": 1.01513672, + "balance_loss_mlp": 1.04483366, + "epoch": 0.015692168946339995, + "flos": 29970421125120.0, + "grad_norm": 1.7862701647716586, + "language_loss": 1.0257014, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.04696822, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39453125, + "step": 261, + "time_per_iteration": 2.5264780521392822 + }, + { + "auxiliary_loss_clip": 0.01083054, + "auxiliary_loss_mlp": 0.01037566, + "balance_loss_clip": 1.01076746, + "balance_loss_mlp": 1.04234612, + "epoch": 0.015752292199007967, + "flos": 19391039608320.0, + "grad_norm": 1.6031367656723223, + "language_loss": 0.76873946, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.78994572, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 262, + "time_per_iteration": 2.48397159576416 + }, + { + "auxiliary_loss_clip": 0.01079737, + "auxiliary_loss_mlp": 0.01041867, + "balance_loss_clip": 1.01423419, + "balance_loss_mlp": 1.03847504, + "epoch": 0.015812415451675936, + "flos": 20338756997760.0, + "grad_norm": 1.7084775626780269, + "language_loss": 0.82158327, + "learning_rate": 3.587643540438383e-06, + "loss": 0.84279919, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41210938, + "step": 263, + "time_per_iteration": 2.4642598628997803 + }, + { + "auxiliary_loss_clip": 0.01077048, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_clip": 1.01613736, + "balance_loss_mlp": 1.03553402, + "epoch": 0.015872538704343905, + "flos": 17524583124480.0, + "grad_norm": 2.17296017702561, + "language_loss": 1.03400373, + "learning_rate": 3.590087005168037e-06, + "loss": 1.05522966, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4140625, + "step": 264, + "time_per_iteration": 2.43809175491333 + }, + { + "auxiliary_loss_clip": 0.01077684, + "auxiliary_loss_mlp": 0.01044866, + "balance_loss_clip": 1.01645851, + "balance_loss_mlp": 1.03632593, + "epoch": 0.015932661957011873, + "flos": 15259802405760.0, + "grad_norm": 1.9415994157280507, + "language_loss": 1.14071894, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.16194439, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 265, + "time_per_iteration": 2.4155664443969727 + }, + { + "auxiliary_loss_clip": 0.01084177, + "auxiliary_loss_mlp": 0.01048072, + "balance_loss_clip": 1.01667213, + "balance_loss_mlp": 1.04137063, + "epoch": 0.015992785209679845, + "flos": 20301504710400.0, + "grad_norm": 1.855790177529862, + "language_loss": 0.91434687, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.9356693, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.42773438, + "step": 266, + "time_per_iteration": 2.503396987915039 + }, + { + "auxiliary_loss_clip": 0.0108016, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.01576996, + "balance_loss_mlp": 1.04015231, + "epoch": 0.016052908462347814, + "flos": 23361397799040.0, + "grad_norm": 1.6726723919324413, + "language_loss": 0.97525406, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.99648565, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 267, + "time_per_iteration": 2.4556307792663574 + }, + { + "auxiliary_loss_clip": 0.0107762, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.0219605, + "balance_loss_mlp": 1.03791082, + "epoch": 0.016113031715015783, + "flos": 21285566691840.0, + "grad_norm": 1.8959558945497696, + "language_loss": 0.99569213, + "learning_rate": 3.599769175344462e-06, + "loss": 1.01696658, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.39648438, + "step": 268, + "time_per_iteration": 2.4850926399230957 + }, + { + "auxiliary_loss_clip": 0.01078513, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_clip": 1.02858627, + "balance_loss_mlp": 1.03886533, + "epoch": 0.01617315496768375, + "flos": 18913742144640.0, + "grad_norm": 1.7734706586745663, + "language_loss": 0.99278879, + "learning_rate": 3.602167137831432e-06, + "loss": 1.01414943, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.39648438, + "step": 269, + "time_per_iteration": 2.4067189693450928 + }, + { + "auxiliary_loss_clip": 0.01076683, + "auxiliary_loss_mlp": 0.01055979, + "balance_loss_clip": 1.02846527, + "balance_loss_mlp": 1.03716183, + "epoch": 0.01623327822035172, + "flos": 16545513467520.0, + "grad_norm": 1.787914617446807, + "language_loss": 1.07600427, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.09733081, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.39648438, + "step": 270, + "time_per_iteration": 2.4217727184295654 + }, + { + "auxiliary_loss_clip": 0.01075132, + "auxiliary_loss_mlp": 0.01046632, + "balance_loss_clip": 1.01967883, + "balance_loss_mlp": 1.03623009, + "epoch": 0.016293401473019692, + "flos": 23512361984640.0, + "grad_norm": 1.7442076079546263, + "language_loss": 0.9867897, + "learning_rate": 3.606936435072361e-06, + "loss": 1.00800729, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38867188, + "step": 271, + "time_per_iteration": 2.448598623275757 + }, + { + "auxiliary_loss_clip": 0.01074273, + "auxiliary_loss_mlp": 0.01037717, + "balance_loss_clip": 1.01104999, + "balance_loss_mlp": 1.03570163, + "epoch": 0.01635352472568766, + "flos": 29014988325120.0, + "grad_norm": 1.9345535097035402, + "language_loss": 0.96648163, + "learning_rate": 3.609307900676025e-06, + "loss": 0.98760152, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 272, + "time_per_iteration": 2.51503586769104 + }, + { + "auxiliary_loss_clip": 0.01074452, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.01177073, + "balance_loss_mlp": 1.03689492, + "epoch": 0.01641364797835563, + "flos": 13369674153600.0, + "grad_norm": 1.8193532645749473, + "language_loss": 0.93630731, + "learning_rate": 3.611670663634051e-06, + "loss": 0.95742595, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 273, + "time_per_iteration": 2.408466339111328 + }, + { + "auxiliary_loss_clip": 0.01076213, + "auxiliary_loss_mlp": 0.01038936, + "balance_loss_clip": 1.01204228, + "balance_loss_mlp": 1.038903, + "epoch": 0.016473771231023598, + "flos": 18877292818560.0, + "grad_norm": 1.8054884306408012, + "language_loss": 1.05533934, + "learning_rate": 3.614024787585744e-06, + "loss": 1.07649088, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37304688, + "step": 274, + "time_per_iteration": 2.462442398071289 + }, + { + "auxiliary_loss_clip": 0.01081764, + "auxiliary_loss_mlp": 0.0104442, + "balance_loss_clip": 1.01539207, + "balance_loss_mlp": 1.0430696, + "epoch": 0.016533894483691566, + "flos": 22600535339520.0, + "grad_norm": 1.6466746471753408, + "language_loss": 0.99748546, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.01874733, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.38671875, + "step": 275, + "time_per_iteration": 2.4583442211151123 + }, + { + "auxiliary_loss_clip": 0.01082498, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.01415372, + "balance_loss_mlp": 1.04391456, + "epoch": 0.01659401773635954, + "flos": 21506112950400.0, + "grad_norm": 1.4341877670405285, + "language_loss": 0.90382957, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.92509842, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.38671875, + "step": 276, + "time_per_iteration": 2.487459421157837 + }, + { + "auxiliary_loss_clip": 0.01081887, + "auxiliary_loss_mlp": 0.01043553, + "balance_loss_clip": 1.01432228, + "balance_loss_mlp": 1.04412079, + "epoch": 0.016654140989027507, + "flos": 32849673505920.0, + "grad_norm": 1.5719930585942532, + "language_loss": 0.89018023, + "learning_rate": 3.621035951423551e-06, + "loss": 0.91143465, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.37890625, + "step": 277, + "time_per_iteration": 2.576155662536621 + }, + { + "auxiliary_loss_clip": 0.01082033, + "auxiliary_loss_mlp": 0.01048118, + "balance_loss_clip": 1.01776719, + "balance_loss_mlp": 1.0435276, + "epoch": 0.016714264241695476, + "flos": 12305591602560.0, + "grad_norm": 1.8584143024471391, + "language_loss": 0.90343451, + "learning_rate": 3.623356141983041e-06, + "loss": 0.92473608, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.38476562, + "step": 278, + "time_per_iteration": 2.4338877201080322 + }, + { + "auxiliary_loss_clip": 0.01085335, + "auxiliary_loss_mlp": 0.01044002, + "balance_loss_clip": 1.01405632, + "balance_loss_mlp": 1.04590058, + "epoch": 0.016774387494363444, + "flos": 27122625745920.0, + "grad_norm": 1.6622574612475278, + "language_loss": 1.02717173, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.04846501, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.39453125, + "step": 279, + "time_per_iteration": 2.520209789276123 + }, + { + "auxiliary_loss_clip": 0.0108561, + "auxiliary_loss_mlp": 0.01043472, + "balance_loss_clip": 1.01321697, + "balance_loss_mlp": 1.04529035, + "epoch": 0.016834510747031413, + "flos": 20190515898240.0, + "grad_norm": 1.8222456034663919, + "language_loss": 1.07618022, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.097471, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.40234375, + "step": 280, + "time_per_iteration": 2.4726343154907227 + }, + { + "auxiliary_loss_clip": 0.01082315, + "auxiliary_loss_mlp": 0.01042702, + "balance_loss_clip": 1.01587999, + "balance_loss_mlp": 1.04281974, + "epoch": 0.016894633999699385, + "flos": 27272961527040.0, + "grad_norm": 1.5111924662742857, + "language_loss": 0.84199786, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.86324799, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 281, + "time_per_iteration": 2.579535722732544 + }, + { + "auxiliary_loss_clip": 0.0107898, + "auxiliary_loss_mlp": 0.01040723, + "balance_loss_clip": 1.01362658, + "balance_loss_mlp": 1.04010093, + "epoch": 0.016954757252367354, + "flos": 14902081948800.0, + "grad_norm": 2.091330182018933, + "language_loss": 1.00837684, + "learning_rate": 3.632554186750274e-06, + "loss": 1.02957392, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38867188, + "step": 282, + "time_per_iteration": 2.4451372623443604 + }, + { + "auxiliary_loss_clip": 0.0107842, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.00846422, + "balance_loss_mlp": 1.03927469, + "epoch": 0.017014880505035322, + "flos": 21357802028160.0, + "grad_norm": 1.6630350944167145, + "language_loss": 0.90203714, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.92317563, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 283, + "time_per_iteration": 2.4600670337677 + }, + { + "auxiliary_loss_clip": 0.01075972, + "auxiliary_loss_mlp": 0.0103327, + "balance_loss_clip": 1.00730634, + "balance_loss_mlp": 1.03771234, + "epoch": 0.01707500375770329, + "flos": 35331753726720.0, + "grad_norm": 1.7651061365582053, + "language_loss": 0.94349569, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.96458817, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 284, + "time_per_iteration": 2.6087069511413574 + }, + { + "auxiliary_loss_clip": 0.01074357, + "auxiliary_loss_mlp": 0.01032587, + "balance_loss_clip": 1.00675452, + "balance_loss_mlp": 1.03615403, + "epoch": 0.01713512701037126, + "flos": 23581071273600.0, + "grad_norm": 1.8840048980892565, + "language_loss": 1.09616125, + "learning_rate": 3.639367500948819e-06, + "loss": 1.11723065, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38085938, + "step": 285, + "time_per_iteration": 2.503856897354126 + }, + { + "auxiliary_loss_clip": 0.01071632, + "auxiliary_loss_mlp": 0.01034385, + "balance_loss_clip": 1.00981605, + "balance_loss_mlp": 1.03465748, + "epoch": 0.01719525026303923, + "flos": 27633474892800.0, + "grad_norm": 1.7121594971970922, + "language_loss": 1.05195332, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.07301342, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 286, + "time_per_iteration": 2.5478200912475586 + }, + { + "auxiliary_loss_clip": 0.01070873, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.00747168, + "balance_loss_mlp": 1.03270841, + "epoch": 0.0172553735157072, + "flos": 26978504186880.0, + "grad_norm": 1.4830744707498988, + "language_loss": 0.99835181, + "learning_rate": 3.643869982119001e-06, + "loss": 1.01940107, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38085938, + "step": 287, + "time_per_iteration": 2.4996163845062256 + }, + { + "auxiliary_loss_clip": 0.01069401, + "auxiliary_loss_mlp": 0.01034481, + "balance_loss_clip": 1.0102222, + "balance_loss_mlp": 1.03192806, + "epoch": 0.01731549676837517, + "flos": 14055962215680.0, + "grad_norm": 2.1222890079899344, + "language_loss": 1.16811979, + "learning_rate": 3.646109470232502e-06, + "loss": 1.18915868, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.375, + "step": 288, + "time_per_iteration": 2.4296839237213135 + }, + { + "auxiliary_loss_clip": 0.01058296, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02411199, + "balance_loss_mlp": 1.0347544, + "epoch": 0.017375620021043137, + "flos": 66506885959680.0, + "grad_norm": 0.932411853858951, + "language_loss": 0.64186275, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66283655, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.23535156, + "step": 289, + "time_per_iteration": 3.1736297607421875 + }, + { + "auxiliary_loss_clip": 0.01067776, + "auxiliary_loss_mlp": 0.01038898, + "balance_loss_clip": 1.01436472, + "balance_loss_mlp": 1.0318439, + "epoch": 0.01743574327371111, + "flos": 15224435331840.0, + "grad_norm": 2.077931203850872, + "language_loss": 1.00649023, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.02755702, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 290, + "time_per_iteration": 2.555510997772217 + }, + { + "auxiliary_loss_clip": 0.01070476, + "auxiliary_loss_mlp": 0.01056412, + "balance_loss_clip": 1.03283286, + "balance_loss_mlp": 1.03435135, + "epoch": 0.017495866526379078, + "flos": 25372708980480.0, + "grad_norm": 1.5804229723503513, + "language_loss": 0.9709962, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.99226511, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 291, + "time_per_iteration": 2.5116424560546875 + }, + { + "auxiliary_loss_clip": 0.0107268, + "auxiliary_loss_mlp": 0.01057612, + "balance_loss_clip": 1.0335201, + "balance_loss_mlp": 1.03733194, + "epoch": 0.017555989779047047, + "flos": 26358272150400.0, + "grad_norm": 1.5396855663765308, + "language_loss": 0.81635725, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.83766013, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35351562, + "step": 292, + "time_per_iteration": 2.5562639236450195 + }, + { + "auxiliary_loss_clip": 0.0106819, + "auxiliary_loss_mlp": 0.01052813, + "balance_loss_clip": 1.02935243, + "balance_loss_mlp": 1.03485465, + "epoch": 0.017616113031715015, + "flos": 22337919025920.0, + "grad_norm": 1.9071288259160022, + "language_loss": 0.98447442, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.00568449, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.33203125, + "step": 293, + "time_per_iteration": 2.485985040664673 + }, + { + "auxiliary_loss_clip": 0.01065642, + "auxiliary_loss_mlp": 0.01044677, + "balance_loss_clip": 1.02115679, + "balance_loss_mlp": 1.03222585, + "epoch": 0.017676236284382984, + "flos": 20155881962880.0, + "grad_norm": 1.5493453494418794, + "language_loss": 0.94352484, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.9646281, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.33398438, + "step": 294, + "time_per_iteration": 6.391979694366455 + }, + { + "auxiliary_loss_clip": 0.01065311, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.00950718, + "balance_loss_mlp": 1.03034544, + "epoch": 0.017736359537050956, + "flos": 25222303376640.0, + "grad_norm": 1.4990014546844366, + "language_loss": 0.93051851, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.95150423, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34960938, + "step": 295, + "time_per_iteration": 5.337981939315796 + }, + { + "auxiliary_loss_clip": 0.01066937, + "auxiliary_loss_mlp": 0.01037031, + "balance_loss_clip": 1.01364255, + "balance_loss_mlp": 1.03156435, + "epoch": 0.017796482789718925, + "flos": 20337779479680.0, + "grad_norm": 1.8545510366592748, + "language_loss": 0.93144822, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.95248789, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 296, + "time_per_iteration": 2.4316158294677734 + }, + { + "auxiliary_loss_clip": 0.01065236, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.01369596, + "balance_loss_mlp": 1.03088462, + "epoch": 0.017856606042386893, + "flos": 22378208601600.0, + "grad_norm": 1.7150698571905323, + "language_loss": 0.98072588, + "learning_rate": 3.665921869855132e-06, + "loss": 1.00173903, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 297, + "time_per_iteration": 2.4523074626922607 + }, + { + "auxiliary_loss_clip": 0.01065319, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.01766133, + "balance_loss_mlp": 1.03130651, + "epoch": 0.017916729295054862, + "flos": 20229024994560.0, + "grad_norm": 1.6845333048798552, + "language_loss": 1.01746178, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.03850627, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 298, + "time_per_iteration": 2.435473680496216 + }, + { + "auxiliary_loss_clip": 0.01066698, + "auxiliary_loss_mlp": 0.01039627, + "balance_loss_clip": 1.01650035, + "balance_loss_mlp": 1.03078902, + "epoch": 0.01797685254772283, + "flos": 19389957356160.0, + "grad_norm": 1.5144232823295136, + "language_loss": 0.97376871, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.99483192, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 299, + "time_per_iteration": 2.472338914871216 + }, + { + "auxiliary_loss_clip": 0.01066046, + "auxiliary_loss_mlp": 0.01042092, + "balance_loss_clip": 1.01832175, + "balance_loss_mlp": 1.02983689, + "epoch": 0.018036975800390802, + "flos": 24424851945600.0, + "grad_norm": 2.0186355357643553, + "language_loss": 0.81916797, + "learning_rate": 3.672392800539357e-06, + "loss": 0.84024936, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 300, + "time_per_iteration": 2.487420082092285 + }, + { + "auxiliary_loss_clip": 0.01063066, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.01330018, + "balance_loss_mlp": 1.02848518, + "epoch": 0.01809709905305877, + "flos": 15778017849600.0, + "grad_norm": 1.770811728359909, + "language_loss": 1.01069212, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.03167975, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 301, + "time_per_iteration": 2.457226276397705 + }, + { + "auxiliary_loss_clip": 0.01048458, + "auxiliary_loss_mlp": 0.01025203, + "balance_loss_clip": 1.01442671, + "balance_loss_mlp": 1.02741838, + "epoch": 0.01815722230572674, + "flos": 67344592055040.0, + "grad_norm": 0.8756461282280742, + "language_loss": 0.62486458, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64560127, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.2109375, + "step": 302, + "time_per_iteration": 3.1609270572662354 + }, + { + "auxiliary_loss_clip": 0.01069446, + "auxiliary_loss_mlp": 0.01071462, + "balance_loss_clip": 1.04638028, + "balance_loss_mlp": 1.0350486, + "epoch": 0.01821734555839471, + "flos": 15484747495680.0, + "grad_norm": 1.8035649950172024, + "language_loss": 1.02879417, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.05020332, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.34375, + "step": 303, + "time_per_iteration": 2.5072736740112305 + }, + { + "auxiliary_loss_clip": 0.01076853, + "auxiliary_loss_mlp": 0.01140822, + "balance_loss_clip": 1.11328518, + "balance_loss_mlp": 1.04200816, + "epoch": 0.018277468811062677, + "flos": 24096284340480.0, + "grad_norm": 1.6162251255892142, + "language_loss": 0.9114241, + "learning_rate": 3.680920768703364e-06, + "loss": 0.93360078, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.34765625, + "step": 304, + "time_per_iteration": 2.4957633018493652 + }, + { + "auxiliary_loss_clip": 0.01071762, + "auxiliary_loss_mlp": 0.01118964, + "balance_loss_clip": 1.09332228, + "balance_loss_mlp": 1.03888655, + "epoch": 0.01833759206373065, + "flos": 20958290807040.0, + "grad_norm": 1.4730770474972743, + "language_loss": 0.88069034, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.90259761, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.328125, + "step": 305, + "time_per_iteration": 2.488272190093994 + }, + { + "auxiliary_loss_clip": 0.01060205, + "auxiliary_loss_mlp": 0.01064406, + "balance_loss_clip": 1.04186368, + "balance_loss_mlp": 1.02768803, + "epoch": 0.018397715316398618, + "flos": 19389747888000.0, + "grad_norm": 1.6983109741610847, + "language_loss": 0.99082673, + "learning_rate": 3.685142765363119e-06, + "loss": 1.0120728, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.32421875, + "step": 306, + "time_per_iteration": 2.557779550552368 + }, + { + "auxiliary_loss_clip": 0.01052935, + "auxiliary_loss_mlp": 0.0104256, + "balance_loss_clip": 1.02005291, + "balance_loss_mlp": 1.02125943, + "epoch": 0.018457838569066586, + "flos": 29131248752640.0, + "grad_norm": 1.6367791771873348, + "language_loss": 0.97924662, + "learning_rate": 3.687243426879095e-06, + "loss": 1.00020158, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.31640625, + "step": 307, + "time_per_iteration": 2.5334136486053467 + }, + { + "auxiliary_loss_clip": 0.01053412, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.01318312, + "balance_loss_mlp": 1.02171671, + "epoch": 0.018517961821734555, + "flos": 19207640903040.0, + "grad_norm": 1.7612655894679548, + "language_loss": 0.86022341, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.88110566, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.31640625, + "step": 308, + "time_per_iteration": 2.5304481983184814 + }, + { + "auxiliary_loss_clip": 0.01061589, + "auxiliary_loss_mlp": 0.01038722, + "balance_loss_clip": 1.01480842, + "balance_loss_mlp": 1.02706218, + "epoch": 0.018578085074402523, + "flos": 19862053027200.0, + "grad_norm": 1.6695985682296173, + "language_loss": 0.97483426, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.99583733, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34375, + "step": 309, + "time_per_iteration": 2.467846632003784 + }, + { + "auxiliary_loss_clip": 0.01061935, + "auxiliary_loss_mlp": 0.0104664, + "balance_loss_clip": 1.0232271, + "balance_loss_mlp": 1.02861905, + "epoch": 0.018638208327070496, + "flos": 29605648573440.0, + "grad_norm": 1.8144101277553621, + "language_loss": 0.88109088, + "learning_rate": 3.69350459956065e-06, + "loss": 0.90217662, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.33398438, + "step": 310, + "time_per_iteration": 2.5191166400909424 + }, + { + "auxiliary_loss_clip": 0.01060532, + "auxiliary_loss_mlp": 0.01039677, + "balance_loss_clip": 1.01604962, + "balance_loss_mlp": 1.02774298, + "epoch": 0.018698331579738464, + "flos": 45729866131200.0, + "grad_norm": 1.5115441118037374, + "language_loss": 0.8324126, + "learning_rate": 3.695578199367497e-06, + "loss": 0.85341471, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.328125, + "step": 311, + "time_per_iteration": 2.698251485824585 + }, + { + "auxiliary_loss_clip": 0.01056954, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.01408768, + "balance_loss_mlp": 1.02504182, + "epoch": 0.018758454832406433, + "flos": 20482669088640.0, + "grad_norm": 1.9394295833107467, + "language_loss": 1.02466917, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.04559577, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.31835938, + "step": 312, + "time_per_iteration": 2.4265899658203125 + }, + { + "auxiliary_loss_clip": 0.01053329, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.01061344, + "balance_loss_mlp": 1.02112341, + "epoch": 0.0188185780850744, + "flos": 15776900686080.0, + "grad_norm": 1.7456041971512823, + "language_loss": 1.03895962, + "learning_rate": 3.699705471087043e-06, + "loss": 1.05981278, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.32226562, + "step": 313, + "time_per_iteration": 2.451723098754883 + }, + { + "auxiliary_loss_clip": 0.01053627, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.0125401, + "balance_loss_mlp": 1.02035475, + "epoch": 0.018878701337742373, + "flos": 22454633301120.0, + "grad_norm": 1.9561360397861012, + "language_loss": 0.9421671, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.96305895, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.33203125, + "step": 314, + "time_per_iteration": 2.46235728263855 + }, + { + "auxiliary_loss_clip": 0.01047211, + "auxiliary_loss_mlp": 0.01028138, + "balance_loss_clip": 1.00823021, + "balance_loss_mlp": 1.01513386, + "epoch": 0.018938824590410342, + "flos": 30992189241600.0, + "grad_norm": 2.2802747700380714, + "language_loss": 1.04170799, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.0624615, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.3203125, + "step": 315, + "time_per_iteration": 2.525439500808716 + }, + { + "auxiliary_loss_clip": 0.01048698, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.01680362, + "balance_loss_mlp": 1.01567197, + "epoch": 0.01899894784307831, + "flos": 23257775283840.0, + "grad_norm": 1.5642686849792493, + "language_loss": 0.91748989, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.93835378, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33007812, + "step": 316, + "time_per_iteration": 2.4626078605651855 + }, + { + "auxiliary_loss_clip": 0.01048181, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.0103116, + "balance_loss_mlp": 1.0150373, + "epoch": 0.01905907109574628, + "flos": 17456921176320.0, + "grad_norm": 2.001937900836701, + "language_loss": 0.9683249, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.9891153, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33203125, + "step": 317, + "time_per_iteration": 2.449324607849121 + }, + { + "auxiliary_loss_clip": 0.01050135, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.01026332, + "balance_loss_mlp": 1.01637101, + "epoch": 0.019119194348414248, + "flos": 14969499517440.0, + "grad_norm": 1.9113955023332994, + "language_loss": 1.05841649, + "learning_rate": 3.709909364265374e-06, + "loss": 1.07924581, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3359375, + "step": 318, + "time_per_iteration": 2.4522011280059814 + }, + { + "auxiliary_loss_clip": 0.01051392, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.0105865, + "balance_loss_mlp": 1.01729941, + "epoch": 0.01917931760108222, + "flos": 25481672933760.0, + "grad_norm": 1.92243857292623, + "language_loss": 1.05108058, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.07191718, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34179688, + "step": 319, + "time_per_iteration": 2.521923542022705 + }, + { + "auxiliary_loss_clip": 0.01042419, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.02857721, + "balance_loss_mlp": 1.01906168, + "epoch": 0.01923944085375019, + "flos": 71553722100480.0, + "grad_norm": 0.9766901159740592, + "language_loss": 0.60036325, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62117052, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.09716797, + "router_z_loss_mlp": 0.234375, + "step": 320, + "time_per_iteration": 2.970048666000366 + }, + { + "auxiliary_loss_clip": 0.01051188, + "auxiliary_loss_mlp": 0.01036281, + "balance_loss_clip": 1.01553869, + "balance_loss_mlp": 1.01721931, + "epoch": 0.019299564106418157, + "flos": 19681482142080.0, + "grad_norm": 1.7650813310083497, + "language_loss": 1.06671751, + "learning_rate": 3.715954969092154e-06, + "loss": 1.08759224, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33984375, + "step": 321, + "time_per_iteration": 2.4633588790893555 + }, + { + "auxiliary_loss_clip": 0.01051129, + "auxiliary_loss_mlp": 0.01049862, + "balance_loss_clip": 1.0271647, + "balance_loss_mlp": 1.01753688, + "epoch": 0.019359687359086126, + "flos": 24386063558400.0, + "grad_norm": 1.7235707672074083, + "language_loss": 0.96146882, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.98247874, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3359375, + "step": 322, + "time_per_iteration": 2.4831669330596924 + }, + { + "auxiliary_loss_clip": 0.01050158, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.01613092, + "balance_loss_mlp": 1.01716709, + "epoch": 0.019419810611754094, + "flos": 23950242656640.0, + "grad_norm": 1.6842862244031123, + "language_loss": 0.87644994, + "learning_rate": 3.719954063833981e-06, + "loss": 0.8973248, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33007812, + "step": 323, + "time_per_iteration": 2.5170605182647705 + }, + { + "auxiliary_loss_clip": 0.01049933, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.01094961, + "balance_loss_mlp": 1.01710343, + "epoch": 0.019479933864422067, + "flos": 22159233354240.0, + "grad_norm": 1.5670535378546833, + "language_loss": 1.01539159, + "learning_rate": 3.721944334919596e-06, + "loss": 1.03622198, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.328125, + "step": 324, + "time_per_iteration": 2.469804048538208 + }, + { + "auxiliary_loss_clip": 0.0105158, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.01360798, + "balance_loss_mlp": 1.01898265, + "epoch": 0.019540057117090035, + "flos": 22235727876480.0, + "grad_norm": 1.7785446031554615, + "language_loss": 0.83181769, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.85268891, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.32617188, + "step": 325, + "time_per_iteration": 2.6482508182525635 + }, + { + "auxiliary_loss_clip": 0.01052508, + "auxiliary_loss_mlp": 0.01037493, + "balance_loss_clip": 1.01610637, + "balance_loss_mlp": 1.01879108, + "epoch": 0.019600180369758004, + "flos": 23075633387520.0, + "grad_norm": 1.4824096814011942, + "language_loss": 0.86647522, + "learning_rate": 3.72590651470665e-06, + "loss": 0.88737524, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.3359375, + "step": 326, + "time_per_iteration": 2.453738212585449 + }, + { + "auxiliary_loss_clip": 0.01054779, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.0175581, + "balance_loss_mlp": 1.02065158, + "epoch": 0.019660303622425972, + "flos": 25409681976960.0, + "grad_norm": 1.7600522765780668, + "language_loss": 0.89101493, + "learning_rate": 3.727878498433505e-06, + "loss": 0.91198885, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.34179688, + "step": 327, + "time_per_iteration": 2.4836325645446777 + }, + { + "auxiliary_loss_clip": 0.01051374, + "auxiliary_loss_mlp": 0.01038975, + "balance_loss_clip": 1.01429844, + "balance_loss_mlp": 1.01822317, + "epoch": 0.01972042687509394, + "flos": 23656448632320.0, + "grad_norm": 1.8354560326831015, + "language_loss": 0.89268327, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.91358674, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.33203125, + "step": 328, + "time_per_iteration": 2.4601492881774902 + }, + { + "auxiliary_loss_clip": 0.01050994, + "auxiliary_loss_mlp": 0.01039396, + "balance_loss_clip": 1.016186, + "balance_loss_mlp": 1.01748824, + "epoch": 0.019780550127761913, + "flos": 18222496669440.0, + "grad_norm": 2.019168521692278, + "language_loss": 1.14488649, + "learning_rate": 3.731804438545683e-06, + "loss": 1.16579032, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.33398438, + "step": 329, + "time_per_iteration": 2.4218249320983887 + }, + { + "auxiliary_loss_clip": 0.01052977, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.0122577, + "balance_loss_mlp": 1.01951194, + "epoch": 0.01984067338042988, + "flos": 22417695216000.0, + "grad_norm": 1.9570761831401466, + "language_loss": 0.86860716, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.88949549, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.33398438, + "step": 330, + "time_per_iteration": 2.474691152572632 + }, + { + "auxiliary_loss_clip": 0.01055725, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.01241541, + "balance_loss_mlp": 1.02247572, + "epoch": 0.01990079663309785, + "flos": 17054267932800.0, + "grad_norm": 1.935387035738896, + "language_loss": 1.11344814, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.13436496, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.33203125, + "step": 331, + "time_per_iteration": 2.433915376663208 + }, + { + "auxiliary_loss_clip": 0.01054813, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_clip": 1.02033257, + "balance_loss_mlp": 1.02226162, + "epoch": 0.01996091988576582, + "flos": 15960857973120.0, + "grad_norm": 1.617532465745625, + "language_loss": 1.03551733, + "learning_rate": 3.737648825272422e-06, + "loss": 1.056499, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.32617188, + "step": 332, + "time_per_iteration": 2.4041998386383057 + }, + { + "auxiliary_loss_clip": 0.01057276, + "auxiliary_loss_mlp": 0.01049653, + "balance_loss_clip": 1.02571607, + "balance_loss_mlp": 1.02257323, + "epoch": 0.02002104313843379, + "flos": 23585330459520.0, + "grad_norm": 2.0657487740342004, + "language_loss": 0.93461829, + "learning_rate": 3.739585224276384e-06, + "loss": 0.95568752, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.34765625, + "step": 333, + "time_per_iteration": 5.939366340637207 + }, + { + "auxiliary_loss_clip": 0.01059113, + "auxiliary_loss_mlp": 0.01064763, + "balance_loss_clip": 1.03975272, + "balance_loss_mlp": 1.02462626, + "epoch": 0.02008116639110176, + "flos": 34093454158080.0, + "grad_norm": 1.6296505044350587, + "language_loss": 0.93251032, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.953749, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.34570312, + "step": 334, + "time_per_iteration": 4.023910760879517 + }, + { + "auxiliary_loss_clip": 0.01058272, + "auxiliary_loss_mlp": 0.01047056, + "balance_loss_clip": 1.0222609, + "balance_loss_mlp": 1.02456069, + "epoch": 0.020141289643769728, + "flos": 19682669128320.0, + "grad_norm": 1.5500544188091645, + "language_loss": 0.92126071, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.94231397, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3359375, + "step": 335, + "time_per_iteration": 2.4827654361724854 + }, + { + "auxiliary_loss_clip": 0.01057233, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.0189054, + "balance_loss_mlp": 1.02497244, + "epoch": 0.020201412896437697, + "flos": 20739525027840.0, + "grad_norm": 1.9435041870189311, + "language_loss": 1.04189467, + "learning_rate": 3.745359722027911e-06, + "loss": 1.06288683, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.32226562, + "step": 336, + "time_per_iteration": 2.4678378105163574 + }, + { + "auxiliary_loss_clip": 0.01057538, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.00890994, + "balance_loss_mlp": 1.02422345, + "epoch": 0.020261536149105665, + "flos": 20265474320640.0, + "grad_norm": 1.5308863184395802, + "language_loss": 0.96066093, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.98159242, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.33203125, + "step": 337, + "time_per_iteration": 2.4676594734191895 + }, + { + "auxiliary_loss_clip": 0.01052949, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.01258981, + "balance_loss_mlp": 1.02134871, + "epoch": 0.020321659401773638, + "flos": 25847562648960.0, + "grad_norm": 1.2770784875378334, + "language_loss": 0.95205188, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.97295225, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.31640625, + "step": 338, + "time_per_iteration": 2.5320656299591064 + }, + { + "auxiliary_loss_clip": 0.01051566, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.01685834, + "balance_loss_mlp": 1.02052879, + "epoch": 0.020381782654441606, + "flos": 17494033818240.0, + "grad_norm": 1.6374697253380042, + "language_loss": 0.95828885, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.97921127, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.30859375, + "step": 339, + "time_per_iteration": 2.4545087814331055 + }, + { + "auxiliary_loss_clip": 0.01054079, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.01008928, + "balance_loss_mlp": 1.02278709, + "epoch": 0.020441905907109575, + "flos": 24242779872000.0, + "grad_norm": 1.5248951722341324, + "language_loss": 0.97586143, + "learning_rate": 3.75297936342452e-06, + "loss": 0.99673748, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3125, + "step": 340, + "time_per_iteration": 2.4710581302642822 + }, + { + "auxiliary_loss_clip": 0.01055147, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_clip": 1.01520276, + "balance_loss_mlp": 1.02346206, + "epoch": 0.020502029159777543, + "flos": 22232306563200.0, + "grad_norm": 1.6479682549609678, + "language_loss": 0.97359681, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.99452817, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.31640625, + "step": 341, + "time_per_iteration": 2.513909339904785 + }, + { + "auxiliary_loss_clip": 0.01053362, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_clip": 1.01914048, + "balance_loss_mlp": 1.02214515, + "epoch": 0.020562152412445512, + "flos": 23986726894080.0, + "grad_norm": 2.442168980302781, + "language_loss": 0.96863955, + "learning_rate": 3.756755633390458e-06, + "loss": 0.98959607, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3125, + "step": 342, + "time_per_iteration": 2.480635166168213 + }, + { + "auxiliary_loss_clip": 0.01053921, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.01089549, + "balance_loss_mlp": 1.0227809, + "epoch": 0.020622275665113484, + "flos": 26974210089600.0, + "grad_norm": 1.4513318263038222, + "language_loss": 0.97555053, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.99641925, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3125, + "step": 343, + "time_per_iteration": 2.5186679363250732 + }, + { + "auxiliary_loss_clip": 0.01052964, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.01142383, + "balance_loss_mlp": 1.02210093, + "epoch": 0.020682398917781453, + "flos": 22599627644160.0, + "grad_norm": 1.5596747561495, + "language_loss": 0.87155473, + "learning_rate": 3.7605098841644e-06, + "loss": 0.89241517, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.30859375, + "step": 344, + "time_per_iteration": 2.4981014728546143 + }, + { + "auxiliary_loss_clip": 0.01052181, + "auxiliary_loss_mlp": 0.0103953, + "balance_loss_clip": 1.01758397, + "balance_loss_mlp": 1.02081704, + "epoch": 0.02074252217044942, + "flos": 15012686736000.0, + "grad_norm": 1.4032766726695793, + "language_loss": 0.88269889, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.90361595, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.31445312, + "step": 345, + "time_per_iteration": 2.511490821838379 + }, + { + "auxiliary_loss_clip": 0.01050155, + "auxiliary_loss_mlp": 0.01043267, + "balance_loss_clip": 1.0206883, + "balance_loss_mlp": 1.0178318, + "epoch": 0.02080264542311739, + "flos": 25336783324800.0, + "grad_norm": 1.7472236291153327, + "language_loss": 0.97384208, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.99477637, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.32421875, + "step": 346, + "time_per_iteration": 2.5483906269073486 + }, + { + "auxiliary_loss_clip": 0.01046587, + "auxiliary_loss_mlp": 0.01040461, + "balance_loss_clip": 1.01759708, + "balance_loss_mlp": 1.01515579, + "epoch": 0.02086276867578536, + "flos": 24387669480960.0, + "grad_norm": 1.6794710914870128, + "language_loss": 0.91511512, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.93598562, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.31445312, + "step": 347, + "time_per_iteration": 2.651024341583252 + }, + { + "auxiliary_loss_clip": 0.01045041, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.01579976, + "balance_loss_mlp": 1.0138284, + "epoch": 0.02092289192845333, + "flos": 24461056892160.0, + "grad_norm": 1.5253953557080513, + "language_loss": 0.82996881, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.85079861, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3125, + "step": 348, + "time_per_iteration": 2.568885564804077 + }, + { + "auxiliary_loss_clip": 0.01046997, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.00860095, + "balance_loss_mlp": 1.01523268, + "epoch": 0.0209830151811213, + "flos": 17450392752000.0, + "grad_norm": 1.8800966145467317, + "language_loss": 0.92315567, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.94392145, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.31835938, + "step": 349, + "time_per_iteration": 2.4211666584014893 + }, + { + "auxiliary_loss_clip": 0.01052959, + "auxiliary_loss_mlp": 0.01052669, + "balance_loss_clip": 1.0276711, + "balance_loss_mlp": 1.01985884, + "epoch": 0.021043138433789268, + "flos": 24572778842880.0, + "grad_norm": 1.6042996410551307, + "language_loss": 0.92770702, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.94876331, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.33203125, + "step": 350, + "time_per_iteration": 2.514585018157959 + }, + { + "auxiliary_loss_clip": 0.01056169, + "auxiliary_loss_mlp": 0.01056151, + "balance_loss_clip": 1.03041339, + "balance_loss_mlp": 1.02121615, + "epoch": 0.021103261686457236, + "flos": 24453132013440.0, + "grad_norm": 1.7415094709691268, + "language_loss": 0.89059722, + "learning_rate": 3.773480007028776e-06, + "loss": 0.91172045, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.34960938, + "step": 351, + "time_per_iteration": 2.4895401000976562 + }, + { + "auxiliary_loss_clip": 0.01055467, + "auxiliary_loss_mlp": 0.01055435, + "balance_loss_clip": 1.03000808, + "balance_loss_mlp": 1.02140582, + "epoch": 0.021163384939125205, + "flos": 14682233917440.0, + "grad_norm": 1.6683731560860307, + "language_loss": 0.96391118, + "learning_rate": 3.775311735671078e-06, + "loss": 0.98502004, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.33984375, + "step": 352, + "time_per_iteration": 2.4473307132720947 + }, + { + "auxiliary_loss_clip": 0.01054252, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.01323867, + "balance_loss_mlp": 1.02056229, + "epoch": 0.021223508191793177, + "flos": 24492199691520.0, + "grad_norm": 1.7019599123993663, + "language_loss": 0.91113013, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.932055, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3359375, + "step": 353, + "time_per_iteration": 2.553689956665039 + }, + { + "auxiliary_loss_clip": 0.01053495, + "auxiliary_loss_mlp": 0.01040308, + "balance_loss_clip": 1.01581073, + "balance_loss_mlp": 1.02077854, + "epoch": 0.021283631444461146, + "flos": 24126030685440.0, + "grad_norm": 1.7864862226037206, + "language_loss": 0.88779557, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.90873361, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.328125, + "step": 354, + "time_per_iteration": 2.511162757873535 + }, + { + "auxiliary_loss_clip": 0.01056654, + "auxiliary_loss_mlp": 0.0103833, + "balance_loss_clip": 1.01117396, + "balance_loss_mlp": 1.0234791, + "epoch": 0.021343754697129114, + "flos": 25191055843200.0, + "grad_norm": 1.751334386513014, + "language_loss": 0.91461623, + "learning_rate": 3.780775860546545e-06, + "loss": 0.93556607, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.33203125, + "step": 355, + "time_per_iteration": 2.551647901535034 + }, + { + "auxiliary_loss_clip": 0.01055938, + "auxiliary_loss_mlp": 0.01037525, + "balance_loss_clip": 1.01176405, + "balance_loss_mlp": 1.02295291, + "epoch": 0.021403877949797083, + "flos": 17273243180160.0, + "grad_norm": 1.8561163947937789, + "language_loss": 1.03026938, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.05120397, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.33007812, + "step": 356, + "time_per_iteration": 2.4424355030059814 + }, + { + "auxiliary_loss_clip": 0.01051904, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.01509404, + "balance_loss_mlp": 1.02103949, + "epoch": 0.021464001202465055, + "flos": 30916183478400.0, + "grad_norm": 1.6393247965911175, + "language_loss": 0.90655422, + "learning_rate": 3.784393017158528e-06, + "loss": 0.927477, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.30859375, + "step": 357, + "time_per_iteration": 2.550257444381714 + }, + { + "auxiliary_loss_clip": 0.01049646, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.00806332, + "balance_loss_mlp": 1.01878524, + "epoch": 0.021524124455133024, + "flos": 18185418938880.0, + "grad_norm": 2.033949581243725, + "language_loss": 0.89859986, + "learning_rate": 3.786194003461506e-06, + "loss": 0.91941202, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.30859375, + "step": 358, + "time_per_iteration": 2.420276641845703 + }, + { + "auxiliary_loss_clip": 0.01052544, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.0091536, + "balance_loss_mlp": 1.01939023, + "epoch": 0.021584247707800992, + "flos": 13805006296320.0, + "grad_norm": 1.7411050690070378, + "language_loss": 1.01038039, + "learning_rate": 3.787989966086264e-06, + "loss": 1.03125739, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.33203125, + "step": 359, + "time_per_iteration": 2.4263339042663574 + }, + { + "auxiliary_loss_clip": 0.0105698, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.0122776, + "balance_loss_mlp": 1.02343392, + "epoch": 0.02164437096046896, + "flos": 23293596205440.0, + "grad_norm": 1.975109024806782, + "language_loss": 0.95856643, + "learning_rate": 3.789780932980997e-06, + "loss": 0.97952104, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3359375, + "step": 360, + "time_per_iteration": 2.4664599895477295 + }, + { + "auxiliary_loss_clip": 0.01040569, + "auxiliary_loss_mlp": 0.01018223, + "balance_loss_clip": 1.00878167, + "balance_loss_mlp": 1.02093506, + "epoch": 0.02170449421313693, + "flos": 68896237875840.0, + "grad_norm": 0.8994916100683901, + "language_loss": 0.65260315, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67319101, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.09423828, + "router_z_loss_mlp": 0.19628906, + "step": 361, + "time_per_iteration": 3.174626588821411 + }, + { + "auxiliary_loss_clip": 0.01062255, + "auxiliary_loss_mlp": 0.01083123, + "balance_loss_clip": 1.05673039, + "balance_loss_mlp": 1.02799392, + "epoch": 0.0217646174658049, + "flos": 25227365523840.0, + "grad_norm": 2.1069719654627637, + "language_loss": 0.9514755, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.9729293, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.34179688, + "step": 362, + "time_per_iteration": 2.489434003829956 + }, + { + "auxiliary_loss_clip": 0.01065318, + "auxiliary_loss_mlp": 0.01135035, + "balance_loss_clip": 1.10979843, + "balance_loss_mlp": 1.02964151, + "epoch": 0.02182474071847287, + "flos": 22892025214080.0, + "grad_norm": 1.7524061382011709, + "language_loss": 1.01685953, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.03886318, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.35546875, + "step": 363, + "time_per_iteration": 2.4795219898223877 + }, + { + "auxiliary_loss_clip": 0.01058261, + "auxiliary_loss_mlp": 0.01096419, + "balance_loss_clip": 1.07308948, + "balance_loss_mlp": 1.0255233, + "epoch": 0.02188486397114084, + "flos": 23657879998080.0, + "grad_norm": 1.7047753876417933, + "language_loss": 1.00673032, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.02827704, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.328125, + "step": 364, + "time_per_iteration": 2.5590672492980957 + }, + { + "auxiliary_loss_clip": 0.01055543, + "auxiliary_loss_mlp": 0.01062395, + "balance_loss_clip": 1.04016221, + "balance_loss_mlp": 1.02388453, + "epoch": 0.021944987223808807, + "flos": 21542562276480.0, + "grad_norm": 1.7183442361974162, + "language_loss": 0.93013781, + "learning_rate": 3.798661793553676e-06, + "loss": 0.95131719, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.31640625, + "step": 365, + "time_per_iteration": 2.543931007385254 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.00805211, + "balance_loss_mlp": 1.02955437, + "epoch": 0.022005110476476776, + "flos": 16069961571840.0, + "grad_norm": 1.4965838347684615, + "language_loss": 0.91469157, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.93562675, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.32226562, + "step": 366, + "time_per_iteration": 2.461550712585449 + }, + { + "auxiliary_loss_clip": 0.0107226, + "auxiliary_loss_mlp": 0.01053499, + "balance_loss_clip": 1.02729654, + "balance_loss_mlp": 1.03793919, + "epoch": 0.022065233729144748, + "flos": 21432655716480.0, + "grad_norm": 1.7436805288872164, + "language_loss": 1.00496304, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.02622056, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.34375, + "step": 367, + "time_per_iteration": 2.482743740081787 + }, + { + "auxiliary_loss_clip": 0.0107694, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_clip": 1.05421877, + "balance_loss_mlp": 1.04025459, + "epoch": 0.022125356981812717, + "flos": 21542632099200.0, + "grad_norm": 1.5872538070346651, + "language_loss": 0.96469468, + "learning_rate": 3.803932100062912e-06, + "loss": 0.98628974, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.3671875, + "step": 368, + "time_per_iteration": 2.473029136657715 + }, + { + "auxiliary_loss_clip": 0.01070991, + "auxiliary_loss_mlp": 0.01094667, + "balance_loss_clip": 1.06771421, + "balance_loss_mlp": 1.03560328, + "epoch": 0.022185480234480685, + "flos": 20703110613120.0, + "grad_norm": 2.188996590859147, + "language_loss": 0.96622384, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.98788047, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.35351562, + "step": 369, + "time_per_iteration": 2.4747204780578613 + }, + { + "auxiliary_loss_clip": 0.01067443, + "auxiliary_loss_mlp": 0.01107009, + "balance_loss_clip": 1.08313107, + "balance_loss_mlp": 1.03387809, + "epoch": 0.022245603487148654, + "flos": 25191998449920.0, + "grad_norm": 1.7452480599431255, + "language_loss": 0.94708717, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.96883172, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3359375, + "step": 370, + "time_per_iteration": 2.5162887573242188 + }, + { + "auxiliary_loss_clip": 0.01065255, + "auxiliary_loss_mlp": 0.01076542, + "balance_loss_clip": 1.05291522, + "balance_loss_mlp": 1.0340147, + "epoch": 0.022305726739816623, + "flos": 21394914670080.0, + "grad_norm": 1.4209812785347278, + "language_loss": 0.89684153, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.9182595, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3125, + "step": 371, + "time_per_iteration": 2.4882850646972656 + }, + { + "auxiliary_loss_clip": 0.01063969, + "auxiliary_loss_mlp": 0.01051573, + "balance_loss_clip": 1.02742112, + "balance_loss_mlp": 1.03394675, + "epoch": 0.022365849992484595, + "flos": 22491047715840.0, + "grad_norm": 1.993940242841274, + "language_loss": 0.982355, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.00351048, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.30078125, + "step": 372, + "time_per_iteration": 2.459507703781128 + }, + { + "auxiliary_loss_clip": 0.01057762, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.01113582, + "balance_loss_mlp": 1.0273459, + "epoch": 0.022425973245152563, + "flos": 17855664347520.0, + "grad_norm": 2.018832828161093, + "language_loss": 0.94146943, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.96242565, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3046875, + "step": 373, + "time_per_iteration": 7.351617336273193 + }, + { + "auxiliary_loss_clip": 0.01059714, + "auxiliary_loss_mlp": 0.01083468, + "balance_loss_clip": 1.05523908, + "balance_loss_mlp": 1.02653718, + "epoch": 0.022486096497820532, + "flos": 15482233877760.0, + "grad_norm": 1.9791609312613758, + "language_loss": 0.9598487, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.98128051, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.33203125, + "step": 374, + "time_per_iteration": 3.8822717666625977 + }, + { + "auxiliary_loss_clip": 0.01057041, + "auxiliary_loss_mlp": 0.01137261, + "balance_loss_clip": 1.10824502, + "balance_loss_mlp": 1.02234483, + "epoch": 0.0225462197504885, + "flos": 27782868067200.0, + "grad_norm": 1.5808787138599067, + "language_loss": 0.92508572, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.94702882, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.34765625, + "step": 375, + "time_per_iteration": 2.5588536262512207 + }, + { + "auxiliary_loss_clip": 0.01058581, + "auxiliary_loss_mlp": 0.01095103, + "balance_loss_clip": 1.07077229, + "balance_loss_mlp": 1.02340436, + "epoch": 0.02260634300315647, + "flos": 19974438293760.0, + "grad_norm": 1.9567445915504975, + "language_loss": 0.98526859, + "learning_rate": 3.817778917253314e-06, + "loss": 1.00680542, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3515625, + "step": 376, + "time_per_iteration": 2.4672186374664307 + }, + { + "auxiliary_loss_clip": 0.01058407, + "auxiliary_loss_mlp": 0.01062511, + "balance_loss_clip": 1.03661895, + "balance_loss_mlp": 1.0230149, + "epoch": 0.02266646625582444, + "flos": 16027437669120.0, + "grad_norm": 2.7299721499892424, + "language_loss": 0.93942362, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.96063268, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.35351562, + "step": 377, + "time_per_iteration": 2.4267687797546387 + }, + { + "auxiliary_loss_clip": 0.01056123, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.01075971, + "balance_loss_mlp": 1.02317297, + "epoch": 0.02272658950849241, + "flos": 20403800593920.0, + "grad_norm": 1.7576562012774108, + "language_loss": 1.07824039, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.09913683, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.33007812, + "step": 378, + "time_per_iteration": 2.473787307739258 + }, + { + "auxiliary_loss_clip": 0.01042357, + "auxiliary_loss_mlp": 0.01039295, + "balance_loss_clip": 1.02851868, + "balance_loss_mlp": 1.02332401, + "epoch": 0.02278671276116038, + "flos": 69843990176640.0, + "grad_norm": 1.0172823467558303, + "language_loss": 0.75652194, + "learning_rate": 3.822895650276492e-06, + "loss": 0.7773385, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.19042969, + "step": 379, + "time_per_iteration": 3.078988790512085 + }, + { + "auxiliary_loss_clip": 0.01050903, + "auxiliary_loss_mlp": 0.01095351, + "balance_loss_clip": 1.06593013, + "balance_loss_mlp": 1.01971483, + "epoch": 0.022846836013828347, + "flos": 38507243927040.0, + "grad_norm": 1.8581431618441673, + "language_loss": 0.92617601, + "learning_rate": 3.824592231451859e-06, + "loss": 0.94763857, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.3125, + "step": 380, + "time_per_iteration": 2.626676559448242 + }, + { + "auxiliary_loss_clip": 0.01049226, + "auxiliary_loss_mlp": 0.01140949, + "balance_loss_clip": 1.10895348, + "balance_loss_mlp": 1.01942563, + "epoch": 0.02290695926649632, + "flos": 20958430452480.0, + "grad_norm": 1.8444911428781403, + "language_loss": 1.07154179, + "learning_rate": 3.826284353801652e-06, + "loss": 1.09344351, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.296875, + "step": 381, + "time_per_iteration": 2.442504405975342 + }, + { + "auxiliary_loss_clip": 0.01046808, + "auxiliary_loss_mlp": 0.01159748, + "balance_loss_clip": 1.12853909, + "balance_loss_mlp": 1.0167253, + "epoch": 0.022967082519164288, + "flos": 24021325918080.0, + "grad_norm": 1.8375417247472114, + "language_loss": 0.98562652, + "learning_rate": 3.827972040701142e-06, + "loss": 1.0076921, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.30078125, + "step": 382, + "time_per_iteration": 2.524433135986328 + }, + { + "auxiliary_loss_clip": 0.01045008, + "auxiliary_loss_mlp": 0.01154577, + "balance_loss_clip": 1.12532306, + "balance_loss_mlp": 1.01346707, + "epoch": 0.023027205771832256, + "flos": 20996066764800.0, + "grad_norm": 1.5887092471825892, + "language_loss": 0.96227187, + "learning_rate": 3.829655315342268e-06, + "loss": 0.98426771, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.31445312, + "step": 383, + "time_per_iteration": 2.5006637573242188 + }, + { + "auxiliary_loss_clip": 0.01049137, + "auxiliary_loss_mlp": 0.01132752, + "balance_loss_clip": 1.10856426, + "balance_loss_mlp": 1.01547778, + "epoch": 0.023087329024500225, + "flos": 21359757064320.0, + "grad_norm": 1.67865832263442, + "language_loss": 0.96721143, + "learning_rate": 3.831334200735543e-06, + "loss": 0.98903036, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3359375, + "step": 384, + "time_per_iteration": 2.4451935291290283 + }, + { + "auxiliary_loss_clip": 0.01054972, + "auxiliary_loss_mlp": 0.01067441, + "balance_loss_clip": 1.04473174, + "balance_loss_mlp": 1.02155185, + "epoch": 0.023147452277168194, + "flos": 21871339349760.0, + "grad_norm": 1.5653887655972567, + "language_loss": 0.96656799, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.98779213, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.33398438, + "step": 385, + "time_per_iteration": 2.4414944648742676 + }, + { + "auxiliary_loss_clip": 0.01060448, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.01226926, + "balance_loss_mlp": 1.02736878, + "epoch": 0.023207575529836166, + "flos": 18915697180800.0, + "grad_norm": 1.530763170308701, + "language_loss": 0.75778854, + "learning_rate": 3.83467889492477e-06, + "loss": 0.77878106, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.33203125, + "step": 386, + "time_per_iteration": 2.4757237434387207 + }, + { + "auxiliary_loss_clip": 0.01067537, + "auxiliary_loss_mlp": 0.01124646, + "balance_loss_clip": 1.09753752, + "balance_loss_mlp": 1.03222609, + "epoch": 0.023267698782504134, + "flos": 25044839602560.0, + "grad_norm": 1.578774471492193, + "language_loss": 0.95899045, + "learning_rate": 3.836344748851495e-06, + "loss": 0.98091227, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.35351562, + "step": 387, + "time_per_iteration": 2.5410001277923584 + }, + { + "auxiliary_loss_clip": 0.0107026, + "auxiliary_loss_mlp": 0.01244227, + "balance_loss_clip": 1.21561658, + "balance_loss_mlp": 1.033319, + "epoch": 0.023327822035172103, + "flos": 28877883949440.0, + "grad_norm": 1.6360884572049952, + "language_loss": 0.94518, + "learning_rate": 3.838006303795566e-06, + "loss": 0.9683249, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.36914062, + "step": 388, + "time_per_iteration": 2.583221197128296 + }, + { + "auxiliary_loss_clip": 0.01061667, + "auxiliary_loss_mlp": 0.01245961, + "balance_loss_clip": 1.21696925, + "balance_loss_mlp": 1.02636814, + "epoch": 0.02338794528784007, + "flos": 27120426330240.0, + "grad_norm": 1.9083147035832377, + "language_loss": 1.06098771, + "learning_rate": 3.839663581888206e-06, + "loss": 1.08406401, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.35351562, + "step": 389, + "time_per_iteration": 2.4842751026153564 + }, + { + "auxiliary_loss_clip": 0.01054832, + "auxiliary_loss_mlp": 0.01149739, + "balance_loss_clip": 1.12221324, + "balance_loss_mlp": 1.02061367, + "epoch": 0.02344806854050804, + "flos": 21321352702080.0, + "grad_norm": 1.719351910528869, + "language_loss": 0.97008938, + "learning_rate": 3.841316605090178e-06, + "loss": 0.99213517, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.34375, + "step": 390, + "time_per_iteration": 2.4864470958709717 + }, + { + "auxiliary_loss_clip": 0.0105106, + "auxiliary_loss_mlp": 0.01061157, + "balance_loss_clip": 1.03557444, + "balance_loss_mlp": 1.01914644, + "epoch": 0.023508191793176012, + "flos": 24788856447360.0, + "grad_norm": 1.939306325924463, + "language_loss": 1.04287207, + "learning_rate": 3.842965395193529e-06, + "loss": 1.06399429, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.31835938, + "step": 391, + "time_per_iteration": 2.491482973098755 + }, + { + "auxiliary_loss_clip": 0.01058708, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.01037192, + "balance_loss_mlp": 1.02537847, + "epoch": 0.02356831504584398, + "flos": 25994162914560.0, + "grad_norm": 1.6564997196821902, + "language_loss": 0.97874367, + "learning_rate": 3.84460997382332e-06, + "loss": 0.9996779, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.33398438, + "step": 392, + "time_per_iteration": 2.5364246368408203 + }, + { + "auxiliary_loss_clip": 0.01071197, + "auxiliary_loss_mlp": 0.01052446, + "balance_loss_clip": 1.0235858, + "balance_loss_mlp": 1.03618598, + "epoch": 0.02362843829851195, + "flos": 19061459573760.0, + "grad_norm": 1.677648731787316, + "language_loss": 0.97780597, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.99904239, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.34960938, + "step": 393, + "time_per_iteration": 2.453315258026123 + }, + { + "auxiliary_loss_clip": 0.01086634, + "auxiliary_loss_mlp": 0.01083953, + "balance_loss_clip": 1.05202913, + "balance_loss_mlp": 1.04925573, + "epoch": 0.023688561551179918, + "flos": 16070101217280.0, + "grad_norm": 1.504603378010179, + "language_loss": 0.89725667, + "learning_rate": 3.84788658233771e-06, + "loss": 0.9189626, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.375, + "step": 394, + "time_per_iteration": 2.4793944358825684 + }, + { + "auxiliary_loss_clip": 0.01083467, + "auxiliary_loss_mlp": 0.01116137, + "balance_loss_clip": 1.08147073, + "balance_loss_mlp": 1.04631972, + "epoch": 0.023748684803847887, + "flos": 21723342629760.0, + "grad_norm": 1.5473601816260492, + "language_loss": 0.93547249, + "learning_rate": 3.84951865465269e-06, + "loss": 0.95746851, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.37109375, + "step": 395, + "time_per_iteration": 2.499711751937866 + }, + { + "auxiliary_loss_clip": 0.01068875, + "auxiliary_loss_mlp": 0.01130705, + "balance_loss_clip": 1.11344326, + "balance_loss_mlp": 1.04772222, + "epoch": 0.02380880805651586, + "flos": 61923175136640.0, + "grad_norm": 1.034173276045498, + "language_loss": 0.64108658, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66308236, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.2109375, + "step": 396, + "time_per_iteration": 2.9469611644744873 + }, + { + "auxiliary_loss_clip": 0.01062368, + "auxiliary_loss_mlp": 0.01176483, + "balance_loss_clip": 1.1380024, + "balance_loss_mlp": 1.02662802, + "epoch": 0.023868931309183827, + "flos": 20265299763840.0, + "grad_norm": 1.9732415497440556, + "language_loss": 0.99557865, + "learning_rate": 3.852770440269372e-06, + "loss": 1.01796722, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.35742188, + "step": 397, + "time_per_iteration": 2.4573190212249756 + }, + { + "auxiliary_loss_clip": 0.01056196, + "auxiliary_loss_mlp": 0.01191142, + "balance_loss_clip": 1.15587997, + "balance_loss_mlp": 1.02041304, + "epoch": 0.023929054561851796, + "flos": 21138128553600.0, + "grad_norm": 1.7634831749766737, + "language_loss": 1.00372171, + "learning_rate": 3.854390195044404e-06, + "loss": 1.02619505, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.35742188, + "step": 398, + "time_per_iteration": 2.5018601417541504 + }, + { + "auxiliary_loss_clip": 0.01055112, + "auxiliary_loss_mlp": 0.01157312, + "balance_loss_clip": 1.12543523, + "balance_loss_mlp": 1.02073753, + "epoch": 0.023989177814519765, + "flos": 13697683176960.0, + "grad_norm": 2.095343063557732, + "language_loss": 1.07544744, + "learning_rate": 3.856005885185868e-06, + "loss": 1.09757161, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.34375, + "step": 399, + "time_per_iteration": 2.452336072921753 + }, + { + "auxiliary_loss_clip": 0.01061664, + "auxiliary_loss_mlp": 0.01128832, + "balance_loss_clip": 1.10282016, + "balance_loss_mlp": 1.02799106, + "epoch": 0.024049301067187733, + "flos": 26320845306240.0, + "grad_norm": 1.71131071693994, + "language_loss": 0.93634069, + "learning_rate": 3.857617531042398e-06, + "loss": 0.95824564, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3359375, + "step": 400, + "time_per_iteration": 2.5170485973358154 + }, + { + "auxiliary_loss_clip": 0.01077872, + "auxiliary_loss_mlp": 0.010685, + "balance_loss_clip": 1.04061723, + "balance_loss_mlp": 1.04331744, + "epoch": 0.024109424319855705, + "flos": 24424293363840.0, + "grad_norm": 1.5299244848717946, + "language_loss": 0.88781524, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.90927899, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.34570312, + "step": 401, + "time_per_iteration": 2.51882004737854 + }, + { + "auxiliary_loss_clip": 0.01105542, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.00915468, + "balance_loss_mlp": 1.0690577, + "epoch": 0.024169547572523674, + "flos": 29603169866880.0, + "grad_norm": 1.6525525823408749, + "language_loss": 0.90059721, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.9220134, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.36523438, + "step": 402, + "time_per_iteration": 2.570519208908081 + }, + { + "auxiliary_loss_clip": 0.01135296, + "auxiliary_loss_mlp": 0.01137525, + "balance_loss_clip": 1.10328794, + "balance_loss_mlp": 1.09048748, + "epoch": 0.024229670825191642, + "flos": 22600360782720.0, + "grad_norm": 2.1212160614934357, + "language_loss": 1.08712173, + "learning_rate": 3.86242840411147e-06, + "loss": 1.10984993, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.44726562, + "step": 403, + "time_per_iteration": 2.5086300373077393 + }, + { + "auxiliary_loss_clip": 0.01152256, + "auxiliary_loss_mlp": 0.01310818, + "balance_loss_clip": 1.27186012, + "balance_loss_mlp": 1.09858251, + "epoch": 0.02428979407785961, + "flos": 18149283815040.0, + "grad_norm": 2.012260642036847, + "language_loss": 1.10839891, + "learning_rate": 3.864024073288798e-06, + "loss": 1.1330297, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.53515625, + "step": 404, + "time_per_iteration": 2.5056493282318115 + }, + { + "auxiliary_loss_clip": 0.01141174, + "auxiliary_loss_mlp": 0.01313721, + "balance_loss_clip": 1.26961398, + "balance_loss_mlp": 1.08737814, + "epoch": 0.024349917330527583, + "flos": 15304071876480.0, + "grad_norm": 1.7520327690697088, + "language_loss": 0.99499238, + "learning_rate": 3.865615797668091e-06, + "loss": 1.01954138, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.5390625, + "step": 405, + "time_per_iteration": 2.4998738765716553 + }, + { + "auxiliary_loss_clip": 0.01106758, + "auxiliary_loss_mlp": 0.01188182, + "balance_loss_clip": 1.14018893, + "balance_loss_mlp": 1.05776453, + "epoch": 0.024410040583195552, + "flos": 20772937065600.0, + "grad_norm": 1.7700789791114835, + "language_loss": 1.04371715, + "learning_rate": 3.867203596705844e-06, + "loss": 1.06666672, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.49023438, + "step": 406, + "time_per_iteration": 2.4956769943237305 + }, + { + "auxiliary_loss_clip": 0.0108177, + "auxiliary_loss_mlp": 0.01118397, + "balance_loss_clip": 1.07083273, + "balance_loss_mlp": 1.03623927, + "epoch": 0.02447016383586352, + "flos": 21797777381760.0, + "grad_norm": 1.7172087592256737, + "language_loss": 0.97895312, + "learning_rate": 3.86878748971496e-06, + "loss": 1.00095487, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.45507812, + "step": 407, + "time_per_iteration": 2.485947370529175 + }, + { + "auxiliary_loss_clip": 0.01070793, + "auxiliary_loss_mlp": 0.01068357, + "balance_loss_clip": 1.02053034, + "balance_loss_mlp": 1.02987337, + "epoch": 0.02453028708853149, + "flos": 33946714247040.0, + "grad_norm": 1.4417177831272252, + "language_loss": 0.82290316, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.84429467, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.40820312, + "step": 408, + "time_per_iteration": 2.583381414413452 + }, + { + "auxiliary_loss_clip": 0.01074591, + "auxiliary_loss_mlp": 0.01082551, + "balance_loss_clip": 1.03734636, + "balance_loss_mlp": 1.03423309, + "epoch": 0.024590410341199458, + "flos": 21792086830080.0, + "grad_norm": 2.234857793951266, + "language_loss": 1.06532502, + "learning_rate": 3.871943634189376e-06, + "loss": 1.08689642, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.40429688, + "step": 409, + "time_per_iteration": 2.458040952682495 + }, + { + "auxiliary_loss_clip": 0.010873, + "auxiliary_loss_mlp": 0.01085392, + "balance_loss_clip": 1.03890085, + "balance_loss_mlp": 1.04669189, + "epoch": 0.02465053359386743, + "flos": 35113371972480.0, + "grad_norm": 1.782492068289133, + "language_loss": 0.93966281, + "learning_rate": 3.873515923575128e-06, + "loss": 0.96138972, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.40625, + "step": 410, + "time_per_iteration": 2.591573476791382 + }, + { + "auxiliary_loss_clip": 0.01097663, + "auxiliary_loss_mlp": 0.01091273, + "balance_loss_clip": 1.04718947, + "balance_loss_mlp": 1.0561378, + "epoch": 0.0247106568465354, + "flos": 27450250744320.0, + "grad_norm": 2.0100138812826267, + "language_loss": 0.91941869, + "learning_rate": 3.875084382775879e-06, + "loss": 0.94130802, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.4140625, + "step": 411, + "time_per_iteration": 2.520829916000366 + }, + { + "auxiliary_loss_clip": 0.01102946, + "auxiliary_loss_mlp": 0.01104003, + "balance_loss_clip": 1.05677199, + "balance_loss_mlp": 1.06026542, + "epoch": 0.024770780099203367, + "flos": 20702761499520.0, + "grad_norm": 1.7488256910988416, + "language_loss": 0.98558384, + "learning_rate": 3.87664903040738e-06, + "loss": 1.00765324, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.42578125, + "step": 412, + "time_per_iteration": 4.009392499923706 + }, + { + "auxiliary_loss_clip": 0.01067275, + "auxiliary_loss_mlp": 0.01123078, + "balance_loss_clip": 1.09828234, + "balance_loss_mlp": 1.04154396, + "epoch": 0.024830903351871336, + "flos": 69548625141120.0, + "grad_norm": 0.964304275836295, + "language_loss": 0.58828115, + "learning_rate": 3.878209884949994e-06, + "loss": 0.61018467, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.2578125, + "step": 413, + "time_per_iteration": 4.518808841705322 + }, + { + "auxiliary_loss_clip": 0.01080489, + "auxiliary_loss_mlp": 0.01352252, + "balance_loss_clip": 1.29920411, + "balance_loss_mlp": 1.0407443, + "epoch": 0.024891026604539304, + "flos": 32269102640640.0, + "grad_norm": 1.630355473054429, + "language_loss": 0.88304049, + "learning_rate": 3.879766964750006e-06, + "loss": 0.90736794, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.3984375, + "step": 414, + "time_per_iteration": 3.9202213287353516 + }, + { + "auxiliary_loss_clip": 0.01061057, + "auxiliary_loss_mlp": 0.01217555, + "balance_loss_clip": 1.17027617, + "balance_loss_mlp": 1.02325416, + "epoch": 0.024951149857207276, + "flos": 18839377215360.0, + "grad_norm": 1.7733515973750797, + "language_loss": 0.91122979, + "learning_rate": 3.881320288020917e-06, + "loss": 0.93401593, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.37890625, + "step": 415, + "time_per_iteration": 2.4326889514923096 + }, + { + "auxiliary_loss_clip": 0.01069004, + "auxiliary_loss_mlp": 0.01109241, + "balance_loss_clip": 1.06813788, + "balance_loss_mlp": 1.02867365, + "epoch": 0.025011273109875245, + "flos": 15376307212800.0, + "grad_norm": 1.9580423386031647, + "language_loss": 1.17924917, + "learning_rate": 3.882869872844723e-06, + "loss": 1.20103168, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.40234375, + "step": 416, + "time_per_iteration": 2.460941791534424 + }, + { + "auxiliary_loss_clip": 0.01079348, + "auxiliary_loss_mlp": 0.01226701, + "balance_loss_clip": 1.18521583, + "balance_loss_mlp": 1.03785563, + "epoch": 0.025071396362543213, + "flos": 18914545105920.0, + "grad_norm": 1.4849245568933516, + "language_loss": 0.86919314, + "learning_rate": 3.884415737173176e-06, + "loss": 0.89225364, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.4140625, + "step": 417, + "time_per_iteration": 2.480829954147339 + }, + { + "auxiliary_loss_clip": 0.01092358, + "auxiliary_loss_mlp": 0.01151558, + "balance_loss_clip": 1.11403108, + "balance_loss_mlp": 1.04928017, + "epoch": 0.025131519615211182, + "flos": 25336783324800.0, + "grad_norm": 1.5055364611404225, + "language_loss": 0.8663283, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.88876748, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.4296875, + "step": 418, + "time_per_iteration": 2.5229387283325195 + }, + { + "auxiliary_loss_clip": 0.0110301, + "auxiliary_loss_mlp": 0.01208428, + "balance_loss_clip": 1.17230773, + "balance_loss_mlp": 1.05734265, + "epoch": 0.02519164286787915, + "flos": 18952146506880.0, + "grad_norm": 2.0411450314123916, + "language_loss": 1.01694596, + "learning_rate": 3.887496375507294e-06, + "loss": 1.0400604, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.45703125, + "step": 419, + "time_per_iteration": 2.4592461585998535 + }, + { + "auxiliary_loss_clip": 0.01102145, + "auxiliary_loss_mlp": 0.01289402, + "balance_loss_clip": 1.24755907, + "balance_loss_mlp": 1.05516112, + "epoch": 0.025251766120547123, + "flos": 17420122736640.0, + "grad_norm": 1.5802325254694738, + "language_loss": 0.84811753, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.872033, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.47070312, + "step": 420, + "time_per_iteration": 2.489466905593872 + }, + { + "auxiliary_loss_clip": 0.01095744, + "auxiliary_loss_mlp": 0.01359643, + "balance_loss_clip": 1.31355679, + "balance_loss_mlp": 1.04864407, + "epoch": 0.02531188937321509, + "flos": 25044281020800.0, + "grad_norm": 1.5752372439744768, + "language_loss": 0.88626528, + "learning_rate": 3.890562344079484e-06, + "loss": 0.91081917, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.47070312, + "step": 421, + "time_per_iteration": 2.5057857036590576 + }, + { + "auxiliary_loss_clip": 0.0107962, + "auxiliary_loss_mlp": 0.0130183, + "balance_loss_clip": 1.25331223, + "balance_loss_mlp": 1.03527308, + "epoch": 0.02537201262588306, + "flos": 30590897541120.0, + "grad_norm": 1.7501331531352018, + "language_loss": 0.97287649, + "learning_rate": 3.89208987073549e-06, + "loss": 0.99669105, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.44335938, + "step": 422, + "time_per_iteration": 2.546980857849121 + }, + { + "auxiliary_loss_clip": 0.01074358, + "auxiliary_loss_mlp": 0.01267467, + "balance_loss_clip": 1.21546841, + "balance_loss_mlp": 1.03082883, + "epoch": 0.02543213587855103, + "flos": 26064233746560.0, + "grad_norm": 1.4963160418317423, + "language_loss": 0.9285996, + "learning_rate": 3.893613781940409e-06, + "loss": 0.95201784, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.43554688, + "step": 423, + "time_per_iteration": 2.4973714351654053 + }, + { + "auxiliary_loss_clip": 0.010686, + "auxiliary_loss_mlp": 0.01198454, + "balance_loss_clip": 1.14442837, + "balance_loss_mlp": 1.02503848, + "epoch": 0.025492259131218997, + "flos": 36021498013440.0, + "grad_norm": 1.5120011326170943, + "language_loss": 0.84542441, + "learning_rate": 3.895134094768415e-06, + "loss": 0.86809498, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.43554688, + "step": 424, + "time_per_iteration": 2.6108956336975098 + }, + { + "auxiliary_loss_clip": 0.01075044, + "auxiliary_loss_mlp": 0.01102674, + "balance_loss_clip": 1.04340315, + "balance_loss_mlp": 1.02645218, + "epoch": 0.02555238238388697, + "flos": 18587059752960.0, + "grad_norm": 1.747182477324269, + "language_loss": 0.97283477, + "learning_rate": 3.896650826173015e-06, + "loss": 0.99461192, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.484375, + "step": 425, + "time_per_iteration": 2.442579984664917 + }, + { + "auxiliary_loss_clip": 0.01078244, + "auxiliary_loss_mlp": 0.0110228, + "balance_loss_clip": 1.05273676, + "balance_loss_mlp": 1.02673221, + "epoch": 0.025612505636554938, + "flos": 24242046733440.0, + "grad_norm": 1.858634003781625, + "language_loss": 0.98814982, + "learning_rate": 3.898163992988186e-06, + "loss": 1.00995517, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.515625, + "step": 426, + "time_per_iteration": 2.491123914718628 + }, + { + "auxiliary_loss_clip": 0.01035619, + "auxiliary_loss_mlp": 0.01247714, + "balance_loss_clip": 1.22921276, + "balance_loss_mlp": 1.01196575, + "epoch": 0.025672628889222907, + "flos": 60583661936640.0, + "grad_norm": 0.9953989383190229, + "language_loss": 0.57471186, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59754521, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.23632812, + "step": 427, + "time_per_iteration": 3.14373517036438 + }, + { + "auxiliary_loss_clip": 0.01093529, + "auxiliary_loss_mlp": 0.01514068, + "balance_loss_clip": 1.43658161, + "balance_loss_mlp": 1.0367291, + "epoch": 0.025732752141890875, + "flos": 19572238897920.0, + "grad_norm": 1.9609025011286223, + "language_loss": 1.01531613, + "learning_rate": 3.901179699595194e-06, + "loss": 1.04139209, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.5703125, + "step": 428, + "time_per_iteration": 2.4653875827789307 + }, + { + "auxiliary_loss_clip": 0.01102117, + "auxiliary_loss_mlp": 0.01536129, + "balance_loss_clip": 1.45220518, + "balance_loss_mlp": 1.04303503, + "epoch": 0.025792875394558847, + "flos": 31282945977600.0, + "grad_norm": 1.5666351018907405, + "language_loss": 0.92477018, + "learning_rate": 3.902682272467353e-06, + "loss": 0.95115268, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.58984375, + "step": 429, + "time_per_iteration": 2.5686259269714355 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.01175055, + "balance_loss_clip": 1.11597538, + "balance_loss_mlp": 1.05489016, + "epoch": 0.025852998647226816, + "flos": 32378241150720.0, + "grad_norm": 1.7194285060258234, + "language_loss": 0.96532792, + "learning_rate": 3.904181346912895e-06, + "loss": 0.98816907, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.54296875, + "step": 430, + "time_per_iteration": 2.5670180320739746 + }, + { + "auxiliary_loss_clip": 0.01104334, + "auxiliary_loss_mlp": 0.01212343, + "balance_loss_clip": 1.16098726, + "balance_loss_mlp": 1.0569948, + "epoch": 0.025913121899894784, + "flos": 20192261466240.0, + "grad_norm": 1.4125027672327777, + "language_loss": 0.92967927, + "learning_rate": 3.905676939184698e-06, + "loss": 0.95284599, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.47265625, + "step": 431, + "time_per_iteration": 2.48945951461792 + }, + { + "auxiliary_loss_clip": 0.01096897, + "auxiliary_loss_mlp": 0.01415654, + "balance_loss_clip": 1.35256827, + "balance_loss_mlp": 1.05194187, + "epoch": 0.025973245152562753, + "flos": 14719556027520.0, + "grad_norm": 1.9494570201501953, + "language_loss": 1.02119184, + "learning_rate": 3.907169065422638e-06, + "loss": 1.04631734, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.44921875, + "step": 432, + "time_per_iteration": 2.441380500793457 + }, + { + "auxiliary_loss_clip": 0.01081184, + "auxiliary_loss_mlp": 0.01464531, + "balance_loss_clip": 1.4023037, + "balance_loss_mlp": 1.04050517, + "epoch": 0.02603336840523072, + "flos": 30991665571200.0, + "grad_norm": 1.7110813320340912, + "language_loss": 0.88180423, + "learning_rate": 3.908657741654636e-06, + "loss": 0.90726137, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.40625, + "step": 433, + "time_per_iteration": 2.611130714416504 + }, + { + "auxiliary_loss_clip": 0.01067657, + "auxiliary_loss_mlp": 0.01241945, + "balance_loss_clip": 1.19564462, + "balance_loss_mlp": 1.02843809, + "epoch": 0.026093491657898694, + "flos": 17673347894400.0, + "grad_norm": 1.8078536597896537, + "language_loss": 1.00427544, + "learning_rate": 3.910142983797699e-06, + "loss": 1.02737153, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.39257812, + "step": 434, + "time_per_iteration": 2.4487080574035645 + }, + { + "auxiliary_loss_clip": 0.01068704, + "auxiliary_loss_mlp": 0.01114978, + "balance_loss_clip": 1.07153773, + "balance_loss_mlp": 1.02613163, + "epoch": 0.026153614910566662, + "flos": 17856921156480.0, + "grad_norm": 1.7393823453049984, + "language_loss": 0.94274235, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.96457911, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.42578125, + "step": 435, + "time_per_iteration": 2.4638381004333496 + }, + { + "auxiliary_loss_clip": 0.01074687, + "auxiliary_loss_mlp": 0.01286979, + "balance_loss_clip": 1.23042583, + "balance_loss_mlp": 1.03001726, + "epoch": 0.02621373816323463, + "flos": 20010084658560.0, + "grad_norm": 1.855561554180425, + "language_loss": 1.00319719, + "learning_rate": 3.913103228936546e-06, + "loss": 1.02681375, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.44726562, + "step": 436, + "time_per_iteration": 2.464911460876465 + }, + { + "auxiliary_loss_clip": 0.01081567, + "auxiliary_loss_mlp": 0.01325425, + "balance_loss_clip": 1.26515269, + "balance_loss_mlp": 1.03569412, + "epoch": 0.0262738614159026, + "flos": 19280190441600.0, + "grad_norm": 1.889354115675155, + "language_loss": 0.90361893, + "learning_rate": 3.914578263220868e-06, + "loss": 0.9276889, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.45703125, + "step": 437, + "time_per_iteration": 2.528285026550293 + }, + { + "auxiliary_loss_clip": 0.01092816, + "auxiliary_loss_mlp": 0.01185407, + "balance_loss_clip": 1.12151098, + "balance_loss_mlp": 1.04545319, + "epoch": 0.026333984668570568, + "flos": 18806209557120.0, + "grad_norm": 1.9311327244759642, + "language_loss": 1.02307796, + "learning_rate": 3.916049925995316e-06, + "loss": 1.04586017, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.47265625, + "step": 438, + "time_per_iteration": 2.4848694801330566 + }, + { + "auxiliary_loss_clip": 0.01061256, + "auxiliary_loss_mlp": 0.01229617, + "balance_loss_clip": 1.18956256, + "balance_loss_mlp": 1.03786755, + "epoch": 0.02639410792123854, + "flos": 64568403607680.0, + "grad_norm": 0.9687376022112427, + "language_loss": 0.62851357, + "learning_rate": 3.917518232637377e-06, + "loss": 0.65142226, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.234375, + "step": 439, + "time_per_iteration": 3.152158737182617 + }, + { + "auxiliary_loss_clip": 0.01111209, + "auxiliary_loss_mlp": 0.01143553, + "balance_loss_clip": 1.07560325, + "balance_loss_mlp": 1.05496895, + "epoch": 0.02645423117390651, + "flos": 28472263240320.0, + "grad_norm": 1.643052309564428, + "language_loss": 0.87152088, + "learning_rate": 3.918983198419573e-06, + "loss": 0.89406848, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.5625, + "step": 440, + "time_per_iteration": 2.5506815910339355 + }, + { + "auxiliary_loss_clip": 0.01111826, + "auxiliary_loss_mlp": 0.01180838, + "balance_loss_clip": 1.11045647, + "balance_loss_mlp": 1.05053329, + "epoch": 0.026514354426574478, + "flos": 18550261313280.0, + "grad_norm": 1.583446230047119, + "language_loss": 0.93723452, + "learning_rate": 3.920444838510415e-06, + "loss": 0.96016109, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.61328125, + "step": 441, + "time_per_iteration": 2.4710886478424072 + }, + { + "auxiliary_loss_clip": 0.01105112, + "auxiliary_loss_mlp": 0.01166443, + "balance_loss_clip": 1.10264266, + "balance_loss_mlp": 1.04616296, + "epoch": 0.026574477679242446, + "flos": 20666766021120.0, + "grad_norm": 1.5838662080562382, + "language_loss": 0.89557087, + "learning_rate": 3.92190316797534e-06, + "loss": 0.91828644, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.58984375, + "step": 442, + "time_per_iteration": 2.4895575046539307 + }, + { + "auxiliary_loss_clip": 0.0104997, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.01291728, + "balance_loss_mlp": 1.02347493, + "epoch": 0.026634600931910415, + "flos": 57953026414080.0, + "grad_norm": 0.9732443204481646, + "language_loss": 0.64726448, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66811079, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.265625, + "step": 443, + "time_per_iteration": 2.9266908168792725 + }, + { + "auxiliary_loss_clip": 0.01090088, + "auxiliary_loss_mlp": 0.01119116, + "balance_loss_clip": 1.05560172, + "balance_loss_mlp": 1.03570509, + "epoch": 0.026694724184578387, + "flos": 15814222796160.0, + "grad_norm": 1.7843176004192642, + "language_loss": 0.99075246, + "learning_rate": 3.924809954779425e-06, + "loss": 1.01284444, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.54296875, + "step": 444, + "time_per_iteration": 2.464545249938965 + }, + { + "auxiliary_loss_clip": 0.0108764, + "auxiliary_loss_mlp": 0.01135656, + "balance_loss_clip": 1.07166433, + "balance_loss_mlp": 1.03387642, + "epoch": 0.026754847437246355, + "flos": 23439149130240.0, + "grad_norm": 1.7650013699669371, + "language_loss": 1.05818057, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.08041358, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.5390625, + "step": 445, + "time_per_iteration": 2.5354983806610107 + }, + { + "auxiliary_loss_clip": 0.01077301, + "auxiliary_loss_mlp": 0.01121267, + "balance_loss_clip": 1.06914878, + "balance_loss_mlp": 1.02889597, + "epoch": 0.026814970689914324, + "flos": 17341009862400.0, + "grad_norm": 1.7453518220001314, + "language_loss": 1.06053925, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.08252501, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.484375, + "step": 446, + "time_per_iteration": 2.4397237300872803 + }, + { + "auxiliary_loss_clip": 0.01073684, + "auxiliary_loss_mlp": 0.01084602, + "balance_loss_clip": 1.03250742, + "balance_loss_mlp": 1.02750671, + "epoch": 0.026875093942582293, + "flos": 17893754507520.0, + "grad_norm": 1.6624574624385358, + "language_loss": 0.90394521, + "learning_rate": 3.92914567610317e-06, + "loss": 0.92552805, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.4609375, + "step": 447, + "time_per_iteration": 2.467102527618408 + }, + { + "auxiliary_loss_clip": 0.01071468, + "auxiliary_loss_mlp": 0.01069429, + "balance_loss_clip": 1.01895559, + "balance_loss_mlp": 1.02658105, + "epoch": 0.026935217195250265, + "flos": 21722958604800.0, + "grad_norm": 1.6072632850483282, + "language_loss": 0.97541744, + "learning_rate": 3.930584452530952e-06, + "loss": 0.99682641, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.44921875, + "step": 448, + "time_per_iteration": 2.45749568939209 + }, + { + "auxiliary_loss_clip": 0.01073408, + "auxiliary_loss_mlp": 0.01077867, + "balance_loss_clip": 1.02865767, + "balance_loss_mlp": 1.02684307, + "epoch": 0.026995340447918233, + "flos": 23621570317440.0, + "grad_norm": 1.9791859654580417, + "language_loss": 0.97788966, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.9994024, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.46484375, + "step": 449, + "time_per_iteration": 2.477748394012451 + }, + { + "auxiliary_loss_clip": 0.01072197, + "auxiliary_loss_mlp": 0.01097728, + "balance_loss_clip": 1.0498302, + "balance_loss_mlp": 1.02680922, + "epoch": 0.027055463700586202, + "flos": 17930308567680.0, + "grad_norm": 1.6624340948183838, + "language_loss": 0.95286608, + "learning_rate": 3.933452395729493e-06, + "loss": 0.97456545, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.453125, + "step": 450, + "time_per_iteration": 2.444392204284668 + }, + { + "auxiliary_loss_clip": 0.01075371, + "auxiliary_loss_mlp": 0.01106679, + "balance_loss_clip": 1.06025934, + "balance_loss_mlp": 1.03107512, + "epoch": 0.02711558695325417, + "flos": 25117738254720.0, + "grad_norm": 1.3830171268415852, + "language_loss": 0.87715119, + "learning_rate": 3.934881590952304e-06, + "loss": 0.89897174, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.44335938, + "step": 451, + "time_per_iteration": 3.945362091064453 + }, + { + "auxiliary_loss_clip": 0.01076533, + "auxiliary_loss_mlp": 0.01093663, + "balance_loss_clip": 1.04814911, + "balance_loss_mlp": 1.03162563, + "epoch": 0.02717571020592214, + "flos": 24238520686080.0, + "grad_norm": 1.4247779251996742, + "language_loss": 0.82774174, + "learning_rate": 3.936307620734599e-06, + "loss": 0.84944367, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.44921875, + "step": 452, + "time_per_iteration": 3.9083104133605957 + }, + { + "auxiliary_loss_clip": 0.0107286, + "auxiliary_loss_mlp": 0.01071755, + "balance_loss_clip": 1.02795792, + "balance_loss_mlp": 1.02930152, + "epoch": 0.02723583345859011, + "flos": 25117773166080.0, + "grad_norm": 1.4037185699395758, + "language_loss": 0.78768903, + "learning_rate": 3.937730499067294e-06, + "loss": 0.80913514, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.43554688, + "step": 453, + "time_per_iteration": 3.910768508911133 + }, + { + "auxiliary_loss_clip": 0.01067028, + "auxiliary_loss_mlp": 0.01114327, + "balance_loss_clip": 1.07057726, + "balance_loss_mlp": 1.02542186, + "epoch": 0.02729595671125808, + "flos": 42739939140480.0, + "grad_norm": 1.5802973028391543, + "language_loss": 0.93361497, + "learning_rate": 3.939150239848748e-06, + "loss": 0.95542854, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.41601562, + "step": 454, + "time_per_iteration": 2.6732449531555176 + }, + { + "auxiliary_loss_clip": 0.01064768, + "auxiliary_loss_mlp": 0.01103791, + "balance_loss_clip": 1.06073308, + "balance_loss_mlp": 1.02387166, + "epoch": 0.02735607996392605, + "flos": 21430002453120.0, + "grad_norm": 1.40996443550794, + "language_loss": 0.83596015, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.85764575, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.41015625, + "step": 455, + "time_per_iteration": 2.504915237426758 + }, + { + "auxiliary_loss_clip": 0.01062349, + "auxiliary_loss_mlp": 0.01074541, + "balance_loss_clip": 1.03508329, + "balance_loss_mlp": 1.02212381, + "epoch": 0.027416203216594017, + "flos": 20850199637760.0, + "grad_norm": 1.60295818078564, + "language_loss": 0.92798924, + "learning_rate": 3.941980363893499e-06, + "loss": 0.94935822, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.40234375, + "step": 456, + "time_per_iteration": 2.490342617034912 + }, + { + "auxiliary_loss_clip": 0.01061423, + "auxiliary_loss_mlp": 0.01082777, + "balance_loss_clip": 1.04377198, + "balance_loss_mlp": 1.02204454, + "epoch": 0.027476326469261986, + "flos": 13223667381120.0, + "grad_norm": 1.5015077123030105, + "language_loss": 0.89764625, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.9190883, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.39453125, + "step": 457, + "time_per_iteration": 2.4761478900909424 + }, + { + "auxiliary_loss_clip": 0.01058649, + "auxiliary_loss_mlp": 0.01081982, + "balance_loss_clip": 1.04440701, + "balance_loss_mlp": 1.01990438, + "epoch": 0.027536449721929958, + "flos": 24023385688320.0, + "grad_norm": 1.869561994849557, + "language_loss": 1.03928685, + "learning_rate": 3.944798102235412e-06, + "loss": 1.06069314, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.38671875, + "step": 458, + "time_per_iteration": 2.5039923191070557 + }, + { + "auxiliary_loss_clip": 0.01057831, + "auxiliary_loss_mlp": 0.01064508, + "balance_loss_clip": 1.02946067, + "balance_loss_mlp": 1.02015853, + "epoch": 0.027596572974597926, + "flos": 13005215804160.0, + "grad_norm": 2.028272082687578, + "language_loss": 0.94367236, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.96489573, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.37695312, + "step": 459, + "time_per_iteration": 2.4560458660125732 + }, + { + "auxiliary_loss_clip": 0.01056359, + "auxiliary_loss_mlp": 0.01051928, + "balance_loss_clip": 1.01802516, + "balance_loss_mlp": 1.01991963, + "epoch": 0.027656696227265895, + "flos": 26141810520960.0, + "grad_norm": 1.4720825176737136, + "language_loss": 0.90831614, + "learning_rate": 3.947603562811407e-06, + "loss": 0.92939901, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.36328125, + "step": 460, + "time_per_iteration": 2.5009100437164307 + }, + { + "auxiliary_loss_clip": 0.01047951, + "auxiliary_loss_mlp": 0.01371652, + "balance_loss_clip": 1.33045363, + "balance_loss_mlp": 1.02201283, + "epoch": 0.027716819479933864, + "flos": 60693917610240.0, + "grad_norm": 1.7228985869413633, + "language_loss": 0.73757339, + "learning_rate": 3.949001722282675e-06, + "loss": 0.76176947, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.25976562, + "step": 461, + "time_per_iteration": 3.033900499343872 + }, + { + "auxiliary_loss_clip": 0.01059002, + "auxiliary_loss_mlp": 0.01072757, + "balance_loss_clip": 1.03918791, + "balance_loss_mlp": 1.02266741, + "epoch": 0.027776942732601832, + "flos": 31210605907200.0, + "grad_norm": 1.9347250579229809, + "language_loss": 0.95866072, + "learning_rate": 3.950396852153582e-06, + "loss": 0.97997832, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.36328125, + "step": 462, + "time_per_iteration": 2.522268295288086 + }, + { + "auxiliary_loss_clip": 0.01060527, + "auxiliary_loss_mlp": 0.01135243, + "balance_loss_clip": 1.09921789, + "balance_loss_mlp": 1.02257156, + "epoch": 0.027837065985269804, + "flos": 22673538725760.0, + "grad_norm": 1.8424971046146295, + "language_loss": 1.04239631, + "learning_rate": 3.951788965525118e-06, + "loss": 1.06435394, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.37890625, + "step": 463, + "time_per_iteration": 2.4867489337921143 + }, + { + "auxiliary_loss_clip": 0.01046778, + "auxiliary_loss_mlp": 0.01021885, + "balance_loss_clip": 1.0062449, + "balance_loss_mlp": 1.02434325, + "epoch": 0.027897189237937773, + "flos": 62179437582720.0, + "grad_norm": 0.8973351139160131, + "language_loss": 0.59326446, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61395109, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.22460938, + "step": 464, + "time_per_iteration": 3.030200481414795 + }, + { + "auxiliary_loss_clip": 0.01062848, + "auxiliary_loss_mlp": 0.01198682, + "balance_loss_clip": 1.15948558, + "balance_loss_mlp": 1.02381897, + "epoch": 0.02795731249060574, + "flos": 24492164780160.0, + "grad_norm": 1.6567602474438679, + "language_loss": 0.96851265, + "learning_rate": 3.954564194750784e-06, + "loss": 0.99112797, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.390625, + "step": 465, + "time_per_iteration": 2.4986178874969482 + }, + { + "auxiliary_loss_clip": 0.01061766, + "auxiliary_loss_mlp": 0.01174991, + "balance_loss_clip": 1.13536596, + "balance_loss_mlp": 1.02291989, + "epoch": 0.02801743574327371, + "flos": 23731860902400.0, + "grad_norm": 1.684077725179182, + "language_loss": 0.87185907, + "learning_rate": 3.955947336385828e-06, + "loss": 0.89422661, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.38867188, + "step": 466, + "time_per_iteration": 2.470954656600952 + }, + { + "auxiliary_loss_clip": 0.01058656, + "auxiliary_loss_mlp": 0.01127028, + "balance_loss_clip": 1.08601975, + "balance_loss_mlp": 1.02179849, + "epoch": 0.02807755899594168, + "flos": 20628117279360.0, + "grad_norm": 1.6112438676202752, + "language_loss": 0.93925571, + "learning_rate": 3.957327513084761e-06, + "loss": 0.96111262, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.3671875, + "step": 467, + "time_per_iteration": 2.482576847076416 + }, + { + "auxiliary_loss_clip": 0.01063363, + "auxiliary_loss_mlp": 0.01079824, + "balance_loss_clip": 1.04058015, + "balance_loss_mlp": 1.02423286, + "epoch": 0.02813768224860965, + "flos": 19243566558720.0, + "grad_norm": 1.7647817159058723, + "language_loss": 0.96234852, + "learning_rate": 3.958704737531818e-06, + "loss": 0.98378038, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.390625, + "step": 468, + "time_per_iteration": 2.527853488922119 + }, + { + "auxiliary_loss_clip": 0.01063887, + "auxiliary_loss_mlp": 0.01062662, + "balance_loss_clip": 1.02275062, + "balance_loss_mlp": 1.02488852, + "epoch": 0.02819780550127762, + "flos": 20812912439040.0, + "grad_norm": 1.8090786064206932, + "language_loss": 1.04676318, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.06802869, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.390625, + "step": 469, + "time_per_iteration": 2.491145372390747 + }, + { + "auxiliary_loss_clip": 0.01063796, + "auxiliary_loss_mlp": 0.01058231, + "balance_loss_clip": 1.01684213, + "balance_loss_mlp": 1.02440226, + "epoch": 0.028257928753945588, + "flos": 19973111662080.0, + "grad_norm": 1.8249365771843946, + "language_loss": 0.9798125, + "learning_rate": 3.96145038000181e-06, + "loss": 1.00103283, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.39453125, + "step": 470, + "time_per_iteration": 2.4382712841033936 + }, + { + "auxiliary_loss_clip": 0.01075298, + "auxiliary_loss_mlp": 0.01063437, + "balance_loss_clip": 1.0202117, + "balance_loss_mlp": 1.03350055, + "epoch": 0.028318052006613557, + "flos": 20483472049920.0, + "grad_norm": 1.6536804897011441, + "language_loss": 1.04656124, + "learning_rate": 3.962818822989861e-06, + "loss": 1.06794846, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.41796875, + "step": 471, + "time_per_iteration": 2.486631155014038 + }, + { + "auxiliary_loss_clip": 0.01070454, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_clip": 1.03955543, + "balance_loss_mlp": 1.02979302, + "epoch": 0.02837817525928153, + "flos": 28513495422720.0, + "grad_norm": 1.5594555198240558, + "language_loss": 0.85379982, + "learning_rate": 3.964184363657625e-06, + "loss": 0.87535095, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.40625, + "step": 472, + "time_per_iteration": 2.520949125289917 + }, + { + "auxiliary_loss_clip": 0.01073506, + "auxiliary_loss_mlp": 0.01099429, + "balance_loss_clip": 1.05660939, + "balance_loss_mlp": 1.03324246, + "epoch": 0.028438298511949497, + "flos": 18550680249600.0, + "grad_norm": 1.5507133161459867, + "language_loss": 1.03927302, + "learning_rate": 3.965547014290071e-06, + "loss": 1.06100225, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.40234375, + "step": 473, + "time_per_iteration": 2.4519848823547363 + }, + { + "auxiliary_loss_clip": 0.01068118, + "auxiliary_loss_mlp": 0.01157544, + "balance_loss_clip": 1.10714245, + "balance_loss_mlp": 1.02916074, + "epoch": 0.028498421764617466, + "flos": 16909273589760.0, + "grad_norm": 1.8957181580051394, + "language_loss": 1.04659927, + "learning_rate": 3.96690678709433e-06, + "loss": 1.06885588, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.390625, + "step": 474, + "time_per_iteration": 2.430156707763672 + }, + { + "auxiliary_loss_clip": 0.01067562, + "auxiliary_loss_mlp": 0.01185116, + "balance_loss_clip": 1.12827742, + "balance_loss_mlp": 1.02766824, + "epoch": 0.028558545017285435, + "flos": 27777561540480.0, + "grad_norm": 1.7384403101353583, + "language_loss": 0.89234042, + "learning_rate": 3.968263694200355e-06, + "loss": 0.91486716, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.3984375, + "step": 475, + "time_per_iteration": 2.515697956085205 + }, + { + "auxiliary_loss_clip": 0.01046359, + "auxiliary_loss_mlp": 0.01302395, + "balance_loss_clip": 1.25948, + "balance_loss_mlp": 1.02134895, + "epoch": 0.028618668269953403, + "flos": 65651060868480.0, + "grad_norm": 1.108212568755077, + "language_loss": 0.67332512, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69681269, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.25, + "step": 476, + "time_per_iteration": 2.980910301208496 + }, + { + "auxiliary_loss_clip": 0.01063847, + "auxiliary_loss_mlp": 0.01297294, + "balance_loss_clip": 1.23358881, + "balance_loss_mlp": 1.02015924, + "epoch": 0.028678791522621375, + "flos": 21936208389120.0, + "grad_norm": 1.838459948772356, + "language_loss": 0.98401761, + "learning_rate": 3.970968959455509e-06, + "loss": 1.00762892, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.4375, + "step": 477, + "time_per_iteration": 2.4759461879730225 + }, + { + "auxiliary_loss_clip": 0.01072471, + "auxiliary_loss_mlp": 0.01375246, + "balance_loss_clip": 1.31197023, + "balance_loss_mlp": 1.02801323, + "epoch": 0.028738914775289344, + "flos": 24570963452160.0, + "grad_norm": 2.06680580994056, + "language_loss": 0.9559828, + "learning_rate": 3.97231734148446e-06, + "loss": 0.98045999, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.44335938, + "step": 478, + "time_per_iteration": 2.4877047538757324 + }, + { + "auxiliary_loss_clip": 0.01082779, + "auxiliary_loss_mlp": 0.01245624, + "balance_loss_clip": 1.1939826, + "balance_loss_mlp": 1.03950381, + "epoch": 0.028799038027957313, + "flos": 23256867588480.0, + "grad_norm": 1.5081175216432052, + "language_loss": 0.9190942, + "learning_rate": 3.973662905576082e-06, + "loss": 0.94237828, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.43164062, + "step": 479, + "time_per_iteration": 2.506470203399658 + }, + { + "auxiliary_loss_clip": 0.0109537, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_clip": 1.03453255, + "balance_loss_mlp": 1.05306149, + "epoch": 0.02885916128062528, + "flos": 22163003781120.0, + "grad_norm": 1.7414786868711547, + "language_loss": 0.84525621, + "learning_rate": 3.975005663484038e-06, + "loss": 0.86690164, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.42382812, + "step": 480, + "time_per_iteration": 2.482440948486328 + }, + { + "auxiliary_loss_clip": 0.01114211, + "auxiliary_loss_mlp": 0.01097172, + "balance_loss_clip": 1.05974054, + "balance_loss_mlp": 1.06619561, + "epoch": 0.02891928453329325, + "flos": 22931651473920.0, + "grad_norm": 1.4651378733657758, + "language_loss": 0.95342934, + "learning_rate": 3.976345626888605e-06, + "loss": 0.97554314, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.47851562, + "step": 481, + "time_per_iteration": 2.562629222869873 + }, + { + "auxiliary_loss_clip": 0.01138624, + "auxiliary_loss_mlp": 0.01123518, + "balance_loss_clip": 1.08766019, + "balance_loss_mlp": 1.08891475, + "epoch": 0.028979407785961222, + "flos": 57430202670720.0, + "grad_norm": 0.9052968901909274, + "language_loss": 0.66359693, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68621832, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.49609375, + "step": 482, + "time_per_iteration": 2.8132543563842773 + }, + { + "auxiliary_loss_clip": 0.01140211, + "auxiliary_loss_mlp": 0.01237639, + "balance_loss_clip": 1.18893015, + "balance_loss_mlp": 1.07987344, + "epoch": 0.02903953103862919, + "flos": 16721929900800.0, + "grad_norm": 2.009995415563254, + "language_loss": 0.97913986, + "learning_rate": 3.979017216545415e-06, + "loss": 1.00291824, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.60546875, + "step": 483, + "time_per_iteration": 2.5561890602111816 + }, + { + "auxiliary_loss_clip": 0.01137553, + "auxiliary_loss_mlp": 0.01299003, + "balance_loss_clip": 1.25081873, + "balance_loss_mlp": 1.07552803, + "epoch": 0.02909965429129716, + "flos": 16762708235520.0, + "grad_norm": 1.5692790691675422, + "language_loss": 0.86888838, + "learning_rate": 3.980348865796749e-06, + "loss": 0.89325392, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.62109375, + "step": 484, + "time_per_iteration": 2.441469192504883 + }, + { + "auxiliary_loss_clip": 0.01118154, + "auxiliary_loss_mlp": 0.0129426, + "balance_loss_clip": 1.2446692, + "balance_loss_mlp": 1.06238127, + "epoch": 0.029159777543965128, + "flos": 19784511164160.0, + "grad_norm": 1.8453913858402686, + "language_loss": 0.94359136, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.96771544, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.5546875, + "step": 485, + "time_per_iteration": 2.491818904876709 + }, + { + "auxiliary_loss_clip": 0.01101327, + "auxiliary_loss_mlp": 0.01224207, + "balance_loss_clip": 1.16920376, + "balance_loss_mlp": 1.04975796, + "epoch": 0.029219900796633096, + "flos": 19641751148160.0, + "grad_norm": 1.7592660679479901, + "language_loss": 0.9997077, + "learning_rate": 3.983003930109732e-06, + "loss": 1.02296305, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.515625, + "step": 486, + "time_per_iteration": 2.4576265811920166 + }, + { + "auxiliary_loss_clip": 0.01087293, + "auxiliary_loss_mlp": 0.0122641, + "balance_loss_clip": 1.16675723, + "balance_loss_mlp": 1.03734541, + "epoch": 0.02928002404930107, + "flos": 25884500734080.0, + "grad_norm": 1.5523878104721587, + "language_loss": 0.9864561, + "learning_rate": 3.984327367746315e-06, + "loss": 1.00959301, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.5, + "step": 487, + "time_per_iteration": 2.531792640686035 + }, + { + "auxiliary_loss_clip": 0.01093668, + "auxiliary_loss_mlp": 0.01202614, + "balance_loss_clip": 1.13261414, + "balance_loss_mlp": 1.04040742, + "epoch": 0.029340147301969037, + "flos": 20659399724160.0, + "grad_norm": 1.9972596833023266, + "language_loss": 1.05266643, + "learning_rate": 3.985648090637122e-06, + "loss": 1.07562923, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.53125, + "step": 488, + "time_per_iteration": 2.4513723850250244 + }, + { + "auxiliary_loss_clip": 0.01098791, + "auxiliary_loss_mlp": 0.01178871, + "balance_loss_clip": 1.10624862, + "balance_loss_mlp": 1.04263854, + "epoch": 0.029400270554637006, + "flos": 24426806981760.0, + "grad_norm": 1.612030519160693, + "language_loss": 0.95104742, + "learning_rate": 3.986966109896785e-06, + "loss": 0.97382402, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.5625, + "step": 489, + "time_per_iteration": 2.5253055095672607 + }, + { + "auxiliary_loss_clip": 0.01118395, + "auxiliary_loss_mlp": 0.01124979, + "balance_loss_clip": 1.05269074, + "balance_loss_mlp": 1.05359125, + "epoch": 0.029460393807304974, + "flos": 20119851573120.0, + "grad_norm": 1.5117878485635015, + "language_loss": 0.94966328, + "learning_rate": 3.988281436571815e-06, + "loss": 0.97209704, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.6484375, + "step": 490, + "time_per_iteration": 2.483485460281372 + }, + { + "auxiliary_loss_clip": 0.0113758, + "auxiliary_loss_mlp": 0.01099364, + "balance_loss_clip": 1.02602625, + "balance_loss_mlp": 1.06776619, + "epoch": 0.029520517059972943, + "flos": 17674953816960.0, + "grad_norm": 1.8937820445965092, + "language_loss": 1.05084133, + "learning_rate": 3.989594081641164e-06, + "loss": 1.07321072, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.69921875, + "step": 491, + "time_per_iteration": 5.441805362701416 + }, + { + "auxiliary_loss_clip": 0.0114447, + "auxiliary_loss_mlp": 0.01192518, + "balance_loss_clip": 1.113554, + "balance_loss_mlp": 1.07219505, + "epoch": 0.029580640312640915, + "flos": 18952181418240.0, + "grad_norm": 1.6406449666500702, + "language_loss": 0.93240142, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.95577139, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.72265625, + "step": 492, + "time_per_iteration": 3.90789532661438 + }, + { + "auxiliary_loss_clip": 0.01129758, + "auxiliary_loss_mlp": 0.01186463, + "balance_loss_clip": 1.10425639, + "balance_loss_mlp": 1.06729031, + "epoch": 0.029640763565308884, + "flos": 18725351114880.0, + "grad_norm": 2.016961366388103, + "language_loss": 0.99007815, + "learning_rate": 3.992211370544093e-06, + "loss": 1.01324034, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.625, + "step": 493, + "time_per_iteration": 3.875744581222534 + }, + { + "auxiliary_loss_clip": 0.01114203, + "auxiliary_loss_mlp": 0.01147368, + "balance_loss_clip": 1.07064462, + "balance_loss_mlp": 1.05397987, + "epoch": 0.029700886817976852, + "flos": 20594251393920.0, + "grad_norm": 1.4909158347075284, + "language_loss": 0.98815495, + "learning_rate": 3.99351603600268e-06, + "loss": 1.01077056, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.6015625, + "step": 494, + "time_per_iteration": 2.510373592376709 + }, + { + "auxiliary_loss_clip": 0.01104537, + "auxiliary_loss_mlp": 0.01133556, + "balance_loss_clip": 1.05854952, + "balance_loss_mlp": 1.04807806, + "epoch": 0.02976101007064482, + "flos": 22235762787840.0, + "grad_norm": 1.8895961131602586, + "language_loss": 0.97594285, + "learning_rate": 3.994818063106668e-06, + "loss": 0.9983238, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.5625, + "step": 495, + "time_per_iteration": 2.4800100326538086 + }, + { + "auxiliary_loss_clip": 0.01103096, + "auxiliary_loss_mlp": 0.01125985, + "balance_loss_clip": 1.04964375, + "balance_loss_mlp": 1.04405785, + "epoch": 0.029821133323312793, + "flos": 23731511788800.0, + "grad_norm": 1.4766247403847093, + "language_loss": 0.71986866, + "learning_rate": 3.99611746250533e-06, + "loss": 0.74215949, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.58984375, + "step": 496, + "time_per_iteration": 2.5348434448242188 + }, + { + "auxiliary_loss_clip": 0.01107344, + "auxiliary_loss_mlp": 0.01121402, + "balance_loss_clip": 1.05311882, + "balance_loss_mlp": 1.04771471, + "epoch": 0.02988125657598076, + "flos": 22418393443200.0, + "grad_norm": 1.4250591486584456, + "language_loss": 0.96096951, + "learning_rate": 3.997414244783595e-06, + "loss": 0.98325694, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.59375, + "step": 497, + "time_per_iteration": 2.499897003173828 + }, + { + "auxiliary_loss_clip": 0.01108402, + "auxiliary_loss_mlp": 0.01094779, + "balance_loss_clip": 1.02558994, + "balance_loss_mlp": 1.04693675, + "epoch": 0.02994137982864873, + "flos": 13844248531200.0, + "grad_norm": 1.9704547132895474, + "language_loss": 1.01603985, + "learning_rate": 3.998708420462557e-06, + "loss": 1.03807151, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.61328125, + "step": 498, + "time_per_iteration": 2.4617421627044678 + }, + { + "auxiliary_loss_clip": 0.01112543, + "auxiliary_loss_mlp": 0.01106553, + "balance_loss_clip": 1.02334476, + "balance_loss_mlp": 1.04628384, + "epoch": 0.0300015030813167, + "flos": 23907404551680.0, + "grad_norm": 2.0235955988153487, + "language_loss": 0.96577203, + "learning_rate": 4e-06, + "loss": 0.98796308, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.6640625, + "step": 499, + "time_per_iteration": 2.455439329147339 + }, + { + "auxiliary_loss_clip": 0.01113809, + "auxiliary_loss_mlp": 0.01124168, + "balance_loss_clip": 1.04210448, + "balance_loss_mlp": 1.04805195, + "epoch": 0.030061626333984667, + "flos": 22015740199680.0, + "grad_norm": 1.450328724173176, + "language_loss": 0.90556049, + "learning_rate": 3.9999999620799e-06, + "loss": 0.92794025, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.65625, + "step": 500, + "time_per_iteration": 2.4918484687805176 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.01138084, + "balance_loss_clip": 1.06417418, + "balance_loss_mlp": 1.04953122, + "epoch": 0.03012174958665264, + "flos": 23038625479680.0, + "grad_norm": 1.9245725410329135, + "language_loss": 1.03260815, + "learning_rate": 3.9999998483196e-06, + "loss": 1.05511522, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.6328125, + "step": 501, + "time_per_iteration": 2.459219217300415 + }, + { + "auxiliary_loss_clip": 0.01107122, + "auxiliary_loss_mlp": 0.01126613, + "balance_loss_clip": 1.05289388, + "balance_loss_mlp": 1.04742968, + "epoch": 0.030181872839320608, + "flos": 18952251240960.0, + "grad_norm": 1.9278549276928703, + "language_loss": 0.98206395, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.00440121, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.59765625, + "step": 502, + "time_per_iteration": 2.488969326019287 + }, + { + "auxiliary_loss_clip": 0.0110702, + "auxiliary_loss_mlp": 0.01140594, + "balance_loss_clip": 1.06453848, + "balance_loss_mlp": 1.04990494, + "epoch": 0.030241996091988577, + "flos": 16727061870720.0, + "grad_norm": 2.0312172336926313, + "language_loss": 0.95668173, + "learning_rate": 3.999999393278425e-06, + "loss": 0.97915787, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.5703125, + "step": 503, + "time_per_iteration": 2.462157726287842 + }, + { + "auxiliary_loss_clip": 0.01103482, + "auxiliary_loss_mlp": 0.01150776, + "balance_loss_clip": 1.05698252, + "balance_loss_mlp": 1.04730058, + "epoch": 0.030302119344656545, + "flos": 28620015580800.0, + "grad_norm": 1.5971093431692582, + "language_loss": 0.97744256, + "learning_rate": 3.999999051997567e-06, + "loss": 0.99998516, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.5625, + "step": 504, + "time_per_iteration": 2.585536479949951 + }, + { + "auxiliary_loss_clip": 0.01102356, + "auxiliary_loss_mlp": 0.01148119, + "balance_loss_clip": 1.05108261, + "balance_loss_mlp": 1.04616618, + "epoch": 0.030362242597324514, + "flos": 15668425491840.0, + "grad_norm": 1.519782239364022, + "language_loss": 0.84274971, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.86525452, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.5625, + "step": 505, + "time_per_iteration": 2.4598066806793213 + }, + { + "auxiliary_loss_clip": 0.0106289, + "auxiliary_loss_mlp": 0.01158018, + "balance_loss_clip": 1.10957122, + "balance_loss_mlp": 1.03374863, + "epoch": 0.030422365849992486, + "flos": 72122107034880.0, + "grad_norm": 0.9083511625743905, + "language_loss": 0.5529021, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57511115, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.29296875, + "step": 506, + "time_per_iteration": 3.2049031257629395 + }, + { + "auxiliary_loss_clip": 0.01099584, + "auxiliary_loss_mlp": 0.01155772, + "balance_loss_clip": 1.07013237, + "balance_loss_mlp": 1.04265046, + "epoch": 0.030482489102660455, + "flos": 19426790707200.0, + "grad_norm": 1.54761936163015, + "language_loss": 0.90868783, + "learning_rate": 3.999997573114069e-06, + "loss": 0.93124139, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.5703125, + "step": 507, + "time_per_iteration": 2.5416057109832764 + }, + { + "auxiliary_loss_clip": 0.01089259, + "auxiliary_loss_mlp": 0.011534, + "balance_loss_clip": 1.07515156, + "balance_loss_mlp": 1.03643203, + "epoch": 0.030542612355328423, + "flos": 20374787387520.0, + "grad_norm": 1.7865720078895793, + "language_loss": 1.00743413, + "learning_rate": 3.999996928472659e-06, + "loss": 1.02986073, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.52734375, + "step": 508, + "time_per_iteration": 2.5111982822418213 + }, + { + "auxiliary_loss_clip": 0.01091777, + "auxiliary_loss_mlp": 0.01157533, + "balance_loss_clip": 1.07761478, + "balance_loss_mlp": 1.03450191, + "epoch": 0.030602735607996392, + "flos": 34675945148160.0, + "grad_norm": 1.5576848542450341, + "language_loss": 0.79518276, + "learning_rate": 3.999996207991165e-06, + "loss": 0.81767583, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.5703125, + "step": 509, + "time_per_iteration": 2.5608036518096924 + }, + { + "auxiliary_loss_clip": 0.01089366, + "auxiliary_loss_mlp": 0.011369, + "balance_loss_clip": 1.0546937, + "balance_loss_mlp": 1.03308797, + "epoch": 0.03066285886066436, + "flos": 23657565795840.0, + "grad_norm": 1.8118574763559583, + "language_loss": 0.92320585, + "learning_rate": 3.999995411669614e-06, + "loss": 0.94546854, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.5625, + "step": 510, + "time_per_iteration": 2.4971930980682373 + }, + { + "auxiliary_loss_clip": 0.01088133, + "auxiliary_loss_mlp": 0.01109449, + "balance_loss_clip": 1.03930604, + "balance_loss_mlp": 1.03266156, + "epoch": 0.030722982113332332, + "flos": 23001861951360.0, + "grad_norm": 1.818955433352456, + "language_loss": 0.96844339, + "learning_rate": 3.999994539508036e-06, + "loss": 0.99041927, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.5546875, + "step": 511, + "time_per_iteration": 2.4615046977996826 + }, + { + "auxiliary_loss_clip": 0.01090055, + "auxiliary_loss_mlp": 0.01094922, + "balance_loss_clip": 1.02244306, + "balance_loss_mlp": 1.03563952, + "epoch": 0.0307831053660003, + "flos": 24749788769280.0, + "grad_norm": 1.5119013925501068, + "language_loss": 0.92847729, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.95032704, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.546875, + "step": 512, + "time_per_iteration": 2.5186767578125 + }, + { + "auxiliary_loss_clip": 0.01099037, + "auxiliary_loss_mlp": 0.01122197, + "balance_loss_clip": 1.0268774, + "balance_loss_mlp": 1.03957152, + "epoch": 0.03084322861866827, + "flos": 26139680928000.0, + "grad_norm": 1.668883507733051, + "language_loss": 0.96336728, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.98557961, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.59375, + "step": 513, + "time_per_iteration": 2.499514579772949 + }, + { + "auxiliary_loss_clip": 0.01098186, + "auxiliary_loss_mlp": 0.01123851, + "balance_loss_clip": 1.04441011, + "balance_loss_mlp": 1.04187703, + "epoch": 0.03090335187133624, + "flos": 18770283901440.0, + "grad_norm": 1.3112978803390427, + "language_loss": 0.90259612, + "learning_rate": 3.999991467983491e-06, + "loss": 0.92481643, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.5625, + "step": 514, + "time_per_iteration": 2.4696667194366455 + }, + { + "auxiliary_loss_clip": 0.01102758, + "auxiliary_loss_mlp": 0.0111079, + "balance_loss_clip": 1.03020465, + "balance_loss_mlp": 1.04537988, + "epoch": 0.030963475124004207, + "flos": 23220767376000.0, + "grad_norm": 2.443021249961568, + "language_loss": 0.90932828, + "learning_rate": 3.999990292462167e-06, + "loss": 0.93146378, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.57421875, + "step": 515, + "time_per_iteration": 2.493640184402466 + }, + { + "auxiliary_loss_clip": 0.01109739, + "auxiliary_loss_mlp": 0.01134942, + "balance_loss_clip": 1.03814387, + "balance_loss_mlp": 1.04790258, + "epoch": 0.03102359837667218, + "flos": 42523861536000.0, + "grad_norm": 1.5017812315555825, + "language_loss": 0.95326561, + "learning_rate": 3.999989041101011e-06, + "loss": 0.97571248, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.6171875, + "step": 516, + "time_per_iteration": 2.6898787021636963 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01147333, + "balance_loss_clip": 1.03622949, + "balance_loss_mlp": 1.04604602, + "epoch": 0.031083721629340148, + "flos": 21175939422720.0, + "grad_norm": 1.5843742027639292, + "language_loss": 0.86425745, + "learning_rate": 3.999987713900071e-06, + "loss": 0.88682175, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.62890625, + "step": 517, + "time_per_iteration": 2.4948623180389404 + }, + { + "auxiliary_loss_clip": 0.01104503, + "auxiliary_loss_mlp": 0.01126735, + "balance_loss_clip": 1.03394306, + "balance_loss_mlp": 1.0430367, + "epoch": 0.031143844882008116, + "flos": 29714891817600.0, + "grad_norm": 1.3503588235869797, + "language_loss": 0.95947772, + "learning_rate": 3.999986310859396e-06, + "loss": 0.98179018, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.61328125, + "step": 518, + "time_per_iteration": 2.5664560794830322 + }, + { + "auxiliary_loss_clip": 0.01108074, + "auxiliary_loss_mlp": 0.0115579, + "balance_loss_clip": 1.04878747, + "balance_loss_mlp": 1.04225373, + "epoch": 0.031203968134676085, + "flos": 23111349575040.0, + "grad_norm": 1.6921325038477357, + "language_loss": 0.9641695, + "learning_rate": 3.999984831979039e-06, + "loss": 0.986808, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.65625, + "step": 519, + "time_per_iteration": 2.568657398223877 + }, + { + "auxiliary_loss_clip": 0.01102498, + "auxiliary_loss_mlp": 0.01182164, + "balance_loss_clip": 1.08245707, + "balance_loss_mlp": 1.04172552, + "epoch": 0.03126409138734405, + "flos": 20953473039360.0, + "grad_norm": 1.6542053395696847, + "language_loss": 0.95543146, + "learning_rate": 3.999983277259057e-06, + "loss": 0.9782781, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.609375, + "step": 520, + "time_per_iteration": 2.5302679538726807 + }, + { + "auxiliary_loss_clip": 0.01091642, + "auxiliary_loss_mlp": 0.01189027, + "balance_loss_clip": 1.09027398, + "balance_loss_mlp": 1.03398323, + "epoch": 0.031324214640012026, + "flos": 21649117345920.0, + "grad_norm": 1.5931616236821584, + "language_loss": 0.9621582, + "learning_rate": 3.999981646699509e-06, + "loss": 0.98496497, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.57421875, + "step": 521, + "time_per_iteration": 2.4701414108276367 + }, + { + "auxiliary_loss_clip": 0.01096876, + "auxiliary_loss_mlp": 0.01205215, + "balance_loss_clip": 1.09301519, + "balance_loss_mlp": 1.03767633, + "epoch": 0.03138433789267999, + "flos": 23440196471040.0, + "grad_norm": 1.6480498504105687, + "language_loss": 0.79155159, + "learning_rate": 3.999979940300456e-06, + "loss": 0.81457245, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.59375, + "step": 522, + "time_per_iteration": 2.5112197399139404 + }, + { + "auxiliary_loss_clip": 0.01097384, + "auxiliary_loss_mlp": 0.01150734, + "balance_loss_clip": 1.05083704, + "balance_loss_mlp": 1.03663826, + "epoch": 0.03144446114534796, + "flos": 18981369181440.0, + "grad_norm": 2.0942728152601306, + "language_loss": 0.99647903, + "learning_rate": 3.999978158061963e-06, + "loss": 1.01896024, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.609375, + "step": 523, + "time_per_iteration": 2.43190598487854 + }, + { + "auxiliary_loss_clip": 0.01099987, + "auxiliary_loss_mlp": 0.0113624, + "balance_loss_clip": 1.04449654, + "balance_loss_mlp": 1.04056573, + "epoch": 0.031504584398015935, + "flos": 22636600640640.0, + "grad_norm": 1.7602689256321742, + "language_loss": 1.03405833, + "learning_rate": 3.999976299984099e-06, + "loss": 1.05642056, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.59375, + "step": 524, + "time_per_iteration": 2.5049948692321777 + }, + { + "auxiliary_loss_clip": 0.0110226, + "auxiliary_loss_mlp": 0.01133099, + "balance_loss_clip": 1.03820872, + "balance_loss_mlp": 1.04332471, + "epoch": 0.0315647076506839, + "flos": 25296004990080.0, + "grad_norm": 1.835632317280912, + "language_loss": 0.92858231, + "learning_rate": 3.999974366066933e-06, + "loss": 0.95093596, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.58984375, + "step": 525, + "time_per_iteration": 2.49580979347229 + }, + { + "auxiliary_loss_clip": 0.01103175, + "auxiliary_loss_mlp": 0.01121206, + "balance_loss_clip": 1.02517056, + "balance_loss_mlp": 1.04288244, + "epoch": 0.03162483090335187, + "flos": 16981892951040.0, + "grad_norm": 1.6643463217271257, + "language_loss": 0.90448618, + "learning_rate": 3.999972356310538e-06, + "loss": 0.92672992, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.6015625, + "step": 526, + "time_per_iteration": 2.502254009246826 + }, + { + "auxiliary_loss_clip": 0.01107912, + "auxiliary_loss_mlp": 0.01124799, + "balance_loss_clip": 1.02771544, + "balance_loss_mlp": 1.0449096, + "epoch": 0.03168495415601984, + "flos": 18733485461760.0, + "grad_norm": 1.6378996589415575, + "language_loss": 0.94918996, + "learning_rate": 3.999970270714991e-06, + "loss": 0.97151715, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.62890625, + "step": 527, + "time_per_iteration": 2.451977014541626 + }, + { + "auxiliary_loss_clip": 0.0111541, + "auxiliary_loss_mlp": 0.01156017, + "balance_loss_clip": 1.05850339, + "balance_loss_mlp": 1.04665124, + "epoch": 0.03174507740868781, + "flos": 21213820114560.0, + "grad_norm": 1.7345732638730849, + "language_loss": 1.07938552, + "learning_rate": 3.999968109280371e-06, + "loss": 1.1020999, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.6875, + "step": 528, + "time_per_iteration": 2.497335910797119 + }, + { + "auxiliary_loss_clip": 0.01111602, + "auxiliary_loss_mlp": 0.0116157, + "balance_loss_clip": 1.06567764, + "balance_loss_mlp": 1.0431422, + "epoch": 0.03180520066135578, + "flos": 24786587208960.0, + "grad_norm": 1.6708697127592953, + "language_loss": 0.93226665, + "learning_rate": 3.99996587200676e-06, + "loss": 0.95499837, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.68359375, + "step": 529, + "time_per_iteration": 2.4965312480926514 + }, + { + "auxiliary_loss_clip": 0.01113208, + "auxiliary_loss_mlp": 0.01190402, + "balance_loss_clip": 1.08983696, + "balance_loss_mlp": 1.04383636, + "epoch": 0.03186532391402375, + "flos": 24863081731200.0, + "grad_norm": 1.5876529627092344, + "language_loss": 0.99495316, + "learning_rate": 3.999963558894243e-06, + "loss": 1.01798928, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.6953125, + "step": 530, + "time_per_iteration": 3.93658709526062 + }, + { + "auxiliary_loss_clip": 0.01110733, + "auxiliary_loss_mlp": 0.01200385, + "balance_loss_clip": 1.09252501, + "balance_loss_mlp": 1.04129124, + "epoch": 0.03192544716669172, + "flos": 21213994671360.0, + "grad_norm": 1.7409412276121312, + "language_loss": 0.89042419, + "learning_rate": 3.999961169942907e-06, + "loss": 0.91353536, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.6953125, + "step": 531, + "time_per_iteration": 3.947244644165039 + }, + { + "auxiliary_loss_clip": 0.01107662, + "auxiliary_loss_mlp": 0.01181013, + "balance_loss_clip": 1.0749166, + "balance_loss_mlp": 1.03721142, + "epoch": 0.03198557041935969, + "flos": 24352058027520.0, + "grad_norm": 1.5922783889218952, + "language_loss": 1.01070762, + "learning_rate": 3.999958705152843e-06, + "loss": 1.03359437, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.703125, + "step": 532, + "time_per_iteration": 3.98683762550354 + }, + { + "auxiliary_loss_clip": 0.01054155, + "auxiliary_loss_mlp": 0.01112228, + "balance_loss_clip": 1.0855248, + "balance_loss_mlp": 1.02263224, + "epoch": 0.032045693672027656, + "flos": 61824056186880.0, + "grad_norm": 0.8177710683862445, + "language_loss": 0.5813067, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60297054, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.31640625, + "step": 533, + "time_per_iteration": 4.461689472198486 + }, + { + "auxiliary_loss_clip": 0.01100474, + "auxiliary_loss_mlp": 0.011704, + "balance_loss_clip": 1.05901051, + "balance_loss_mlp": 1.03504872, + "epoch": 0.03210581692469563, + "flos": 28399958081280.0, + "grad_norm": 1.5133056646533105, + "language_loss": 0.96741486, + "learning_rate": 3.999953548056907e-06, + "loss": 0.99012357, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.65625, + "step": 534, + "time_per_iteration": 2.5278666019439697 + }, + { + "auxiliary_loss_clip": 0.01106248, + "auxiliary_loss_mlp": 0.01194806, + "balance_loss_clip": 1.06319904, + "balance_loss_mlp": 1.03651822, + "epoch": 0.03216594017736359, + "flos": 24716551288320.0, + "grad_norm": 1.7792757982625182, + "language_loss": 0.90488517, + "learning_rate": 3.999950855751232e-06, + "loss": 0.92789572, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.6953125, + "step": 535, + "time_per_iteration": 2.557374954223633 + }, + { + "auxiliary_loss_clip": 0.01104064, + "auxiliary_loss_mlp": 0.01178931, + "balance_loss_clip": 1.06353593, + "balance_loss_mlp": 1.03565001, + "epoch": 0.032226063430031565, + "flos": 31174121669760.0, + "grad_norm": 2.0280163740028647, + "language_loss": 0.92853439, + "learning_rate": 3.999948087607219e-06, + "loss": 0.95136434, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.68359375, + "step": 536, + "time_per_iteration": 2.5691068172454834 + }, + { + "auxiliary_loss_clip": 0.01110484, + "auxiliary_loss_mlp": 0.01188241, + "balance_loss_clip": 1.0478605, + "balance_loss_mlp": 1.03895593, + "epoch": 0.03228618668269954, + "flos": 32196832392960.0, + "grad_norm": 1.5284148689318524, + "language_loss": 0.83409375, + "learning_rate": 3.999945243624975e-06, + "loss": 0.85708106, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.71484375, + "step": 537, + "time_per_iteration": 2.602902889251709 + }, + { + "auxiliary_loss_clip": 0.01125477, + "auxiliary_loss_mlp": 0.01154761, + "balance_loss_clip": 1.02477503, + "balance_loss_mlp": 1.04873323, + "epoch": 0.0323463099353675, + "flos": 22669174805760.0, + "grad_norm": 1.6777262703084497, + "language_loss": 0.9344542, + "learning_rate": 3.999942323804607e-06, + "loss": 0.95725667, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.765625, + "step": 538, + "time_per_iteration": 2.4709980487823486 + }, + { + "auxiliary_loss_clip": 0.01129259, + "auxiliary_loss_mlp": 0.01194416, + "balance_loss_clip": 1.04220963, + "balance_loss_mlp": 1.05134463, + "epoch": 0.032406433188035474, + "flos": 26903999612160.0, + "grad_norm": 1.6959415512883778, + "language_loss": 0.8825298, + "learning_rate": 3.999939328146225e-06, + "loss": 0.90576661, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.77734375, + "step": 539, + "time_per_iteration": 2.5402956008911133 + }, + { + "auxiliary_loss_clip": 0.01130928, + "auxiliary_loss_mlp": 0.01210736, + "balance_loss_clip": 1.06749415, + "balance_loss_mlp": 1.05148911, + "epoch": 0.03246655644070344, + "flos": 31502584540800.0, + "grad_norm": 1.6262645838172556, + "language_loss": 0.87846148, + "learning_rate": 3.999936256649943e-06, + "loss": 0.90187812, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.796875, + "step": 540, + "time_per_iteration": 2.535794734954834 + }, + { + "auxiliary_loss_clip": 0.01133982, + "auxiliary_loss_mlp": 0.01220164, + "balance_loss_clip": 1.08293033, + "balance_loss_mlp": 1.057127, + "epoch": 0.03252667969337141, + "flos": 23217311151360.0, + "grad_norm": 1.7183399200697322, + "language_loss": 0.97069895, + "learning_rate": 3.999933109315878e-06, + "loss": 0.99424052, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.765625, + "step": 541, + "time_per_iteration": 2.5130021572113037 + }, + { + "auxiliary_loss_clip": 0.01133363, + "auxiliary_loss_mlp": 0.01260855, + "balance_loss_clip": 1.10025585, + "balance_loss_mlp": 1.05313075, + "epoch": 0.032586802946039384, + "flos": 14756563935360.0, + "grad_norm": 1.757498706555251, + "language_loss": 0.96902514, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.99296731, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.80078125, + "step": 542, + "time_per_iteration": 2.4482169151306152 + }, + { + "auxiliary_loss_clip": 0.01130744, + "auxiliary_loss_mlp": 0.01273974, + "balance_loss_clip": 1.09029579, + "balance_loss_mlp": 1.0473417, + "epoch": 0.03264692619870735, + "flos": 24279508488960.0, + "grad_norm": 1.5747198311958899, + "language_loss": 0.8122493, + "learning_rate": 3.999926587134879e-06, + "loss": 0.8362965, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.8359375, + "step": 543, + "time_per_iteration": 2.5225167274475098 + }, + { + "auxiliary_loss_clip": 0.01127858, + "auxiliary_loss_mlp": 0.01273549, + "balance_loss_clip": 1.0967381, + "balance_loss_mlp": 1.04508126, + "epoch": 0.03270704945137532, + "flos": 22892060125440.0, + "grad_norm": 1.9400828914122876, + "language_loss": 1.05496657, + "learning_rate": 3.999923212288192e-06, + "loss": 1.07898068, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.828125, + "step": 544, + "time_per_iteration": 2.475820302963257 + }, + { + "auxiliary_loss_clip": 0.0112007, + "auxiliary_loss_mlp": 0.01246125, + "balance_loss_clip": 1.09677923, + "balance_loss_mlp": 1.03863096, + "epoch": 0.032767172704043286, + "flos": 18040040570880.0, + "grad_norm": 2.0253150590157976, + "language_loss": 0.83212256, + "learning_rate": 3.999919761604216e-06, + "loss": 0.85578454, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.8125, + "step": 545, + "time_per_iteration": 2.446779251098633 + }, + { + "auxiliary_loss_clip": 0.01121591, + "auxiliary_loss_mlp": 0.01239558, + "balance_loss_clip": 1.09135747, + "balance_loss_mlp": 1.03943312, + "epoch": 0.03282729595671126, + "flos": 22527636687360.0, + "grad_norm": 1.8455715661743097, + "language_loss": 1.02270365, + "learning_rate": 3.999916235083083e-06, + "loss": 1.04631519, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.8203125, + "step": 546, + "time_per_iteration": 2.4760241508483887 + }, + { + "auxiliary_loss_clip": 0.01116349, + "auxiliary_loss_mlp": 0.01210784, + "balance_loss_clip": 1.0752672, + "balance_loss_mlp": 1.03537107, + "epoch": 0.03288741920937923, + "flos": 20409630791040.0, + "grad_norm": 1.9035107248633822, + "language_loss": 0.99299961, + "learning_rate": 3.999912632724925e-06, + "loss": 1.01627088, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.80859375, + "step": 547, + "time_per_iteration": 2.4931209087371826 + }, + { + "auxiliary_loss_clip": 0.01129005, + "auxiliary_loss_mlp": 0.01232072, + "balance_loss_clip": 1.07462072, + "balance_loss_mlp": 1.04002047, + "epoch": 0.032947542462047195, + "flos": 20776916960640.0, + "grad_norm": 1.5837130300785742, + "language_loss": 0.94542706, + "learning_rate": 3.999908954529881e-06, + "loss": 0.96903789, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.890625, + "step": 548, + "time_per_iteration": 2.4664671421051025 + }, + { + "auxiliary_loss_clip": 0.01136493, + "auxiliary_loss_mlp": 0.01196867, + "balance_loss_clip": 1.04480338, + "balance_loss_mlp": 1.04186535, + "epoch": 0.03300766571471517, + "flos": 19900247921280.0, + "grad_norm": 1.8770912022006996, + "language_loss": 0.86237824, + "learning_rate": 3.999905200498087e-06, + "loss": 0.88571191, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.9453125, + "step": 549, + "time_per_iteration": 2.501680374145508 + }, + { + "auxiliary_loss_clip": 0.01139474, + "auxiliary_loss_mlp": 0.01201756, + "balance_loss_clip": 1.04039443, + "balance_loss_mlp": 1.04646051, + "epoch": 0.03306778896738313, + "flos": 17966792805120.0, + "grad_norm": 1.6231718541416202, + "language_loss": 0.93893534, + "learning_rate": 3.999901370629689e-06, + "loss": 0.96234763, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.9296875, + "step": 550, + "time_per_iteration": 2.4385581016540527 + }, + { + "auxiliary_loss_clip": 0.01147395, + "auxiliary_loss_mlp": 0.01172734, + "balance_loss_clip": 1.03607237, + "balance_loss_mlp": 1.05177951, + "epoch": 0.033127912220051105, + "flos": 21652294279680.0, + "grad_norm": 1.4128984923864125, + "language_loss": 0.86525953, + "learning_rate": 3.99989746492483e-06, + "loss": 0.88846081, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.95703125, + "step": 551, + "time_per_iteration": 2.487537145614624 + }, + { + "auxiliary_loss_clip": 0.01135096, + "auxiliary_loss_mlp": 0.01197826, + "balance_loss_clip": 1.0693661, + "balance_loss_mlp": 1.04413843, + "epoch": 0.03318803547271908, + "flos": 30187127134080.0, + "grad_norm": 3.182602773534329, + "language_loss": 1.06004131, + "learning_rate": 3.999893483383658e-06, + "loss": 1.08337045, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.91015625, + "step": 552, + "time_per_iteration": 2.5393924713134766 + }, + { + "auxiliary_loss_clip": 0.01137627, + "auxiliary_loss_mlp": 0.01204695, + "balance_loss_clip": 1.06808162, + "balance_loss_mlp": 1.04647756, + "epoch": 0.03324815872538704, + "flos": 20374996855680.0, + "grad_norm": 1.9751188836200375, + "language_loss": 1.04130173, + "learning_rate": 3.999889426006326e-06, + "loss": 1.06472492, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.91015625, + "step": 553, + "time_per_iteration": 2.4983417987823486 + }, + { + "auxiliary_loss_clip": 0.01151044, + "auxiliary_loss_mlp": 0.01244142, + "balance_loss_clip": 1.10261655, + "balance_loss_mlp": 1.04873753, + "epoch": 0.033308281978055014, + "flos": 24493526323200.0, + "grad_norm": 1.7742589723238658, + "language_loss": 0.88081658, + "learning_rate": 3.999885292792986e-06, + "loss": 0.90476841, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 1.0234375, + "step": 554, + "time_per_iteration": 2.4930005073547363 + }, + { + "auxiliary_loss_clip": 0.01147339, + "auxiliary_loss_mlp": 0.01247258, + "balance_loss_clip": 1.10792613, + "balance_loss_mlp": 1.04858994, + "epoch": 0.03336840523072298, + "flos": 23399313402240.0, + "grad_norm": 2.4006587351921294, + "language_loss": 0.90614688, + "learning_rate": 3.999881083743795e-06, + "loss": 0.93009281, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.98828125, + "step": 555, + "time_per_iteration": 2.486687660217285 + }, + { + "auxiliary_loss_clip": 0.01140244, + "auxiliary_loss_mlp": 0.01243849, + "balance_loss_clip": 1.11147952, + "balance_loss_mlp": 1.04319668, + "epoch": 0.03342852848339095, + "flos": 30549386067840.0, + "grad_norm": 1.9108890543740997, + "language_loss": 1.01771629, + "learning_rate": 3.999876798858914e-06, + "loss": 1.04155731, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.97265625, + "step": 556, + "time_per_iteration": 2.507275104522705 + }, + { + "auxiliary_loss_clip": 0.01139649, + "auxiliary_loss_mlp": 0.01233307, + "balance_loss_clip": 1.08787167, + "balance_loss_mlp": 1.04195929, + "epoch": 0.03348865173605892, + "flos": 22892199770880.0, + "grad_norm": 1.797722026952748, + "language_loss": 0.94498062, + "learning_rate": 3.999872438138503e-06, + "loss": 0.96871006, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.9765625, + "step": 557, + "time_per_iteration": 2.526979684829712 + }, + { + "auxiliary_loss_clip": 0.0112784, + "auxiliary_loss_mlp": 0.01203832, + "balance_loss_clip": 1.07918715, + "balance_loss_mlp": 1.03668118, + "epoch": 0.03354877498872689, + "flos": 17675058551040.0, + "grad_norm": 2.2127831879194377, + "language_loss": 1.1035949, + "learning_rate": 3.999868001582729e-06, + "loss": 1.12691164, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.91015625, + "step": 558, + "time_per_iteration": 2.4273765087127686 + }, + { + "auxiliary_loss_clip": 0.01129122, + "auxiliary_loss_mlp": 0.01196892, + "balance_loss_clip": 1.05517602, + "balance_loss_mlp": 1.03869724, + "epoch": 0.03360889824139486, + "flos": 21651910254720.0, + "grad_norm": 1.8722720853276784, + "language_loss": 0.92296618, + "learning_rate": 3.99986348919176e-06, + "loss": 0.9462263, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.90234375, + "step": 559, + "time_per_iteration": 2.5072474479675293 + }, + { + "auxiliary_loss_clip": 0.01119332, + "auxiliary_loss_mlp": 0.01177146, + "balance_loss_clip": 1.044204, + "balance_loss_mlp": 1.03072166, + "epoch": 0.033669021494062826, + "flos": 21794740093440.0, + "grad_norm": 1.5855640139217508, + "language_loss": 0.9336009, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.95656562, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.88671875, + "step": 560, + "time_per_iteration": 2.4519424438476562 + }, + { + "auxiliary_loss_clip": 0.0111538, + "auxiliary_loss_mlp": 0.0118726, + "balance_loss_clip": 1.04315996, + "balance_loss_mlp": 1.03105402, + "epoch": 0.0337291447467308, + "flos": 21865299684480.0, + "grad_norm": 1.828883306704858, + "language_loss": 0.90188801, + "learning_rate": 3.999854236904925e-06, + "loss": 0.92491442, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.84375, + "step": 561, + "time_per_iteration": 2.483736515045166 + }, + { + "auxiliary_loss_clip": 0.01116414, + "auxiliary_loss_mlp": 0.01176057, + "balance_loss_clip": 1.04025352, + "balance_loss_mlp": 1.03113675, + "epoch": 0.03378926799939877, + "flos": 24244734908160.0, + "grad_norm": 1.53276061490754, + "language_loss": 0.88441819, + "learning_rate": 3.999849497009409e-06, + "loss": 0.90734291, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.8515625, + "step": 562, + "time_per_iteration": 2.4966485500335693 + }, + { + "auxiliary_loss_clip": 0.0111902, + "auxiliary_loss_mlp": 0.01173969, + "balance_loss_clip": 1.04722595, + "balance_loss_mlp": 1.03180313, + "epoch": 0.033849391252066735, + "flos": 16506899637120.0, + "grad_norm": 1.7433612505801317, + "language_loss": 0.92252147, + "learning_rate": 3.999844681279401e-06, + "loss": 0.94545144, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.87109375, + "step": 563, + "time_per_iteration": 2.455982208251953 + }, + { + "auxiliary_loss_clip": 0.01118452, + "auxiliary_loss_mlp": 0.01153023, + "balance_loss_clip": 1.02990413, + "balance_loss_mlp": 1.0319984, + "epoch": 0.03390951450473471, + "flos": 15668390580480.0, + "grad_norm": 1.8861857747296413, + "language_loss": 0.99484402, + "learning_rate": 3.99983978971508e-06, + "loss": 1.01755869, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.8671875, + "step": 564, + "time_per_iteration": 2.4077341556549072 + }, + { + "auxiliary_loss_clip": 0.01117822, + "auxiliary_loss_mlp": 0.01146901, + "balance_loss_clip": 1.02917027, + "balance_loss_mlp": 1.03201461, + "epoch": 0.03396963775740267, + "flos": 22673678371200.0, + "grad_norm": 1.9683834492964154, + "language_loss": 1.03429043, + "learning_rate": 3.999834822316635e-06, + "loss": 1.05693769, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.859375, + "step": 565, + "time_per_iteration": 2.4581360816955566 + }, + { + "auxiliary_loss_clip": 0.01048328, + "auxiliary_loss_mlp": 0.01045479, + "balance_loss_clip": 1.02535641, + "balance_loss_mlp": 1.02438474, + "epoch": 0.034029761010070644, + "flos": 64388984797440.0, + "grad_norm": 0.9766516319306671, + "language_loss": 0.55027646, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57121456, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.24023438, + "step": 566, + "time_per_iteration": 3.1307201385498047 + }, + { + "auxiliary_loss_clip": 0.0112585, + "auxiliary_loss_mlp": 0.0116581, + "balance_loss_clip": 1.04607654, + "balance_loss_mlp": 1.03932071, + "epoch": 0.034089884262738616, + "flos": 25003188483840.0, + "grad_norm": 1.8891349744273729, + "language_loss": 0.89690077, + "learning_rate": 3.999824660018126e-06, + "loss": 0.91981733, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.8671875, + "step": 567, + "time_per_iteration": 2.4997751712799072 + }, + { + "auxiliary_loss_clip": 0.01119206, + "auxiliary_loss_mlp": 0.01204005, + "balance_loss_clip": 1.07907414, + "balance_loss_mlp": 1.0362916, + "epoch": 0.03415000751540658, + "flos": 28437838773120.0, + "grad_norm": 1.6184569789964445, + "language_loss": 0.88866186, + "learning_rate": 3.999819465118447e-06, + "loss": 0.91189396, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.828125, + "step": 568, + "time_per_iteration": 2.502458095550537 + }, + { + "auxiliary_loss_clip": 0.01127194, + "auxiliary_loss_mlp": 0.01221816, + "balance_loss_clip": 1.09435725, + "balance_loss_mlp": 1.04144526, + "epoch": 0.034210130768074554, + "flos": 21467708588160.0, + "grad_norm": 1.5547682955162685, + "language_loss": 0.94873005, + "learning_rate": 3.999814194385413e-06, + "loss": 0.97222018, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.859375, + "step": 569, + "time_per_iteration": 3.972914457321167 + }, + { + "auxiliary_loss_clip": 0.01133259, + "auxiliary_loss_mlp": 0.01244152, + "balance_loss_clip": 1.1077764, + "balance_loss_mlp": 1.03843021, + "epoch": 0.03427025402074252, + "flos": 18696512465280.0, + "grad_norm": 1.5575524223318786, + "language_loss": 1.0156033, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.03937745, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.94921875, + "step": 570, + "time_per_iteration": 2.4375224113464355 + }, + { + "auxiliary_loss_clip": 0.0112952, + "auxiliary_loss_mlp": 0.01186745, + "balance_loss_clip": 1.06143188, + "balance_loss_mlp": 1.03812456, + "epoch": 0.03433037727341049, + "flos": 20848942828800.0, + "grad_norm": 1.913000886526044, + "language_loss": 0.90328473, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.92644733, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.9140625, + "step": 571, + "time_per_iteration": 2.452617645263672 + }, + { + "auxiliary_loss_clip": 0.01118482, + "auxiliary_loss_mlp": 0.01176553, + "balance_loss_clip": 1.04837918, + "balance_loss_mlp": 1.03448927, + "epoch": 0.03439050052607846, + "flos": 25409123395200.0, + "grad_norm": 1.876046048035974, + "language_loss": 0.91276026, + "learning_rate": 3.999797927188199e-06, + "loss": 0.93571061, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.8359375, + "step": 572, + "time_per_iteration": 6.690124273300171 + }, + { + "auxiliary_loss_clip": 0.01123119, + "auxiliary_loss_mlp": 0.01152595, + "balance_loss_clip": 1.03438687, + "balance_loss_mlp": 1.03611279, + "epoch": 0.03445062377874643, + "flos": 17639167806720.0, + "grad_norm": 1.659871319825706, + "language_loss": 0.92387295, + "learning_rate": 3.999792353123774e-06, + "loss": 0.94663006, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.8671875, + "step": 573, + "time_per_iteration": 2.439091682434082 + }, + { + "auxiliary_loss_clip": 0.01119368, + "auxiliary_loss_mlp": 0.01162519, + "balance_loss_clip": 1.03997183, + "balance_loss_mlp": 1.03149724, + "epoch": 0.0345107470314144, + "flos": 16763546108160.0, + "grad_norm": 1.893990148939269, + "language_loss": 0.91562116, + "learning_rate": 3.999786703227023e-06, + "loss": 0.93844008, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.87890625, + "step": 574, + "time_per_iteration": 2.4105968475341797 + }, + { + "auxiliary_loss_clip": 0.01122081, + "auxiliary_loss_mlp": 0.01211753, + "balance_loss_clip": 1.0810045, + "balance_loss_mlp": 1.03274226, + "epoch": 0.03457087028408237, + "flos": 14683560549120.0, + "grad_norm": 1.9178032909202491, + "language_loss": 0.94081104, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.96414936, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.890625, + "step": 575, + "time_per_iteration": 2.4481594562530518 + }, + { + "auxiliary_loss_clip": 0.0111962, + "auxiliary_loss_mlp": 0.01212054, + "balance_loss_clip": 1.08931613, + "balance_loss_mlp": 1.03160977, + "epoch": 0.03463099353675034, + "flos": 20010259215360.0, + "grad_norm": 1.928200201612274, + "language_loss": 0.89894342, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.92226011, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.8828125, + "step": 576, + "time_per_iteration": 2.4339497089385986 + }, + { + "auxiliary_loss_clip": 0.01124952, + "auxiliary_loss_mlp": 0.01221854, + "balance_loss_clip": 1.08624172, + "balance_loss_mlp": 1.03464127, + "epoch": 0.03469111678941831, + "flos": 25299984885120.0, + "grad_norm": 1.822266975770665, + "language_loss": 0.92172527, + "learning_rate": 3.99976929854497e-06, + "loss": 0.94519341, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.90625, + "step": 577, + "time_per_iteration": 2.5071544647216797 + }, + { + "auxiliary_loss_clip": 0.01129671, + "auxiliary_loss_mlp": 0.01199416, + "balance_loss_clip": 1.05359983, + "balance_loss_mlp": 1.03464389, + "epoch": 0.034751240042086275, + "flos": 23258264042880.0, + "grad_norm": 1.7449173582018862, + "language_loss": 0.79331028, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.81660116, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.94921875, + "step": 578, + "time_per_iteration": 2.464688777923584 + }, + { + "auxiliary_loss_clip": 0.01135186, + "auxiliary_loss_mlp": 0.01175142, + "balance_loss_clip": 1.02875292, + "balance_loss_mlp": 1.03696179, + "epoch": 0.03481136329475425, + "flos": 23768100760320.0, + "grad_norm": 1.6107996176960915, + "language_loss": 0.83964503, + "learning_rate": 3.999757316265973e-06, + "loss": 0.86274827, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.984375, + "step": 579, + "time_per_iteration": 2.4916505813598633 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01144654, + "balance_loss_clip": 1.02067578, + "balance_loss_mlp": 1.0336169, + "epoch": 0.03487148654742222, + "flos": 20156475456000.0, + "grad_norm": 1.7741074525628746, + "language_loss": 0.93820435, + "learning_rate": 3.999751211379863e-06, + "loss": 0.96091962, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.9296875, + "step": 580, + "time_per_iteration": 2.509655237197876 + }, + { + "auxiliary_loss_clip": 0.01130375, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_clip": 1.02726579, + "balance_loss_mlp": 1.03658557, + "epoch": 0.034931609800090184, + "flos": 15668669871360.0, + "grad_norm": 2.069002442033303, + "language_loss": 0.94589162, + "learning_rate": 3.999745030662987e-06, + "loss": 0.96868879, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.9375, + "step": 581, + "time_per_iteration": 2.461562156677246 + }, + { + "auxiliary_loss_clip": 0.01138039, + "auxiliary_loss_mlp": 0.01180326, + "balance_loss_clip": 1.04890943, + "balance_loss_mlp": 1.03759456, + "epoch": 0.034991733052758156, + "flos": 16361451446400.0, + "grad_norm": 1.7015709136785961, + "language_loss": 0.84837043, + "learning_rate": 3.99973877411558e-06, + "loss": 0.87155414, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 1.0078125, + "step": 582, + "time_per_iteration": 2.410998582839966 + }, + { + "auxiliary_loss_clip": 0.01135532, + "auxiliary_loss_mlp": 0.0118301, + "balance_loss_clip": 1.05502641, + "balance_loss_mlp": 1.03843379, + "epoch": 0.03505185630542612, + "flos": 19386396397440.0, + "grad_norm": 1.6203561778939457, + "language_loss": 0.93527448, + "learning_rate": 3.999732441737877e-06, + "loss": 0.95845991, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.96875, + "step": 583, + "time_per_iteration": 2.470585823059082 + }, + { + "auxiliary_loss_clip": 0.01133185, + "auxiliary_loss_mlp": 0.01170752, + "balance_loss_clip": 1.05306816, + "balance_loss_mlp": 1.03542304, + "epoch": 0.03511197955809409, + "flos": 21322784067840.0, + "grad_norm": 1.9805693861181615, + "language_loss": 0.92181969, + "learning_rate": 3.99972603353012e-06, + "loss": 0.94485909, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.9765625, + "step": 584, + "time_per_iteration": 2.529553174972534 + }, + { + "auxiliary_loss_clip": 0.01136966, + "auxiliary_loss_mlp": 0.01184859, + "balance_loss_clip": 1.05573189, + "balance_loss_mlp": 1.03474641, + "epoch": 0.035172102810762065, + "flos": 14135738405760.0, + "grad_norm": 2.4129239320597478, + "language_loss": 1.06750739, + "learning_rate": 3.999719549492551e-06, + "loss": 1.09072566, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 1.0234375, + "step": 585, + "time_per_iteration": 2.4398443698883057 + }, + { + "auxiliary_loss_clip": 0.01135905, + "auxiliary_loss_mlp": 0.01191041, + "balance_loss_clip": 1.05523801, + "balance_loss_mlp": 1.03748393, + "epoch": 0.03523222606343003, + "flos": 20296023626880.0, + "grad_norm": 2.0376822303572695, + "language_loss": 0.96939135, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.99266076, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.984375, + "step": 586, + "time_per_iteration": 2.4211606979370117 + }, + { + "auxiliary_loss_clip": 0.01139563, + "auxiliary_loss_mlp": 0.01180904, + "balance_loss_clip": 1.0544467, + "balance_loss_mlp": 1.03403854, + "epoch": 0.035292349316098, + "flos": 20374787387520.0, + "grad_norm": 1.6885230957616433, + "language_loss": 0.8697657, + "learning_rate": 3.999706353928965e-06, + "loss": 0.89297038, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 1.0546875, + "step": 587, + "time_per_iteration": 2.445727825164795 + }, + { + "auxiliary_loss_clip": 0.01135822, + "auxiliary_loss_mlp": 0.01178345, + "balance_loss_clip": 1.04397202, + "balance_loss_mlp": 1.03322887, + "epoch": 0.03535247256876597, + "flos": 21467848233600.0, + "grad_norm": 1.4798219771882806, + "language_loss": 0.87201285, + "learning_rate": 3.999699642403449e-06, + "loss": 0.89515448, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 1.03125, + "step": 588, + "time_per_iteration": 2.4359219074249268 + }, + { + "auxiliary_loss_clip": 0.01135184, + "auxiliary_loss_mlp": 0.01179778, + "balance_loss_clip": 1.03214931, + "balance_loss_mlp": 1.03024328, + "epoch": 0.03541259582143394, + "flos": 23621919431040.0, + "grad_norm": 1.8870066143249948, + "language_loss": 1.05879104, + "learning_rate": 3.99969285504912e-06, + "loss": 1.08194065, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 1.046875, + "step": 589, + "time_per_iteration": 2.453648090362549 + }, + { + "auxiliary_loss_clip": 0.01131056, + "auxiliary_loss_mlp": 0.01162723, + "balance_loss_clip": 1.02424955, + "balance_loss_mlp": 1.03055692, + "epoch": 0.03547271907410191, + "flos": 33725050824960.0, + "grad_norm": 1.9168849169932591, + "language_loss": 0.92424816, + "learning_rate": 3.99968599186624e-06, + "loss": 0.94718587, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 1.0, + "step": 590, + "time_per_iteration": 2.548443555831909 + }, + { + "auxiliary_loss_clip": 0.01133026, + "auxiliary_loss_mlp": 0.0116205, + "balance_loss_clip": 1.02958465, + "balance_loss_mlp": 1.0304544, + "epoch": 0.03553284232676988, + "flos": 21141619689600.0, + "grad_norm": 1.904244731809144, + "language_loss": 0.95059144, + "learning_rate": 3.999679052855065e-06, + "loss": 0.97354221, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 1.0234375, + "step": 591, + "time_per_iteration": 2.45732045173645 + }, + { + "auxiliary_loss_clip": 0.01135159, + "auxiliary_loss_mlp": 0.01178403, + "balance_loss_clip": 1.03830838, + "balance_loss_mlp": 1.03139913, + "epoch": 0.03559296557943785, + "flos": 20045591377920.0, + "grad_norm": 1.662101568159591, + "language_loss": 0.91738057, + "learning_rate": 3.999672038015861e-06, + "loss": 0.94051623, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 1.0390625, + "step": 592, + "time_per_iteration": 2.433211088180542 + }, + { + "auxiliary_loss_clip": 0.01044255, + "auxiliary_loss_mlp": 0.01103515, + "balance_loss_clip": 1.08234298, + "balance_loss_mlp": 1.02112067, + "epoch": 0.035653088832105814, + "flos": 60331239740160.0, + "grad_norm": 0.9100897362399614, + "language_loss": 0.59922636, + "learning_rate": 3.999664947348893e-06, + "loss": 0.62070405, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.23046875, + "step": 593, + "time_per_iteration": 3.0719518661499023 + }, + { + "auxiliary_loss_clip": 0.01139564, + "auxiliary_loss_mlp": 0.01209852, + "balance_loss_clip": 1.06346262, + "balance_loss_mlp": 1.03209782, + "epoch": 0.035713212084773786, + "flos": 20112310719360.0, + "grad_norm": 1.7407052258697056, + "language_loss": 0.93002135, + "learning_rate": 3.999657780854429e-06, + "loss": 0.95351553, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 1.078125, + "step": 594, + "time_per_iteration": 2.4300620555877686 + }, + { + "auxiliary_loss_clip": 0.01143001, + "auxiliary_loss_mlp": 0.0122224, + "balance_loss_clip": 1.08214498, + "balance_loss_mlp": 1.03211153, + "epoch": 0.03577333533744176, + "flos": 26284605448320.0, + "grad_norm": 1.821200500500637, + "language_loss": 0.91260576, + "learning_rate": 3.999650538532742e-06, + "loss": 0.93625814, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 1.109375, + "step": 595, + "time_per_iteration": 2.499069929122925 + }, + { + "auxiliary_loss_clip": 0.01140779, + "auxiliary_loss_mlp": 0.01232191, + "balance_loss_clip": 1.08074725, + "balance_loss_mlp": 1.03046095, + "epoch": 0.035833458590109724, + "flos": 10888955475840.0, + "grad_norm": 2.2216642179293693, + "language_loss": 1.07016993, + "learning_rate": 3.999643220384106e-06, + "loss": 1.09389973, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 1.1015625, + "step": 596, + "time_per_iteration": 2.397531270980835 + }, + { + "auxiliary_loss_clip": 0.01148663, + "auxiliary_loss_mlp": 0.0121052, + "balance_loss_clip": 1.07700562, + "balance_loss_mlp": 1.03398108, + "epoch": 0.035893581842777696, + "flos": 22089127610880.0, + "grad_norm": 2.1432945611476892, + "language_loss": 0.92265987, + "learning_rate": 3.999635826408799e-06, + "loss": 0.94625169, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 1.1484375, + "step": 597, + "time_per_iteration": 2.4649407863616943 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01218642, + "balance_loss_clip": 1.07482839, + "balance_loss_mlp": 1.030936, + "epoch": 0.03595370509544566, + "flos": 23037263936640.0, + "grad_norm": 1.6013077074951683, + "language_loss": 0.858154, + "learning_rate": 3.999628356607101e-06, + "loss": 0.88170791, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 1.0546875, + "step": 598, + "time_per_iteration": 2.4549970626831055 + }, + { + "auxiliary_loss_clip": 0.01131057, + "auxiliary_loss_mlp": 0.01165755, + "balance_loss_clip": 1.0394882, + "balance_loss_mlp": 1.03080678, + "epoch": 0.03601382834811363, + "flos": 20776672581120.0, + "grad_norm": 1.646228355223748, + "language_loss": 0.86395174, + "learning_rate": 3.999620810979295e-06, + "loss": 0.88691986, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 1.0078125, + "step": 599, + "time_per_iteration": 2.4681897163391113 + }, + { + "auxiliary_loss_clip": 0.01152795, + "auxiliary_loss_mlp": 0.01159652, + "balance_loss_clip": 1.03367186, + "balance_loss_mlp": 1.0365355, + "epoch": 0.036073951600781605, + "flos": 23950487036160.0, + "grad_norm": 1.9726730442747542, + "language_loss": 0.96328104, + "learning_rate": 3.999613189525668e-06, + "loss": 0.98640549, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 1.1640625, + "step": 600, + "time_per_iteration": 2.4512691497802734 + }, + { + "auxiliary_loss_clip": 0.01140167, + "auxiliary_loss_mlp": 0.01149151, + "balance_loss_clip": 1.02765274, + "balance_loss_mlp": 1.03376889, + "epoch": 0.03613407485344957, + "flos": 18911403083520.0, + "grad_norm": 1.583812149546343, + "language_loss": 0.88016117, + "learning_rate": 3.999605492246508e-06, + "loss": 0.90305436, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 1.0625, + "step": 601, + "time_per_iteration": 2.4613654613494873 + }, + { + "auxiliary_loss_clip": 0.01137278, + "auxiliary_loss_mlp": 0.01157257, + "balance_loss_clip": 1.03847671, + "balance_loss_mlp": 1.03210258, + "epoch": 0.03619419810611754, + "flos": 23037438493440.0, + "grad_norm": 2.1863642945842585, + "language_loss": 0.84624588, + "learning_rate": 3.999597719142107e-06, + "loss": 0.86919129, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 1.0546875, + "step": 602, + "time_per_iteration": 2.4436893463134766 + }, + { + "auxiliary_loss_clip": 0.01146275, + "auxiliary_loss_mlp": 0.01188324, + "balance_loss_clip": 1.05767083, + "balance_loss_mlp": 1.03426147, + "epoch": 0.03625432135878551, + "flos": 29456569601280.0, + "grad_norm": 1.6668777311786453, + "language_loss": 0.85847187, + "learning_rate": 3.999589870212761e-06, + "loss": 0.88181782, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 1.1171875, + "step": 603, + "time_per_iteration": 2.5141797065734863 + }, + { + "auxiliary_loss_clip": 0.01137624, + "auxiliary_loss_mlp": 0.01174297, + "balance_loss_clip": 1.06252623, + "balance_loss_mlp": 1.03375828, + "epoch": 0.03631444461145348, + "flos": 23507544216960.0, + "grad_norm": 1.7633756512242336, + "language_loss": 0.92174476, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.94486403, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 1.0390625, + "step": 604, + "time_per_iteration": 2.4520962238311768 + }, + { + "auxiliary_loss_clip": 0.01145165, + "auxiliary_loss_mlp": 0.01195868, + "balance_loss_clip": 1.06769419, + "balance_loss_mlp": 1.03581345, + "epoch": 0.03637456786412145, + "flos": 16617190222080.0, + "grad_norm": 1.9060720903789012, + "language_loss": 0.90218383, + "learning_rate": 3.999573944880424e-06, + "loss": 0.92559421, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 1.09375, + "step": 605, + "time_per_iteration": 2.432851552963257 + }, + { + "auxiliary_loss_clip": 0.01145378, + "auxiliary_loss_mlp": 0.01164865, + "balance_loss_clip": 1.05342865, + "balance_loss_mlp": 1.03662348, + "epoch": 0.03643469111678942, + "flos": 15850916501760.0, + "grad_norm": 2.189037031962612, + "language_loss": 0.94489419, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.9679966, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 1.09375, + "step": 606, + "time_per_iteration": 2.4093234539031982 + }, + { + "auxiliary_loss_clip": 0.0114942, + "auxiliary_loss_mlp": 0.01181308, + "balance_loss_clip": 1.04769862, + "balance_loss_mlp": 1.03741181, + "epoch": 0.03649481436945739, + "flos": 23619335990400.0, + "grad_norm": 2.2750634880678406, + "language_loss": 0.88896465, + "learning_rate": 3.999557716251912e-06, + "loss": 0.91227192, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 1.125, + "step": 607, + "time_per_iteration": 2.5047388076782227 + }, + { + "auxiliary_loss_clip": 0.01139677, + "auxiliary_loss_mlp": 0.01156641, + "balance_loss_clip": 1.04191446, + "balance_loss_mlp": 1.03562021, + "epoch": 0.036554937622125354, + "flos": 21754694897280.0, + "grad_norm": 2.3149950976431146, + "language_loss": 0.88564229, + "learning_rate": 3.999549488202358e-06, + "loss": 0.90860552, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 1.046875, + "step": 608, + "time_per_iteration": 2.426285743713379 + }, + { + "auxiliary_loss_clip": 0.01136598, + "auxiliary_loss_mlp": 0.01134869, + "balance_loss_clip": 1.02853394, + "balance_loss_mlp": 1.0341115, + "epoch": 0.036615060874793326, + "flos": 17818865907840.0, + "grad_norm": 1.859383743505045, + "language_loss": 0.88021719, + "learning_rate": 3.999541184329688e-06, + "loss": 0.90293187, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 1.0234375, + "step": 609, + "time_per_iteration": 3.864445686340332 + }, + { + "auxiliary_loss_clip": 0.01143784, + "auxiliary_loss_mlp": 0.01141999, + "balance_loss_clip": 1.03566492, + "balance_loss_mlp": 1.03638268, + "epoch": 0.0366751841274613, + "flos": 26752791047040.0, + "grad_norm": 1.900608640244239, + "language_loss": 0.86938202, + "learning_rate": 3.999532804634215e-06, + "loss": 0.89223981, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 1.078125, + "step": 610, + "time_per_iteration": 2.475282669067383 + }, + { + "auxiliary_loss_clip": 0.01145349, + "auxiliary_loss_mlp": 0.0113481, + "balance_loss_clip": 1.02180004, + "balance_loss_mlp": 1.03796923, + "epoch": 0.03673530738012926, + "flos": 22195961971200.0, + "grad_norm": 1.855841745920249, + "language_loss": 0.94348156, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.9662832, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 1.078125, + "step": 611, + "time_per_iteration": 5.377411603927612 + }, + { + "auxiliary_loss_clip": 0.01141231, + "auxiliary_loss_mlp": 0.01142037, + "balance_loss_clip": 1.02416253, + "balance_loss_mlp": 1.03334546, + "epoch": 0.036795430632797235, + "flos": 24680485987200.0, + "grad_norm": 1.8377996266837493, + "language_loss": 0.80533373, + "learning_rate": 3.999515817776136e-06, + "loss": 0.82816648, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 1.078125, + "step": 612, + "time_per_iteration": 3.8703882694244385 + }, + { + "auxiliary_loss_clip": 0.01144723, + "auxiliary_loss_mlp": 0.01135816, + "balance_loss_clip": 1.02681112, + "balance_loss_mlp": 1.03488398, + "epoch": 0.0368555538854652, + "flos": 17747957203200.0, + "grad_norm": 2.502692943427083, + "language_loss": 0.88265264, + "learning_rate": 3.999507210614175e-06, + "loss": 0.90545809, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 1.09375, + "step": 613, + "time_per_iteration": 2.447810649871826 + }, + { + "auxiliary_loss_clip": 0.01136952, + "auxiliary_loss_mlp": 0.01146764, + "balance_loss_clip": 1.03156042, + "balance_loss_mlp": 1.0319196, + "epoch": 0.03691567713813317, + "flos": 20593518255360.0, + "grad_norm": 1.6746319516059052, + "language_loss": 0.99686742, + "learning_rate": 3.9994985276307e-06, + "loss": 1.01970458, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 1.0546875, + "step": 614, + "time_per_iteration": 2.4973056316375732 + }, + { + "auxiliary_loss_clip": 0.01143539, + "auxiliary_loss_mlp": 0.01138837, + "balance_loss_clip": 1.02945089, + "balance_loss_mlp": 1.03434527, + "epoch": 0.036975800390801145, + "flos": 33649149795840.0, + "grad_norm": 2.3394643251372016, + "language_loss": 0.84657586, + "learning_rate": 3.999489768826041e-06, + "loss": 0.86939967, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 1.09375, + "step": 615, + "time_per_iteration": 2.550053119659424 + }, + { + "auxiliary_loss_clip": 0.01149275, + "auxiliary_loss_mlp": 0.01149109, + "balance_loss_clip": 1.02880275, + "balance_loss_mlp": 1.03408265, + "epoch": 0.03703592364346911, + "flos": 28292425493760.0, + "grad_norm": 1.7307014896371944, + "language_loss": 0.88484526, + "learning_rate": 3.999480934200528e-06, + "loss": 0.90782917, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 1.15625, + "step": 616, + "time_per_iteration": 2.5172057151794434 + }, + { + "auxiliary_loss_clip": 0.01142722, + "auxiliary_loss_mlp": 0.01148217, + "balance_loss_clip": 1.02605176, + "balance_loss_mlp": 1.0331043, + "epoch": 0.03709604689613708, + "flos": 31502863831680.0, + "grad_norm": 2.0226835265390304, + "language_loss": 0.7443614, + "learning_rate": 3.999472023754499e-06, + "loss": 0.76727086, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 1.09375, + "step": 617, + "time_per_iteration": 2.5514779090881348 + }, + { + "auxiliary_loss_clip": 0.01144035, + "auxiliary_loss_mlp": 0.01153274, + "balance_loss_clip": 1.03206182, + "balance_loss_mlp": 1.03471375, + "epoch": 0.03715617014880505, + "flos": 19608374021760.0, + "grad_norm": 1.9091637159595967, + "language_loss": 0.87663281, + "learning_rate": 3.99946303748829e-06, + "loss": 0.89960587, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 1.09375, + "step": 618, + "time_per_iteration": 2.4170682430267334 + }, + { + "auxiliary_loss_clip": 0.01141964, + "auxiliary_loss_mlp": 0.01132541, + "balance_loss_clip": 1.0255388, + "balance_loss_mlp": 1.03270066, + "epoch": 0.03721629340147302, + "flos": 15923291483520.0, + "grad_norm": 2.2025431207740165, + "language_loss": 0.97853041, + "learning_rate": 3.999453975402242e-06, + "loss": 1.00127554, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 1.09375, + "step": 619, + "time_per_iteration": 2.4366865158081055 + }, + { + "auxiliary_loss_clip": 0.01144644, + "auxiliary_loss_mlp": 0.01140787, + "balance_loss_clip": 1.02963603, + "balance_loss_mlp": 1.03562737, + "epoch": 0.03727641665414099, + "flos": 21103075681920.0, + "grad_norm": 2.922265213518912, + "language_loss": 1.00434625, + "learning_rate": 3.9994448374967e-06, + "loss": 1.02720046, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 1.09375, + "step": 620, + "time_per_iteration": 2.423802614212036 + }, + { + "auxiliary_loss_clip": 0.01147724, + "auxiliary_loss_mlp": 0.01143387, + "balance_loss_clip": 1.03047216, + "balance_loss_mlp": 1.03511846, + "epoch": 0.037336539906808956, + "flos": 24130604073600.0, + "grad_norm": 1.7414318203517491, + "language_loss": 0.82710987, + "learning_rate": 3.999435623772008e-06, + "loss": 0.850021, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 1.125, + "step": 621, + "time_per_iteration": 2.4973435401916504 + }, + { + "auxiliary_loss_clip": 0.01146479, + "auxiliary_loss_mlp": 0.01140138, + "balance_loss_clip": 1.03375554, + "balance_loss_mlp": 1.03534997, + "epoch": 0.03739666315947693, + "flos": 22345285322880.0, + "grad_norm": 2.2279396189109626, + "language_loss": 0.92843008, + "learning_rate": 3.999426334228518e-06, + "loss": 0.95129633, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 1.109375, + "step": 622, + "time_per_iteration": 2.432361125946045 + }, + { + "auxiliary_loss_clip": 0.01146456, + "auxiliary_loss_mlp": 0.01143284, + "balance_loss_clip": 1.02626777, + "balance_loss_mlp": 1.03450549, + "epoch": 0.0374567864121449, + "flos": 20448454089600.0, + "grad_norm": 2.032015196544754, + "language_loss": 0.95863605, + "learning_rate": 3.999416968866581e-06, + "loss": 0.98153341, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 1.1171875, + "step": 623, + "time_per_iteration": 2.4633841514587402 + }, + { + "auxiliary_loss_clip": 0.01151717, + "auxiliary_loss_mlp": 0.01162799, + "balance_loss_clip": 1.04292214, + "balance_loss_mlp": 1.03735971, + "epoch": 0.037516909664812866, + "flos": 19207047409920.0, + "grad_norm": 1.7952810784722342, + "language_loss": 0.87904549, + "learning_rate": 3.999407527686551e-06, + "loss": 0.90219063, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 1.140625, + "step": 624, + "time_per_iteration": 2.436494827270508 + }, + { + "auxiliary_loss_clip": 0.01148249, + "auxiliary_loss_mlp": 0.01144412, + "balance_loss_clip": 1.03683746, + "balance_loss_mlp": 1.03573513, + "epoch": 0.03757703291748084, + "flos": 35003814526080.0, + "grad_norm": 2.8860014183493523, + "language_loss": 0.73953319, + "learning_rate": 3.999398010688788e-06, + "loss": 0.76245981, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 1.125, + "step": 625, + "time_per_iteration": 2.6041533946990967 + }, + { + "auxiliary_loss_clip": 0.01150095, + "auxiliary_loss_mlp": 0.01149034, + "balance_loss_clip": 1.03254294, + "balance_loss_mlp": 1.03531814, + "epoch": 0.0376371561701488, + "flos": 25482720274560.0, + "grad_norm": 1.8207282661875475, + "language_loss": 0.82669246, + "learning_rate": 3.999388417873652e-06, + "loss": 0.84968376, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 1.1484375, + "step": 626, + "time_per_iteration": 2.490727186203003 + }, + { + "auxiliary_loss_clip": 0.01150833, + "auxiliary_loss_mlp": 0.01153904, + "balance_loss_clip": 1.03679323, + "balance_loss_mlp": 1.0368886, + "epoch": 0.037697279422816775, + "flos": 18184685800320.0, + "grad_norm": 1.6761634543459292, + "language_loss": 0.85668659, + "learning_rate": 3.999378749241506e-06, + "loss": 0.87973398, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 1.140625, + "step": 627, + "time_per_iteration": 2.4729204177856445 + }, + { + "auxiliary_loss_clip": 0.01145017, + "auxiliary_loss_mlp": 0.01134961, + "balance_loss_clip": 1.02776766, + "balance_loss_mlp": 1.03444088, + "epoch": 0.03775740267548475, + "flos": 24643128965760.0, + "grad_norm": 1.5134426086173418, + "language_loss": 0.92669588, + "learning_rate": 3.999369004792719e-06, + "loss": 0.94949561, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 1.109375, + "step": 628, + "time_per_iteration": 2.4645371437072754 + }, + { + "auxiliary_loss_clip": 0.01150354, + "auxiliary_loss_mlp": 0.01141764, + "balance_loss_clip": 1.02436709, + "balance_loss_mlp": 1.03583467, + "epoch": 0.03781752592815271, + "flos": 21287137703040.0, + "grad_norm": 2.0370317312948325, + "language_loss": 0.8416189, + "learning_rate": 3.999359184527658e-06, + "loss": 0.8645401, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 1.1484375, + "step": 629, + "time_per_iteration": 2.462101459503174 + }, + { + "auxiliary_loss_clip": 0.01146355, + "auxiliary_loss_mlp": 0.011391, + "balance_loss_clip": 1.02294207, + "balance_loss_mlp": 1.03396928, + "epoch": 0.037877649180820684, + "flos": 22088569029120.0, + "grad_norm": 1.6481701926445964, + "language_loss": 0.82757461, + "learning_rate": 3.999349288446696e-06, + "loss": 0.85042918, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 1.125, + "step": 630, + "time_per_iteration": 2.584721803665161 + }, + { + "auxiliary_loss_clip": 0.01151643, + "auxiliary_loss_mlp": 0.01137685, + "balance_loss_clip": 1.02901435, + "balance_loss_mlp": 1.03617048, + "epoch": 0.03793777243348865, + "flos": 14500476046080.0, + "grad_norm": 2.799449965772281, + "language_loss": 0.99718016, + "learning_rate": 3.99933931655021e-06, + "loss": 1.02007341, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 1.15625, + "step": 631, + "time_per_iteration": 2.505333662033081 + }, + { + "auxiliary_loss_clip": 0.01137196, + "auxiliary_loss_mlp": 0.01132671, + "balance_loss_clip": 1.02781439, + "balance_loss_mlp": 1.03332996, + "epoch": 0.03799789568615662, + "flos": 21907334828160.0, + "grad_norm": 1.4941902100665034, + "language_loss": 0.94728148, + "learning_rate": 3.999329268838575e-06, + "loss": 0.96998012, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 1.0390625, + "step": 632, + "time_per_iteration": 2.4616222381591797 + }, + { + "auxiliary_loss_clip": 0.01137809, + "auxiliary_loss_mlp": 0.01120377, + "balance_loss_clip": 1.02396011, + "balance_loss_mlp": 1.03341722, + "epoch": 0.03805801893882459, + "flos": 24825864355200.0, + "grad_norm": 1.738479481222945, + "language_loss": 0.88349819, + "learning_rate": 3.999319145312175e-06, + "loss": 0.90608013, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 1.046875, + "step": 633, + "time_per_iteration": 2.4975996017456055 + }, + { + "auxiliary_loss_clip": 0.0114388, + "auxiliary_loss_mlp": 0.01133259, + "balance_loss_clip": 1.03317142, + "balance_loss_mlp": 1.03329587, + "epoch": 0.03811814219149256, + "flos": 30481619385600.0, + "grad_norm": 1.673016660576088, + "language_loss": 0.75777364, + "learning_rate": 3.999308945971392e-06, + "loss": 0.78054506, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 1.109375, + "step": 634, + "time_per_iteration": 2.5212392807006836 + }, + { + "auxiliary_loss_clip": 0.01053407, + "auxiliary_loss_mlp": 0.01050714, + "balance_loss_clip": 1.03412056, + "balance_loss_mlp": 1.02602744, + "epoch": 0.03817826544416053, + "flos": 66989561639040.0, + "grad_norm": 1.080090002084096, + "language_loss": 0.61783701, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63887829, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.2734375, + "step": 635, + "time_per_iteration": 3.095073938369751 + }, + { + "auxiliary_loss_clip": 0.01144458, + "auxiliary_loss_mlp": 0.01137539, + "balance_loss_clip": 1.03540051, + "balance_loss_mlp": 1.03319693, + "epoch": 0.038238388696828496, + "flos": 20484309922560.0, + "grad_norm": 2.1375099227535066, + "language_loss": 0.90742528, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.93024528, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 1.109375, + "step": 636, + "time_per_iteration": 2.4218077659606934 + }, + { + "auxiliary_loss_clip": 0.01140279, + "auxiliary_loss_mlp": 0.01143451, + "balance_loss_clip": 1.03220439, + "balance_loss_mlp": 1.03110051, + "epoch": 0.03829851194949647, + "flos": 17964977414400.0, + "grad_norm": 2.108904257511354, + "language_loss": 0.87953991, + "learning_rate": 3.999277893066632e-06, + "loss": 0.90237719, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 1.09375, + "step": 637, + "time_per_iteration": 2.4062628746032715 + }, + { + "auxiliary_loss_clip": 0.01149598, + "auxiliary_loss_mlp": 0.01140114, + "balance_loss_clip": 1.03215766, + "balance_loss_mlp": 1.03322482, + "epoch": 0.03835863520216444, + "flos": 22455401351040.0, + "grad_norm": 1.7172727993800396, + "language_loss": 0.89876044, + "learning_rate": 3.999267390472215e-06, + "loss": 0.92165744, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 1.1640625, + "step": 638, + "time_per_iteration": 2.427471160888672 + }, + { + "auxiliary_loss_clip": 0.01146479, + "auxiliary_loss_mlp": 0.01150787, + "balance_loss_clip": 1.04578769, + "balance_loss_mlp": 1.03394055, + "epoch": 0.038418758454832405, + "flos": 22163317983360.0, + "grad_norm": 2.331586236551516, + "language_loss": 0.77012938, + "learning_rate": 3.999256812065381e-06, + "loss": 0.79310209, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 1.125, + "step": 639, + "time_per_iteration": 2.4867639541625977 + }, + { + "auxiliary_loss_clip": 0.01152412, + "auxiliary_loss_mlp": 0.01154368, + "balance_loss_clip": 1.04593539, + "balance_loss_mlp": 1.03479469, + "epoch": 0.03847888170750038, + "flos": 22746332643840.0, + "grad_norm": 2.0925507270613792, + "language_loss": 0.93403959, + "learning_rate": 3.999246157846526e-06, + "loss": 0.95710731, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 1.171875, + "step": 640, + "time_per_iteration": 2.442833185195923 + }, + { + "auxiliary_loss_clip": 0.01144826, + "auxiliary_loss_mlp": 0.01143048, + "balance_loss_clip": 1.04396117, + "balance_loss_mlp": 1.03502786, + "epoch": 0.03853900496016834, + "flos": 22710092785920.0, + "grad_norm": 2.049803715612257, + "language_loss": 0.89439285, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.91727161, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 1.09375, + "step": 641, + "time_per_iteration": 2.4766299724578857 + }, + { + "auxiliary_loss_clip": 0.01045718, + "auxiliary_loss_mlp": 0.01157494, + "balance_loss_clip": 1.1399461, + "balance_loss_mlp": 1.0203532, + "epoch": 0.038599128212836314, + "flos": 70395652569600.0, + "grad_norm": 0.954832839638245, + "language_loss": 0.65817571, + "learning_rate": 3.999224621974381e-06, + "loss": 0.68020779, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.25390625, + "step": 642, + "time_per_iteration": 3.0364344120025635 + }, + { + "auxiliary_loss_clip": 0.01145578, + "auxiliary_loss_mlp": 0.011214, + "balance_loss_clip": 1.0282737, + "balance_loss_mlp": 1.0339222, + "epoch": 0.03865925146550429, + "flos": 23294015141760.0, + "grad_norm": 1.6105373145386477, + "language_loss": 0.84267819, + "learning_rate": 3.999213740321906e-06, + "loss": 0.86534798, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 1.1171875, + "step": 643, + "time_per_iteration": 2.470949411392212 + }, + { + "auxiliary_loss_clip": 0.01138372, + "auxiliary_loss_mlp": 0.01116481, + "balance_loss_clip": 1.02240109, + "balance_loss_mlp": 1.03272557, + "epoch": 0.03871937471817225, + "flos": 21429478782720.0, + "grad_norm": 1.7282296521325415, + "language_loss": 0.8665489, + "learning_rate": 3.999202782859046e-06, + "loss": 0.88909739, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 1.0546875, + "step": 644, + "time_per_iteration": 2.4602267742156982 + }, + { + "auxiliary_loss_clip": 0.0114193, + "auxiliary_loss_mlp": 0.0113537, + "balance_loss_clip": 1.03051376, + "balance_loss_mlp": 1.03361261, + "epoch": 0.038779497970840224, + "flos": 34275875345280.0, + "grad_norm": 1.9212685796933018, + "language_loss": 0.87557673, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.8983497, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 1.078125, + "step": 645, + "time_per_iteration": 2.5726985931396484 + }, + { + "auxiliary_loss_clip": 0.01145175, + "auxiliary_loss_mlp": 0.01140511, + "balance_loss_clip": 1.03670359, + "balance_loss_mlp": 1.034168, + "epoch": 0.03883962122350819, + "flos": 22747065782400.0, + "grad_norm": 2.3097887274972857, + "language_loss": 0.87386203, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.89671886, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 1.109375, + "step": 646, + "time_per_iteration": 2.4509987831115723 + }, + { + "auxiliary_loss_clip": 0.01139482, + "auxiliary_loss_mlp": 0.0114241, + "balance_loss_clip": 1.04399121, + "balance_loss_mlp": 1.0324012, + "epoch": 0.03889974447617616, + "flos": 21944726760960.0, + "grad_norm": 1.8819525249691549, + "language_loss": 0.85945958, + "learning_rate": 3.999169455612323e-06, + "loss": 0.8822785, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 1.0703125, + "step": 647, + "time_per_iteration": 2.514894485473633 + }, + { + "auxiliary_loss_clip": 0.01141196, + "auxiliary_loss_mlp": 0.01148608, + "balance_loss_clip": 1.03864908, + "balance_loss_mlp": 1.03167391, + "epoch": 0.03895986772884413, + "flos": 31503457324800.0, + "grad_norm": 1.923486594833773, + "language_loss": 0.90020525, + "learning_rate": 3.999158194912106e-06, + "loss": 0.92310333, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 1.09375, + "step": 648, + "time_per_iteration": 2.5370564460754395 + }, + { + "auxiliary_loss_clip": 0.01135196, + "auxiliary_loss_mlp": 0.01175276, + "balance_loss_clip": 1.06641376, + "balance_loss_mlp": 1.02939034, + "epoch": 0.0390199909815121, + "flos": 19900003541760.0, + "grad_norm": 1.7761805985334822, + "language_loss": 0.89493334, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.91803813, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 1.0625, + "step": 649, + "time_per_iteration": 3.85715389251709 + }, + { + "auxiliary_loss_clip": 0.01144721, + "auxiliary_loss_mlp": 0.01139222, + "balance_loss_clip": 1.03202879, + "balance_loss_mlp": 1.03192854, + "epoch": 0.03908011423418007, + "flos": 21611515944960.0, + "grad_norm": 1.898034747366542, + "language_loss": 0.84001702, + "learning_rate": 3.999135446087263e-06, + "loss": 0.86285645, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 1.125, + "step": 650, + "time_per_iteration": 5.374142408370972 + }, + { + "auxiliary_loss_clip": 0.01138069, + "auxiliary_loss_mlp": 0.01130687, + "balance_loss_clip": 1.03121901, + "balance_loss_mlp": 1.0306164, + "epoch": 0.039140237486848035, + "flos": 18660412252800.0, + "grad_norm": 2.1249664287301133, + "language_loss": 0.82968974, + "learning_rate": 3.9991239579635e-06, + "loss": 0.8523773, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 1.078125, + "step": 651, + "time_per_iteration": 3.827467679977417 + }, + { + "auxiliary_loss_clip": 0.01144146, + "auxiliary_loss_mlp": 0.01143147, + "balance_loss_clip": 1.03681254, + "balance_loss_mlp": 1.03242385, + "epoch": 0.03920036073951601, + "flos": 18660132961920.0, + "grad_norm": 2.3254845210019863, + "language_loss": 0.93467844, + "learning_rate": 3.999112394032757e-06, + "loss": 0.95755136, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 1.1171875, + "step": 652, + "time_per_iteration": 2.397010564804077 + }, + { + "auxiliary_loss_clip": 0.01134054, + "auxiliary_loss_mlp": 0.01132591, + "balance_loss_clip": 1.02840257, + "balance_loss_mlp": 1.03009403, + "epoch": 0.03926048399218398, + "flos": 31353226277760.0, + "grad_norm": 2.292943232036688, + "language_loss": 0.87760836, + "learning_rate": 3.999100754295471e-06, + "loss": 0.90027481, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 1.0390625, + "step": 653, + "time_per_iteration": 2.5087904930114746 + }, + { + "auxiliary_loss_clip": 0.01149826, + "auxiliary_loss_mlp": 0.01150856, + "balance_loss_clip": 1.03374493, + "balance_loss_mlp": 1.03430355, + "epoch": 0.039320607244851945, + "flos": 29602297082880.0, + "grad_norm": 2.0606702949693037, + "language_loss": 0.92878038, + "learning_rate": 3.999089038752085e-06, + "loss": 0.95178723, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 1.15625, + "step": 654, + "time_per_iteration": 2.4998490810394287 + }, + { + "auxiliary_loss_clip": 0.01040154, + "auxiliary_loss_mlp": 0.01019751, + "balance_loss_clip": 1.00067747, + "balance_loss_mlp": 1.01383352, + "epoch": 0.03938073049751992, + "flos": 66531151221120.0, + "grad_norm": 0.7438771244747499, + "language_loss": 0.50222671, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52282578, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.26367188, + "step": 655, + "time_per_iteration": 3.104891538619995 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01141032, + "balance_loss_clip": 1.03636646, + "balance_loss_mlp": 1.03428555, + "epoch": 0.03944085375018788, + "flos": 23366704325760.0, + "grad_norm": 1.9825798270969837, + "language_loss": 0.84229481, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.86515629, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 1.109375, + "step": 656, + "time_per_iteration": 2.456942558288574 + }, + { + "auxiliary_loss_clip": 0.011591, + "auxiliary_loss_mlp": 0.01138425, + "balance_loss_clip": 1.03027868, + "balance_loss_mlp": 1.03786945, + "epoch": 0.039500977002855854, + "flos": 18547398581760.0, + "grad_norm": 2.993029478405108, + "language_loss": 0.83184087, + "learning_rate": 3.999053437289776e-06, + "loss": 0.8548162, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 1.2109375, + "step": 657, + "time_per_iteration": 2.396207094192505 + }, + { + "auxiliary_loss_clip": 0.01160207, + "auxiliary_loss_mlp": 0.01150377, + "balance_loss_clip": 1.03040516, + "balance_loss_mlp": 1.03921843, + "epoch": 0.039561100255523826, + "flos": 25336992792960.0, + "grad_norm": 2.2994740733883288, + "language_loss": 0.85964495, + "learning_rate": 3.999041418526457e-06, + "loss": 0.88275075, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 1.2109375, + "step": 658, + "time_per_iteration": 2.494626998901367 + }, + { + "auxiliary_loss_clip": 0.01147782, + "auxiliary_loss_mlp": 0.01146308, + "balance_loss_clip": 1.0417856, + "balance_loss_mlp": 1.03554034, + "epoch": 0.03962122350819179, + "flos": 18219005533440.0, + "grad_norm": 1.8461537837749333, + "language_loss": 0.9614377, + "learning_rate": 3.999029323959287e-06, + "loss": 0.9843787, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 1.125, + "step": 659, + "time_per_iteration": 2.4406585693359375 + }, + { + "auxiliary_loss_clip": 0.01154311, + "auxiliary_loss_mlp": 0.0114721, + "balance_loss_clip": 1.03954029, + "balance_loss_mlp": 1.03571033, + "epoch": 0.03968134676085976, + "flos": 20521178184960.0, + "grad_norm": 2.085971986846673, + "language_loss": 0.85468787, + "learning_rate": 3.999017153588724e-06, + "loss": 0.87770307, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 1.1875, + "step": 660, + "time_per_iteration": 2.431804656982422 + }, + { + "auxiliary_loss_clip": 0.0115663, + "auxiliary_loss_mlp": 0.01155076, + "balance_loss_clip": 1.05126858, + "balance_loss_mlp": 1.03981268, + "epoch": 0.03974147001352773, + "flos": 22421395820160.0, + "grad_norm": 1.6034772804171846, + "language_loss": 0.85248554, + "learning_rate": 3.999004907415231e-06, + "loss": 0.8756026, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 1.171875, + "step": 661, + "time_per_iteration": 2.4936578273773193 + }, + { + "auxiliary_loss_clip": 0.01043831, + "auxiliary_loss_mlp": 0.01065092, + "balance_loss_clip": 1.04973829, + "balance_loss_mlp": 1.01680923, + "epoch": 0.0398015932661957, + "flos": 71125267495680.0, + "grad_norm": 0.946722031743666, + "language_loss": 0.69565392, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71674323, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.26953125, + "step": 662, + "time_per_iteration": 3.176159143447876 + }, + { + "auxiliary_loss_clip": 0.01150752, + "auxiliary_loss_mlp": 0.01142488, + "balance_loss_clip": 1.03515172, + "balance_loss_mlp": 1.03584886, + "epoch": 0.03986171651886367, + "flos": 16799995434240.0, + "grad_norm": 1.7247541789998526, + "language_loss": 0.87811625, + "learning_rate": 3.998980187661314e-06, + "loss": 0.90104866, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 1.140625, + "step": 663, + "time_per_iteration": 2.496194839477539 + }, + { + "auxiliary_loss_clip": 0.01153436, + "auxiliary_loss_mlp": 0.01141784, + "balance_loss_clip": 1.0307765, + "balance_loss_mlp": 1.03455997, + "epoch": 0.03992183977153164, + "flos": 24533920632960.0, + "grad_norm": 2.29137104944411, + "language_loss": 0.92828369, + "learning_rate": 3.998967714081826e-06, + "loss": 0.95123589, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 1.1875, + "step": 664, + "time_per_iteration": 2.492635726928711 + }, + { + "auxiliary_loss_clip": 0.01141531, + "auxiliary_loss_mlp": 0.01129279, + "balance_loss_clip": 1.02928603, + "balance_loss_mlp": 1.03249192, + "epoch": 0.03998196302419961, + "flos": 15595003169280.0, + "grad_norm": 1.9046914955046066, + "language_loss": 0.89442289, + "learning_rate": 3.998955164701281e-06, + "loss": 0.91713107, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 1.09375, + "step": 665, + "time_per_iteration": 2.403488874435425 + }, + { + "auxiliary_loss_clip": 0.01146659, + "auxiliary_loss_mlp": 0.01129273, + "balance_loss_clip": 1.02632403, + "balance_loss_mlp": 1.0332005, + "epoch": 0.04004208627686758, + "flos": 25303790223360.0, + "grad_norm": 2.040123180001308, + "language_loss": 0.84858418, + "learning_rate": 3.998942539520158e-06, + "loss": 0.87134349, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 1.1328125, + "step": 666, + "time_per_iteration": 2.533057689666748 + }, + { + "auxiliary_loss_clip": 0.01142127, + "auxiliary_loss_mlp": 0.01121325, + "balance_loss_clip": 1.01961589, + "balance_loss_mlp": 1.03244996, + "epoch": 0.04010220952953555, + "flos": 23474760583680.0, + "grad_norm": 1.8487774465223867, + "language_loss": 0.91057909, + "learning_rate": 3.998929838538932e-06, + "loss": 0.93321365, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 1.09375, + "step": 667, + "time_per_iteration": 2.475883960723877 + }, + { + "auxiliary_loss_clip": 0.01143159, + "auxiliary_loss_mlp": 0.0112864, + "balance_loss_clip": 1.0239743, + "balance_loss_mlp": 1.03338122, + "epoch": 0.04016233278220352, + "flos": 18616247516160.0, + "grad_norm": 2.1902831871626067, + "language_loss": 0.85681808, + "learning_rate": 3.998917061758087e-06, + "loss": 0.87953603, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 1.09375, + "step": 668, + "time_per_iteration": 2.403777599334717 + }, + { + "auxiliary_loss_clip": 0.01045396, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.01937294, + "balance_loss_mlp": 1.01646304, + "epoch": 0.040222456034871484, + "flos": 70902801112320.0, + "grad_norm": 0.8154666165093641, + "language_loss": 0.60316014, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62400234, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.2890625, + "step": 669, + "time_per_iteration": 3.1351687908172607 + }, + { + "auxiliary_loss_clip": 0.01148064, + "auxiliary_loss_mlp": 0.01135891, + "balance_loss_clip": 1.02946138, + "balance_loss_mlp": 1.03482938, + "epoch": 0.040282579287539456, + "flos": 23763701928960.0, + "grad_norm": 1.662387864976036, + "language_loss": 0.90439564, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.92723525, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 1.1328125, + "step": 670, + "time_per_iteration": 2.486539363861084 + }, + { + "auxiliary_loss_clip": 0.01139737, + "auxiliary_loss_mlp": 0.011163, + "balance_loss_clip": 1.02188587, + "balance_loss_mlp": 1.03324819, + "epoch": 0.04034270254020743, + "flos": 18477537217920.0, + "grad_norm": 1.8615681522588239, + "language_loss": 0.79142499, + "learning_rate": 3.998878276622692e-06, + "loss": 0.81398535, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 1.0625, + "step": 671, + "time_per_iteration": 2.4270260334014893 + }, + { + "auxiliary_loss_clip": 0.0115208, + "auxiliary_loss_mlp": 0.01131061, + "balance_loss_clip": 1.02973342, + "balance_loss_mlp": 1.03835058, + "epoch": 0.040402825792875394, + "flos": 17200903109760.0, + "grad_norm": 1.8956433288339951, + "language_loss": 0.97127789, + "learning_rate": 3.998865196648242e-06, + "loss": 0.99410939, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 1.1328125, + "step": 672, + "time_per_iteration": 2.4225685596466064 + }, + { + "auxiliary_loss_clip": 0.01151095, + "auxiliary_loss_mlp": 0.01135538, + "balance_loss_clip": 1.03330457, + "balance_loss_mlp": 1.03608048, + "epoch": 0.040462949045543366, + "flos": 19171156665600.0, + "grad_norm": 1.8477654372546233, + "language_loss": 0.93808079, + "learning_rate": 3.998852040876622e-06, + "loss": 0.96094704, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 1.1484375, + "step": 673, + "time_per_iteration": 2.4184083938598633 + }, + { + "auxiliary_loss_clip": 0.01145377, + "auxiliary_loss_mlp": 0.01129033, + "balance_loss_clip": 1.03099537, + "balance_loss_mlp": 1.03231931, + "epoch": 0.04052307229821133, + "flos": 24018812300160.0, + "grad_norm": 1.9040232467614557, + "language_loss": 0.79716676, + "learning_rate": 3.998838809308334e-06, + "loss": 0.81991088, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 1.1328125, + "step": 674, + "time_per_iteration": 2.477980136871338 + }, + { + "auxiliary_loss_clip": 0.01160325, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_clip": 1.03071928, + "balance_loss_mlp": 1.03637743, + "epoch": 0.0405831955508793, + "flos": 16435641818880.0, + "grad_norm": 2.215652634157601, + "language_loss": 0.85125887, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.87428176, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 1.234375, + "step": 675, + "time_per_iteration": 2.422987222671509 + }, + { + "auxiliary_loss_clip": 0.01146998, + "auxiliary_loss_mlp": 0.01114603, + "balance_loss_clip": 1.01785231, + "balance_loss_mlp": 1.03223252, + "epoch": 0.040643318803547275, + "flos": 24278775350400.0, + "grad_norm": 1.6483841310272274, + "language_loss": 0.80701369, + "learning_rate": 3.998812118783757e-06, + "loss": 0.82962966, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 1.1484375, + "step": 676, + "time_per_iteration": 2.481201410293579 + }, + { + "auxiliary_loss_clip": 0.01148228, + "auxiliary_loss_mlp": 0.0111558, + "balance_loss_clip": 1.01959252, + "balance_loss_mlp": 1.03390324, + "epoch": 0.04070344205621524, + "flos": 17711123852160.0, + "grad_norm": 2.1627593240706826, + "language_loss": 0.89751595, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.92015398, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 1.140625, + "step": 677, + "time_per_iteration": 2.414358139038086 + }, + { + "auxiliary_loss_clip": 0.01143863, + "auxiliary_loss_mlp": 0.01112875, + "balance_loss_clip": 1.01984406, + "balance_loss_mlp": 1.03332329, + "epoch": 0.04076356530888321, + "flos": 26176444456320.0, + "grad_norm": 2.1219640420479324, + "language_loss": 0.80295086, + "learning_rate": 3.998785125078559e-06, + "loss": 0.82551825, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 1.109375, + "step": 678, + "time_per_iteration": 2.5327956676483154 + }, + { + "auxiliary_loss_clip": 0.01146948, + "auxiliary_loss_mlp": 0.01124406, + "balance_loss_clip": 1.02498579, + "balance_loss_mlp": 1.03048611, + "epoch": 0.04082368856155118, + "flos": 35771973459840.0, + "grad_norm": 1.5857690280125762, + "language_loss": 0.85437536, + "learning_rate": 3.998771514534505e-06, + "loss": 0.87708896, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 1.1640625, + "step": 679, + "time_per_iteration": 2.5361883640289307 + }, + { + "auxiliary_loss_clip": 0.01143064, + "auxiliary_loss_mlp": 0.01110961, + "balance_loss_clip": 1.01702392, + "balance_loss_mlp": 1.03150892, + "epoch": 0.04088381181421915, + "flos": 28145406291840.0, + "grad_norm": 1.8122299387912468, + "language_loss": 0.81136459, + "learning_rate": 3.998757828196835e-06, + "loss": 0.8339048, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 1.109375, + "step": 680, + "time_per_iteration": 2.537060260772705 + }, + { + "auxiliary_loss_clip": 0.01147984, + "auxiliary_loss_mlp": 0.01118835, + "balance_loss_clip": 1.01784062, + "balance_loss_mlp": 1.03201079, + "epoch": 0.04094393506688712, + "flos": 27596501896320.0, + "grad_norm": 1.7487901662373744, + "language_loss": 0.86967373, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.89234191, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 1.15625, + "step": 681, + "time_per_iteration": 2.4821507930755615 + }, + { + "auxiliary_loss_clip": 0.01149803, + "auxiliary_loss_mlp": 0.01123164, + "balance_loss_clip": 1.02140665, + "balance_loss_mlp": 1.03132033, + "epoch": 0.04100405831955509, + "flos": 23110930638720.0, + "grad_norm": 1.633952853797651, + "language_loss": 0.7617293, + "learning_rate": 3.998730228142726e-06, + "loss": 0.78445894, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 1.1875, + "step": 682, + "time_per_iteration": 2.4639172554016113 + }, + { + "auxiliary_loss_clip": 0.01148086, + "auxiliary_loss_mlp": 0.01117165, + "balance_loss_clip": 1.01741064, + "balance_loss_mlp": 1.03163362, + "epoch": 0.04106418157222306, + "flos": 20155707406080.0, + "grad_norm": 1.7166485726616771, + "language_loss": 0.76514614, + "learning_rate": 3.998716314427333e-06, + "loss": 0.78779864, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 1.1640625, + "step": 683, + "time_per_iteration": 2.4202606678009033 + }, + { + "auxiliary_loss_clip": 0.01144307, + "auxiliary_loss_mlp": 0.01109018, + "balance_loss_clip": 1.02261484, + "balance_loss_mlp": 1.02928138, + "epoch": 0.041124304824891024, + "flos": 17419738711680.0, + "grad_norm": 2.695420921015649, + "language_loss": 0.85175836, + "learning_rate": 3.998702324920417e-06, + "loss": 0.87429166, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 1.1484375, + "step": 684, + "time_per_iteration": 2.473734140396118 + }, + { + "auxiliary_loss_clip": 0.01141528, + "auxiliary_loss_mlp": 0.01114277, + "balance_loss_clip": 1.01933861, + "balance_loss_mlp": 1.02941537, + "epoch": 0.041184428077558996, + "flos": 25778853360000.0, + "grad_norm": 1.4670487658655595, + "language_loss": 0.93872792, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.96128601, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 1.125, + "step": 685, + "time_per_iteration": 2.506664514541626 + }, + { + "auxiliary_loss_clip": 0.01143432, + "auxiliary_loss_mlp": 0.01104638, + "balance_loss_clip": 1.01551676, + "balance_loss_mlp": 1.02931964, + "epoch": 0.04124455133022697, + "flos": 22963701968640.0, + "grad_norm": 2.2013557400519232, + "language_loss": 0.93457246, + "learning_rate": 3.998674118534141e-06, + "loss": 0.95705312, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 1.140625, + "step": 686, + "time_per_iteration": 2.479328155517578 + }, + { + "auxiliary_loss_clip": 0.01145387, + "auxiliary_loss_mlp": 0.01113393, + "balance_loss_clip": 1.01964641, + "balance_loss_mlp": 1.02947474, + "epoch": 0.04130467458289493, + "flos": 21287975575680.0, + "grad_norm": 1.8003524058405944, + "language_loss": 0.75618255, + "learning_rate": 3.998659901655851e-06, + "loss": 0.77877033, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 1.15625, + "step": 687, + "time_per_iteration": 2.440997838973999 + }, + { + "auxiliary_loss_clip": 0.01130864, + "auxiliary_loss_mlp": 0.01101142, + "balance_loss_clip": 1.02108145, + "balance_loss_mlp": 1.02755916, + "epoch": 0.041364797835562905, + "flos": 19973216396160.0, + "grad_norm": 1.4241847846972275, + "language_loss": 0.89603168, + "learning_rate": 3.998645608988177e-06, + "loss": 0.91835177, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 1.03125, + "step": 688, + "time_per_iteration": 3.9174094200134277 + }, + { + "auxiliary_loss_clip": 0.01135141, + "auxiliary_loss_mlp": 0.0110081, + "balance_loss_clip": 1.02022409, + "balance_loss_mlp": 1.02849007, + "epoch": 0.04142492108823087, + "flos": 21905205235200.0, + "grad_norm": 1.753687891361087, + "language_loss": 0.87174642, + "learning_rate": 3.998631240531661e-06, + "loss": 0.89410603, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 1.0625, + "step": 689, + "time_per_iteration": 2.437152624130249 + }, + { + "auxiliary_loss_clip": 0.01138413, + "auxiliary_loss_mlp": 0.01107427, + "balance_loss_clip": 1.0261265, + "balance_loss_mlp": 1.02782011, + "epoch": 0.04148504434089884, + "flos": 27638292660480.0, + "grad_norm": 1.9561709728017773, + "language_loss": 0.72685653, + "learning_rate": 3.998616796286848e-06, + "loss": 0.7493149, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 1.109375, + "step": 690, + "time_per_iteration": 3.929933547973633 + }, + { + "auxiliary_loss_clip": 0.01138071, + "auxiliary_loss_mlp": 0.01109711, + "balance_loss_clip": 1.02449965, + "balance_loss_mlp": 1.0274477, + "epoch": 0.041545167593566815, + "flos": 20517442669440.0, + "grad_norm": 1.6165250474725026, + "language_loss": 0.78118956, + "learning_rate": 3.998602276254286e-06, + "loss": 0.80366731, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 1.109375, + "step": 691, + "time_per_iteration": 3.800966739654541 + }, + { + "auxiliary_loss_clip": 0.01140213, + "auxiliary_loss_mlp": 0.01110365, + "balance_loss_clip": 1.02334201, + "balance_loss_mlp": 1.02730799, + "epoch": 0.04160529084623478, + "flos": 11868269512320.0, + "grad_norm": 1.9734362678836048, + "language_loss": 0.8775323, + "learning_rate": 3.998587680434526e-06, + "loss": 0.90003806, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 1.125, + "step": 692, + "time_per_iteration": 2.3723509311676025 + }, + { + "auxiliary_loss_clip": 0.01143444, + "auxiliary_loss_mlp": 0.01106273, + "balance_loss_clip": 1.0180105, + "balance_loss_mlp": 1.02847564, + "epoch": 0.04166541409890275, + "flos": 14827472640000.0, + "grad_norm": 2.1374499543792664, + "language_loss": 0.95044053, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9729377, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 1.1484375, + "step": 693, + "time_per_iteration": 2.399405002593994 + }, + { + "auxiliary_loss_clip": 0.01145031, + "auxiliary_loss_mlp": 0.0110449, + "balance_loss_clip": 1.0169431, + "balance_loss_mlp": 1.03120589, + "epoch": 0.04172553735157072, + "flos": 25807063605120.0, + "grad_norm": 2.0002385278993358, + "language_loss": 0.86368775, + "learning_rate": 3.998558261435626e-06, + "loss": 0.88618302, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 1.140625, + "step": 694, + "time_per_iteration": 2.4540154933929443 + }, + { + "auxiliary_loss_clip": 0.01148437, + "auxiliary_loss_mlp": 0.0111921, + "balance_loss_clip": 1.01916981, + "balance_loss_mlp": 1.03047276, + "epoch": 0.04178566060423869, + "flos": 24278670616320.0, + "grad_norm": 1.8546510084979992, + "language_loss": 0.89156288, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.91423929, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 1.171875, + "step": 695, + "time_per_iteration": 2.4542396068573 + }, + { + "auxiliary_loss_clip": 0.01141274, + "auxiliary_loss_mlp": 0.01111994, + "balance_loss_clip": 1.0255909, + "balance_loss_mlp": 1.02842283, + "epoch": 0.04184578385690666, + "flos": 18221065303680.0, + "grad_norm": 2.457914240917448, + "language_loss": 0.90388429, + "learning_rate": 3.99852853929461e-06, + "loss": 0.92641699, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 1.125, + "step": 696, + "time_per_iteration": 2.380408763885498 + }, + { + "auxiliary_loss_clip": 0.01143746, + "auxiliary_loss_mlp": 0.01116715, + "balance_loss_clip": 1.02454209, + "balance_loss_mlp": 1.0283854, + "epoch": 0.041905907109574626, + "flos": 22775450584320.0, + "grad_norm": 2.330216831203628, + "language_loss": 0.9735322, + "learning_rate": 3.998513564547216e-06, + "loss": 0.99613678, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 1.15625, + "step": 697, + "time_per_iteration": 2.4298970699310303 + }, + { + "auxiliary_loss_clip": 0.0113553, + "auxiliary_loss_mlp": 0.01107672, + "balance_loss_clip": 1.02141225, + "balance_loss_mlp": 1.02829766, + "epoch": 0.0419660303622426, + "flos": 20155916874240.0, + "grad_norm": 2.117415220399418, + "language_loss": 0.89289582, + "learning_rate": 3.998498514015987e-06, + "loss": 0.91532779, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 1.078125, + "step": 698, + "time_per_iteration": 2.406831979751587 + }, + { + "auxiliary_loss_clip": 0.0114367, + "auxiliary_loss_mlp": 0.011163, + "balance_loss_clip": 1.02226782, + "balance_loss_mlp": 1.02739084, + "epoch": 0.042026153614910564, + "flos": 23075249362560.0, + "grad_norm": 1.8068258593235993, + "language_loss": 0.94687164, + "learning_rate": 3.998483387701495e-06, + "loss": 0.96947134, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 1.1640625, + "step": 699, + "time_per_iteration": 2.4555959701538086 + }, + { + "auxiliary_loss_clip": 0.01059863, + "auxiliary_loss_mlp": 0.01034091, + "balance_loss_clip": 1.0172112, + "balance_loss_mlp": 1.0290184, + "epoch": 0.042086276867578536, + "flos": 64491734528640.0, + "grad_norm": 0.967139958296333, + "language_loss": 0.68031961, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70125914, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.30859375, + "step": 700, + "time_per_iteration": 3.0838208198547363 + }, + { + "auxiliary_loss_clip": 0.01144112, + "auxiliary_loss_mlp": 0.01118248, + "balance_loss_clip": 1.02278471, + "balance_loss_mlp": 1.02894843, + "epoch": 0.04214640012024651, + "flos": 15486109038720.0, + "grad_norm": 2.2284658935471056, + "language_loss": 0.9273811, + "learning_rate": 3.998452907725016e-06, + "loss": 0.9500047, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 1.15625, + "step": 701, + "time_per_iteration": 2.4092633724212646 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01110361, + "balance_loss_clip": 1.0225755, + "balance_loss_mlp": 1.02932513, + "epoch": 0.04220652337291447, + "flos": 23875947550080.0, + "grad_norm": 1.7732673197037274, + "language_loss": 0.73061061, + "learning_rate": 3.998437554064184e-06, + "loss": 0.75313759, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 1.1328125, + "step": 702, + "time_per_iteration": 2.4450647830963135 + }, + { + "auxiliary_loss_clip": 0.01052494, + "auxiliary_loss_mlp": 0.01023664, + "balance_loss_clip": 1.00850046, + "balance_loss_mlp": 1.02324557, + "epoch": 0.042266646625582445, + "flos": 63792145238400.0, + "grad_norm": 0.8712367276957804, + "language_loss": 0.61116236, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.63192391, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.29296875, + "step": 703, + "time_per_iteration": 3.116760492324829 + }, + { + "auxiliary_loss_clip": 0.01051436, + "auxiliary_loss_mlp": 0.01013246, + "balance_loss_clip": 0.99875015, + "balance_loss_mlp": 1.02073598, + "epoch": 0.04232676987825041, + "flos": 50015521877760.0, + "grad_norm": 1.0704243783074525, + "language_loss": 0.5788312, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59947801, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.30664062, + "step": 704, + "time_per_iteration": 2.9471654891967773 + }, + { + "auxiliary_loss_clip": 0.01147149, + "auxiliary_loss_mlp": 0.01121852, + "balance_loss_clip": 1.03625989, + "balance_loss_mlp": 1.03329575, + "epoch": 0.04238689313091838, + "flos": 21615041992320.0, + "grad_norm": 2.2101748565416046, + "language_loss": 0.91898108, + "learning_rate": 3.998391038398319e-06, + "loss": 0.94167113, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 1.140625, + "step": 705, + "time_per_iteration": 2.438861608505249 + }, + { + "auxiliary_loss_clip": 0.01141899, + "auxiliary_loss_mlp": 0.01129015, + "balance_loss_clip": 1.04466295, + "balance_loss_mlp": 1.0315094, + "epoch": 0.042447016383586354, + "flos": 19134113846400.0, + "grad_norm": 2.4709033476251605, + "language_loss": 0.75250739, + "learning_rate": 3.998375381617201e-06, + "loss": 0.77521658, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 1.109375, + "step": 706, + "time_per_iteration": 2.4444239139556885 + }, + { + "auxiliary_loss_clip": 0.01150915, + "auxiliary_loss_mlp": 0.01141375, + "balance_loss_clip": 1.05072856, + "balance_loss_mlp": 1.03545868, + "epoch": 0.04250713963625432, + "flos": 24424851945600.0, + "grad_norm": 2.7454612573551307, + "language_loss": 0.96787024, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.99079311, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 1.15625, + "step": 707, + "time_per_iteration": 2.448821783065796 + }, + { + "auxiliary_loss_clip": 0.01152416, + "auxiliary_loss_mlp": 0.01135119, + "balance_loss_clip": 1.04576015, + "balance_loss_mlp": 1.03550673, + "epoch": 0.04256726288892229, + "flos": 30366231742080.0, + "grad_norm": 1.6581984549677886, + "language_loss": 0.85144758, + "learning_rate": 3.998343840719776e-06, + "loss": 0.87432295, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 1.171875, + "step": 708, + "time_per_iteration": 2.516350507736206 + }, + { + "auxiliary_loss_clip": 0.01154444, + "auxiliary_loss_mlp": 0.01155713, + "balance_loss_clip": 1.05867708, + "balance_loss_mlp": 1.03431833, + "epoch": 0.04262738614159026, + "flos": 16361730737280.0, + "grad_norm": 1.976981271756763, + "language_loss": 0.87505841, + "learning_rate": 3.998327956604666e-06, + "loss": 0.89815998, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 1.203125, + "step": 709, + "time_per_iteration": 2.3963935375213623 + }, + { + "auxiliary_loss_clip": 0.0114777, + "auxiliary_loss_mlp": 0.01146419, + "balance_loss_clip": 1.05605829, + "balance_loss_mlp": 1.03262353, + "epoch": 0.04268750939425823, + "flos": 20411341447680.0, + "grad_norm": 3.1010226778975363, + "language_loss": 0.91445822, + "learning_rate": 3.99831199671276e-06, + "loss": 0.9374001, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 1.1484375, + "step": 710, + "time_per_iteration": 2.4525625705718994 + }, + { + "auxiliary_loss_clip": 0.01155516, + "auxiliary_loss_mlp": 0.01147531, + "balance_loss_clip": 1.05345154, + "balance_loss_mlp": 1.03446317, + "epoch": 0.0427476326469262, + "flos": 20301923646720.0, + "grad_norm": 1.8841525420833416, + "language_loss": 0.88040853, + "learning_rate": 3.998295961044662e-06, + "loss": 0.90343893, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 1.2109375, + "step": 711, + "time_per_iteration": 2.417354106903076 + }, + { + "auxiliary_loss_clip": 0.01144454, + "auxiliary_loss_mlp": 0.01128539, + "balance_loss_clip": 1.04232681, + "balance_loss_mlp": 1.03117323, + "epoch": 0.042807755899594166, + "flos": 21649780661760.0, + "grad_norm": 1.64860752759726, + "language_loss": 0.88556767, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.9082976, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 1.140625, + "step": 712, + "time_per_iteration": 2.4566781520843506 + }, + { + "auxiliary_loss_clip": 0.01152712, + "auxiliary_loss_mlp": 0.01138287, + "balance_loss_clip": 1.04291964, + "balance_loss_mlp": 1.03289008, + "epoch": 0.04286787915226214, + "flos": 21433912525440.0, + "grad_norm": 2.3930566971177276, + "language_loss": 0.96439731, + "learning_rate": 3.998263662382328e-06, + "loss": 0.98730725, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 1.1953125, + "step": 713, + "time_per_iteration": 2.473478078842163 + }, + { + "auxiliary_loss_clip": 0.01038652, + "auxiliary_loss_mlp": 0.01092257, + "balance_loss_clip": 1.07928669, + "balance_loss_mlp": 1.01230037, + "epoch": 0.04292800240493011, + "flos": 66394256313600.0, + "grad_norm": 0.9014068670757661, + "language_loss": 0.63926554, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.66057467, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.26367188, + "step": 714, + "time_per_iteration": 3.1448700428009033 + }, + { + "auxiliary_loss_clip": 0.01141506, + "auxiliary_loss_mlp": 0.01115124, + "balance_loss_clip": 1.02972221, + "balance_loss_mlp": 1.03291035, + "epoch": 0.042988125657598075, + "flos": 31648905515520.0, + "grad_norm": 1.6810060794282315, + "language_loss": 0.78302395, + "learning_rate": 3.998231060622563e-06, + "loss": 0.80559027, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 1.0859375, + "step": 715, + "time_per_iteration": 2.524284839630127 + }, + { + "auxiliary_loss_clip": 0.01145289, + "auxiliary_loss_mlp": 0.01119189, + "balance_loss_clip": 1.02425051, + "balance_loss_mlp": 1.03274786, + "epoch": 0.04304824891026605, + "flos": 33247264602240.0, + "grad_norm": 1.7573901763432593, + "language_loss": 0.76250398, + "learning_rate": 3.998214646082688e-06, + "loss": 0.78514874, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 1.125, + "step": 716, + "time_per_iteration": 2.547414779663086 + }, + { + "auxiliary_loss_clip": 0.01037703, + "auxiliary_loss_mlp": 0.01013978, + "balance_loss_clip": 0.99995852, + "balance_loss_mlp": 1.01001048, + "epoch": 0.04310837216293401, + "flos": 64061080508160.0, + "grad_norm": 0.9041181067697918, + "language_loss": 0.65790331, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67842013, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.27734375, + "step": 717, + "time_per_iteration": 3.149366617202759 + }, + { + "auxiliary_loss_clip": 0.01043218, + "auxiliary_loss_mlp": 0.01017201, + "balance_loss_clip": 1.00442147, + "balance_loss_mlp": 1.01441026, + "epoch": 0.043168495415601985, + "flos": 61340719057920.0, + "grad_norm": 0.9950550243141916, + "language_loss": 0.59116203, + "learning_rate": 3.998181589686065e-06, + "loss": 0.61176616, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.2890625, + "step": 718, + "time_per_iteration": 2.9409382343292236 + }, + { + "auxiliary_loss_clip": 0.0115902, + "auxiliary_loss_mlp": 0.01130951, + "balance_loss_clip": 1.03610849, + "balance_loss_mlp": 1.04256403, + "epoch": 0.04322861866826996, + "flos": 20703215347200.0, + "grad_norm": 1.8148965741547145, + "language_loss": 0.9526943, + "learning_rate": 3.99816494783057e-06, + "loss": 0.97559398, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 1.1640625, + "step": 719, + "time_per_iteration": 2.45314359664917 + }, + { + "auxiliary_loss_clip": 0.01161045, + "auxiliary_loss_mlp": 0.01126274, + "balance_loss_clip": 1.03615177, + "balance_loss_mlp": 1.04308903, + "epoch": 0.04328874192093792, + "flos": 30372027027840.0, + "grad_norm": 1.4297366166222838, + "language_loss": 0.68987429, + "learning_rate": 3.99814823020446e-06, + "loss": 0.71274751, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 1.1796875, + "step": 720, + "time_per_iteration": 2.5416030883789062 + }, + { + "auxiliary_loss_clip": 0.01157038, + "auxiliary_loss_mlp": 0.01128176, + "balance_loss_clip": 1.04096293, + "balance_loss_mlp": 1.04451132, + "epoch": 0.043348865173605894, + "flos": 21943714331520.0, + "grad_norm": 1.9215818591168672, + "language_loss": 0.80529994, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.82815206, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 1.125, + "step": 721, + "time_per_iteration": 2.465538740158081 + }, + { + "auxiliary_loss_clip": 0.01162103, + "auxiliary_loss_mlp": 0.01125861, + "balance_loss_clip": 1.03693056, + "balance_loss_mlp": 1.04607975, + "epoch": 0.04340898842627386, + "flos": 15263433187200.0, + "grad_norm": 2.551966770720661, + "language_loss": 0.93448949, + "learning_rate": 3.998114567642933e-06, + "loss": 0.95736915, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 1.1640625, + "step": 722, + "time_per_iteration": 2.441356658935547 + }, + { + "auxiliary_loss_clip": 0.01160653, + "auxiliary_loss_mlp": 0.01121257, + "balance_loss_clip": 1.03604567, + "balance_loss_mlp": 1.04438806, + "epoch": 0.04346911167894183, + "flos": 27964172090880.0, + "grad_norm": 1.7760254459726104, + "language_loss": 0.89578849, + "learning_rate": 3.998097622708792e-06, + "loss": 0.91860759, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 1.1640625, + "step": 723, + "time_per_iteration": 2.5088438987731934 + }, + { + "auxiliary_loss_clip": 0.01154337, + "auxiliary_loss_mlp": 0.01118766, + "balance_loss_clip": 1.03336465, + "balance_loss_mlp": 1.04160857, + "epoch": 0.0435292349316098, + "flos": 29240910933120.0, + "grad_norm": 1.6843211822434485, + "language_loss": 0.86046994, + "learning_rate": 3.99808060200659e-06, + "loss": 0.883201, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 1.125, + "step": 724, + "time_per_iteration": 2.572150230407715 + }, + { + "auxiliary_loss_clip": 0.01149898, + "auxiliary_loss_mlp": 0.01125642, + "balance_loss_clip": 1.03885722, + "balance_loss_mlp": 1.04166746, + "epoch": 0.04358935818427777, + "flos": 20557313308800.0, + "grad_norm": 1.7737384397129539, + "language_loss": 0.83929199, + "learning_rate": 3.998063505536971e-06, + "loss": 0.86204737, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 1.0859375, + "step": 725, + "time_per_iteration": 2.460517406463623 + }, + { + "auxiliary_loss_clip": 0.01160372, + "auxiliary_loss_mlp": 0.01120347, + "balance_loss_clip": 1.03623319, + "balance_loss_mlp": 1.04292965, + "epoch": 0.04364948143694574, + "flos": 14464061631360.0, + "grad_norm": 1.9363191651098222, + "language_loss": 0.9248907, + "learning_rate": 3.998046333300584e-06, + "loss": 0.94769788, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 1.171875, + "step": 726, + "time_per_iteration": 2.4446539878845215 + }, + { + "auxiliary_loss_clip": 0.01040842, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.02023053, + "balance_loss_mlp": 1.01525593, + "epoch": 0.043709604689613706, + "flos": 50064610982400.0, + "grad_norm": 0.9196165543991929, + "language_loss": 0.56134468, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58207601, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.25585938, + "step": 727, + "time_per_iteration": 4.6004369258880615 + }, + { + "auxiliary_loss_clip": 0.01143707, + "auxiliary_loss_mlp": 0.0110247, + "balance_loss_clip": 1.02107394, + "balance_loss_mlp": 1.03138566, + "epoch": 0.04376972794228168, + "flos": 13990709151360.0, + "grad_norm": 2.1946081947748133, + "language_loss": 0.86729062, + "learning_rate": 3.998011761530112e-06, + "loss": 0.88975233, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 1.125, + "step": 728, + "time_per_iteration": 2.424870491027832 + }, + { + "auxiliary_loss_clip": 0.01139082, + "auxiliary_loss_mlp": 0.01099467, + "balance_loss_clip": 1.02021623, + "balance_loss_mlp": 1.03237259, + "epoch": 0.04382985119494965, + "flos": 22009037218560.0, + "grad_norm": 2.1532141704767183, + "language_loss": 0.79052758, + "learning_rate": 3.997994361997338e-06, + "loss": 0.81291306, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 1.0625, + "step": 729, + "time_per_iteration": 3.9228484630584717 + }, + { + "auxiliary_loss_clip": 0.01148438, + "auxiliary_loss_mlp": 0.01102248, + "balance_loss_clip": 1.01842046, + "balance_loss_mlp": 1.03554714, + "epoch": 0.043889974447617615, + "flos": 24205387939200.0, + "grad_norm": 1.8937880481048945, + "language_loss": 0.9912231, + "learning_rate": 3.997976886700417e-06, + "loss": 1.01372993, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 1.125, + "step": 730, + "time_per_iteration": 3.9357101917266846 + }, + { + "auxiliary_loss_clip": 0.01144626, + "auxiliary_loss_mlp": 0.01096822, + "balance_loss_clip": 1.01637924, + "balance_loss_mlp": 1.03328729, + "epoch": 0.04395009770028559, + "flos": 17273592293760.0, + "grad_norm": 2.4081699568166632, + "language_loss": 0.93713468, + "learning_rate": 3.997959335640013e-06, + "loss": 0.95954919, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 1.109375, + "step": 731, + "time_per_iteration": 2.4546773433685303 + }, + { + "auxiliary_loss_clip": 0.01147668, + "auxiliary_loss_mlp": 0.01110516, + "balance_loss_clip": 1.02196741, + "balance_loss_mlp": 1.03637171, + "epoch": 0.04401022095295355, + "flos": 12309536586240.0, + "grad_norm": 2.8717169498735915, + "language_loss": 0.95605797, + "learning_rate": 3.997941708816791e-06, + "loss": 0.97863984, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 1.1171875, + "step": 732, + "time_per_iteration": 2.451756477355957 + }, + { + "auxiliary_loss_clip": 0.01152254, + "auxiliary_loss_mlp": 0.01111422, + "balance_loss_clip": 1.02783227, + "balance_loss_mlp": 1.0397439, + "epoch": 0.044070344205621524, + "flos": 20958605009280.0, + "grad_norm": 2.04184133923239, + "language_loss": 0.90404558, + "learning_rate": 3.997924006231419e-06, + "loss": 0.92668235, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 1.125, + "step": 733, + "time_per_iteration": 2.453716993331909 + }, + { + "auxiliary_loss_clip": 0.01155701, + "auxiliary_loss_mlp": 0.01120224, + "balance_loss_clip": 1.03124654, + "balance_loss_mlp": 1.0405035, + "epoch": 0.044130467458289496, + "flos": 13844423088000.0, + "grad_norm": 2.2516441753610117, + "language_loss": 0.94911051, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.97186971, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 1.1484375, + "step": 734, + "time_per_iteration": 2.43974232673645 + }, + { + "auxiliary_loss_clip": 0.01153703, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.0334053, + "balance_loss_mlp": 1.04253638, + "epoch": 0.04419059071095746, + "flos": 28653881466240.0, + "grad_norm": 1.7596240525812477, + "language_loss": 0.82477832, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.84749579, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 1.109375, + "step": 735, + "time_per_iteration": 2.5013484954833984 + }, + { + "auxiliary_loss_clip": 0.0115145, + "auxiliary_loss_mlp": 0.01113023, + "balance_loss_clip": 1.03262782, + "balance_loss_mlp": 1.04059815, + "epoch": 0.04425071396362543, + "flos": 28182065086080.0, + "grad_norm": 1.958572002443253, + "language_loss": 0.92400038, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.94664514, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 1.109375, + "step": 736, + "time_per_iteration": 2.520101547241211 + }, + { + "auxiliary_loss_clip": 0.01153039, + "auxiliary_loss_mlp": 0.01122166, + "balance_loss_clip": 1.03328347, + "balance_loss_mlp": 1.04079342, + "epoch": 0.0443108372162934, + "flos": 23657356327680.0, + "grad_norm": 1.6545055352494313, + "language_loss": 0.88095534, + "learning_rate": 3.997852438281901e-06, + "loss": 0.90370739, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 1.125, + "step": 737, + "time_per_iteration": 2.5043704509735107 + }, + { + "auxiliary_loss_clip": 0.01153469, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_clip": 1.0332005, + "balance_loss_mlp": 1.04126489, + "epoch": 0.04437096046896137, + "flos": 33978590184960.0, + "grad_norm": 1.8760504001142357, + "language_loss": 0.88670981, + "learning_rate": 3.997834356895906e-06, + "loss": 0.90945005, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 1.125, + "step": 738, + "time_per_iteration": 2.580198287963867 + }, + { + "auxiliary_loss_clip": 0.0104627, + "auxiliary_loss_mlp": 0.01020977, + "balance_loss_clip": 1.00695777, + "balance_loss_mlp": 1.0171442, + "epoch": 0.04443108372162934, + "flos": 67394379386880.0, + "grad_norm": 0.8809992072563103, + "language_loss": 0.59327638, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61394882, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.29101562, + "step": 739, + "time_per_iteration": 3.019779682159424 + }, + { + "auxiliary_loss_clip": 0.01147755, + "auxiliary_loss_mlp": 0.01117163, + "balance_loss_clip": 1.03323972, + "balance_loss_mlp": 1.03867161, + "epoch": 0.04449120697429731, + "flos": 29751376055040.0, + "grad_norm": 2.210708701997334, + "language_loss": 0.97377348, + "learning_rate": 3.997797966850369e-06, + "loss": 0.99642277, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 1.09375, + "step": 740, + "time_per_iteration": 2.5146284103393555 + }, + { + "auxiliary_loss_clip": 0.01146668, + "auxiliary_loss_mlp": 0.01116899, + "balance_loss_clip": 1.03173625, + "balance_loss_mlp": 1.03367734, + "epoch": 0.04455133022696528, + "flos": 36500645779200.0, + "grad_norm": 1.822140058037994, + "language_loss": 0.76034653, + "learning_rate": 3.997779658192205e-06, + "loss": 0.78298223, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 1.1328125, + "step": 741, + "time_per_iteration": 2.5871119499206543 + }, + { + "auxiliary_loss_clip": 0.01135098, + "auxiliary_loss_mlp": 0.01106665, + "balance_loss_clip": 1.02054882, + "balance_loss_mlp": 1.02734685, + "epoch": 0.044611453479633245, + "flos": 28802401856640.0, + "grad_norm": 1.6392228533763398, + "language_loss": 0.9079752, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9303928, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 1.078125, + "step": 742, + "time_per_iteration": 2.496699333190918 + }, + { + "auxiliary_loss_clip": 0.01138247, + "auxiliary_loss_mlp": 0.01120232, + "balance_loss_clip": 1.02443576, + "balance_loss_mlp": 1.02893972, + "epoch": 0.04467157673230122, + "flos": 20009945013120.0, + "grad_norm": 1.8766350539023289, + "language_loss": 0.87973946, + "learning_rate": 3.997742813608561e-06, + "loss": 0.90232432, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 1.09375, + "step": 743, + "time_per_iteration": 2.4465954303741455 + }, + { + "auxiliary_loss_clip": 0.01140816, + "auxiliary_loss_mlp": 0.01113915, + "balance_loss_clip": 1.02670169, + "balance_loss_mlp": 1.02928197, + "epoch": 0.04473169998496919, + "flos": 18003975269760.0, + "grad_norm": 2.2816699584426328, + "language_loss": 0.85252416, + "learning_rate": 3.997724277684479e-06, + "loss": 0.87507147, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 1.109375, + "step": 744, + "time_per_iteration": 2.4268076419830322 + }, + { + "auxiliary_loss_clip": 0.01137097, + "auxiliary_loss_mlp": 0.01116476, + "balance_loss_clip": 1.02730727, + "balance_loss_mlp": 1.02858961, + "epoch": 0.044791823237637154, + "flos": 20630665808640.0, + "grad_norm": 2.2616490585746267, + "language_loss": 0.88658237, + "learning_rate": 3.99770566600649e-06, + "loss": 0.90911818, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 1.0859375, + "step": 745, + "time_per_iteration": 2.4623963832855225 + }, + { + "auxiliary_loss_clip": 0.0113805, + "auxiliary_loss_mlp": 0.01113571, + "balance_loss_clip": 1.02235186, + "balance_loss_mlp": 1.02790534, + "epoch": 0.04485194649030513, + "flos": 31174819896960.0, + "grad_norm": 1.681727306137203, + "language_loss": 0.72507191, + "learning_rate": 3.997686978575302e-06, + "loss": 0.74758816, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 1.1015625, + "step": 746, + "time_per_iteration": 2.5230627059936523 + }, + { + "auxiliary_loss_clip": 0.01135766, + "auxiliary_loss_mlp": 0.01108748, + "balance_loss_clip": 1.02582586, + "balance_loss_mlp": 1.02934778, + "epoch": 0.04491206974297309, + "flos": 26142019989120.0, + "grad_norm": 1.7980778513982356, + "language_loss": 0.73686814, + "learning_rate": 3.997668215391625e-06, + "loss": 0.75931334, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 1.0625, + "step": 747, + "time_per_iteration": 2.4970059394836426 + }, + { + "auxiliary_loss_clip": 0.01132827, + "auxiliary_loss_mlp": 0.01109177, + "balance_loss_clip": 1.0300225, + "balance_loss_mlp": 1.02718925, + "epoch": 0.044972192995641064, + "flos": 20666626375680.0, + "grad_norm": 1.6733235936810396, + "language_loss": 0.705854, + "learning_rate": 3.997649376456168e-06, + "loss": 0.72827411, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 1.0546875, + "step": 748, + "time_per_iteration": 2.4578843116760254 + }, + { + "auxiliary_loss_clip": 0.01133152, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_clip": 1.02506781, + "balance_loss_mlp": 1.02805603, + "epoch": 0.045032316248309036, + "flos": 16105922138880.0, + "grad_norm": 2.0145462783121304, + "language_loss": 0.81297636, + "learning_rate": 3.997630461769647e-06, + "loss": 0.83539069, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 1.046875, + "step": 749, + "time_per_iteration": 2.4110422134399414 + }, + { + "auxiliary_loss_clip": 0.01133525, + "auxiliary_loss_mlp": 0.01097532, + "balance_loss_clip": 1.02266872, + "balance_loss_mlp": 1.02717578, + "epoch": 0.045092439500977, + "flos": 17857863763200.0, + "grad_norm": 1.9740615004970081, + "language_loss": 0.9338429, + "learning_rate": 3.997611471332778e-06, + "loss": 0.95615345, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 1.0625, + "step": 750, + "time_per_iteration": 2.403404951095581 + }, + { + "auxiliary_loss_clip": 0.01133673, + "auxiliary_loss_mlp": 0.01110672, + "balance_loss_clip": 1.02107453, + "balance_loss_mlp": 1.02723479, + "epoch": 0.04515256275364497, + "flos": 24461650385280.0, + "grad_norm": 1.8150889031213246, + "language_loss": 0.78918195, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.81162548, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 1.0625, + "step": 751, + "time_per_iteration": 2.4733352661132812 + }, + { + "auxiliary_loss_clip": 0.01129511, + "auxiliary_loss_mlp": 0.0110942, + "balance_loss_clip": 1.02683175, + "balance_loss_mlp": 1.02687716, + "epoch": 0.04521268600631294, + "flos": 20915522524800.0, + "grad_norm": 2.1476393589833953, + "language_loss": 0.73663074, + "learning_rate": 3.997573263210883e-06, + "loss": 0.75902009, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 1.0234375, + "step": 752, + "time_per_iteration": 2.4177374839782715 + }, + { + "auxiliary_loss_clip": 0.01125974, + "auxiliary_loss_mlp": 0.01095165, + "balance_loss_clip": 1.0210644, + "balance_loss_mlp": 1.02512336, + "epoch": 0.04527280925898091, + "flos": 13370512026240.0, + "grad_norm": 2.4592828739466985, + "language_loss": 0.98795986, + "learning_rate": 3.997554045527305e-06, + "loss": 1.01017118, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 1.0078125, + "step": 753, + "time_per_iteration": 2.3938474655151367 + }, + { + "auxiliary_loss_clip": 0.01132138, + "auxiliary_loss_mlp": 0.01102924, + "balance_loss_clip": 1.01881051, + "balance_loss_mlp": 1.02710354, + "epoch": 0.04533293251164888, + "flos": 23253551009280.0, + "grad_norm": 1.7829900085674473, + "language_loss": 0.94655603, + "learning_rate": 3.997534752096277e-06, + "loss": 0.96890664, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 1.046875, + "step": 754, + "time_per_iteration": 2.4401543140411377 + }, + { + "auxiliary_loss_clip": 0.01119839, + "auxiliary_loss_mlp": 0.01100307, + "balance_loss_clip": 1.03130913, + "balance_loss_mlp": 1.02705443, + "epoch": 0.04539305576431685, + "flos": 12421188714240.0, + "grad_norm": 2.2233125280831234, + "language_loss": 0.8334623, + "learning_rate": 3.997515382918531e-06, + "loss": 0.85566378, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.9296875, + "step": 755, + "time_per_iteration": 2.423874855041504 + }, + { + "auxiliary_loss_clip": 0.01126963, + "auxiliary_loss_mlp": 0.01094351, + "balance_loss_clip": 1.01767564, + "balance_loss_mlp": 1.02748966, + "epoch": 0.04545317901698482, + "flos": 16070066305920.0, + "grad_norm": 2.7768742739392476, + "language_loss": 0.83761609, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.85982925, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.9921875, + "step": 756, + "time_per_iteration": 2.423295021057129 + }, + { + "auxiliary_loss_clip": 0.01038475, + "auxiliary_loss_mlp": 0.01020994, + "balance_loss_clip": 1.00993145, + "balance_loss_mlp": 1.01612711, + "epoch": 0.045513302269652785, + "flos": 66392475834240.0, + "grad_norm": 0.8147330362799062, + "language_loss": 0.62927663, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64987135, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.22363281, + "step": 757, + "time_per_iteration": 3.1076056957244873 + }, + { + "auxiliary_loss_clip": 0.01123728, + "auxiliary_loss_mlp": 0.01089517, + "balance_loss_clip": 1.01756227, + "balance_loss_mlp": 1.02637184, + "epoch": 0.04557342552232076, + "flos": 21470082560640.0, + "grad_norm": 1.4475162366835992, + "language_loss": 0.86870086, + "learning_rate": 3.997456820912346e-06, + "loss": 0.89083326, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.97265625, + "step": 758, + "time_per_iteration": 2.4468162059783936 + }, + { + "auxiliary_loss_clip": 0.01122527, + "auxiliary_loss_mlp": 0.01096152, + "balance_loss_clip": 1.02457893, + "balance_loss_mlp": 1.02613902, + "epoch": 0.04563354877498873, + "flos": 23731546700160.0, + "grad_norm": 1.6421709814566503, + "language_loss": 0.92035711, + "learning_rate": 3.997437148755101e-06, + "loss": 0.94254386, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.96484375, + "step": 759, + "time_per_iteration": 2.4394891262054443 + }, + { + "auxiliary_loss_clip": 0.01127605, + "auxiliary_loss_mlp": 0.01089758, + "balance_loss_clip": 1.01952028, + "balance_loss_mlp": 1.02720857, + "epoch": 0.045693672027656694, + "flos": 25734653712000.0, + "grad_norm": 1.9414847033246145, + "language_loss": 0.78815544, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.81032914, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.0, + "step": 760, + "time_per_iteration": 2.4857101440429688 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.01086036, + "balance_loss_clip": 1.0194695, + "balance_loss_mlp": 1.02658474, + "epoch": 0.045753795280324666, + "flos": 19718001290880.0, + "grad_norm": 1.8907724451366423, + "language_loss": 0.86819911, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.89029527, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.97265625, + "step": 761, + "time_per_iteration": 2.426213026046753 + }, + { + "auxiliary_loss_clip": 0.01124746, + "auxiliary_loss_mlp": 0.0109556, + "balance_loss_clip": 1.02222252, + "balance_loss_mlp": 1.02707672, + "epoch": 0.04581391853299264, + "flos": 23254737995520.0, + "grad_norm": 1.5763461964755348, + "language_loss": 0.81917208, + "learning_rate": 3.997377677828266e-06, + "loss": 0.84137511, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.9765625, + "step": 762, + "time_per_iteration": 2.4767606258392334 + }, + { + "auxiliary_loss_clip": 0.0103563, + "auxiliary_loss_mlp": 0.01017786, + "balance_loss_clip": 1.00557923, + "balance_loss_mlp": 1.01358962, + "epoch": 0.0458740417856606, + "flos": 64227896317440.0, + "grad_norm": 1.0099804210804244, + "language_loss": 0.58912963, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60966378, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.22070312, + "step": 763, + "time_per_iteration": 3.0731773376464844 + }, + { + "auxiliary_loss_clip": 0.01121011, + "auxiliary_loss_mlp": 0.01086608, + "balance_loss_clip": 1.01884973, + "balance_loss_mlp": 1.02573705, + "epoch": 0.045934165038328575, + "flos": 20769271372800.0, + "grad_norm": 2.0696924660042004, + "language_loss": 0.9309442, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.95302039, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.953125, + "step": 764, + "time_per_iteration": 2.44710111618042 + }, + { + "auxiliary_loss_clip": 0.01128728, + "auxiliary_loss_mlp": 0.01092947, + "balance_loss_clip": 1.02146888, + "balance_loss_mlp": 1.02825689, + "epoch": 0.04599428829099654, + "flos": 30261596797440.0, + "grad_norm": 1.8970109995098774, + "language_loss": 0.91690052, + "learning_rate": 3.997317525234592e-06, + "loss": 0.93911725, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 1.0078125, + "step": 765, + "time_per_iteration": 2.4943723678588867 + }, + { + "auxiliary_loss_clip": 0.01127246, + "auxiliary_loss_mlp": 0.01095254, + "balance_loss_clip": 1.02358532, + "balance_loss_mlp": 1.02612019, + "epoch": 0.04605441154366451, + "flos": 23037822518400.0, + "grad_norm": 2.3386482294694195, + "language_loss": 0.94150591, + "learning_rate": 3.997297322892056e-06, + "loss": 0.96373093, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 1.0078125, + "step": 766, + "time_per_iteration": 3.94807767868042 + }, + { + "auxiliary_loss_clip": 0.0112332, + "auxiliary_loss_mlp": 0.01088985, + "balance_loss_clip": 1.02084517, + "balance_loss_mlp": 1.02624428, + "epoch": 0.046114534796332485, + "flos": 22016333692800.0, + "grad_norm": 2.01040258366671, + "language_loss": 0.87956429, + "learning_rate": 3.997277044811806e-06, + "loss": 0.90168738, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.96875, + "step": 767, + "time_per_iteration": 2.4641506671905518 + }, + { + "auxiliary_loss_clip": 0.01126415, + "auxiliary_loss_mlp": 0.01092933, + "balance_loss_clip": 1.0230763, + "balance_loss_mlp": 1.02637029, + "epoch": 0.04617465804900045, + "flos": 29861073146880.0, + "grad_norm": 1.8448964343350567, + "language_loss": 0.90976912, + "learning_rate": 3.99725669099461e-06, + "loss": 0.93196261, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 1.0, + "step": 768, + "time_per_iteration": 2.4838666915893555 + }, + { + "auxiliary_loss_clip": 0.01129397, + "auxiliary_loss_mlp": 0.01089388, + "balance_loss_clip": 1.01905465, + "balance_loss_mlp": 1.02606571, + "epoch": 0.04623478130166842, + "flos": 25628866692480.0, + "grad_norm": 1.9684060631435032, + "language_loss": 0.78897661, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.81116444, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.03125, + "step": 769, + "time_per_iteration": 5.420944452285767 + }, + { + "auxiliary_loss_clip": 0.0112043, + "auxiliary_loss_mlp": 0.0109677, + "balance_loss_clip": 1.02567363, + "balance_loss_mlp": 1.02694619, + "epoch": 0.04629490455433639, + "flos": 20448035153280.0, + "grad_norm": 1.724662341571376, + "language_loss": 0.89795363, + "learning_rate": 3.997215756152471e-06, + "loss": 0.92012566, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.9375, + "step": 770, + "time_per_iteration": 3.9440112113952637 + }, + { + "auxiliary_loss_clip": 0.01128884, + "auxiliary_loss_mlp": 0.01094301, + "balance_loss_clip": 1.02482605, + "balance_loss_mlp": 1.02562356, + "epoch": 0.04635502780700436, + "flos": 23147624344320.0, + "grad_norm": 2.0470585001970005, + "language_loss": 0.91511911, + "learning_rate": 3.99719517512908e-06, + "loss": 0.93735099, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 1.03125, + "step": 771, + "time_per_iteration": 2.4450881481170654 + }, + { + "auxiliary_loss_clip": 0.01136375, + "auxiliary_loss_mlp": 0.01100635, + "balance_loss_clip": 1.0188576, + "balance_loss_mlp": 1.02673697, + "epoch": 0.04641515105967233, + "flos": 23290977853440.0, + "grad_norm": 1.9674777994919002, + "language_loss": 0.87871599, + "learning_rate": 3.997174518371848e-06, + "loss": 0.90108621, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 1.09375, + "step": 772, + "time_per_iteration": 2.435906171798706 + }, + { + "auxiliary_loss_clip": 0.01128413, + "auxiliary_loss_mlp": 0.01088221, + "balance_loss_clip": 1.02098715, + "balance_loss_mlp": 1.02830124, + "epoch": 0.046475274312340296, + "flos": 25114142384640.0, + "grad_norm": 1.7385662105911095, + "language_loss": 0.78276026, + "learning_rate": 3.997153785881557e-06, + "loss": 0.80492663, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 1.0, + "step": 773, + "time_per_iteration": 2.4614150524139404 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01081742, + "balance_loss_clip": 1.01784563, + "balance_loss_mlp": 1.02816975, + "epoch": 0.04653539756500827, + "flos": 25263745027200.0, + "grad_norm": 1.8042543341097, + "language_loss": 0.81806386, + "learning_rate": 3.997132977658996e-06, + "loss": 0.84011382, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.953125, + "step": 774, + "time_per_iteration": 2.4512312412261963 + }, + { + "auxiliary_loss_clip": 0.01130333, + "auxiliary_loss_mlp": 0.01101352, + "balance_loss_clip": 1.02782345, + "balance_loss_mlp": 1.02870679, + "epoch": 0.046595520817676234, + "flos": 35402802076800.0, + "grad_norm": 2.5106479490493854, + "language_loss": 0.76514322, + "learning_rate": 3.997112093704952e-06, + "loss": 0.78746003, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 1.015625, + "step": 775, + "time_per_iteration": 2.5525665283203125 + }, + { + "auxiliary_loss_clip": 0.01129184, + "auxiliary_loss_mlp": 0.01092556, + "balance_loss_clip": 1.01878905, + "balance_loss_mlp": 1.02747607, + "epoch": 0.046655644070344206, + "flos": 18111577680000.0, + "grad_norm": 1.612297059918002, + "language_loss": 0.81280386, + "learning_rate": 3.997091134020217e-06, + "loss": 0.83502126, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 1.015625, + "step": 776, + "time_per_iteration": 2.4103944301605225 + }, + { + "auxiliary_loss_clip": 0.01127989, + "auxiliary_loss_mlp": 0.01089115, + "balance_loss_clip": 1.01549149, + "balance_loss_mlp": 1.02790773, + "epoch": 0.04671576732301218, + "flos": 29204007759360.0, + "grad_norm": 1.7187579355062241, + "language_loss": 0.76234835, + "learning_rate": 3.997070098605585e-06, + "loss": 0.78451943, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 1.0, + "step": 777, + "time_per_iteration": 2.4932217597961426 + }, + { + "auxiliary_loss_clip": 0.01127021, + "auxiliary_loss_mlp": 0.01092988, + "balance_loss_clip": 1.01812434, + "balance_loss_mlp": 1.02743447, + "epoch": 0.04677589057568014, + "flos": 30477115820160.0, + "grad_norm": 1.7224883389611743, + "language_loss": 0.79182601, + "learning_rate": 3.997048987461856e-06, + "loss": 0.81402612, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.99609375, + "step": 778, + "time_per_iteration": 2.5414860248565674 + }, + { + "auxiliary_loss_clip": 0.0113126, + "auxiliary_loss_mlp": 0.0110191, + "balance_loss_clip": 1.02594984, + "balance_loss_mlp": 1.02887797, + "epoch": 0.046836013828348115, + "flos": 20556649992960.0, + "grad_norm": 1.8248308589518587, + "language_loss": 0.82879174, + "learning_rate": 3.997027800589829e-06, + "loss": 0.85112345, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 1.0234375, + "step": 779, + "time_per_iteration": 2.4907591342926025 + }, + { + "auxiliary_loss_clip": 0.01124461, + "auxiliary_loss_mlp": 0.01090545, + "balance_loss_clip": 1.02126074, + "balance_loss_mlp": 1.02609038, + "epoch": 0.04689613708101608, + "flos": 25446201125760.0, + "grad_norm": 1.9988860237486408, + "language_loss": 0.79348707, + "learning_rate": 3.997006537990308e-06, + "loss": 0.81563711, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.984375, + "step": 780, + "time_per_iteration": 2.5092263221740723 + }, + { + "auxiliary_loss_clip": 0.01124685, + "auxiliary_loss_mlp": 0.01087951, + "balance_loss_clip": 1.01766503, + "balance_loss_mlp": 1.0277046, + "epoch": 0.04695626033368405, + "flos": 23000325851520.0, + "grad_norm": 1.5564591161405106, + "language_loss": 0.78871608, + "learning_rate": 3.996985199664099e-06, + "loss": 0.81084239, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.96875, + "step": 781, + "time_per_iteration": 2.4851717948913574 + }, + { + "auxiliary_loss_clip": 0.01134739, + "auxiliary_loss_mlp": 0.01101609, + "balance_loss_clip": 1.02035594, + "balance_loss_mlp": 1.02886438, + "epoch": 0.047016383586352024, + "flos": 29132051713920.0, + "grad_norm": 1.8514347251301078, + "language_loss": 0.79326612, + "learning_rate": 3.99696378561201e-06, + "loss": 0.8156296, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 1.0625, + "step": 782, + "time_per_iteration": 2.5721004009246826 + }, + { + "auxiliary_loss_clip": 0.01133448, + "auxiliary_loss_mlp": 0.01093575, + "balance_loss_clip": 1.02033305, + "balance_loss_mlp": 1.03003383, + "epoch": 0.04707650683901999, + "flos": 14975434448640.0, + "grad_norm": 1.8153019745488344, + "language_loss": 0.83836341, + "learning_rate": 3.996942295834855e-06, + "loss": 0.86063361, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 1.03125, + "step": 783, + "time_per_iteration": 2.442685842514038 + }, + { + "auxiliary_loss_clip": 0.01126048, + "auxiliary_loss_mlp": 0.01096142, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.02859151, + "epoch": 0.04713663009168796, + "flos": 21650094864000.0, + "grad_norm": 1.6397611669243433, + "language_loss": 0.84214222, + "learning_rate": 3.996920730333448e-06, + "loss": 0.86436403, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.97265625, + "step": 784, + "time_per_iteration": 2.445434093475342 + }, + { + "auxiliary_loss_clip": 0.01132832, + "auxiliary_loss_mlp": 0.01094157, + "balance_loss_clip": 1.02019954, + "balance_loss_mlp": 1.02982461, + "epoch": 0.04719675334435593, + "flos": 21324320167680.0, + "grad_norm": 1.9516384574821146, + "language_loss": 0.83121675, + "learning_rate": 3.996899089108607e-06, + "loss": 0.85348666, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.0234375, + "step": 785, + "time_per_iteration": 2.462984085083008 + }, + { + "auxiliary_loss_clip": 0.01128416, + "auxiliary_loss_mlp": 0.01093777, + "balance_loss_clip": 1.02163196, + "balance_loss_mlp": 1.02940941, + "epoch": 0.0472568765970239, + "flos": 17930413301760.0, + "grad_norm": 1.8294102202585873, + "language_loss": 0.94905436, + "learning_rate": 3.996877372161152e-06, + "loss": 0.97127628, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.98828125, + "step": 786, + "time_per_iteration": 2.3854053020477295 + }, + { + "auxiliary_loss_clip": 0.01136259, + "auxiliary_loss_mlp": 0.01107894, + "balance_loss_clip": 1.02282584, + "balance_loss_mlp": 1.03083968, + "epoch": 0.04731699984969187, + "flos": 18076350251520.0, + "grad_norm": 2.0750957617473507, + "language_loss": 0.83329582, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.85573733, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 1.0546875, + "step": 787, + "time_per_iteration": 2.4238786697387695 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.01096024, + "balance_loss_clip": 1.02135134, + "balance_loss_mlp": 1.03283858, + "epoch": 0.047377123102359836, + "flos": 23183968936320.0, + "grad_norm": 2.171102484013912, + "language_loss": 0.86099792, + "learning_rate": 3.996833711101698e-06, + "loss": 0.88331574, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 1.03125, + "step": 788, + "time_per_iteration": 2.429232358932495 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.0109877, + "balance_loss_clip": 1.02266729, + "balance_loss_mlp": 1.03172731, + "epoch": 0.04743724635502781, + "flos": 22746681757440.0, + "grad_norm": 1.780541112028689, + "language_loss": 0.87988985, + "learning_rate": 3.996811766991355e-06, + "loss": 0.90218329, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.9921875, + "step": 789, + "time_per_iteration": 2.422647476196289 + }, + { + "auxiliary_loss_clip": 0.01129348, + "auxiliary_loss_mlp": 0.01096613, + "balance_loss_clip": 1.02465832, + "balance_loss_mlp": 1.02794719, + "epoch": 0.04749736960769577, + "flos": 17237736460800.0, + "grad_norm": 2.3503811297891493, + "language_loss": 0.86072397, + "learning_rate": 3.996789747161709e-06, + "loss": 0.88298362, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.015625, + "step": 790, + "time_per_iteration": 2.4177567958831787 + }, + { + "auxiliary_loss_clip": 0.01128664, + "auxiliary_loss_mlp": 0.01090908, + "balance_loss_clip": 1.01942992, + "balance_loss_mlp": 1.02669954, + "epoch": 0.047557492860363745, + "flos": 40477672039680.0, + "grad_norm": 1.812654727552757, + "language_loss": 0.91726911, + "learning_rate": 3.996767651613597e-06, + "loss": 0.93946481, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 1.015625, + "step": 791, + "time_per_iteration": 2.5801892280578613 + }, + { + "auxiliary_loss_clip": 0.01130046, + "auxiliary_loss_mlp": 0.01089521, + "balance_loss_clip": 1.02147579, + "balance_loss_mlp": 1.02863991, + "epoch": 0.04761761611303172, + "flos": 18697001224320.0, + "grad_norm": 1.8184137166363679, + "language_loss": 0.93475795, + "learning_rate": 3.996745480347854e-06, + "loss": 0.95695364, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 1.015625, + "step": 792, + "time_per_iteration": 2.4260759353637695 + }, + { + "auxiliary_loss_clip": 0.01126688, + "auxiliary_loss_mlp": 0.0109598, + "balance_loss_clip": 1.02164102, + "balance_loss_mlp": 1.02642488, + "epoch": 0.04767773936569968, + "flos": 20920968696960.0, + "grad_norm": 2.114892430515852, + "language_loss": 0.76931417, + "learning_rate": 3.996723233365324e-06, + "loss": 0.79154086, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.0, + "step": 793, + "time_per_iteration": 2.4195919036865234 + }, + { + "auxiliary_loss_clip": 0.0112963, + "auxiliary_loss_mlp": 0.01093968, + "balance_loss_clip": 1.01862741, + "balance_loss_mlp": 1.02825058, + "epoch": 0.047737862618367655, + "flos": 23731546700160.0, + "grad_norm": 1.8922723863531363, + "language_loss": 0.91048574, + "learning_rate": 3.996700910666847e-06, + "loss": 0.93272173, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 1.015625, + "step": 794, + "time_per_iteration": 2.4750466346740723 + }, + { + "auxiliary_loss_clip": 0.01129898, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_clip": 1.01584744, + "balance_loss_mlp": 1.02604842, + "epoch": 0.04779798587103562, + "flos": 23694643526400.0, + "grad_norm": 2.576090113652724, + "language_loss": 0.75514168, + "learning_rate": 3.996678512253272e-06, + "loss": 0.77730578, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 1.0390625, + "step": 795, + "time_per_iteration": 2.440300226211548 + }, + { + "auxiliary_loss_clip": 0.01121473, + "auxiliary_loss_mlp": 0.0109148, + "balance_loss_clip": 1.01919115, + "balance_loss_mlp": 1.02514851, + "epoch": 0.04785810912370359, + "flos": 23182572481920.0, + "grad_norm": 1.6778087643319475, + "language_loss": 0.83677101, + "learning_rate": 3.996656038125449e-06, + "loss": 0.85890055, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.9609375, + "step": 796, + "time_per_iteration": 2.438460111618042 + }, + { + "auxiliary_loss_clip": 0.01126284, + "auxiliary_loss_mlp": 0.01085635, + "balance_loss_clip": 1.01611233, + "balance_loss_mlp": 1.02805769, + "epoch": 0.047918232376371564, + "flos": 18039656545920.0, + "grad_norm": 1.8707317942598143, + "language_loss": 0.86171526, + "learning_rate": 3.996633488284228e-06, + "loss": 0.88383442, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.984375, + "step": 797, + "time_per_iteration": 2.394491672515869 + }, + { + "auxiliary_loss_clip": 0.01038824, + "auxiliary_loss_mlp": 0.01018545, + "balance_loss_clip": 1.00700581, + "balance_loss_mlp": 1.01667762, + "epoch": 0.04797835562903953, + "flos": 62439400632960.0, + "grad_norm": 0.9351228017063494, + "language_loss": 0.64602757, + "learning_rate": 3.996610862730465e-06, + "loss": 0.6666013, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.22070312, + "step": 798, + "time_per_iteration": 2.948089361190796 + }, + { + "auxiliary_loss_clip": 0.01135993, + "auxiliary_loss_mlp": 0.01099678, + "balance_loss_clip": 1.02243054, + "balance_loss_mlp": 1.0282104, + "epoch": 0.0480384788817075, + "flos": 21506217684480.0, + "grad_norm": 1.8207314241628734, + "language_loss": 0.94829297, + "learning_rate": 3.996588161465018e-06, + "loss": 0.97064972, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 1.078125, + "step": 799, + "time_per_iteration": 2.4190673828125 + }, + { + "auxiliary_loss_clip": 0.01119699, + "auxiliary_loss_mlp": 0.01097683, + "balance_loss_clip": 1.02401185, + "balance_loss_mlp": 1.02583659, + "epoch": 0.048098602134375466, + "flos": 21725611868160.0, + "grad_norm": 2.010004037721749, + "language_loss": 0.9065969, + "learning_rate": 3.996565384488748e-06, + "loss": 0.92877072, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.9375, + "step": 800, + "time_per_iteration": 2.4437150955200195 + }, + { + "auxiliary_loss_clip": 0.0112911, + "auxiliary_loss_mlp": 0.01105532, + "balance_loss_clip": 1.03367305, + "balance_loss_mlp": 1.02780533, + "epoch": 0.04815872538704344, + "flos": 22929940817280.0, + "grad_norm": 1.919406211562986, + "language_loss": 0.88225609, + "learning_rate": 3.996542531802518e-06, + "loss": 0.90460253, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.015625, + "step": 801, + "time_per_iteration": 2.4226744174957275 + }, + { + "auxiliary_loss_clip": 0.0112781, + "auxiliary_loss_mlp": 0.0110204, + "balance_loss_clip": 1.02932215, + "balance_loss_mlp": 1.02885723, + "epoch": 0.04821884863971141, + "flos": 43173176601600.0, + "grad_norm": 1.8101202191822818, + "language_loss": 0.83623838, + "learning_rate": 3.996519603407196e-06, + "loss": 0.8585369, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.98828125, + "step": 802, + "time_per_iteration": 2.6308212280273438 + }, + { + "auxiliary_loss_clip": 0.01125116, + "auxiliary_loss_mlp": 0.01109157, + "balance_loss_clip": 1.03739333, + "balance_loss_mlp": 1.02919221, + "epoch": 0.048278971892379376, + "flos": 18619145159040.0, + "grad_norm": 1.7674656946439435, + "language_loss": 0.89733648, + "learning_rate": 3.996496599303649e-06, + "loss": 0.91967928, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.95703125, + "step": 803, + "time_per_iteration": 2.4383859634399414 + }, + { + "auxiliary_loss_clip": 0.01124105, + "auxiliary_loss_mlp": 0.01102861, + "balance_loss_clip": 1.03109694, + "balance_loss_mlp": 1.02629101, + "epoch": 0.04833909514504735, + "flos": 20229024994560.0, + "grad_norm": 1.993864127437173, + "language_loss": 0.89977777, + "learning_rate": 3.996473519492753e-06, + "loss": 0.92204738, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.9765625, + "step": 804, + "time_per_iteration": 2.434561014175415 + }, + { + "auxiliary_loss_clip": 0.01127648, + "auxiliary_loss_mlp": 0.0110738, + "balance_loss_clip": 1.03914428, + "balance_loss_mlp": 1.02809668, + "epoch": 0.04839921839771532, + "flos": 24644001749760.0, + "grad_norm": 1.9710814926157487, + "language_loss": 0.89173484, + "learning_rate": 3.99645036397538e-06, + "loss": 0.91408515, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.9921875, + "step": 805, + "time_per_iteration": 3.940985679626465 + }, + { + "auxiliary_loss_clip": 0.01126997, + "auxiliary_loss_mlp": 0.01099673, + "balance_loss_clip": 1.02795672, + "balance_loss_mlp": 1.02706385, + "epoch": 0.048459341650383285, + "flos": 24826283291520.0, + "grad_norm": 2.0533771923172317, + "language_loss": 0.70781481, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.73008156, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.0, + "step": 806, + "time_per_iteration": 2.4611496925354004 + }, + { + "auxiliary_loss_clip": 0.0112498, + "auxiliary_loss_mlp": 0.01105411, + "balance_loss_clip": 1.03173971, + "balance_loss_mlp": 1.02855229, + "epoch": 0.04851946490305126, + "flos": 22162130997120.0, + "grad_norm": 2.016635483805228, + "language_loss": 0.80266404, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.82496798, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.96484375, + "step": 807, + "time_per_iteration": 2.550133228302002 + }, + { + "auxiliary_loss_clip": 0.0112265, + "auxiliary_loss_mlp": 0.01094868, + "balance_loss_clip": 1.02462971, + "balance_loss_mlp": 1.02706468, + "epoch": 0.04857958815571922, + "flos": 19791004677120.0, + "grad_norm": 1.8971304209440776, + "language_loss": 0.8991456, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.92132086, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.95703125, + "step": 808, + "time_per_iteration": 2.4279072284698486 + }, + { + "auxiliary_loss_clip": 0.01129648, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_clip": 1.02708781, + "balance_loss_mlp": 1.02667928, + "epoch": 0.048639711408387194, + "flos": 18696966312960.0, + "grad_norm": 1.5979004793448004, + "language_loss": 0.93540597, + "learning_rate": 3.996356984858732e-06, + "loss": 0.95777398, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 1.03125, + "step": 809, + "time_per_iteration": 5.257675886154175 + }, + { + "auxiliary_loss_clip": 0.01126257, + "auxiliary_loss_mlp": 0.01090004, + "balance_loss_clip": 1.01709557, + "balance_loss_mlp": 1.0290637, + "epoch": 0.048699834661055166, + "flos": 24862348592640.0, + "grad_norm": 1.8828613243198968, + "language_loss": 0.88903427, + "learning_rate": 3.996333450822208e-06, + "loss": 0.91119689, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.97265625, + "step": 810, + "time_per_iteration": 3.866572380065918 + }, + { + "auxiliary_loss_clip": 0.01130846, + "auxiliary_loss_mlp": 0.01084792, + "balance_loss_clip": 1.01522171, + "balance_loss_mlp": 1.02959275, + "epoch": 0.04875995791372313, + "flos": 20702970967680.0, + "grad_norm": 1.682357340063889, + "language_loss": 0.83875442, + "learning_rate": 3.99630984108452e-06, + "loss": 0.86091083, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 1.015625, + "step": 811, + "time_per_iteration": 2.45694637298584 + }, + { + "auxiliary_loss_clip": 0.0112501, + "auxiliary_loss_mlp": 0.01097166, + "balance_loss_clip": 1.02735698, + "balance_loss_mlp": 1.02807593, + "epoch": 0.048820081166391104, + "flos": 18587304132480.0, + "grad_norm": 1.7785973269450905, + "language_loss": 0.77735823, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.79957998, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.96875, + "step": 812, + "time_per_iteration": 2.3841586112976074 + }, + { + "auxiliary_loss_clip": 0.01118452, + "auxiliary_loss_mlp": 0.01094484, + "balance_loss_clip": 1.02753603, + "balance_loss_mlp": 1.02711451, + "epoch": 0.04888020441905907, + "flos": 22706322359040.0, + "grad_norm": 2.009119264157794, + "language_loss": 0.92772406, + "learning_rate": 3.996262394509233e-06, + "loss": 0.94985342, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.9140625, + "step": 813, + "time_per_iteration": 2.4460930824279785 + }, + { + "auxiliary_loss_clip": 0.01122485, + "auxiliary_loss_mlp": 0.01090506, + "balance_loss_clip": 1.02441633, + "balance_loss_mlp": 1.02621555, + "epoch": 0.04894032767172704, + "flos": 22783235817600.0, + "grad_norm": 2.068441860286538, + "language_loss": 0.78264749, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.8047775, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.9609375, + "step": 814, + "time_per_iteration": 2.420793056488037 + }, + { + "auxiliary_loss_clip": 0.011234, + "auxiliary_loss_mlp": 0.011012, + "balance_loss_clip": 1.02838731, + "balance_loss_mlp": 1.02567816, + "epoch": 0.04900045092439501, + "flos": 25515084971520.0, + "grad_norm": 1.8166781578221358, + "language_loss": 0.8717823, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.89402831, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.9765625, + "step": 815, + "time_per_iteration": 2.4551053047180176 + }, + { + "auxiliary_loss_clip": 0.01128477, + "auxiliary_loss_mlp": 0.01097516, + "balance_loss_clip": 1.02265215, + "balance_loss_mlp": 1.02689028, + "epoch": 0.04906057417706298, + "flos": 25956945538560.0, + "grad_norm": 2.0875320817834893, + "language_loss": 0.96493769, + "learning_rate": 3.996190656910043e-06, + "loss": 0.98719758, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 1.015625, + "step": 816, + "time_per_iteration": 2.4652719497680664 + }, + { + "auxiliary_loss_clip": 0.01124956, + "auxiliary_loss_mlp": 0.01086402, + "balance_loss_clip": 1.01482892, + "balance_loss_mlp": 1.02478743, + "epoch": 0.04912069742973095, + "flos": 18623648724480.0, + "grad_norm": 2.0971487976054997, + "language_loss": 0.84503907, + "learning_rate": 3.996166592984268e-06, + "loss": 0.86715263, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 1.0, + "step": 817, + "time_per_iteration": 2.400782346725464 + }, + { + "auxiliary_loss_clip": 0.01124415, + "auxiliary_loss_mlp": 0.01108134, + "balance_loss_clip": 1.03141093, + "balance_loss_mlp": 1.02671444, + "epoch": 0.049180820682398915, + "flos": 23698553598720.0, + "grad_norm": 1.5808824118039877, + "language_loss": 0.87186432, + "learning_rate": 3.996142453363656e-06, + "loss": 0.89418983, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.9765625, + "step": 818, + "time_per_iteration": 2.4802234172821045 + }, + { + "auxiliary_loss_clip": 0.01130484, + "auxiliary_loss_mlp": 0.01102548, + "balance_loss_clip": 1.02468061, + "balance_loss_mlp": 1.02723527, + "epoch": 0.04924094393506689, + "flos": 22419266227200.0, + "grad_norm": 2.367986260900332, + "language_loss": 0.82921702, + "learning_rate": 3.996118238049124e-06, + "loss": 0.85154736, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 1.03125, + "step": 819, + "time_per_iteration": 2.4196581840515137 + }, + { + "auxiliary_loss_clip": 0.01126639, + "auxiliary_loss_mlp": 0.01094052, + "balance_loss_clip": 1.02686572, + "balance_loss_mlp": 1.02769327, + "epoch": 0.04930106718773486, + "flos": 15737448983040.0, + "grad_norm": 2.340973671253265, + "language_loss": 0.87657368, + "learning_rate": 3.996093947041586e-06, + "loss": 0.89878058, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.98828125, + "step": 820, + "time_per_iteration": 2.4172985553741455 + }, + { + "auxiliary_loss_clip": 0.01130159, + "auxiliary_loss_mlp": 0.01090882, + "balance_loss_clip": 1.02045345, + "balance_loss_mlp": 1.02736664, + "epoch": 0.049361190440402825, + "flos": 26249412931200.0, + "grad_norm": 1.7740636162689962, + "language_loss": 0.93433547, + "learning_rate": 3.996069580341966e-06, + "loss": 0.95654583, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.03125, + "step": 821, + "time_per_iteration": 2.441948652267456 + }, + { + "auxiliary_loss_clip": 0.01130009, + "auxiliary_loss_mlp": 0.01103275, + "balance_loss_clip": 1.02407241, + "balance_loss_mlp": 1.02945018, + "epoch": 0.0494213136930708, + "flos": 21251281870080.0, + "grad_norm": 2.2523934561042753, + "language_loss": 0.92241263, + "learning_rate": 3.996045137951188e-06, + "loss": 0.94474542, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 1.0078125, + "step": 822, + "time_per_iteration": 2.447622060775757 + }, + { + "auxiliary_loss_clip": 0.01130032, + "auxiliary_loss_mlp": 0.01091187, + "balance_loss_clip": 1.01684785, + "balance_loss_mlp": 1.02943933, + "epoch": 0.04948143694573876, + "flos": 27964241913600.0, + "grad_norm": 2.048651541065716, + "language_loss": 0.70931351, + "learning_rate": 3.996020619870178e-06, + "loss": 0.73152566, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.0078125, + "step": 823, + "time_per_iteration": 2.4577138423919678 + }, + { + "auxiliary_loss_clip": 0.01051613, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.02955818, + "balance_loss_mlp": 1.02661741, + "epoch": 0.049541560198406734, + "flos": 66178250398080.0, + "grad_norm": 1.3714006389048436, + "language_loss": 0.62486339, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64576805, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.09277344, + "router_z_loss_mlp": 0.25, + "step": 824, + "time_per_iteration": 3.1256790161132812 + }, + { + "auxiliary_loss_clip": 0.01136179, + "auxiliary_loss_mlp": 0.01108715, + "balance_loss_clip": 1.02784312, + "balance_loss_mlp": 1.03218424, + "epoch": 0.049601683451074706, + "flos": 22891606277760.0, + "grad_norm": 1.950089683384892, + "language_loss": 0.94378054, + "learning_rate": 3.995971356641185e-06, + "loss": 0.96622944, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 1.0390625, + "step": 825, + "time_per_iteration": 2.447047472000122 + }, + { + "auxiliary_loss_clip": 0.011298, + "auxiliary_loss_mlp": 0.0110263, + "balance_loss_clip": 1.02838635, + "balance_loss_mlp": 1.02891612, + "epoch": 0.04966180670374267, + "flos": 21432585893760.0, + "grad_norm": 2.4839616786413083, + "language_loss": 0.71979761, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.74212193, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.0078125, + "step": 826, + "time_per_iteration": 2.4005424976348877 + }, + { + "auxiliary_loss_clip": 0.01134133, + "auxiliary_loss_mlp": 0.01104253, + "balance_loss_clip": 1.0245254, + "balance_loss_mlp": 1.02904248, + "epoch": 0.04972192995641064, + "flos": 23106392161920.0, + "grad_norm": 1.752457603278906, + "language_loss": 0.82754141, + "learning_rate": 3.995921790662459e-06, + "loss": 0.84992528, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 1.046875, + "step": 827, + "time_per_iteration": 2.464085578918457 + }, + { + "auxiliary_loss_clip": 0.01133215, + "auxiliary_loss_mlp": 0.0111074, + "balance_loss_clip": 1.02691174, + "balance_loss_mlp": 1.02864671, + "epoch": 0.04978205320907861, + "flos": 40404563919360.0, + "grad_norm": 2.0308059899730937, + "language_loss": 0.82910037, + "learning_rate": 3.995896894144294e-06, + "loss": 0.85153997, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 1.046875, + "step": 828, + "time_per_iteration": 2.762592077255249 + }, + { + "auxiliary_loss_clip": 0.01128613, + "auxiliary_loss_mlp": 0.01094514, + "balance_loss_clip": 1.02084279, + "balance_loss_mlp": 1.02726007, + "epoch": 0.04984217646174658, + "flos": 25227365523840.0, + "grad_norm": 1.835989445728411, + "language_loss": 0.872908, + "learning_rate": 3.995871921941519e-06, + "loss": 0.89513928, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 1.015625, + "step": 829, + "time_per_iteration": 2.5734376907348633 + }, + { + "auxiliary_loss_clip": 0.01133377, + "auxiliary_loss_mlp": 0.01109738, + "balance_loss_clip": 1.02848506, + "balance_loss_mlp": 1.02867651, + "epoch": 0.04990229971441455, + "flos": 15958763291520.0, + "grad_norm": 1.873953114359596, + "language_loss": 0.78572136, + "learning_rate": 3.99584687405508e-06, + "loss": 0.80815256, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 1.046875, + "step": 830, + "time_per_iteration": 2.4663054943084717 + }, + { + "auxiliary_loss_clip": 0.01132258, + "auxiliary_loss_mlp": 0.01114485, + "balance_loss_clip": 1.02255106, + "balance_loss_mlp": 1.02574801, + "epoch": 0.04996242296708252, + "flos": 18404149806720.0, + "grad_norm": 1.7680049350202567, + "language_loss": 0.8146385, + "learning_rate": 3.995821750485929e-06, + "loss": 0.83710599, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 1.0625, + "step": 831, + "time_per_iteration": 2.462139844894409 + }, + { + "auxiliary_loss_clip": 0.0113452, + "auxiliary_loss_mlp": 0.0110743, + "balance_loss_clip": 1.02708244, + "balance_loss_mlp": 1.02893734, + "epoch": 0.05002254621975049, + "flos": 17857095713280.0, + "grad_norm": 2.3540284908975644, + "language_loss": 0.95906782, + "learning_rate": 3.995796551235016e-06, + "loss": 0.98148727, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 1.0546875, + "step": 832, + "time_per_iteration": 2.4712274074554443 + }, + { + "auxiliary_loss_clip": 0.0112218, + "auxiliary_loss_mlp": 0.0109016, + "balance_loss_clip": 1.02206767, + "balance_loss_mlp": 1.02536106, + "epoch": 0.050082669472418455, + "flos": 45658538490240.0, + "grad_norm": 1.7717772174750868, + "language_loss": 0.85488385, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.87700725, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.96875, + "step": 833, + "time_per_iteration": 2.7135560512542725 + }, + { + "auxiliary_loss_clip": 0.01129931, + "auxiliary_loss_mlp": 0.01103536, + "balance_loss_clip": 1.02485824, + "balance_loss_mlp": 1.02585721, + "epoch": 0.05014279272508643, + "flos": 37960538947200.0, + "grad_norm": 1.8868049448423627, + "language_loss": 0.85936546, + "learning_rate": 3.995745925691733e-06, + "loss": 0.88170016, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 1.046875, + "step": 834, + "time_per_iteration": 2.6495156288146973 + }, + { + "auxiliary_loss_clip": 0.0113259, + "auxiliary_loss_mlp": 0.01090849, + "balance_loss_clip": 1.01736856, + "balance_loss_mlp": 1.02794921, + "epoch": 0.0502029159777544, + "flos": 20995124158080.0, + "grad_norm": 1.9846737858935943, + "language_loss": 0.9617635, + "learning_rate": 3.995720499401282e-06, + "loss": 0.98399782, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 1.046875, + "step": 835, + "time_per_iteration": 2.5073297023773193 + }, + { + "auxiliary_loss_clip": 0.0113215, + "auxiliary_loss_mlp": 0.01102783, + "balance_loss_clip": 1.02558303, + "balance_loss_mlp": 1.02677274, + "epoch": 0.050263039230422364, + "flos": 15887156359680.0, + "grad_norm": 1.7716826333032227, + "language_loss": 0.81808454, + "learning_rate": 3.995694997432911e-06, + "loss": 0.8404339, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 1.046875, + "step": 836, + "time_per_iteration": 2.4563841819763184 + }, + { + "auxiliary_loss_clip": 0.0112006, + "auxiliary_loss_mlp": 0.01085969, + "balance_loss_clip": 1.01968837, + "balance_loss_mlp": 1.02622104, + "epoch": 0.050323162483090336, + "flos": 23731616522880.0, + "grad_norm": 2.3205625918079082, + "language_loss": 0.87536287, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.89742315, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.9375, + "step": 837, + "time_per_iteration": 2.5013794898986816 + }, + { + "auxiliary_loss_clip": 0.01122412, + "auxiliary_loss_mlp": 0.01099207, + "balance_loss_clip": 1.02973211, + "balance_loss_mlp": 1.02733886, + "epoch": 0.0503832857357583, + "flos": 20265195029760.0, + "grad_norm": 2.56780608115119, + "language_loss": 0.77931446, + "learning_rate": 3.995643766466275e-06, + "loss": 0.8015306, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.953125, + "step": 838, + "time_per_iteration": 2.4540815353393555 + }, + { + "auxiliary_loss_clip": 0.01125337, + "auxiliary_loss_mlp": 0.01095978, + "balance_loss_clip": 1.02717018, + "balance_loss_mlp": 1.02619982, + "epoch": 0.05044340898842627, + "flos": 17784057415680.0, + "grad_norm": 1.7458688580060588, + "language_loss": 0.86139184, + "learning_rate": 3.995618037469953e-06, + "loss": 0.883605, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.9921875, + "step": 839, + "time_per_iteration": 2.4471378326416016 + }, + { + "auxiliary_loss_clip": 0.01128292, + "auxiliary_loss_mlp": 0.01100836, + "balance_loss_clip": 1.03093207, + "balance_loss_mlp": 1.02837932, + "epoch": 0.050503532241094246, + "flos": 22965412625280.0, + "grad_norm": 2.1940416372116553, + "language_loss": 0.88223338, + "learning_rate": 3.995592232799595e-06, + "loss": 0.90452462, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.99609375, + "step": 840, + "time_per_iteration": 2.459482431411743 + }, + { + "auxiliary_loss_clip": 0.01128945, + "auxiliary_loss_mlp": 0.01099091, + "balance_loss_clip": 1.02441835, + "balance_loss_mlp": 1.02688992, + "epoch": 0.05056365549376221, + "flos": 22776078988800.0, + "grad_norm": 1.8051545138055862, + "language_loss": 0.97429597, + "learning_rate": 3.99556635245618e-06, + "loss": 0.99657631, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 1.015625, + "step": 841, + "time_per_iteration": 2.4701220989227295 + }, + { + "auxiliary_loss_clip": 0.01125666, + "auxiliary_loss_mlp": 0.01090117, + "balance_loss_clip": 1.02541065, + "balance_loss_mlp": 1.02667594, + "epoch": 0.05062377874643018, + "flos": 30915729630720.0, + "grad_norm": 2.226784535004877, + "language_loss": 0.80644429, + "learning_rate": 3.995540396440688e-06, + "loss": 0.82860214, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.9921875, + "step": 842, + "time_per_iteration": 2.581712245941162 + }, + { + "auxiliary_loss_clip": 0.01131404, + "auxiliary_loss_mlp": 0.01102603, + "balance_loss_clip": 1.03064871, + "balance_loss_mlp": 1.02810645, + "epoch": 0.05068390199909815, + "flos": 19646115068160.0, + "grad_norm": 2.379461286570494, + "language_loss": 0.81725568, + "learning_rate": 3.995514364754105e-06, + "loss": 0.83959579, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.03125, + "step": 843, + "time_per_iteration": 2.479529857635498 + }, + { + "auxiliary_loss_clip": 0.01132361, + "auxiliary_loss_mlp": 0.01091869, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.02786183, + "epoch": 0.05074402525176612, + "flos": 37960573858560.0, + "grad_norm": 1.8797006763876272, + "language_loss": 0.85929048, + "learning_rate": 3.995488257397417e-06, + "loss": 0.88153279, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.046875, + "step": 844, + "time_per_iteration": 2.627268075942993 + }, + { + "auxiliary_loss_clip": 0.01125239, + "auxiliary_loss_mlp": 0.01091194, + "balance_loss_clip": 1.02157545, + "balance_loss_mlp": 1.02420568, + "epoch": 0.05080414850443409, + "flos": 22053516157440.0, + "grad_norm": 1.9509915749054254, + "language_loss": 0.78852189, + "learning_rate": 3.995462074371614e-06, + "loss": 0.81068623, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 1.015625, + "step": 845, + "time_per_iteration": 3.9310946464538574 + }, + { + "auxiliary_loss_clip": 0.01124335, + "auxiliary_loss_mlp": 0.01088083, + "balance_loss_clip": 1.01994276, + "balance_loss_mlp": 1.02573383, + "epoch": 0.05086427175710206, + "flos": 20224870542720.0, + "grad_norm": 1.6835496432971293, + "language_loss": 0.90483904, + "learning_rate": 3.99543581567769e-06, + "loss": 0.92696327, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.984375, + "step": 846, + "time_per_iteration": 2.42999529838562 + }, + { + "auxiliary_loss_clip": 0.01127, + "auxiliary_loss_mlp": 0.01089823, + "balance_loss_clip": 1.02151585, + "balance_loss_mlp": 1.02541673, + "epoch": 0.05092439500977003, + "flos": 15158309483520.0, + "grad_norm": 1.6830884935417534, + "language_loss": 0.90893376, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.93110198, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 1.015625, + "step": 847, + "time_per_iteration": 2.437636613845825 + }, + { + "auxiliary_loss_clip": 0.01124722, + "auxiliary_loss_mlp": 0.01096013, + "balance_loss_clip": 1.0289216, + "balance_loss_mlp": 1.02545738, + "epoch": 0.050984518262437994, + "flos": 22054039827840.0, + "grad_norm": 2.799328696583664, + "language_loss": 0.86242688, + "learning_rate": 3.995383071289462e-06, + "loss": 0.8846342, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.99609375, + "step": 848, + "time_per_iteration": 3.9412245750427246 + }, + { + "auxiliary_loss_clip": 0.01131091, + "auxiliary_loss_mlp": 0.01088485, + "balance_loss_clip": 1.02110744, + "balance_loss_mlp": 1.02829468, + "epoch": 0.05104464151510597, + "flos": 30224065219200.0, + "grad_norm": 1.9426385944931062, + "language_loss": 0.90142244, + "learning_rate": 3.995356585597158e-06, + "loss": 0.9236182, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 1.03125, + "step": 849, + "time_per_iteration": 5.342249155044556 + }, + { + "auxiliary_loss_clip": 0.01126329, + "auxiliary_loss_mlp": 0.01091284, + "balance_loss_clip": 1.0230968, + "balance_loss_mlp": 1.02584982, + "epoch": 0.05110476476777394, + "flos": 18331914470400.0, + "grad_norm": 1.7760995981333365, + "language_loss": 0.8730967, + "learning_rate": 3.995330024240732e-06, + "loss": 0.89527285, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 1.0078125, + "step": 850, + "time_per_iteration": 2.4158575534820557 + }, + { + "auxiliary_loss_clip": 0.01127017, + "auxiliary_loss_mlp": 0.01086713, + "balance_loss_clip": 1.01905036, + "balance_loss_mlp": 1.02708244, + "epoch": 0.051164888020441904, + "flos": 37997197741440.0, + "grad_norm": 2.556591989838514, + "language_loss": 0.70103139, + "learning_rate": 3.995303387221192e-06, + "loss": 0.72316873, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 1.0, + "step": 851, + "time_per_iteration": 2.567460775375366 + }, + { + "auxiliary_loss_clip": 0.01129788, + "auxiliary_loss_mlp": 0.01087196, + "balance_loss_clip": 1.01991379, + "balance_loss_mlp": 1.02702141, + "epoch": 0.051225011273109876, + "flos": 23037543227520.0, + "grad_norm": 2.285884356837248, + "language_loss": 0.86837685, + "learning_rate": 3.995276674539547e-06, + "loss": 0.89054674, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 1.03125, + "step": 852, + "time_per_iteration": 2.423408031463623 + }, + { + "auxiliary_loss_clip": 0.01127191, + "auxiliary_loss_mlp": 0.01081896, + "balance_loss_clip": 1.01375628, + "balance_loss_mlp": 1.02675819, + "epoch": 0.05128513452577785, + "flos": 18258841261440.0, + "grad_norm": 2.229355998138035, + "language_loss": 0.83204579, + "learning_rate": 3.995249886196811e-06, + "loss": 0.85413671, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 1.0078125, + "step": 853, + "time_per_iteration": 2.4253132343292236 + }, + { + "auxiliary_loss_clip": 0.01127914, + "auxiliary_loss_mlp": 0.01091916, + "balance_loss_clip": 1.02139187, + "balance_loss_mlp": 1.02738333, + "epoch": 0.05134525777844581, + "flos": 27197723813760.0, + "grad_norm": 2.1471528847269576, + "language_loss": 0.81310225, + "learning_rate": 3.995223022193999e-06, + "loss": 0.83530045, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.0078125, + "step": 854, + "time_per_iteration": 2.4777135848999023 + }, + { + "auxiliary_loss_clip": 0.01126009, + "auxiliary_loss_mlp": 0.01089657, + "balance_loss_clip": 1.02351987, + "balance_loss_mlp": 1.02583981, + "epoch": 0.051405381031113785, + "flos": 28361099871360.0, + "grad_norm": 2.1740625440374193, + "language_loss": 0.85286057, + "learning_rate": 3.99519608253213e-06, + "loss": 0.87501729, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 1.0, + "step": 855, + "time_per_iteration": 2.4987337589263916 + }, + { + "auxiliary_loss_clip": 0.01044204, + "auxiliary_loss_mlp": 0.01015265, + "balance_loss_clip": 1.00415492, + "balance_loss_mlp": 1.01840544, + "epoch": 0.05146550428378175, + "flos": 65614855921920.0, + "grad_norm": 0.9861385312864569, + "language_loss": 0.65863013, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67922485, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.2578125, + "step": 856, + "time_per_iteration": 3.0258331298828125 + }, + { + "auxiliary_loss_clip": 0.01120249, + "auxiliary_loss_mlp": 0.01080859, + "balance_loss_clip": 1.01720107, + "balance_loss_mlp": 1.0247848, + "epoch": 0.05152562753644972, + "flos": 22053760536960.0, + "grad_norm": 1.9686873824562292, + "language_loss": 0.792606, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.81461704, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.95703125, + "step": 857, + "time_per_iteration": 2.464237689971924 + }, + { + "auxiliary_loss_clip": 0.01125959, + "auxiliary_loss_mlp": 0.01092239, + "balance_loss_clip": 1.02304995, + "balance_loss_mlp": 1.02511215, + "epoch": 0.051585750789117694, + "flos": 18508714928640.0, + "grad_norm": 2.0338238251209204, + "language_loss": 0.91288197, + "learning_rate": 3.995114809602412e-06, + "loss": 0.93506396, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 1.0078125, + "step": 858, + "time_per_iteration": 2.388768434524536 + }, + { + "auxiliary_loss_clip": 0.01123081, + "auxiliary_loss_mlp": 0.01087648, + "balance_loss_clip": 1.02430058, + "balance_loss_mlp": 1.02513862, + "epoch": 0.05164587404178566, + "flos": 23729172727680.0, + "grad_norm": 1.878235479469795, + "language_loss": 0.80082572, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.82293296, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.98046875, + "step": 859, + "time_per_iteration": 2.5003578662872314 + }, + { + "auxiliary_loss_clip": 0.01128921, + "auxiliary_loss_mlp": 0.01105744, + "balance_loss_clip": 1.03660238, + "balance_loss_mlp": 1.02623785, + "epoch": 0.05170599729445363, + "flos": 16251963822720.0, + "grad_norm": 2.110308659642544, + "language_loss": 0.94403362, + "learning_rate": 3.995060249372788e-06, + "loss": 0.96638024, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 1.03125, + "step": 860, + "time_per_iteration": 2.42598295211792 + }, + { + "auxiliary_loss_clip": 0.01122266, + "auxiliary_loss_mlp": 0.01087832, + "balance_loss_clip": 1.02493739, + "balance_loss_mlp": 1.0246284, + "epoch": 0.0517661205471216, + "flos": 23984841680640.0, + "grad_norm": 1.8724142506380805, + "language_loss": 0.8434664, + "learning_rate": 3.99503285577813e-06, + "loss": 0.86556733, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.9765625, + "step": 861, + "time_per_iteration": 2.4750988483428955 + }, + { + "auxiliary_loss_clip": 0.01127942, + "auxiliary_loss_mlp": 0.01095504, + "balance_loss_clip": 1.0302726, + "balance_loss_mlp": 1.02795827, + "epoch": 0.05182624379978957, + "flos": 29276452563840.0, + "grad_norm": 2.0169936540183597, + "language_loss": 0.81250715, + "learning_rate": 3.995005386531627e-06, + "loss": 0.83474159, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.99609375, + "step": 862, + "time_per_iteration": 2.5066378116607666 + }, + { + "auxiliary_loss_clip": 0.0111986, + "auxiliary_loss_mlp": 0.0108717, + "balance_loss_clip": 1.02828062, + "balance_loss_mlp": 1.02662253, + "epoch": 0.05188636705245754, + "flos": 24169671751680.0, + "grad_norm": 3.4825332529018818, + "language_loss": 0.92340934, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.94547963, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.93359375, + "step": 863, + "time_per_iteration": 2.502131938934326 + }, + { + "auxiliary_loss_clip": 0.01125404, + "auxiliary_loss_mlp": 0.01094033, + "balance_loss_clip": 1.02207804, + "balance_loss_mlp": 1.02691793, + "epoch": 0.051946490305125506, + "flos": 26759494028160.0, + "grad_norm": 2.0709947360471945, + "language_loss": 0.79882979, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.82102424, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.984375, + "step": 864, + "time_per_iteration": 2.4973390102386475 + }, + { + "auxiliary_loss_clip": 0.01128601, + "auxiliary_loss_mlp": 0.01086955, + "balance_loss_clip": 1.02115154, + "balance_loss_mlp": 1.02599359, + "epoch": 0.05200661355779348, + "flos": 21501574473600.0, + "grad_norm": 2.3385804449765324, + "language_loss": 0.8155117, + "learning_rate": 3.994922524891474e-06, + "loss": 0.83766729, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 1.03125, + "step": 865, + "time_per_iteration": 2.4627766609191895 + }, + { + "auxiliary_loss_clip": 0.0112672, + "auxiliary_loss_mlp": 0.01089402, + "balance_loss_clip": 1.02030826, + "balance_loss_mlp": 1.02646124, + "epoch": 0.05206673681046144, + "flos": 18113497804800.0, + "grad_norm": 2.260164986095091, + "language_loss": 0.90035486, + "learning_rate": 3.994894753048032e-06, + "loss": 0.92251617, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 1.0, + "step": 866, + "time_per_iteration": 2.3968911170959473 + }, + { + "auxiliary_loss_clip": 0.01125059, + "auxiliary_loss_mlp": 0.01093193, + "balance_loss_clip": 1.02896333, + "balance_loss_mlp": 1.02739251, + "epoch": 0.052126860063129415, + "flos": 17523396138240.0, + "grad_norm": 2.146389669745918, + "language_loss": 0.91947442, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.94165695, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.9765625, + "step": 867, + "time_per_iteration": 2.414842128753662 + }, + { + "auxiliary_loss_clip": 0.01117931, + "auxiliary_loss_mlp": 0.01076522, + "balance_loss_clip": 1.01891971, + "balance_loss_mlp": 1.025738, + "epoch": 0.05218698331579739, + "flos": 32596692727680.0, + "grad_norm": 1.4538185033286133, + "language_loss": 0.65122497, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.67316949, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.921875, + "step": 868, + "time_per_iteration": 2.5167908668518066 + }, + { + "auxiliary_loss_clip": 0.01128349, + "auxiliary_loss_mlp": 0.01091046, + "balance_loss_clip": 1.01851892, + "balance_loss_mlp": 1.02672362, + "epoch": 0.05224710656846535, + "flos": 22126205341440.0, + "grad_norm": 1.9551923609882471, + "language_loss": 0.87578464, + "learning_rate": 3.994810983642281e-06, + "loss": 0.8979786, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 1.015625, + "step": 869, + "time_per_iteration": 2.455733060836792 + }, + { + "auxiliary_loss_clip": 0.01127832, + "auxiliary_loss_mlp": 0.01091027, + "balance_loss_clip": 1.02202857, + "balance_loss_mlp": 1.02732813, + "epoch": 0.052307229821133325, + "flos": 11144310226560.0, + "grad_norm": 2.004737040523193, + "language_loss": 0.91783607, + "learning_rate": 3.994782909218751e-06, + "loss": 0.94002467, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 1.0078125, + "step": 870, + "time_per_iteration": 2.392753839492798 + }, + { + "auxiliary_loss_clip": 0.01124768, + "auxiliary_loss_mlp": 0.01082604, + "balance_loss_clip": 1.01770639, + "balance_loss_mlp": 1.02529752, + "epoch": 0.05236735307380129, + "flos": 19127271219840.0, + "grad_norm": 2.223060039598315, + "language_loss": 0.83374041, + "learning_rate": 3.994754759152854e-06, + "loss": 0.8558141, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.99609375, + "step": 871, + "time_per_iteration": 2.405123472213745 + }, + { + "auxiliary_loss_clip": 0.011215, + "auxiliary_loss_mlp": 0.01080171, + "balance_loss_clip": 1.01889741, + "balance_loss_mlp": 1.02528739, + "epoch": 0.05242747632646926, + "flos": 20959582527360.0, + "grad_norm": 3.3363116458221524, + "language_loss": 0.8311621, + "learning_rate": 3.994726533445656e-06, + "loss": 0.85317874, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.9609375, + "step": 872, + "time_per_iteration": 2.418765068054199 + }, + { + "auxiliary_loss_clip": 0.01036752, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.01968074, + "balance_loss_mlp": 1.00802565, + "epoch": 0.052487599579137234, + "flos": 65017632337920.0, + "grad_norm": 0.9114903374827709, + "language_loss": 0.61834103, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63901788, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.28710938, + "step": 873, + "time_per_iteration": 2.936467409133911 + }, + { + "auxiliary_loss_clip": 0.01129797, + "auxiliary_loss_mlp": 0.01084549, + "balance_loss_clip": 1.01750553, + "balance_loss_mlp": 1.02730465, + "epoch": 0.0525477228318052, + "flos": 23287905653760.0, + "grad_norm": 1.7195783521390156, + "language_loss": 0.9251098, + "learning_rate": 3.994669855111643e-06, + "loss": 0.94725329, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 1.0234375, + "step": 874, + "time_per_iteration": 2.5425920486450195 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01088692, + "balance_loss_clip": 1.01835823, + "balance_loss_mlp": 1.02709103, + "epoch": 0.05260784608447317, + "flos": 32228952710400.0, + "grad_norm": 2.210023215168436, + "language_loss": 0.77426052, + "learning_rate": 3.994641402486977e-06, + "loss": 0.79643846, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.015625, + "step": 875, + "time_per_iteration": 2.546679973602295 + }, + { + "auxiliary_loss_clip": 0.01120347, + "auxiliary_loss_mlp": 0.01082298, + "balance_loss_clip": 1.0160656, + "balance_loss_mlp": 1.02403259, + "epoch": 0.052667969337141136, + "flos": 24462034410240.0, + "grad_norm": 1.6378506458604296, + "language_loss": 0.950863, + "learning_rate": 3.99461287422531e-06, + "loss": 0.97288948, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.96484375, + "step": 876, + "time_per_iteration": 2.442812442779541 + }, + { + "auxiliary_loss_clip": 0.01033385, + "auxiliary_loss_mlp": 0.01011917, + "balance_loss_clip": 1.00095022, + "balance_loss_mlp": 1.00505638, + "epoch": 0.05272809258980911, + "flos": 57780938989440.0, + "grad_norm": 0.8214649011612507, + "language_loss": 0.63142455, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65187758, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.28320312, + "step": 877, + "time_per_iteration": 3.0710692405700684 + }, + { + "auxiliary_loss_clip": 0.01130417, + "auxiliary_loss_mlp": 0.01092054, + "balance_loss_clip": 1.01890695, + "balance_loss_mlp": 1.02706611, + "epoch": 0.05278821584247708, + "flos": 17419843445760.0, + "grad_norm": 6.678741295999182, + "language_loss": 0.89217949, + "learning_rate": 3.994555590795299e-06, + "loss": 0.91440421, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 1.03125, + "step": 878, + "time_per_iteration": 2.412493944168091 + }, + { + "auxiliary_loss_clip": 0.0113142, + "auxiliary_loss_mlp": 0.01090617, + "balance_loss_clip": 1.02343011, + "balance_loss_mlp": 1.0286057, + "epoch": 0.052848339095145046, + "flos": 26136154880640.0, + "grad_norm": 1.8732732135628374, + "language_loss": 0.88152182, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.90374219, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 1.03125, + "step": 879, + "time_per_iteration": 2.4723246097564697 + }, + { + "auxiliary_loss_clip": 0.01128562, + "auxiliary_loss_mlp": 0.0110982, + "balance_loss_clip": 1.03562427, + "balance_loss_mlp": 1.02803516, + "epoch": 0.05290846234781302, + "flos": 16471148538240.0, + "grad_norm": 2.037612015787909, + "language_loss": 0.8841238, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.90650761, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.0078125, + "step": 880, + "time_per_iteration": 2.396439790725708 + }, + { + "auxiliary_loss_clip": 0.0113371, + "auxiliary_loss_mlp": 0.01103912, + "balance_loss_clip": 1.03448415, + "balance_loss_mlp": 1.02936578, + "epoch": 0.05296858560048098, + "flos": 19864147708800.0, + "grad_norm": 1.8321939524643087, + "language_loss": 0.90907663, + "learning_rate": 3.994469098399906e-06, + "loss": 0.93145287, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 1.046875, + "step": 881, + "time_per_iteration": 2.418199062347412 + }, + { + "auxiliary_loss_clip": 0.01133214, + "auxiliary_loss_mlp": 0.01100032, + "balance_loss_clip": 1.01982844, + "balance_loss_mlp": 1.02751732, + "epoch": 0.053028708853148955, + "flos": 24387460012800.0, + "grad_norm": 2.030668744442987, + "language_loss": 0.9086293, + "learning_rate": 3.994440116339046e-06, + "loss": 0.93096173, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 1.0625, + "step": 882, + "time_per_iteration": 2.4479787349700928 + }, + { + "auxiliary_loss_clip": 0.01133334, + "auxiliary_loss_mlp": 0.01100334, + "balance_loss_clip": 1.02070212, + "balance_loss_mlp": 1.02888429, + "epoch": 0.05308883210581693, + "flos": 36391681825920.0, + "grad_norm": 2.2494771315648867, + "language_loss": 0.74744678, + "learning_rate": 3.994411058648816e-06, + "loss": 0.76978344, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 1.046875, + "step": 883, + "time_per_iteration": 2.54573655128479 + }, + { + "auxiliary_loss_clip": 0.01124566, + "auxiliary_loss_mlp": 0.01089165, + "balance_loss_clip": 1.01697147, + "balance_loss_mlp": 1.02643287, + "epoch": 0.05314895535848489, + "flos": 22854039788160.0, + "grad_norm": 1.9281765646791185, + "language_loss": 0.79975688, + "learning_rate": 3.994381925330319e-06, + "loss": 0.82189417, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.98046875, + "step": 884, + "time_per_iteration": 2.4227118492126465 + }, + { + "auxiliary_loss_clip": 0.01122629, + "auxiliary_loss_mlp": 0.01082049, + "balance_loss_clip": 1.01638901, + "balance_loss_mlp": 1.02468765, + "epoch": 0.053209078611152864, + "flos": 12859488322560.0, + "grad_norm": 1.9290258101334734, + "language_loss": 0.88879204, + "learning_rate": 3.994352716384659e-06, + "loss": 0.91083884, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.98046875, + "step": 885, + "time_per_iteration": 3.84979248046875 + }, + { + "auxiliary_loss_clip": 0.0113127, + "auxiliary_loss_mlp": 0.0109572, + "balance_loss_clip": 1.02033186, + "balance_loss_mlp": 1.0271107, + "epoch": 0.05326920186382083, + "flos": 12163844016000.0, + "grad_norm": 2.364942727061361, + "language_loss": 0.91269279, + "learning_rate": 3.994323431812945e-06, + "loss": 0.93496263, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 1.0390625, + "step": 886, + "time_per_iteration": 2.3995745182037354 + }, + { + "auxiliary_loss_clip": 0.01127869, + "auxiliary_loss_mlp": 0.01086933, + "balance_loss_clip": 1.01926994, + "balance_loss_mlp": 1.02735138, + "epoch": 0.0533293251164888, + "flos": 22703564361600.0, + "grad_norm": 1.8117351758222557, + "language_loss": 0.91718292, + "learning_rate": 3.994294071616286e-06, + "loss": 0.93933094, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 1.0078125, + "step": 887, + "time_per_iteration": 2.417062520980835 + }, + { + "auxiliary_loss_clip": 0.01126988, + "auxiliary_loss_mlp": 0.01089025, + "balance_loss_clip": 1.0215044, + "balance_loss_mlp": 1.02687526, + "epoch": 0.053389448369156774, + "flos": 26939785622400.0, + "grad_norm": 1.9364517538200954, + "language_loss": 0.78306127, + "learning_rate": 3.994264635795796e-06, + "loss": 0.80522138, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 1.0, + "step": 888, + "time_per_iteration": 3.940000295639038 + }, + { + "auxiliary_loss_clip": 0.01128296, + "auxiliary_loss_mlp": 0.01091383, + "balance_loss_clip": 1.02290893, + "balance_loss_mlp": 1.02751827, + "epoch": 0.05344957162182474, + "flos": 25555165079040.0, + "grad_norm": 1.8613417505828236, + "language_loss": 0.91851896, + "learning_rate": 3.994235124352592e-06, + "loss": 0.94071573, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 1.0078125, + "step": 889, + "time_per_iteration": 3.9605112075805664 + }, + { + "auxiliary_loss_clip": 0.011257, + "auxiliary_loss_mlp": 0.01085746, + "balance_loss_clip": 1.02213621, + "balance_loss_mlp": 1.02664995, + "epoch": 0.05350969487449271, + "flos": 19718559872640.0, + "grad_norm": 1.8244271556415097, + "language_loss": 0.90625685, + "learning_rate": 3.994205537287791e-06, + "loss": 0.92837131, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.9921875, + "step": 890, + "time_per_iteration": 2.401724100112915 + }, + { + "auxiliary_loss_clip": 0.01127688, + "auxiliary_loss_mlp": 0.01086174, + "balance_loss_clip": 1.02079964, + "balance_loss_mlp": 1.0270673, + "epoch": 0.053569818127160676, + "flos": 27015128069760.0, + "grad_norm": 2.0732640865031944, + "language_loss": 0.96809208, + "learning_rate": 3.994175874602517e-06, + "loss": 0.99023068, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 1.0078125, + "step": 891, + "time_per_iteration": 2.462536334991455 + }, + { + "auxiliary_loss_clip": 0.0112356, + "auxiliary_loss_mlp": 0.01091786, + "balance_loss_clip": 1.02798498, + "balance_loss_mlp": 1.02535248, + "epoch": 0.05362994137982865, + "flos": 13187497345920.0, + "grad_norm": 1.8540104871664411, + "language_loss": 0.74823928, + "learning_rate": 3.994146136297893e-06, + "loss": 0.77039266, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.98046875, + "step": 892, + "time_per_iteration": 2.3760876655578613 + }, + { + "auxiliary_loss_clip": 0.01134497, + "auxiliary_loss_mlp": 0.0110204, + "balance_loss_clip": 1.02717626, + "balance_loss_mlp": 1.02947116, + "epoch": 0.05369006463249662, + "flos": 28656744197760.0, + "grad_norm": 1.6247873890802804, + "language_loss": 0.85717261, + "learning_rate": 3.994116322375049e-06, + "loss": 0.87953794, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 1.046875, + "step": 893, + "time_per_iteration": 2.475463628768921 + }, + { + "auxiliary_loss_clip": 0.01129285, + "auxiliary_loss_mlp": 0.01093756, + "balance_loss_clip": 1.02385163, + "balance_loss_mlp": 1.02812362, + "epoch": 0.053750187885164585, + "flos": 28911889480320.0, + "grad_norm": 2.8570813573311096, + "language_loss": 0.84611565, + "learning_rate": 3.994086432835114e-06, + "loss": 0.86834604, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 1.0078125, + "step": 894, + "time_per_iteration": 2.5759589672088623 + }, + { + "auxiliary_loss_clip": 0.01125738, + "auxiliary_loss_mlp": 0.01081075, + "balance_loss_clip": 1.02056432, + "balance_loss_mlp": 1.02675855, + "epoch": 0.05381031113783256, + "flos": 15157925458560.0, + "grad_norm": 2.1791027684585287, + "language_loss": 0.7903257, + "learning_rate": 3.994056467679221e-06, + "loss": 0.81239378, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.98828125, + "step": 895, + "time_per_iteration": 2.3923048973083496 + }, + { + "auxiliary_loss_clip": 0.01133077, + "auxiliary_loss_mlp": 0.01088739, + "balance_loss_clip": 1.0233649, + "balance_loss_mlp": 1.02901173, + "epoch": 0.05387043439050053, + "flos": 21834156885120.0, + "grad_norm": 2.064382236376656, + "language_loss": 0.90006709, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.92228526, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 1.0390625, + "step": 896, + "time_per_iteration": 2.428237199783325 + }, + { + "auxiliary_loss_clip": 0.01127722, + "auxiliary_loss_mlp": 0.01077478, + "balance_loss_clip": 1.01558506, + "balance_loss_mlp": 1.02597606, + "epoch": 0.053930557643168495, + "flos": 17309378304000.0, + "grad_norm": 2.1055391679896536, + "language_loss": 0.9142698, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.93632179, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 1.015625, + "step": 897, + "time_per_iteration": 2.427826166152954 + }, + { + "auxiliary_loss_clip": 0.01120636, + "auxiliary_loss_mlp": 0.01085708, + "balance_loss_clip": 1.01713932, + "balance_loss_mlp": 1.02409196, + "epoch": 0.05399068089583647, + "flos": 17347503375360.0, + "grad_norm": 1.7610670172856944, + "language_loss": 0.92936295, + "learning_rate": 3.993966118527175e-06, + "loss": 0.95142639, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.96484375, + "step": 898, + "time_per_iteration": 2.383394479751587 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01080409, + "balance_loss_clip": 1.02001762, + "balance_loss_mlp": 1.02590048, + "epoch": 0.05405080414850443, + "flos": 17486178762240.0, + "grad_norm": 2.6895572538912713, + "language_loss": 0.95975626, + "learning_rate": 3.993935850918845e-06, + "loss": 0.9818157, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.99609375, + "step": 899, + "time_per_iteration": 2.437697172164917 + }, + { + "auxiliary_loss_clip": 0.01120608, + "auxiliary_loss_mlp": 0.01077013, + "balance_loss_clip": 1.01693189, + "balance_loss_mlp": 1.02492046, + "epoch": 0.054110927401172404, + "flos": 24495690827520.0, + "grad_norm": 2.402747554203343, + "language_loss": 0.78818405, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.81016028, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.95703125, + "step": 900, + "time_per_iteration": 2.517084836959839 + }, + { + "auxiliary_loss_clip": 0.01122646, + "auxiliary_loss_mlp": 0.01081948, + "balance_loss_clip": 1.01790905, + "balance_loss_mlp": 1.02480602, + "epoch": 0.054171050653840376, + "flos": 22928928387840.0, + "grad_norm": 2.4125771537949126, + "language_loss": 0.7967546, + "learning_rate": 3.993875088872592e-06, + "loss": 0.81880057, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.9765625, + "step": 901, + "time_per_iteration": 2.435633420944214 + }, + { + "auxiliary_loss_clip": 0.01113055, + "auxiliary_loss_mlp": 0.01075452, + "balance_loss_clip": 1.02087796, + "balance_loss_mlp": 1.0233562, + "epoch": 0.05423117390650834, + "flos": 12932352063360.0, + "grad_norm": 2.018364363717726, + "language_loss": 0.88925475, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.91113985, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.8984375, + "step": 902, + "time_per_iteration": 2.399024724960327 + }, + { + "auxiliary_loss_clip": 0.01119928, + "auxiliary_loss_mlp": 0.01081189, + "balance_loss_clip": 1.01853251, + "balance_loss_mlp": 1.02378201, + "epoch": 0.05429129715917631, + "flos": 19900317744000.0, + "grad_norm": 2.117994284166333, + "language_loss": 0.89757168, + "learning_rate": 3.993814024394569e-06, + "loss": 0.9195829, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.9609375, + "step": 903, + "time_per_iteration": 2.4367218017578125 + }, + { + "auxiliary_loss_clip": 0.01122128, + "auxiliary_loss_mlp": 0.01087548, + "balance_loss_clip": 1.02517772, + "balance_loss_mlp": 1.0242548, + "epoch": 0.05435142041184428, + "flos": 16907702578560.0, + "grad_norm": 2.419387024230688, + "language_loss": 0.79452693, + "learning_rate": 3.993783378746537e-06, + "loss": 0.81662375, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.9765625, + "step": 904, + "time_per_iteration": 2.404392957687378 + }, + { + "auxiliary_loss_clip": 0.01121452, + "auxiliary_loss_mlp": 0.01089645, + "balance_loss_clip": 1.02496159, + "balance_loss_mlp": 1.02453566, + "epoch": 0.05441154366451225, + "flos": 23947275191040.0, + "grad_norm": 2.2481546222984004, + "language_loss": 0.89883184, + "learning_rate": 3.993752657494039e-06, + "loss": 0.9209429, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.96875, + "step": 905, + "time_per_iteration": 2.4161629676818848 + }, + { + "auxiliary_loss_clip": 0.01120703, + "auxiliary_loss_mlp": 0.01077425, + "balance_loss_clip": 1.01915503, + "balance_loss_mlp": 1.02700603, + "epoch": 0.05447166691718022, + "flos": 19974333559680.0, + "grad_norm": 2.0231286971312854, + "language_loss": 0.77343166, + "learning_rate": 3.993721860638241e-06, + "loss": 0.79541296, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.9375, + "step": 906, + "time_per_iteration": 2.421051502227783 + }, + { + "auxiliary_loss_clip": 0.01123828, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_clip": 1.01782787, + "balance_loss_mlp": 1.0250771, + "epoch": 0.05453179016984819, + "flos": 24935351978880.0, + "grad_norm": 1.951739030132688, + "language_loss": 0.91334414, + "learning_rate": 3.993690988180309e-06, + "loss": 0.93541729, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.98828125, + "step": 907, + "time_per_iteration": 2.440842866897583 + }, + { + "auxiliary_loss_clip": 0.01124339, + "auxiliary_loss_mlp": 0.01081378, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.02663255, + "epoch": 0.05459191342251616, + "flos": 18114091297920.0, + "grad_norm": 1.6651852626431427, + "language_loss": 0.90196919, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.92402637, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.9765625, + "step": 908, + "time_per_iteration": 2.424984931945801 + }, + { + "auxiliary_loss_clip": 0.01121081, + "auxiliary_loss_mlp": 0.01081023, + "balance_loss_clip": 1.02170455, + "balance_loss_mlp": 1.0249052, + "epoch": 0.054652036675184125, + "flos": 19207291789440.0, + "grad_norm": 2.1868488728281776, + "language_loss": 0.9354558, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.95747685, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.9609375, + "step": 909, + "time_per_iteration": 2.4045894145965576 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01088544, + "balance_loss_clip": 1.02259696, + "balance_loss_mlp": 1.02507257, + "epoch": 0.0547121599278521, + "flos": 16324827563520.0, + "grad_norm": 2.2424132423421317, + "language_loss": 0.74963367, + "learning_rate": 3.99359791720544e-06, + "loss": 0.77176797, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 1.0, + "step": 910, + "time_per_iteration": 2.4162583351135254 + }, + { + "auxiliary_loss_clip": 0.01119444, + "auxiliary_loss_mlp": 0.01083427, + "balance_loss_clip": 1.02289271, + "balance_loss_mlp": 1.02491653, + "epoch": 0.05477228318052007, + "flos": 20337988947840.0, + "grad_norm": 1.5909643428255626, + "language_loss": 0.86067891, + "learning_rate": 3.993566742350714e-06, + "loss": 0.8827076, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.9453125, + "step": 911, + "time_per_iteration": 2.4164538383483887 + }, + { + "auxiliary_loss_clip": 0.01122473, + "auxiliary_loss_mlp": 0.01075329, + "balance_loss_clip": 1.01372194, + "balance_loss_mlp": 1.02588105, + "epoch": 0.054832406433188034, + "flos": 21972238778880.0, + "grad_norm": 2.3361856754714774, + "language_loss": 0.79422903, + "learning_rate": 3.993535491899736e-06, + "loss": 0.81620705, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.96484375, + "step": 912, + "time_per_iteration": 2.4688615798950195 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.01081524, + "balance_loss_clip": 1.02373123, + "balance_loss_mlp": 1.02335024, + "epoch": 0.054892529685856006, + "flos": 16398005506560.0, + "grad_norm": 2.465567818491581, + "language_loss": 0.86453819, + "learning_rate": 3.993504165853694e-06, + "loss": 0.886509, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.921875, + "step": 913, + "time_per_iteration": 2.459686040878296 + }, + { + "auxiliary_loss_clip": 0.01118012, + "auxiliary_loss_mlp": 0.01073888, + "balance_loss_clip": 1.01804996, + "balance_loss_mlp": 1.02544069, + "epoch": 0.05495265293852397, + "flos": 23911279712640.0, + "grad_norm": 1.8023726948470293, + "language_loss": 0.86560982, + "learning_rate": 3.993472764213772e-06, + "loss": 0.88752878, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.92578125, + "step": 914, + "time_per_iteration": 2.5117075443267822 + }, + { + "auxiliary_loss_clip": 0.01117667, + "auxiliary_loss_mlp": 0.01082086, + "balance_loss_clip": 1.02147996, + "balance_loss_mlp": 1.0246532, + "epoch": 0.055012776191191944, + "flos": 23585819218560.0, + "grad_norm": 2.158671037614002, + "language_loss": 0.95001459, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.9720121, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.9296875, + "step": 915, + "time_per_iteration": 2.469024896621704 + }, + { + "auxiliary_loss_clip": 0.01119895, + "auxiliary_loss_mlp": 0.01072452, + "balance_loss_clip": 1.0150888, + "balance_loss_mlp": 1.02534914, + "epoch": 0.055072899443859916, + "flos": 17527585501440.0, + "grad_norm": 1.793766418693055, + "language_loss": 0.91853392, + "learning_rate": 3.993409734157064e-06, + "loss": 0.94045734, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.9453125, + "step": 916, + "time_per_iteration": 2.480485677719116 + }, + { + "auxiliary_loss_clip": 0.0111952, + "auxiliary_loss_mlp": 0.01079747, + "balance_loss_clip": 1.02157331, + "balance_loss_mlp": 1.0245626, + "epoch": 0.05513302269652788, + "flos": 21686160165120.0, + "grad_norm": 1.7354715677036034, + "language_loss": 0.83472085, + "learning_rate": 3.993378105742666e-06, + "loss": 0.85671353, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.94921875, + "step": 917, + "time_per_iteration": 2.5028650760650635 + }, + { + "auxiliary_loss_clip": 0.01123176, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_clip": 1.01391625, + "balance_loss_mlp": 1.02447474, + "epoch": 0.05519314594919585, + "flos": 21612353817600.0, + "grad_norm": 1.8069606342306848, + "language_loss": 0.83700562, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.8589704, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.984375, + "step": 918, + "time_per_iteration": 2.536362886428833 + }, + { + "auxiliary_loss_clip": 0.01123927, + "auxiliary_loss_mlp": 0.01079284, + "balance_loss_clip": 1.02077639, + "balance_loss_mlp": 1.02532208, + "epoch": 0.05525326920186382, + "flos": 21797498090880.0, + "grad_norm": 2.1422499519691276, + "language_loss": 0.92001951, + "learning_rate": 3.99331462214778e-06, + "loss": 0.94205165, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.984375, + "step": 919, + "time_per_iteration": 2.5601465702056885 + }, + { + "auxiliary_loss_clip": 0.01117876, + "auxiliary_loss_mlp": 0.01078159, + "balance_loss_clip": 1.01860189, + "balance_loss_mlp": 1.02267587, + "epoch": 0.05531339245453179, + "flos": 28438362443520.0, + "grad_norm": 2.0839333675131035, + "language_loss": 0.91133678, + "learning_rate": 3.993282766969699e-06, + "loss": 0.93329704, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.953125, + "step": 920, + "time_per_iteration": 2.5991275310516357 + }, + { + "auxiliary_loss_clip": 0.01119383, + "auxiliary_loss_mlp": 0.01083533, + "balance_loss_clip": 1.01915991, + "balance_loss_mlp": 1.02339458, + "epoch": 0.05537351570719976, + "flos": 37373718948480.0, + "grad_norm": 1.9122346867378892, + "language_loss": 0.69504476, + "learning_rate": 3.993250836206136e-06, + "loss": 0.71707392, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.9609375, + "step": 921, + "time_per_iteration": 2.5270113945007324 + }, + { + "auxiliary_loss_clip": 0.01122078, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_clip": 1.01989269, + "balance_loss_mlp": 1.02504683, + "epoch": 0.05543363895986773, + "flos": 20083437158400.0, + "grad_norm": 2.253623353244792, + "language_loss": 0.76224709, + "learning_rate": 3.993218829858301e-06, + "loss": 0.7842738, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.96875, + "step": 922, + "time_per_iteration": 2.421757459640503 + }, + { + "auxiliary_loss_clip": 0.01120593, + "auxiliary_loss_mlp": 0.01088235, + "balance_loss_clip": 1.02348006, + "balance_loss_mlp": 1.02319288, + "epoch": 0.0554937622125357, + "flos": 24532105242240.0, + "grad_norm": 4.12314418254269, + "language_loss": 0.86530393, + "learning_rate": 3.993186747927408e-06, + "loss": 0.88739222, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.97265625, + "step": 923, + "time_per_iteration": 2.426964521408081 + }, + { + "auxiliary_loss_clip": 0.01121567, + "auxiliary_loss_mlp": 0.01083764, + "balance_loss_clip": 1.01724517, + "balance_loss_mlp": 1.02369547, + "epoch": 0.055553885465203665, + "flos": 14319172022400.0, + "grad_norm": 2.012388989202911, + "language_loss": 0.82161134, + "learning_rate": 3.993154590414675e-06, + "loss": 0.84366465, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.9765625, + "step": 924, + "time_per_iteration": 2.4108219146728516 + }, + { + "auxiliary_loss_clip": 0.01113395, + "auxiliary_loss_mlp": 0.0107746, + "balance_loss_clip": 1.019238, + "balance_loss_mlp": 1.02218843, + "epoch": 0.05561400871787164, + "flos": 27379900621440.0, + "grad_norm": 1.841892424677068, + "language_loss": 1.04974174, + "learning_rate": 3.993122357321319e-06, + "loss": 1.07165027, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.9140625, + "step": 925, + "time_per_iteration": 3.9872593879699707 + }, + { + "auxiliary_loss_clip": 0.01116847, + "auxiliary_loss_mlp": 0.01082197, + "balance_loss_clip": 1.02097142, + "balance_loss_mlp": 1.02292633, + "epoch": 0.05567413197053961, + "flos": 23219999326080.0, + "grad_norm": 1.8957289546810663, + "language_loss": 0.83292049, + "learning_rate": 3.993090048648564e-06, + "loss": 0.85491097, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.9375, + "step": 926, + "time_per_iteration": 2.43587064743042 + }, + { + "auxiliary_loss_clip": 0.01127628, + "auxiliary_loss_mlp": 0.01089114, + "balance_loss_clip": 1.01992488, + "balance_loss_mlp": 1.0257163, + "epoch": 0.055734255223207574, + "flos": 25263779938560.0, + "grad_norm": 2.513368132909658, + "language_loss": 0.79492259, + "learning_rate": 3.993057664397634e-06, + "loss": 0.81709003, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 1.015625, + "step": 927, + "time_per_iteration": 3.8844234943389893 + }, + { + "auxiliary_loss_clip": 0.01041675, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.01735342, + "balance_loss_mlp": 1.0134151, + "epoch": 0.055794378475875546, + "flos": 66499519662720.0, + "grad_norm": 0.7896404882410542, + "language_loss": 0.6007551, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62145698, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.28125, + "step": 928, + "time_per_iteration": 5.964838743209839 + }, + { + "auxiliary_loss_clip": 0.01120305, + "auxiliary_loss_mlp": 0.01082399, + "balance_loss_clip": 1.01850319, + "balance_loss_mlp": 1.02474117, + "epoch": 0.05585450172854351, + "flos": 25336469122560.0, + "grad_norm": 1.9746621175107022, + "language_loss": 0.98213017, + "learning_rate": 3.992992669166168e-06, + "loss": 1.00415719, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.95703125, + "step": 929, + "time_per_iteration": 2.4845173358917236 + }, + { + "auxiliary_loss_clip": 0.01123352, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_clip": 1.02657819, + "balance_loss_mlp": 1.02726007, + "epoch": 0.05591462498121148, + "flos": 33910334743680.0, + "grad_norm": 2.3685247279258586, + "language_loss": 0.75268775, + "learning_rate": 3.992960058188094e-06, + "loss": 0.77487659, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.9609375, + "step": 930, + "time_per_iteration": 2.5293941497802734 + }, + { + "auxiliary_loss_clip": 0.01124642, + "auxiliary_loss_mlp": 0.01086576, + "balance_loss_clip": 1.01915073, + "balance_loss_mlp": 1.02594137, + "epoch": 0.055974748233879455, + "flos": 17929924542720.0, + "grad_norm": 2.0989753990332525, + "language_loss": 0.8866356, + "learning_rate": 3.992927371636776e-06, + "loss": 0.90874773, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.984375, + "step": 931, + "time_per_iteration": 2.434673547744751 + }, + { + "auxiliary_loss_clip": 0.01124577, + "auxiliary_loss_mlp": 0.01092734, + "balance_loss_clip": 1.0255481, + "balance_loss_mlp": 1.02567458, + "epoch": 0.05603487148654742, + "flos": 24020906981760.0, + "grad_norm": 1.6080111837647388, + "language_loss": 0.86948848, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.89166164, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.9921875, + "step": 932, + "time_per_iteration": 2.445666551589966 + }, + { + "auxiliary_loss_clip": 0.01127364, + "auxiliary_loss_mlp": 0.01095551, + "balance_loss_clip": 1.02769756, + "balance_loss_mlp": 1.0281142, + "epoch": 0.05609499473921539, + "flos": 17306899597440.0, + "grad_norm": 1.95286087742154, + "language_loss": 0.78279328, + "learning_rate": 3.992861771819365e-06, + "loss": 0.80502248, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.9921875, + "step": 933, + "time_per_iteration": 2.416311502456665 + }, + { + "auxiliary_loss_clip": 0.01125054, + "auxiliary_loss_mlp": 0.01089712, + "balance_loss_clip": 1.02214432, + "balance_loss_mlp": 1.02645922, + "epoch": 0.05615511799188336, + "flos": 20993727703680.0, + "grad_norm": 2.246288009154978, + "language_loss": 0.89897525, + "learning_rate": 3.99282885855576e-06, + "loss": 0.92112285, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.984375, + "step": 934, + "time_per_iteration": 2.397213935852051 + }, + { + "auxiliary_loss_clip": 0.01122841, + "auxiliary_loss_mlp": 0.01083441, + "balance_loss_clip": 1.02054572, + "balance_loss_mlp": 1.02791095, + "epoch": 0.05621524124455133, + "flos": 17272614775680.0, + "grad_norm": 3.5601917310719227, + "language_loss": 0.83352959, + "learning_rate": 3.992795869723885e-06, + "loss": 0.85559237, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.94921875, + "step": 935, + "time_per_iteration": 2.397948980331421 + }, + { + "auxiliary_loss_clip": 0.01041933, + "auxiliary_loss_mlp": 0.01014674, + "balance_loss_clip": 1.00303924, + "balance_loss_mlp": 1.01508749, + "epoch": 0.0562753644972193, + "flos": 58716332668800.0, + "grad_norm": 0.8231284325586253, + "language_loss": 0.69276774, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71333373, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.26953125, + "step": 936, + "time_per_iteration": 2.9382803440093994 + }, + { + "auxiliary_loss_clip": 0.01127981, + "auxiliary_loss_mlp": 0.01098956, + "balance_loss_clip": 1.02437842, + "balance_loss_mlp": 1.02713513, + "epoch": 0.05633548774988727, + "flos": 17456083303680.0, + "grad_norm": 2.6184879557465806, + "language_loss": 0.80036873, + "learning_rate": 3.992729665360331e-06, + "loss": 0.82263803, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 1.0078125, + "step": 937, + "time_per_iteration": 2.430989980697632 + }, + { + "auxiliary_loss_clip": 0.01039498, + "auxiliary_loss_mlp": 0.01022038, + "balance_loss_clip": 1.00706565, + "balance_loss_mlp": 1.01346707, + "epoch": 0.05639561100255524, + "flos": 70651426256640.0, + "grad_norm": 0.8843280779576959, + "language_loss": 0.6462326, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66684794, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.25976562, + "step": 938, + "time_per_iteration": 3.003065824508667 + }, + { + "auxiliary_loss_clip": 0.01136436, + "auxiliary_loss_mlp": 0.01100261, + "balance_loss_clip": 1.02286994, + "balance_loss_mlp": 1.0268991, + "epoch": 0.056455734255223204, + "flos": 20484938327040.0, + "grad_norm": 4.53488490516945, + "language_loss": 0.84203786, + "learning_rate": 3.992663158738745e-06, + "loss": 0.8644048, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 1.09375, + "step": 939, + "time_per_iteration": 2.4258644580841064 + }, + { + "auxiliary_loss_clip": 0.01125301, + "auxiliary_loss_mlp": 0.01102785, + "balance_loss_clip": 1.03249943, + "balance_loss_mlp": 1.02609599, + "epoch": 0.056515857507891176, + "flos": 22052503728000.0, + "grad_norm": 1.6001810504644367, + "language_loss": 0.765172, + "learning_rate": 3.992629792084341e-06, + "loss": 0.78745288, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.9921875, + "step": 940, + "time_per_iteration": 2.4411253929138184 + }, + { + "auxiliary_loss_clip": 0.01123712, + "auxiliary_loss_mlp": 0.01096962, + "balance_loss_clip": 1.02758241, + "balance_loss_mlp": 1.02566171, + "epoch": 0.05657598076055915, + "flos": 24024153738240.0, + "grad_norm": 1.7973713209528892, + "language_loss": 0.74662936, + "learning_rate": 3.992596349869216e-06, + "loss": 0.76883602, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.98046875, + "step": 941, + "time_per_iteration": 2.4572362899780273 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.01092796, + "balance_loss_clip": 1.02122307, + "balance_loss_mlp": 1.02705717, + "epoch": 0.05663610401322711, + "flos": 20479701623040.0, + "grad_norm": 1.9433687439825826, + "language_loss": 0.83690298, + "learning_rate": 3.992562832094637e-06, + "loss": 0.85907871, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.9765625, + "step": 942, + "time_per_iteration": 2.4054033756256104 + }, + { + "auxiliary_loss_clip": 0.01130259, + "auxiliary_loss_mlp": 0.01095179, + "balance_loss_clip": 1.02093577, + "balance_loss_mlp": 1.02776945, + "epoch": 0.056696227265895086, + "flos": 21067987898880.0, + "grad_norm": 1.8091049806021222, + "language_loss": 0.91744149, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.93969584, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.0234375, + "step": 943, + "time_per_iteration": 2.570345878601074 + }, + { + "auxiliary_loss_clip": 0.01127175, + "auxiliary_loss_mlp": 0.01097243, + "balance_loss_clip": 1.02347648, + "balance_loss_mlp": 1.02733016, + "epoch": 0.05675635051856306, + "flos": 17820367096320.0, + "grad_norm": 2.0768882442487433, + "language_loss": 0.80012816, + "learning_rate": 3.992495569872206e-06, + "loss": 0.82237238, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 1.0, + "step": 944, + "time_per_iteration": 2.394005298614502 + }, + { + "auxiliary_loss_clip": 0.01126464, + "auxiliary_loss_mlp": 0.01090083, + "balance_loss_clip": 1.02079844, + "balance_loss_mlp": 1.02629352, + "epoch": 0.05681647377123102, + "flos": 23113758458880.0, + "grad_norm": 1.943433641906643, + "language_loss": 0.81938994, + "learning_rate": 3.992461825426906e-06, + "loss": 0.84155536, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 1.0, + "step": 945, + "time_per_iteration": 2.454765558242798 + }, + { + "auxiliary_loss_clip": 0.01131934, + "auxiliary_loss_mlp": 0.01093296, + "balance_loss_clip": 1.0210073, + "balance_loss_mlp": 1.02966034, + "epoch": 0.056876597023898995, + "flos": 16069612458240.0, + "grad_norm": 2.077289481158631, + "language_loss": 0.85102606, + "learning_rate": 3.992428005427252e-06, + "loss": 0.87327838, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 1.0234375, + "step": 946, + "time_per_iteration": 2.4211196899414062 + }, + { + "auxiliary_loss_clip": 0.01135463, + "auxiliary_loss_mlp": 0.01104019, + "balance_loss_clip": 1.02557981, + "balance_loss_mlp": 1.03040671, + "epoch": 0.05693672027656696, + "flos": 16834734103680.0, + "grad_norm": 1.8605156197164077, + "language_loss": 0.83768058, + "learning_rate": 3.992394109874529e-06, + "loss": 0.86007535, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 1.046875, + "step": 947, + "time_per_iteration": 2.4302000999450684 + }, + { + "auxiliary_loss_clip": 0.01134947, + "auxiliary_loss_mlp": 0.01099227, + "balance_loss_clip": 1.02751064, + "balance_loss_mlp": 1.03228641, + "epoch": 0.05699684352923493, + "flos": 21388281511680.0, + "grad_norm": 5.617782041802679, + "language_loss": 0.90811485, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.93045652, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.0234375, + "step": 948, + "time_per_iteration": 2.4176712036132812 + }, + { + "auxiliary_loss_clip": 0.01131975, + "auxiliary_loss_mlp": 0.01094095, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.0299089, + "epoch": 0.057056966781902904, + "flos": 15559391715840.0, + "grad_norm": 1.673510645622403, + "language_loss": 0.90460217, + "learning_rate": 3.992326092115019e-06, + "loss": 0.92686284, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 1.0234375, + "step": 949, + "time_per_iteration": 2.4255564212799072 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_clip": 1.01985145, + "balance_loss_mlp": 1.02921772, + "epoch": 0.05711709003457087, + "flos": 19936836892800.0, + "grad_norm": 2.008457109782295, + "language_loss": 0.82122171, + "learning_rate": 3.992291969910811e-06, + "loss": 0.84336311, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.984375, + "step": 950, + "time_per_iteration": 2.4229536056518555 + }, + { + "auxiliary_loss_clip": 0.01132337, + "auxiliary_loss_mlp": 0.0110328, + "balance_loss_clip": 1.02469742, + "balance_loss_mlp": 1.02949297, + "epoch": 0.05717721328723884, + "flos": 30331493072640.0, + "grad_norm": 1.8936392398058866, + "language_loss": 0.85878342, + "learning_rate": 3.992257772158691e-06, + "loss": 0.88113964, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 1.03125, + "step": 951, + "time_per_iteration": 2.513164758682251 + }, + { + "auxiliary_loss_clip": 0.01128298, + "auxiliary_loss_mlp": 0.01093425, + "balance_loss_clip": 1.02161348, + "balance_loss_mlp": 1.02677917, + "epoch": 0.05723733653990681, + "flos": 23653376432640.0, + "grad_norm": 4.040509312164673, + "language_loss": 0.89580774, + "learning_rate": 3.992223498859958e-06, + "loss": 0.91802502, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.015625, + "step": 952, + "time_per_iteration": 2.429227352142334 + }, + { + "auxiliary_loss_clip": 0.01135727, + "auxiliary_loss_mlp": 0.01103819, + "balance_loss_clip": 1.02456903, + "balance_loss_mlp": 1.02878428, + "epoch": 0.05729745979257478, + "flos": 22054633320960.0, + "grad_norm": 1.8116323273873673, + "language_loss": 0.82249457, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.84489, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 1.0703125, + "step": 953, + "time_per_iteration": 2.460824489593506 + }, + { + "auxiliary_loss_clip": 0.01126616, + "auxiliary_loss_mlp": 0.01091546, + "balance_loss_clip": 1.02483654, + "balance_loss_mlp": 1.02781487, + "epoch": 0.05735758304524275, + "flos": 19603486431360.0, + "grad_norm": 1.8456119833675457, + "language_loss": 0.90118545, + "learning_rate": 3.992154725627848e-06, + "loss": 0.92336714, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.98828125, + "step": 954, + "time_per_iteration": 2.40238356590271 + }, + { + "auxiliary_loss_clip": 0.01131357, + "auxiliary_loss_mlp": 0.01092334, + "balance_loss_clip": 1.0194732, + "balance_loss_mlp": 1.02684367, + "epoch": 0.057417706297910716, + "flos": 19098013633920.0, + "grad_norm": 3.5739281035110317, + "language_loss": 0.93333107, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.95556796, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 1.046875, + "step": 955, + "time_per_iteration": 2.400515556335449 + }, + { + "auxiliary_loss_clip": 0.01127711, + "auxiliary_loss_mlp": 0.01091233, + "balance_loss_clip": 1.02523839, + "balance_loss_mlp": 1.02722752, + "epoch": 0.05747782955057869, + "flos": 16653569725440.0, + "grad_norm": 2.4589728774130917, + "language_loss": 0.93558955, + "learning_rate": 3.992085650224914e-06, + "loss": 0.95777893, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 1.0, + "step": 956, + "time_per_iteration": 2.428107500076294 + }, + { + "auxiliary_loss_clip": 0.01123254, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_clip": 1.01993442, + "balance_loss_mlp": 1.02665424, + "epoch": 0.05753795280324665, + "flos": 14501174273280.0, + "grad_norm": 2.0580916057709144, + "language_loss": 0.78504288, + "learning_rate": 3.99205099921266e-06, + "loss": 0.80712855, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.96484375, + "step": 957, + "time_per_iteration": 2.3925962448120117 + }, + { + "auxiliary_loss_clip": 0.0112932, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_clip": 1.02901018, + "balance_loss_mlp": 1.02769423, + "epoch": 0.057598076055914625, + "flos": 18075372733440.0, + "grad_norm": 1.8558662791525078, + "language_loss": 0.82246041, + "learning_rate": 3.992016272661633e-06, + "loss": 0.84478706, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.015625, + "step": 958, + "time_per_iteration": 2.418532609939575 + }, + { + "auxiliary_loss_clip": 0.01125241, + "auxiliary_loss_mlp": 0.01092919, + "balance_loss_clip": 1.02387261, + "balance_loss_mlp": 1.02585578, + "epoch": 0.0576581993085826, + "flos": 22123586989440.0, + "grad_norm": 2.0437774758620795, + "language_loss": 0.90743577, + "learning_rate": 3.99198147057315e-06, + "loss": 0.9296174, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.9921875, + "step": 959, + "time_per_iteration": 2.4289016723632812 + }, + { + "auxiliary_loss_clip": 0.01118484, + "auxiliary_loss_mlp": 0.01087763, + "balance_loss_clip": 1.02224541, + "balance_loss_mlp": 1.02510524, + "epoch": 0.05771832256125056, + "flos": 33180370704000.0, + "grad_norm": 3.1681646693218983, + "language_loss": 0.82189268, + "learning_rate": 3.991946592948529e-06, + "loss": 0.84395516, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.93359375, + "step": 960, + "time_per_iteration": 2.5346662998199463 + }, + { + "auxiliary_loss_clip": 0.01126532, + "auxiliary_loss_mlp": 0.01090026, + "balance_loss_clip": 1.01797545, + "balance_loss_mlp": 1.02592087, + "epoch": 0.057778445813918534, + "flos": 24169008435840.0, + "grad_norm": 1.8925108964751758, + "language_loss": 0.97161686, + "learning_rate": 3.991911639789094e-06, + "loss": 0.9937824, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.0078125, + "step": 961, + "time_per_iteration": 2.4395880699157715 + }, + { + "auxiliary_loss_clip": 0.01130132, + "auxiliary_loss_mlp": 0.01099052, + "balance_loss_clip": 1.02099371, + "balance_loss_mlp": 1.02804279, + "epoch": 0.0578385690665865, + "flos": 29641748785920.0, + "grad_norm": 2.5390266776576005, + "language_loss": 0.72748172, + "learning_rate": 3.991876611096169e-06, + "loss": 0.74977356, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 1.0234375, + "step": 962, + "time_per_iteration": 2.505279302597046 + }, + { + "auxiliary_loss_clip": 0.01124621, + "auxiliary_loss_mlp": 0.01091321, + "balance_loss_clip": 1.01731586, + "balance_loss_mlp": 1.02808583, + "epoch": 0.05789869231925447, + "flos": 20884414636800.0, + "grad_norm": 2.58501559503198, + "language_loss": 0.91616815, + "learning_rate": 3.991841506871084e-06, + "loss": 0.93832755, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.96484375, + "step": 963, + "time_per_iteration": 2.408115863800049 + }, + { + "auxiliary_loss_clip": 0.01132555, + "auxiliary_loss_mlp": 0.01094141, + "balance_loss_clip": 1.01818109, + "balance_loss_mlp": 1.02800548, + "epoch": 0.057958815571922444, + "flos": 26029914013440.0, + "grad_norm": 2.4241300721880426, + "language_loss": 0.89892358, + "learning_rate": 3.99180632711517e-06, + "loss": 0.92119056, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 1.046875, + "step": 964, + "time_per_iteration": 3.908592700958252 + }, + { + "auxiliary_loss_clip": 0.01126985, + "auxiliary_loss_mlp": 0.01103138, + "balance_loss_clip": 1.02775037, + "balance_loss_mlp": 1.02587056, + "epoch": 0.05801893882459041, + "flos": 18076699365120.0, + "grad_norm": 2.697802868720696, + "language_loss": 0.80888444, + "learning_rate": 3.99177107182976e-06, + "loss": 0.8311857, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 1.0078125, + "step": 965, + "time_per_iteration": 2.447664260864258 + }, + { + "auxiliary_loss_clip": 0.01125824, + "auxiliary_loss_mlp": 0.0109215, + "balance_loss_clip": 1.02172065, + "balance_loss_mlp": 1.02556205, + "epoch": 0.05807906207725838, + "flos": 17747922291840.0, + "grad_norm": 1.9660934455239516, + "language_loss": 0.85969114, + "learning_rate": 3.99173574101619e-06, + "loss": 0.88187087, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.0, + "step": 966, + "time_per_iteration": 2.408773422241211 + }, + { + "auxiliary_loss_clip": 0.01125602, + "auxiliary_loss_mlp": 0.0109356, + "balance_loss_clip": 1.02136683, + "balance_loss_mlp": 1.02658224, + "epoch": 0.058139185329926346, + "flos": 18039412166400.0, + "grad_norm": 1.956085797783424, + "language_loss": 0.80072606, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.82291764, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.9921875, + "step": 967, + "time_per_iteration": 5.366400957107544 + }, + { + "auxiliary_loss_clip": 0.01044823, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.02139413, + "balance_loss_mlp": 1.01678514, + "epoch": 0.05819930858259432, + "flos": 62360287758720.0, + "grad_norm": 0.8031589151860283, + "language_loss": 0.57549012, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59629917, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.28125, + "step": 968, + "time_per_iteration": 3.0211095809936523 + }, + { + "auxiliary_loss_clip": 0.01129065, + "auxiliary_loss_mlp": 0.0109852, + "balance_loss_clip": 1.02241695, + "balance_loss_mlp": 1.02706146, + "epoch": 0.05825943183526229, + "flos": 19134358225920.0, + "grad_norm": 2.135629676756832, + "language_loss": 0.85367376, + "learning_rate": 3.991629295419945e-06, + "loss": 0.87594962, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 1.0234375, + "step": 969, + "time_per_iteration": 2.3907978534698486 + }, + { + "auxiliary_loss_clip": 0.01132043, + "auxiliary_loss_mlp": 0.01097581, + "balance_loss_clip": 1.02100074, + "balance_loss_mlp": 1.02741516, + "epoch": 0.058319555087930255, + "flos": 29021202547200.0, + "grad_norm": 2.17073942942579, + "language_loss": 0.81194687, + "learning_rate": 3.991593662507167e-06, + "loss": 0.83424312, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 1.046875, + "step": 970, + "time_per_iteration": 2.4756600856781006 + }, + { + "auxiliary_loss_clip": 0.0113399, + "auxiliary_loss_mlp": 0.01100688, + "balance_loss_clip": 1.02262926, + "balance_loss_mlp": 1.02932334, + "epoch": 0.05837967834059823, + "flos": 18879003475200.0, + "grad_norm": 2.349614030792172, + "language_loss": 0.95256007, + "learning_rate": 3.991557954072958e-06, + "loss": 0.97490686, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 1.046875, + "step": 971, + "time_per_iteration": 2.4527878761291504 + }, + { + "auxiliary_loss_clip": 0.01131574, + "auxiliary_loss_mlp": 0.01095685, + "balance_loss_clip": 1.02287197, + "balance_loss_mlp": 1.0280596, + "epoch": 0.05843980159326619, + "flos": 25701870078720.0, + "grad_norm": 1.648915557947185, + "language_loss": 0.88691032, + "learning_rate": 3.991522170118673e-06, + "loss": 0.90918291, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 1.0390625, + "step": 972, + "time_per_iteration": 2.467953681945801 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01103289, + "balance_loss_clip": 1.03300285, + "balance_loss_mlp": 1.02864647, + "epoch": 0.058499924845934165, + "flos": 25551080449920.0, + "grad_norm": 1.974001269919108, + "language_loss": 0.89990461, + "learning_rate": 3.991486310645667e-06, + "loss": 0.92223477, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.015625, + "step": 973, + "time_per_iteration": 2.4539103507995605 + }, + { + "auxiliary_loss_clip": 0.01127564, + "auxiliary_loss_mlp": 0.0111272, + "balance_loss_clip": 1.03642631, + "balance_loss_mlp": 1.02655494, + "epoch": 0.05856004809860214, + "flos": 16435222882560.0, + "grad_norm": 1.7522613854206017, + "language_loss": 0.77836841, + "learning_rate": 3.991450375655301e-06, + "loss": 0.80077124, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 1.0078125, + "step": 974, + "time_per_iteration": 2.40959095954895 + }, + { + "auxiliary_loss_clip": 0.01125559, + "auxiliary_loss_mlp": 0.01109093, + "balance_loss_clip": 1.03065324, + "balance_loss_mlp": 1.02649522, + "epoch": 0.0586201713512701, + "flos": 39457230554880.0, + "grad_norm": 1.4281097460993528, + "language_loss": 0.78482264, + "learning_rate": 3.991414365148936e-06, + "loss": 0.80716914, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.9921875, + "step": 975, + "time_per_iteration": 2.5659992694854736 + }, + { + "auxiliary_loss_clip": 0.01129565, + "auxiliary_loss_mlp": 0.01104298, + "balance_loss_clip": 1.02843356, + "balance_loss_mlp": 1.02630043, + "epoch": 0.058680294603938074, + "flos": 23364120885120.0, + "grad_norm": 2.036632895548704, + "language_loss": 0.80002582, + "learning_rate": 3.99137827912794e-06, + "loss": 0.82236445, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 1.03125, + "step": 976, + "time_per_iteration": 2.4385452270507812 + }, + { + "auxiliary_loss_clip": 0.01125149, + "auxiliary_loss_mlp": 0.01098623, + "balance_loss_clip": 1.02552378, + "balance_loss_mlp": 1.02643919, + "epoch": 0.05874041785660604, + "flos": 32230698278400.0, + "grad_norm": 1.6919907319595964, + "language_loss": 0.89246011, + "learning_rate": 3.991342117593679e-06, + "loss": 0.91469783, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.984375, + "step": 977, + "time_per_iteration": 2.501647710800171 + }, + { + "auxiliary_loss_clip": 0.01127052, + "auxiliary_loss_mlp": 0.01091713, + "balance_loss_clip": 1.02095032, + "balance_loss_mlp": 1.02556419, + "epoch": 0.05880054110927401, + "flos": 22308940730880.0, + "grad_norm": 1.5605598580318714, + "language_loss": 0.8205657, + "learning_rate": 3.991305880547527e-06, + "loss": 0.84275341, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 1.015625, + "step": 978, + "time_per_iteration": 2.4952917098999023 + }, + { + "auxiliary_loss_clip": 0.01130386, + "auxiliary_loss_mlp": 0.01100412, + "balance_loss_clip": 1.02507186, + "balance_loss_mlp": 1.02712965, + "epoch": 0.05886066436194198, + "flos": 27379237305600.0, + "grad_norm": 1.8530220101614145, + "language_loss": 0.83436739, + "learning_rate": 3.991269567990855e-06, + "loss": 0.85667533, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 1.03125, + "step": 979, + "time_per_iteration": 2.544856548309326 + }, + { + "auxiliary_loss_clip": 0.0104495, + "auxiliary_loss_mlp": 0.0101797, + "balance_loss_clip": 1.00185323, + "balance_loss_mlp": 1.0160768, + "epoch": 0.05892078761460995, + "flos": 59581725338880.0, + "grad_norm": 0.9455884025861344, + "language_loss": 0.59177703, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61240625, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.2890625, + "step": 980, + "time_per_iteration": 2.947077989578247 + }, + { + "auxiliary_loss_clip": 0.01123254, + "auxiliary_loss_mlp": 0.01091589, + "balance_loss_clip": 1.02364016, + "balance_loss_mlp": 1.0260489, + "epoch": 0.05898091086727792, + "flos": 15413175475200.0, + "grad_norm": 2.1947772391637574, + "language_loss": 0.90017039, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.92231882, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.97265625, + "step": 981, + "time_per_iteration": 2.385772943496704 + }, + { + "auxiliary_loss_clip": 0.01127395, + "auxiliary_loss_mlp": 0.01084126, + "balance_loss_clip": 1.01775062, + "balance_loss_mlp": 1.02742434, + "epoch": 0.059041034119945886, + "flos": 23654319039360.0, + "grad_norm": 1.9181546500720876, + "language_loss": 0.81091762, + "learning_rate": 3.991160177271513e-06, + "loss": 0.83303285, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 1.0, + "step": 982, + "time_per_iteration": 2.4679689407348633 + }, + { + "auxiliary_loss_clip": 0.01136352, + "auxiliary_loss_mlp": 0.01096767, + "balance_loss_clip": 1.02362049, + "balance_loss_mlp": 1.02906358, + "epoch": 0.05910115737261386, + "flos": 24752930791680.0, + "grad_norm": 1.9883143890633186, + "language_loss": 0.88401842, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.9063496, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 1.0703125, + "step": 983, + "time_per_iteration": 2.475595235824585 + }, + { + "auxiliary_loss_clip": 0.01127514, + "auxiliary_loss_mlp": 0.01096753, + "balance_loss_clip": 1.02093637, + "balance_loss_mlp": 1.0264163, + "epoch": 0.05916128062528183, + "flos": 11727953291520.0, + "grad_norm": 1.8307139798266348, + "language_loss": 0.87248522, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.89472783, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 1.0078125, + "step": 984, + "time_per_iteration": 2.4062790870666504 + }, + { + "auxiliary_loss_clip": 0.01124834, + "auxiliary_loss_mlp": 0.01087978, + "balance_loss_clip": 1.02112556, + "balance_loss_mlp": 1.02881384, + "epoch": 0.059221403877949795, + "flos": 21902063212800.0, + "grad_norm": 2.085748640143121, + "language_loss": 0.7996968, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.82182491, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.9609375, + "step": 985, + "time_per_iteration": 2.4216248989105225 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01095754, + "balance_loss_clip": 1.02880573, + "balance_loss_mlp": 1.02739823, + "epoch": 0.05928152713061777, + "flos": 20513742065280.0, + "grad_norm": 1.824564772438856, + "language_loss": 0.92660165, + "learning_rate": 3.991013265915661e-06, + "loss": 0.94885588, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 1.0234375, + "step": 986, + "time_per_iteration": 2.42228364944458 + }, + { + "auxiliary_loss_clip": 0.01130297, + "auxiliary_loss_mlp": 0.0109569, + "balance_loss_clip": 1.02115989, + "balance_loss_mlp": 1.02724624, + "epoch": 0.05934165038328574, + "flos": 24494084904960.0, + "grad_norm": 1.9736339688601048, + "language_loss": 0.78733784, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.80959773, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 1.03125, + "step": 987, + "time_per_iteration": 2.449246644973755 + }, + { + "auxiliary_loss_clip": 0.01132334, + "auxiliary_loss_mlp": 0.01091562, + "balance_loss_clip": 1.0168891, + "balance_loss_mlp": 1.02811456, + "epoch": 0.059401773635953704, + "flos": 38726498465280.0, + "grad_norm": 1.8510213112989038, + "language_loss": 0.75301909, + "learning_rate": 3.990939357235621e-06, + "loss": 0.77525806, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 1.046875, + "step": 988, + "time_per_iteration": 2.591006278991699 + }, + { + "auxiliary_loss_clip": 0.01040563, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.00676167, + "balance_loss_mlp": 1.01165795, + "epoch": 0.059461896888621676, + "flos": 58020618539520.0, + "grad_norm": 0.9645505870189149, + "language_loss": 0.71348822, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73412079, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.2890625, + "step": 989, + "time_per_iteration": 2.898162841796875 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01102607, + "balance_loss_clip": 1.02593184, + "balance_loss_mlp": 1.03012383, + "epoch": 0.05952202014128964, + "flos": 22126659189120.0, + "grad_norm": 2.1417111949179812, + "language_loss": 0.83003402, + "learning_rate": 3.990865146569105e-06, + "loss": 0.85239238, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 1.03125, + "step": 990, + "time_per_iteration": 2.4373950958251953 + }, + { + "auxiliary_loss_clip": 0.01125331, + "auxiliary_loss_mlp": 0.01092499, + "balance_loss_clip": 1.01715875, + "balance_loss_mlp": 1.02652311, + "epoch": 0.059582143393957614, + "flos": 20444823308160.0, + "grad_norm": 2.1669255441445934, + "language_loss": 0.87995899, + "learning_rate": 3.990827927994434e-06, + "loss": 0.90213722, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.98828125, + "step": 991, + "time_per_iteration": 2.427744150161743 + }, + { + "auxiliary_loss_clip": 0.01131123, + "auxiliary_loss_mlp": 0.01089779, + "balance_loss_clip": 1.01992249, + "balance_loss_mlp": 1.02690172, + "epoch": 0.059642266646625586, + "flos": 20593832457600.0, + "grad_norm": 1.7707128244897044, + "language_loss": 0.80476785, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.82697684, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 1.0390625, + "step": 992, + "time_per_iteration": 2.4672327041625977 + }, + { + "auxiliary_loss_clip": 0.01129771, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_clip": 1.02878189, + "balance_loss_mlp": 1.02716327, + "epoch": 0.05970238989929355, + "flos": 19351692639360.0, + "grad_norm": 2.3491214801954032, + "language_loss": 0.78657597, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.80882764, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 1.0234375, + "step": 993, + "time_per_iteration": 2.3871331214904785 + }, + { + "auxiliary_loss_clip": 0.01124075, + "auxiliary_loss_mlp": 0.01097182, + "balance_loss_clip": 1.02570462, + "balance_loss_mlp": 1.02677011, + "epoch": 0.05976251315196152, + "flos": 30262713960960.0, + "grad_norm": 2.0933487783578064, + "language_loss": 0.82146722, + "learning_rate": 3.990715819321712e-06, + "loss": 0.84367979, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.97265625, + "step": 994, + "time_per_iteration": 2.4808077812194824 + }, + { + "auxiliary_loss_clip": 0.01129265, + "auxiliary_loss_mlp": 0.0110893, + "balance_loss_clip": 1.0372138, + "balance_loss_mlp": 1.02835047, + "epoch": 0.05982263640462949, + "flos": 23184038759040.0, + "grad_norm": 2.9910757369075758, + "language_loss": 0.82906139, + "learning_rate": 3.99067829878596e-06, + "loss": 0.85144335, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 1.015625, + "step": 995, + "time_per_iteration": 2.4094972610473633 + }, + { + "auxiliary_loss_clip": 0.01124242, + "auxiliary_loss_mlp": 0.01106382, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.02604425, + "epoch": 0.05988275965729746, + "flos": 27849761965440.0, + "grad_norm": 2.160637101156907, + "language_loss": 0.89495534, + "learning_rate": 3.990640702763487e-06, + "loss": 0.9172616, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.984375, + "step": 996, + "time_per_iteration": 2.4765777587890625 + }, + { + "auxiliary_loss_clip": 0.01126012, + "auxiliary_loss_mlp": 0.01098069, + "balance_loss_clip": 1.02663851, + "balance_loss_mlp": 1.02726793, + "epoch": 0.05994288290996543, + "flos": 24678880064640.0, + "grad_norm": 2.783047225703568, + "language_loss": 0.91457498, + "learning_rate": 3.990603031255718e-06, + "loss": 0.93681592, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.98828125, + "step": 997, + "time_per_iteration": 2.4281632900238037 + }, + { + "auxiliary_loss_clip": 0.01041926, + "auxiliary_loss_mlp": 0.01020887, + "balance_loss_clip": 1.00534177, + "balance_loss_mlp": 1.00967336, + "epoch": 0.0600030061626334, + "flos": 69925965782400.0, + "grad_norm": 1.0421868728509203, + "language_loss": 0.75635278, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77698088, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.32226562, + "step": 998, + "time_per_iteration": 3.1077888011932373 + }, + { + "auxiliary_loss_clip": 0.01119311, + "auxiliary_loss_mlp": 0.01087833, + "balance_loss_clip": 1.02388883, + "balance_loss_mlp": 1.02611005, + "epoch": 0.06006312941530137, + "flos": 26538982680960.0, + "grad_norm": 2.0418756904794084, + "language_loss": 0.7906177, + "learning_rate": 3.990527461790013e-06, + "loss": 0.81268913, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.9296875, + "step": 999, + "time_per_iteration": 2.440178394317627 + }, + { + "auxiliary_loss_clip": 0.01129103, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_clip": 1.018139, + "balance_loss_mlp": 1.02713966, + "epoch": 0.060123252667969335, + "flos": 27342787979520.0, + "grad_norm": 1.705774846965125, + "language_loss": 0.8448599, + "learning_rate": 3.990489563834943e-06, + "loss": 0.86706334, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 1.015625, + "step": 1000, + "time_per_iteration": 2.4486825466156006 + }, + { + "auxiliary_loss_clip": 0.01123979, + "auxiliary_loss_mlp": 0.01092467, + "balance_loss_clip": 1.02103686, + "balance_loss_mlp": 1.02594709, + "epoch": 0.06018337592063731, + "flos": 27015477183360.0, + "grad_norm": 2.0880864904568814, + "language_loss": 0.89541829, + "learning_rate": 3.990451590400309e-06, + "loss": 0.91758275, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.98046875, + "step": 1001, + "time_per_iteration": 2.4317262172698975 + }, + { + "auxiliary_loss_clip": 0.01125175, + "auxiliary_loss_mlp": 0.01088186, + "balance_loss_clip": 1.01952112, + "balance_loss_mlp": 1.02728069, + "epoch": 0.06024349917330528, + "flos": 25591788961920.0, + "grad_norm": 2.0395459777277427, + "language_loss": 0.76422989, + "learning_rate": 3.990413541487551e-06, + "loss": 0.78636342, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.98046875, + "step": 1002, + "time_per_iteration": 2.4245893955230713 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01094445, + "balance_loss_clip": 1.01886606, + "balance_loss_mlp": 1.02682328, + "epoch": 0.060303622425973244, + "flos": 26132279719680.0, + "grad_norm": 2.331451198561699, + "language_loss": 0.79068947, + "learning_rate": 3.990375417098112e-06, + "loss": 0.81290758, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 1.0, + "step": 1003, + "time_per_iteration": 3.8609912395477295 + }, + { + "auxiliary_loss_clip": 0.01133453, + "auxiliary_loss_mlp": 0.01103605, + "balance_loss_clip": 1.02340078, + "balance_loss_mlp": 1.02931833, + "epoch": 0.060363745678641216, + "flos": 20376114019200.0, + "grad_norm": 2.629000738335318, + "language_loss": 0.73495626, + "learning_rate": 3.990337217233437e-06, + "loss": 0.75732684, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 1.046875, + "step": 1004, + "time_per_iteration": 2.411811351776123 + }, + { + "auxiliary_loss_clip": 0.0113358, + "auxiliary_loss_mlp": 0.01103084, + "balance_loss_clip": 1.0242151, + "balance_loss_mlp": 1.02763677, + "epoch": 0.06042386893130918, + "flos": 17748201582720.0, + "grad_norm": 2.7364271437941654, + "language_loss": 0.86672491, + "learning_rate": 3.990298941894976e-06, + "loss": 0.88909155, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 1.0625, + "step": 1005, + "time_per_iteration": 2.40199613571167 + }, + { + "auxiliary_loss_clip": 0.01044407, + "auxiliary_loss_mlp": 0.01021358, + "balance_loss_clip": 1.00161648, + "balance_loss_mlp": 1.01261878, + "epoch": 0.06048399218397715, + "flos": 68535061194240.0, + "grad_norm": 0.9332703699946714, + "language_loss": 0.59206235, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61271989, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.31835938, + "step": 1006, + "time_per_iteration": 4.595729112625122 + }, + { + "auxiliary_loss_clip": 0.01127408, + "auxiliary_loss_mlp": 0.0109366, + "balance_loss_clip": 1.01679337, + "balance_loss_mlp": 1.02574062, + "epoch": 0.060544115436645125, + "flos": 23257391258880.0, + "grad_norm": 2.0632220513829975, + "language_loss": 0.78616261, + "learning_rate": 3.990222164802503e-06, + "loss": 0.80837327, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 1.015625, + "step": 1007, + "time_per_iteration": 3.910252571105957 + }, + { + "auxiliary_loss_clip": 0.01128302, + "auxiliary_loss_mlp": 0.01096327, + "balance_loss_clip": 1.02155876, + "balance_loss_mlp": 1.02737665, + "epoch": 0.06060423868931309, + "flos": 23877309093120.0, + "grad_norm": 1.8110772081952613, + "language_loss": 0.839454, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.86170024, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 1.0078125, + "step": 1008, + "time_per_iteration": 2.4718194007873535 + }, + { + "auxiliary_loss_clip": 0.0112531, + "auxiliary_loss_mlp": 0.0109185, + "balance_loss_clip": 1.02142155, + "balance_loss_mlp": 1.02681816, + "epoch": 0.06066436194198106, + "flos": 18727236328320.0, + "grad_norm": 1.680063284662544, + "language_loss": 0.81262296, + "learning_rate": 3.990145085832335e-06, + "loss": 0.83479458, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.984375, + "step": 1009, + "time_per_iteration": 2.403305768966675 + }, + { + "auxiliary_loss_clip": 0.01120654, + "auxiliary_loss_mlp": 0.01093241, + "balance_loss_clip": 1.02152514, + "balance_loss_mlp": 1.02543366, + "epoch": 0.06072448519464903, + "flos": 24639428361600.0, + "grad_norm": 1.7696400988824703, + "language_loss": 0.9448337, + "learning_rate": 3.990106433146769e-06, + "loss": 0.96697271, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.953125, + "step": 1010, + "time_per_iteration": 2.4620039463043213 + }, + { + "auxiliary_loss_clip": 0.01135929, + "auxiliary_loss_mlp": 0.01099439, + "balance_loss_clip": 1.02009344, + "balance_loss_mlp": 1.02835917, + "epoch": 0.060784608447317, + "flos": 17378017770240.0, + "grad_norm": 2.222828808431505, + "language_loss": 0.75482929, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.777183, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 1.078125, + "step": 1011, + "time_per_iteration": 2.4503891468048096 + }, + { + "auxiliary_loss_clip": 0.01125509, + "auxiliary_loss_mlp": 0.0109771, + "balance_loss_clip": 1.02313316, + "balance_loss_mlp": 1.02678943, + "epoch": 0.06084473169998497, + "flos": 23691187301760.0, + "grad_norm": 1.7846043873876567, + "language_loss": 0.89260995, + "learning_rate": 3.990028901381999e-06, + "loss": 0.91484219, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.98828125, + "step": 1012, + "time_per_iteration": 2.4589455127716064 + }, + { + "auxiliary_loss_clip": 0.01124339, + "auxiliary_loss_mlp": 0.01093836, + "balance_loss_clip": 1.02231002, + "balance_loss_mlp": 1.02477717, + "epoch": 0.06090485495265294, + "flos": 23545320174720.0, + "grad_norm": 1.7186659283016437, + "language_loss": 0.79806948, + "learning_rate": 3.989990022305734e-06, + "loss": 0.82025123, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.99609375, + "step": 1013, + "time_per_iteration": 2.413548707962036 + }, + { + "auxiliary_loss_clip": 0.01130347, + "auxiliary_loss_mlp": 0.01100286, + "balance_loss_clip": 1.02547002, + "balance_loss_mlp": 1.02708149, + "epoch": 0.06096497820532091, + "flos": 20338268238720.0, + "grad_norm": 2.2365622617191647, + "language_loss": 0.8890422, + "learning_rate": 3.98995106776885e-06, + "loss": 0.91134852, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 1.03125, + "step": 1014, + "time_per_iteration": 2.461885452270508 + }, + { + "auxiliary_loss_clip": 0.01132685, + "auxiliary_loss_mlp": 0.01098485, + "balance_loss_clip": 1.02552927, + "balance_loss_mlp": 1.02941501, + "epoch": 0.061025101457988874, + "flos": 26937935320320.0, + "grad_norm": 1.9892464152835103, + "language_loss": 0.77418143, + "learning_rate": 3.98991203777282e-06, + "loss": 0.79649317, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 1.03125, + "step": 1015, + "time_per_iteration": 2.461822509765625 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01093237, + "balance_loss_clip": 1.0259552, + "balance_loss_mlp": 1.02563667, + "epoch": 0.061085224710656846, + "flos": 25373861055360.0, + "grad_norm": 1.9335358993192098, + "language_loss": 0.81727445, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.83939916, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.9375, + "step": 1016, + "time_per_iteration": 2.518801689147949 + }, + { + "auxiliary_loss_clip": 0.01124394, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_clip": 1.01463199, + "balance_loss_mlp": 1.02592719, + "epoch": 0.06114534796332482, + "flos": 24823664939520.0, + "grad_norm": 1.9487878983565214, + "language_loss": 0.78400135, + "learning_rate": 3.989833751409254e-06, + "loss": 0.80608064, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.984375, + "step": 1017, + "time_per_iteration": 2.432973861694336 + }, + { + "auxiliary_loss_clip": 0.01135069, + "auxiliary_loss_mlp": 0.01108516, + "balance_loss_clip": 1.03241277, + "balance_loss_mlp": 1.0312767, + "epoch": 0.061205471215992784, + "flos": 20630386517760.0, + "grad_norm": 1.6789295244751747, + "language_loss": 0.88619113, + "learning_rate": 3.989794495044685e-06, + "loss": 0.90862697, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 1.03125, + "step": 1018, + "time_per_iteration": 2.427917957305908 + }, + { + "auxiliary_loss_clip": 0.01121603, + "auxiliary_loss_mlp": 0.01100773, + "balance_loss_clip": 1.03120232, + "balance_loss_mlp": 1.02634573, + "epoch": 0.061265594468660756, + "flos": 16507423307520.0, + "grad_norm": 3.73986103754787, + "language_loss": 0.8295331, + "learning_rate": 3.989755163226909e-06, + "loss": 0.85175681, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.953125, + "step": 1019, + "time_per_iteration": 2.4019839763641357 + }, + { + "auxiliary_loss_clip": 0.01125408, + "auxiliary_loss_mlp": 0.01091075, + "balance_loss_clip": 1.02026463, + "balance_loss_mlp": 1.02759457, + "epoch": 0.06132571772132872, + "flos": 26245118833920.0, + "grad_norm": 1.8426924374851184, + "language_loss": 0.85705459, + "learning_rate": 3.989715755957418e-06, + "loss": 0.87921941, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.9765625, + "step": 1020, + "time_per_iteration": 2.4584877490997314 + }, + { + "auxiliary_loss_clip": 0.01122658, + "auxiliary_loss_mlp": 0.01089941, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.02635193, + "epoch": 0.06138584097399669, + "flos": 37413275385600.0, + "grad_norm": 1.9112892155786403, + "language_loss": 0.81077987, + "learning_rate": 3.989676273237705e-06, + "loss": 0.83290589, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.96484375, + "step": 1021, + "time_per_iteration": 2.543074607849121 + }, + { + "auxiliary_loss_clip": 0.01120304, + "auxiliary_loss_mlp": 0.01093257, + "balance_loss_clip": 1.02692902, + "balance_loss_mlp": 1.02532387, + "epoch": 0.061445964226664665, + "flos": 17419703800320.0, + "grad_norm": 1.9207476478332044, + "language_loss": 0.90067899, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.92281461, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.94921875, + "step": 1022, + "time_per_iteration": 2.3972439765930176 + }, + { + "auxiliary_loss_clip": 0.01120043, + "auxiliary_loss_mlp": 0.01085176, + "balance_loss_clip": 1.01894331, + "balance_loss_mlp": 1.02614224, + "epoch": 0.06150608747933263, + "flos": 22598964328320.0, + "grad_norm": 1.5899042550673956, + "language_loss": 0.84694982, + "learning_rate": 3.989597081453611e-06, + "loss": 0.86900198, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.94140625, + "step": 1023, + "time_per_iteration": 2.425142526626587 + }, + { + "auxiliary_loss_clip": 0.01042381, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.01723135, + "balance_loss_mlp": 1.01605856, + "epoch": 0.0615662107320006, + "flos": 56738712816000.0, + "grad_norm": 0.9173402749138183, + "language_loss": 0.65254587, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67328978, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.26367188, + "step": 1024, + "time_per_iteration": 3.051473617553711 + }, + { + "auxiliary_loss_clip": 0.01123627, + "auxiliary_loss_mlp": 0.01091455, + "balance_loss_clip": 1.02102637, + "balance_loss_mlp": 1.02805865, + "epoch": 0.06162633398466857, + "flos": 22563701988480.0, + "grad_norm": 1.929982686054449, + "language_loss": 0.9136706, + "learning_rate": 3.989517587886636e-06, + "loss": 0.93582141, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.953125, + "step": 1025, + "time_per_iteration": 2.418164014816284 + }, + { + "auxiliary_loss_clip": 0.01121206, + "auxiliary_loss_mlp": 0.0108356, + "balance_loss_clip": 1.01797104, + "balance_loss_mlp": 1.02517581, + "epoch": 0.06168645723733654, + "flos": 25591928607360.0, + "grad_norm": 1.5744310892839763, + "language_loss": 0.86198819, + "learning_rate": 3.989477727938335e-06, + "loss": 0.88403589, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.9609375, + "step": 1026, + "time_per_iteration": 2.4618115425109863 + }, + { + "auxiliary_loss_clip": 0.01125572, + "auxiliary_loss_mlp": 0.01096842, + "balance_loss_clip": 1.02813005, + "balance_loss_mlp": 1.02566648, + "epoch": 0.06174658049000451, + "flos": 15996993096960.0, + "grad_norm": 2.0899207214694018, + "language_loss": 0.85217607, + "learning_rate": 3.989437792548839e-06, + "loss": 0.8744002, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 1.0, + "step": 1027, + "time_per_iteration": 2.383760690689087 + }, + { + "auxiliary_loss_clip": 0.01120497, + "auxiliary_loss_mlp": 0.01090677, + "balance_loss_clip": 1.02816403, + "balance_loss_mlp": 1.02672338, + "epoch": 0.06180670374267248, + "flos": 11285324674560.0, + "grad_norm": 2.2859821390063115, + "language_loss": 0.86628294, + "learning_rate": 3.989397781719663e-06, + "loss": 0.88839465, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.9375, + "step": 1028, + "time_per_iteration": 2.3931970596313477 + }, + { + "auxiliary_loss_clip": 0.01040075, + "auxiliary_loss_mlp": 0.01024563, + "balance_loss_clip": 1.01130724, + "balance_loss_mlp": 1.01365328, + "epoch": 0.06186682699534045, + "flos": 65127224695680.0, + "grad_norm": 0.9543270552395677, + "language_loss": 0.60728455, + "learning_rate": 3.989357695452323e-06, + "loss": 0.627931, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.265625, + "step": 1029, + "time_per_iteration": 2.911198377609253 + }, + { + "auxiliary_loss_clip": 0.01122117, + "auxiliary_loss_mlp": 0.0110372, + "balance_loss_clip": 1.0340066, + "balance_loss_mlp": 1.02625489, + "epoch": 0.061926950248008414, + "flos": 21104681604480.0, + "grad_norm": 1.9962560984165834, + "language_loss": 0.85088837, + "learning_rate": 3.98931753374834e-06, + "loss": 0.87314671, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.95703125, + "step": 1030, + "time_per_iteration": 2.4927802085876465 + }, + { + "auxiliary_loss_clip": 0.01126456, + "auxiliary_loss_mlp": 0.01111307, + "balance_loss_clip": 1.03982925, + "balance_loss_mlp": 1.02838099, + "epoch": 0.061987073500676386, + "flos": 17747503355520.0, + "grad_norm": 2.381949717331762, + "language_loss": 0.84579551, + "learning_rate": 3.989277296609237e-06, + "loss": 0.86817312, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.98046875, + "step": 1031, + "time_per_iteration": 2.5611655712127686 + }, + { + "auxiliary_loss_clip": 0.01119073, + "auxiliary_loss_mlp": 0.01102307, + "balance_loss_clip": 1.03755212, + "balance_loss_mlp": 1.02531815, + "epoch": 0.06204719675334436, + "flos": 21835134403200.0, + "grad_norm": 1.4828372411818636, + "language_loss": 0.78956044, + "learning_rate": 3.98923698403654e-06, + "loss": 0.81177419, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.9375, + "step": 1032, + "time_per_iteration": 2.5441999435424805 + }, + { + "auxiliary_loss_clip": 0.01128614, + "auxiliary_loss_mlp": 0.01093413, + "balance_loss_clip": 1.02675104, + "balance_loss_mlp": 1.02712679, + "epoch": 0.06210732000601232, + "flos": 19352705068800.0, + "grad_norm": 2.128034416515746, + "language_loss": 0.92314547, + "learning_rate": 3.989196596031776e-06, + "loss": 0.94536573, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 1.015625, + "step": 1033, + "time_per_iteration": 2.51632022857666 + }, + { + "auxiliary_loss_clip": 0.01125029, + "auxiliary_loss_mlp": 0.01094553, + "balance_loss_clip": 1.03439975, + "balance_loss_mlp": 1.02745986, + "epoch": 0.062167443258680295, + "flos": 24748357403520.0, + "grad_norm": 2.2873166466435784, + "language_loss": 0.88127214, + "learning_rate": 3.989156132596479e-06, + "loss": 0.90346795, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.9765625, + "step": 1034, + "time_per_iteration": 2.508610486984253 + }, + { + "auxiliary_loss_clip": 0.01116404, + "auxiliary_loss_mlp": 0.01085141, + "balance_loss_clip": 1.02632368, + "balance_loss_mlp": 1.02681756, + "epoch": 0.06222756651134827, + "flos": 34457074634880.0, + "grad_norm": 1.8945276774460322, + "language_loss": 0.83572543, + "learning_rate": 3.989115593732182e-06, + "loss": 0.85774082, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.8984375, + "step": 1035, + "time_per_iteration": 2.509218454360962 + }, + { + "auxiliary_loss_clip": 0.01124265, + "auxiliary_loss_mlp": 0.01078861, + "balance_loss_clip": 1.01563263, + "balance_loss_mlp": 1.03003216, + "epoch": 0.06228768976401623, + "flos": 25665281107200.0, + "grad_norm": 1.8757286985992025, + "language_loss": 0.81553674, + "learning_rate": 3.989074979440421e-06, + "loss": 0.83756799, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.94140625, + "step": 1036, + "time_per_iteration": 2.500920295715332 + }, + { + "auxiliary_loss_clip": 0.01120647, + "auxiliary_loss_mlp": 0.010817, + "balance_loss_clip": 1.02149892, + "balance_loss_mlp": 1.02839124, + "epoch": 0.062347813016684205, + "flos": 25294608535680.0, + "grad_norm": 1.840717521557544, + "language_loss": 0.8892284, + "learning_rate": 3.989034289722739e-06, + "loss": 0.9112519, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.921875, + "step": 1037, + "time_per_iteration": 2.5002129077911377 + }, + { + "auxiliary_loss_clip": 0.01123696, + "auxiliary_loss_mlp": 0.01092036, + "balance_loss_clip": 1.02456403, + "balance_loss_mlp": 1.03040874, + "epoch": 0.06240793626935217, + "flos": 26905815002880.0, + "grad_norm": 2.066469722728655, + "language_loss": 0.84340858, + "learning_rate": 3.988993524580676e-06, + "loss": 0.8655659, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.9296875, + "step": 1038, + "time_per_iteration": 2.5259828567504883 + }, + { + "auxiliary_loss_clip": 0.01124296, + "auxiliary_loss_mlp": 0.0109562, + "balance_loss_clip": 1.03155708, + "balance_loss_mlp": 1.0315671, + "epoch": 0.06246805952202014, + "flos": 21614727790080.0, + "grad_norm": 2.079839169787104, + "language_loss": 0.89440054, + "learning_rate": 3.98895268401578e-06, + "loss": 0.91659975, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.9296875, + "step": 1039, + "time_per_iteration": 2.464094638824463 + }, + { + "auxiliary_loss_clip": 0.01121981, + "auxiliary_loss_mlp": 0.01092519, + "balance_loss_clip": 1.02666807, + "balance_loss_mlp": 1.02759457, + "epoch": 0.0625281827746881, + "flos": 19311053950080.0, + "grad_norm": 2.024780827523462, + "language_loss": 0.83123702, + "learning_rate": 3.9889117680296e-06, + "loss": 0.85338205, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.9453125, + "step": 1040, + "time_per_iteration": 2.4867732524871826 + }, + { + "auxiliary_loss_clip": 0.01120928, + "auxiliary_loss_mlp": 0.01091156, + "balance_loss_clip": 1.03031123, + "balance_loss_mlp": 1.03144467, + "epoch": 0.06258830602735609, + "flos": 27744533527680.0, + "grad_norm": 2.3310679075405, + "language_loss": 0.73142481, + "learning_rate": 3.988870776623685e-06, + "loss": 0.75354564, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.89453125, + "step": 1041, + "time_per_iteration": 2.522988796234131 + }, + { + "auxiliary_loss_clip": 0.01119462, + "auxiliary_loss_mlp": 0.010963, + "balance_loss_clip": 1.03006768, + "balance_loss_mlp": 1.02643991, + "epoch": 0.06264842928002405, + "flos": 23221465603200.0, + "grad_norm": 1.963326904830616, + "language_loss": 0.84113556, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.86329317, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.9296875, + "step": 1042, + "time_per_iteration": 2.454434633255005 + }, + { + "auxiliary_loss_clip": 0.01121524, + "auxiliary_loss_mlp": 0.01090381, + "balance_loss_clip": 1.03072917, + "balance_loss_mlp": 1.0271163, + "epoch": 0.06270855253269202, + "flos": 38397965771520.0, + "grad_norm": 1.6266546397167794, + "language_loss": 0.79744434, + "learning_rate": 3.988788567558874e-06, + "loss": 0.81956339, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.9453125, + "step": 1043, + "time_per_iteration": 4.0658957958221436 + }, + { + "auxiliary_loss_clip": 0.01114728, + "auxiliary_loss_mlp": 0.01079668, + "balance_loss_clip": 1.02123189, + "balance_loss_mlp": 1.02563965, + "epoch": 0.06276867578535998, + "flos": 22452503708160.0, + "grad_norm": 1.97361183981175, + "language_loss": 0.95319337, + "learning_rate": 3.988747349903097e-06, + "loss": 0.97513735, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.890625, + "step": 1044, + "time_per_iteration": 2.4186480045318604 + }, + { + "auxiliary_loss_clip": 0.01120944, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_clip": 1.02355337, + "balance_loss_mlp": 1.02598763, + "epoch": 0.06282879903802796, + "flos": 22929312412800.0, + "grad_norm": 1.936695404506749, + "language_loss": 0.8825779, + "learning_rate": 3.988706056833821e-06, + "loss": 0.9046396, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.94921875, + "step": 1045, + "time_per_iteration": 2.4632999897003174 + }, + { + "auxiliary_loss_clip": 0.01115101, + "auxiliary_loss_mlp": 0.01082772, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.02473474, + "epoch": 0.06288892229069593, + "flos": 34817937114240.0, + "grad_norm": 2.219264745329713, + "language_loss": 0.81954104, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.84151971, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.90625, + "step": 1046, + "time_per_iteration": 5.377129316329956 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01093742, + "balance_loss_clip": 1.03168166, + "balance_loss_mlp": 1.0258224, + "epoch": 0.06294904554336389, + "flos": 19426127391360.0, + "grad_norm": 2.047324705182207, + "language_loss": 0.80351937, + "learning_rate": 3.988623244461039e-06, + "loss": 0.82563967, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.92578125, + "step": 1047, + "time_per_iteration": 3.8437821865081787 + }, + { + "auxiliary_loss_clip": 0.01126066, + "auxiliary_loss_mlp": 0.01092804, + "balance_loss_clip": 1.02752519, + "balance_loss_mlp": 1.02771652, + "epoch": 0.06300916879603187, + "flos": 40660267783680.0, + "grad_norm": 2.791299523368332, + "language_loss": 0.79515481, + "learning_rate": 3.988581725160672e-06, + "loss": 0.81734347, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.984375, + "step": 1048, + "time_per_iteration": 2.576902389526367 + }, + { + "auxiliary_loss_clip": 0.01122805, + "auxiliary_loss_mlp": 0.01087589, + "balance_loss_clip": 1.02552867, + "balance_loss_mlp": 1.02606678, + "epoch": 0.06306929204869983, + "flos": 23803048897920.0, + "grad_norm": 2.0886795309169455, + "language_loss": 0.80409116, + "learning_rate": 3.988540130453087e-06, + "loss": 0.82619512, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.96875, + "step": 1049, + "time_per_iteration": 2.4208083152770996 + }, + { + "auxiliary_loss_clip": 0.01119807, + "auxiliary_loss_mlp": 0.01092957, + "balance_loss_clip": 1.02505553, + "balance_loss_mlp": 1.02550459, + "epoch": 0.0631294153013678, + "flos": 18914824396800.0, + "grad_norm": 2.0890934669558656, + "language_loss": 0.85480368, + "learning_rate": 3.988498460339862e-06, + "loss": 0.87693131, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.9453125, + "step": 1050, + "time_per_iteration": 2.4068799018859863 + }, + { + "auxiliary_loss_clip": 0.01117889, + "auxiliary_loss_mlp": 0.01074428, + "balance_loss_clip": 1.01751745, + "balance_loss_mlp": 1.02725577, + "epoch": 0.06318953855403578, + "flos": 24279019729920.0, + "grad_norm": 1.7001516318288077, + "language_loss": 0.79598558, + "learning_rate": 3.988456714822575e-06, + "loss": 0.81790876, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.90625, + "step": 1051, + "time_per_iteration": 2.4582767486572266 + }, + { + "auxiliary_loss_clip": 0.01123244, + "auxiliary_loss_mlp": 0.01092525, + "balance_loss_clip": 1.02853346, + "balance_loss_mlp": 1.02813864, + "epoch": 0.06324966180670374, + "flos": 22527811244160.0, + "grad_norm": 2.3002736669031187, + "language_loss": 0.84092963, + "learning_rate": 3.98841489390281e-06, + "loss": 0.8630873, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.953125, + "step": 1052, + "time_per_iteration": 2.4346559047698975 + }, + { + "auxiliary_loss_clip": 0.01122476, + "auxiliary_loss_mlp": 0.01088316, + "balance_loss_clip": 1.02537346, + "balance_loss_mlp": 1.02801478, + "epoch": 0.06330978505937171, + "flos": 15777214888320.0, + "grad_norm": 1.9936604449039912, + "language_loss": 0.82026523, + "learning_rate": 3.988372997582155e-06, + "loss": 0.84237313, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.9453125, + "step": 1053, + "time_per_iteration": 2.3937292098999023 + }, + { + "auxiliary_loss_clip": 0.01122594, + "auxiliary_loss_mlp": 0.01082185, + "balance_loss_clip": 1.01757371, + "balance_loss_mlp": 1.02595663, + "epoch": 0.06336990831203967, + "flos": 21470012737920.0, + "grad_norm": 1.8938669357093418, + "language_loss": 0.87145114, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8934989, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.96875, + "step": 1054, + "time_per_iteration": 2.432512044906616 + }, + { + "auxiliary_loss_clip": 0.01121532, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_clip": 1.02472055, + "balance_loss_mlp": 1.02723479, + "epoch": 0.06343003156470765, + "flos": 18477886331520.0, + "grad_norm": 2.2997934050843134, + "language_loss": 0.88589448, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.90795445, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.9453125, + "step": 1055, + "time_per_iteration": 2.374807596206665 + }, + { + "auxiliary_loss_clip": 0.01124597, + "auxiliary_loss_mlp": 0.01094489, + "balance_loss_clip": 1.0252049, + "balance_loss_mlp": 1.02733898, + "epoch": 0.06349015481737562, + "flos": 25153733733120.0, + "grad_norm": 3.6571621070237463, + "language_loss": 0.87692791, + "learning_rate": 3.988246856230734e-06, + "loss": 0.89911878, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.9765625, + "step": 1056, + "time_per_iteration": 2.461125135421753 + }, + { + "auxiliary_loss_clip": 0.0112646, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_clip": 1.02276111, + "balance_loss_mlp": 1.02575731, + "epoch": 0.06355027807004358, + "flos": 26870517751680.0, + "grad_norm": 2.340082951683335, + "language_loss": 0.85647762, + "learning_rate": 3.988204658322426e-06, + "loss": 0.87871367, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 1.0078125, + "step": 1057, + "time_per_iteration": 2.4642553329467773 + }, + { + "auxiliary_loss_clip": 0.01113159, + "auxiliary_loss_mlp": 0.01081281, + "balance_loss_clip": 1.01905417, + "balance_loss_mlp": 1.02528191, + "epoch": 0.06361040132271156, + "flos": 21395647808640.0, + "grad_norm": 1.8343253404194075, + "language_loss": 0.85614014, + "learning_rate": 3.988162385021196e-06, + "loss": 0.87808454, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.87890625, + "step": 1058, + "time_per_iteration": 2.415755033493042 + }, + { + "auxiliary_loss_clip": 0.01120057, + "auxiliary_loss_mlp": 0.01101414, + "balance_loss_clip": 1.027933, + "balance_loss_mlp": 1.02598333, + "epoch": 0.06367052457537953, + "flos": 25732733587200.0, + "grad_norm": 2.228391340296399, + "language_loss": 0.90098339, + "learning_rate": 3.988120036328651e-06, + "loss": 0.92319798, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.9375, + "step": 1059, + "time_per_iteration": 2.419654369354248 + }, + { + "auxiliary_loss_clip": 0.01122992, + "auxiliary_loss_mlp": 0.01080912, + "balance_loss_clip": 1.01582408, + "balance_loss_mlp": 1.02689362, + "epoch": 0.0637306478280475, + "flos": 17630684346240.0, + "grad_norm": 2.3007004233747494, + "language_loss": 0.94364297, + "learning_rate": 3.988077612246394e-06, + "loss": 0.96568203, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.9609375, + "step": 1060, + "time_per_iteration": 2.395505428314209 + }, + { + "auxiliary_loss_clip": 0.0111891, + "auxiliary_loss_mlp": 0.01084301, + "balance_loss_clip": 1.02240729, + "balance_loss_mlp": 1.02564621, + "epoch": 0.06379077108071547, + "flos": 13661757521280.0, + "grad_norm": 1.947431582406713, + "language_loss": 0.90625364, + "learning_rate": 3.988035112776035e-06, + "loss": 0.92828572, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.93359375, + "step": 1061, + "time_per_iteration": 2.3723816871643066 + }, + { + "auxiliary_loss_clip": 0.01123006, + "auxiliary_loss_mlp": 0.01094058, + "balance_loss_clip": 1.02658546, + "balance_loss_mlp": 1.0237143, + "epoch": 0.06385089433338344, + "flos": 28477499944320.0, + "grad_norm": 2.381135429320362, + "language_loss": 0.80243617, + "learning_rate": 3.987992537919185e-06, + "loss": 0.8246069, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.9921875, + "step": 1062, + "time_per_iteration": 2.4985103607177734 + }, + { + "auxiliary_loss_clip": 0.01120905, + "auxiliary_loss_mlp": 0.01081172, + "balance_loss_clip": 1.01617932, + "balance_loss_mlp": 1.02306414, + "epoch": 0.0639110175860514, + "flos": 24310057795200.0, + "grad_norm": 2.0000955970980203, + "language_loss": 0.89178491, + "learning_rate": 3.987949887677459e-06, + "loss": 0.91380566, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.9765625, + "step": 1063, + "time_per_iteration": 2.4254629611968994 + }, + { + "auxiliary_loss_clip": 0.01123622, + "auxiliary_loss_mlp": 0.01091497, + "balance_loss_clip": 1.0242157, + "balance_loss_mlp": 1.02561641, + "epoch": 0.06397114083871938, + "flos": 22089686192640.0, + "grad_norm": 1.96234639238302, + "language_loss": 0.8321234, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.85427451, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.98046875, + "step": 1064, + "time_per_iteration": 2.4293994903564453 + }, + { + "auxiliary_loss_clip": 0.01120403, + "auxiliary_loss_mlp": 0.01090139, + "balance_loss_clip": 1.02567041, + "balance_loss_mlp": 1.02647913, + "epoch": 0.06403126409138735, + "flos": 19571819961600.0, + "grad_norm": 2.0400156021515072, + "language_loss": 0.8702895, + "learning_rate": 3.987864361045851e-06, + "loss": 0.89239496, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.9375, + "step": 1065, + "time_per_iteration": 2.3946800231933594 + }, + { + "auxiliary_loss_clip": 0.01121485, + "auxiliary_loss_mlp": 0.0108212, + "balance_loss_clip": 1.01774669, + "balance_loss_mlp": 1.02662969, + "epoch": 0.06409138734405531, + "flos": 40805820708480.0, + "grad_norm": 1.7385896987598173, + "language_loss": 0.70013899, + "learning_rate": 3.987821484659211e-06, + "loss": 0.722175, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.9453125, + "step": 1066, + "time_per_iteration": 2.5871217250823975 + }, + { + "auxiliary_loss_clip": 0.01122842, + "auxiliary_loss_mlp": 0.01099292, + "balance_loss_clip": 1.02810073, + "balance_loss_mlp": 1.02791452, + "epoch": 0.06415151059672328, + "flos": 20440773590400.0, + "grad_norm": 2.482341930729839, + "language_loss": 0.93501902, + "learning_rate": 3.987778532894181e-06, + "loss": 0.95724034, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.94921875, + "step": 1067, + "time_per_iteration": 2.3952934741973877 + }, + { + "auxiliary_loss_clip": 0.01124211, + "auxiliary_loss_mlp": 0.01089905, + "balance_loss_clip": 1.02395844, + "balance_loss_mlp": 1.02650857, + "epoch": 0.06421163384939126, + "flos": 18071218281600.0, + "grad_norm": 2.1168191476421754, + "language_loss": 0.8741498, + "learning_rate": 3.987735505752391e-06, + "loss": 0.89629096, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.9765625, + "step": 1068, + "time_per_iteration": 2.4017884731292725 + }, + { + "auxiliary_loss_clip": 0.01120972, + "auxiliary_loss_mlp": 0.01083717, + "balance_loss_clip": 1.023206, + "balance_loss_mlp": 1.02648103, + "epoch": 0.06427175710205922, + "flos": 25118261925120.0, + "grad_norm": 2.41480106391959, + "language_loss": 0.92427808, + "learning_rate": 3.987692403235471e-06, + "loss": 0.94632494, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.9453125, + "step": 1069, + "time_per_iteration": 2.434234857559204 + }, + { + "auxiliary_loss_clip": 0.01125251, + "auxiliary_loss_mlp": 0.01095476, + "balance_loss_clip": 1.02719295, + "balance_loss_mlp": 1.02661729, + "epoch": 0.06433188035472719, + "flos": 17379693515520.0, + "grad_norm": 2.464797247156926, + "language_loss": 0.99638212, + "learning_rate": 3.987649225345056e-06, + "loss": 1.01858938, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.984375, + "step": 1070, + "time_per_iteration": 2.382152557373047 + }, + { + "auxiliary_loss_clip": 0.01125855, + "auxiliary_loss_mlp": 0.01090611, + "balance_loss_clip": 1.02018189, + "balance_loss_mlp": 1.0274061, + "epoch": 0.06439200360739517, + "flos": 23545250352000.0, + "grad_norm": 1.611547189992938, + "language_loss": 0.89884496, + "learning_rate": 3.987605972082782e-06, + "loss": 0.9210096, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.984375, + "step": 1071, + "time_per_iteration": 2.4362175464630127 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01092878, + "balance_loss_clip": 1.02373648, + "balance_loss_mlp": 1.02501035, + "epoch": 0.06445212686006313, + "flos": 21978732291840.0, + "grad_norm": 1.660792292067065, + "language_loss": 0.80094004, + "learning_rate": 3.987562643450292e-06, + "loss": 0.82305497, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.9375, + "step": 1072, + "time_per_iteration": 2.434375286102295 + }, + { + "auxiliary_loss_clip": 0.01124437, + "auxiliary_loss_mlp": 0.01092933, + "balance_loss_clip": 1.02417302, + "balance_loss_mlp": 1.02694046, + "epoch": 0.0645122501127311, + "flos": 25920112187520.0, + "grad_norm": 2.112999654607481, + "language_loss": 0.83960772, + "learning_rate": 3.987519239449226e-06, + "loss": 0.86178148, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.9765625, + "step": 1073, + "time_per_iteration": 2.447216510772705 + }, + { + "auxiliary_loss_clip": 0.01121902, + "auxiliary_loss_mlp": 0.01090209, + "balance_loss_clip": 1.02168775, + "balance_loss_mlp": 1.02630675, + "epoch": 0.06457237336539907, + "flos": 25624956620160.0, + "grad_norm": 1.8802980392483422, + "language_loss": 0.82009876, + "learning_rate": 3.987475760081233e-06, + "loss": 0.84221989, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.95703125, + "step": 1074, + "time_per_iteration": 2.4824140071868896 + }, + { + "auxiliary_loss_clip": 0.01121573, + "auxiliary_loss_mlp": 0.01094172, + "balance_loss_clip": 1.02503073, + "balance_loss_mlp": 1.0261662, + "epoch": 0.06463249661806704, + "flos": 19462960742400.0, + "grad_norm": 1.6310999107561297, + "language_loss": 0.82577121, + "learning_rate": 3.987432205347958e-06, + "loss": 0.8479287, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.953125, + "step": 1075, + "time_per_iteration": 2.394707441329956 + }, + { + "auxiliary_loss_clip": 0.01126363, + "auxiliary_loss_mlp": 0.01085825, + "balance_loss_clip": 1.01968789, + "balance_loss_mlp": 1.02775788, + "epoch": 0.064692619870735, + "flos": 24496912725120.0, + "grad_norm": 3.660783459659181, + "language_loss": 0.91883302, + "learning_rate": 3.987388575251055e-06, + "loss": 0.94095492, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.98828125, + "step": 1076, + "time_per_iteration": 2.4476311206817627 + }, + { + "auxiliary_loss_clip": 0.01121398, + "auxiliary_loss_mlp": 0.01085063, + "balance_loss_clip": 1.01921165, + "balance_loss_mlp": 1.02620292, + "epoch": 0.06475274312340297, + "flos": 17017748784000.0, + "grad_norm": 1.8854817509594821, + "language_loss": 0.84750068, + "learning_rate": 3.98734486979218e-06, + "loss": 0.86956525, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.953125, + "step": 1077, + "time_per_iteration": 2.3808188438415527 + }, + { + "auxiliary_loss_clip": 0.01126964, + "auxiliary_loss_mlp": 0.01098882, + "balance_loss_clip": 1.02916861, + "balance_loss_mlp": 1.02873921, + "epoch": 0.06481286637607095, + "flos": 24571207831680.0, + "grad_norm": 2.015731985806654, + "language_loss": 0.94662684, + "learning_rate": 3.987301088972986e-06, + "loss": 0.96888524, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.98046875, + "step": 1078, + "time_per_iteration": 2.44041109085083 + }, + { + "auxiliary_loss_clip": 0.01131238, + "auxiliary_loss_mlp": 0.01097308, + "balance_loss_clip": 1.02730823, + "balance_loss_mlp": 1.02946138, + "epoch": 0.06487298962873891, + "flos": 21104576870400.0, + "grad_norm": 1.8996078223216881, + "language_loss": 0.81335634, + "learning_rate": 3.987257232795137e-06, + "loss": 0.83564186, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 1.015625, + "step": 1079, + "time_per_iteration": 2.547140121459961 + }, + { + "auxiliary_loss_clip": 0.01122766, + "auxiliary_loss_mlp": 0.01092215, + "balance_loss_clip": 1.02393222, + "balance_loss_mlp": 1.02702093, + "epoch": 0.06493311288140688, + "flos": 24607028753280.0, + "grad_norm": 1.6574004791619237, + "language_loss": 0.72627282, + "learning_rate": 3.987213301260294e-06, + "loss": 0.74842262, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.9609375, + "step": 1080, + "time_per_iteration": 2.4899046421051025 + }, + { + "auxiliary_loss_clip": 0.01121816, + "auxiliary_loss_mlp": 0.01081533, + "balance_loss_clip": 1.01787555, + "balance_loss_mlp": 1.02541375, + "epoch": 0.06499323613407486, + "flos": 25336818236160.0, + "grad_norm": 1.8396558955000455, + "language_loss": 0.7595588, + "learning_rate": 3.987169294370123e-06, + "loss": 0.78159231, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.96484375, + "step": 1081, + "time_per_iteration": 2.4479482173919678 + }, + { + "auxiliary_loss_clip": 0.01118475, + "auxiliary_loss_mlp": 0.0108487, + "balance_loss_clip": 1.02097356, + "balance_loss_mlp": 1.02585471, + "epoch": 0.06505335938674282, + "flos": 20374682653440.0, + "grad_norm": 2.3818817102333636, + "language_loss": 0.87467372, + "learning_rate": 3.987125212126294e-06, + "loss": 0.89670718, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.92578125, + "step": 1082, + "time_per_iteration": 2.3888351917266846 + }, + { + "auxiliary_loss_clip": 0.01133713, + "auxiliary_loss_mlp": 0.0110356, + "balance_loss_clip": 1.02912593, + "balance_loss_mlp": 1.02908659, + "epoch": 0.06511348263941079, + "flos": 25336748413440.0, + "grad_norm": 2.062456197724076, + "language_loss": 0.8509053, + "learning_rate": 3.987081054530478e-06, + "loss": 0.87327802, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 1.046875, + "step": 1083, + "time_per_iteration": 3.9143543243408203 + }, + { + "auxiliary_loss_clip": 0.01119878, + "auxiliary_loss_mlp": 0.01083064, + "balance_loss_clip": 1.01988316, + "balance_loss_mlp": 1.0279175, + "epoch": 0.06517360589207877, + "flos": 20331949282560.0, + "grad_norm": 2.651307631539728, + "language_loss": 0.83543038, + "learning_rate": 3.987036821584348e-06, + "loss": 0.85745978, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.91796875, + "step": 1084, + "time_per_iteration": 3.856062173843384 + }, + { + "auxiliary_loss_clip": 0.01119746, + "auxiliary_loss_mlp": 0.01075087, + "balance_loss_clip": 1.0174613, + "balance_loss_mlp": 1.02678192, + "epoch": 0.06523372914474673, + "flos": 31680432339840.0, + "grad_norm": 2.27366259774032, + "language_loss": 0.68827015, + "learning_rate": 3.986992513289584e-06, + "loss": 0.71021849, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.9296875, + "step": 1085, + "time_per_iteration": 2.518594264984131 + }, + { + "auxiliary_loss_clip": 0.011177, + "auxiliary_loss_mlp": 0.01073352, + "balance_loss_clip": 1.01834846, + "balance_loss_mlp": 1.0260303, + "epoch": 0.0652938523974147, + "flos": 20777091517440.0, + "grad_norm": 1.907629077623935, + "language_loss": 0.79065573, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.81256628, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.9140625, + "step": 1086, + "time_per_iteration": 3.844931125640869 + }, + { + "auxiliary_loss_clip": 0.01118694, + "auxiliary_loss_mlp": 0.01072338, + "balance_loss_clip": 1.01819277, + "balance_loss_mlp": 1.02719188, + "epoch": 0.06535397565008266, + "flos": 16690053962880.0, + "grad_norm": 2.1770142014706977, + "language_loss": 0.87295473, + "learning_rate": 3.986903670660872e-06, + "loss": 0.89486504, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.9140625, + "step": 1087, + "time_per_iteration": 3.793470859527588 + }, + { + "auxiliary_loss_clip": 0.01119643, + "auxiliary_loss_mlp": 0.01086772, + "balance_loss_clip": 1.02654719, + "balance_loss_mlp": 1.02743101, + "epoch": 0.06541409890275064, + "flos": 26867061527040.0, + "grad_norm": 1.9955980550976617, + "language_loss": 0.80957699, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.83164108, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.921875, + "step": 1088, + "time_per_iteration": 2.4624691009521484 + }, + { + "auxiliary_loss_clip": 0.01117962, + "auxiliary_loss_mlp": 0.01077206, + "balance_loss_clip": 1.01834023, + "balance_loss_mlp": 1.0256604, + "epoch": 0.06547422215541861, + "flos": 20520584691840.0, + "grad_norm": 1.8567809076853405, + "language_loss": 0.73969108, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.76164275, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.921875, + "step": 1089, + "time_per_iteration": 2.428581476211548 + }, + { + "auxiliary_loss_clip": 0.0111687, + "auxiliary_loss_mlp": 0.01080845, + "balance_loss_clip": 1.02803564, + "balance_loss_mlp": 1.02648377, + "epoch": 0.06553434540808657, + "flos": 22015565642880.0, + "grad_norm": 1.6623054340837042, + "language_loss": 0.8774364, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.89941359, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.90234375, + "step": 1090, + "time_per_iteration": 2.433183193206787 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01077902, + "balance_loss_clip": 1.01877451, + "balance_loss_mlp": 1.02709794, + "epoch": 0.06559446866075455, + "flos": 24607482600960.0, + "grad_norm": 1.8544865479239219, + "language_loss": 0.7470156, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.76896769, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.90234375, + "step": 1091, + "time_per_iteration": 2.447695255279541 + }, + { + "auxiliary_loss_clip": 0.01117971, + "auxiliary_loss_mlp": 0.0108346, + "balance_loss_clip": 1.02428472, + "balance_loss_mlp": 1.02604508, + "epoch": 0.06565459191342252, + "flos": 24273678291840.0, + "grad_norm": 2.286918635892109, + "language_loss": 0.85563594, + "learning_rate": 3.986680245605936e-06, + "loss": 0.87765026, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.921875, + "step": 1092, + "time_per_iteration": 2.4320003986358643 + }, + { + "auxiliary_loss_clip": 0.0112053, + "auxiliary_loss_mlp": 0.01087363, + "balance_loss_clip": 1.02699494, + "balance_loss_mlp": 1.02588415, + "epoch": 0.06571471516609048, + "flos": 24786063538560.0, + "grad_norm": 2.2270837940348107, + "language_loss": 0.73814356, + "learning_rate": 3.986635334582814e-06, + "loss": 0.76022249, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.9453125, + "step": 1093, + "time_per_iteration": 2.4481000900268555 + }, + { + "auxiliary_loss_clip": 0.01118743, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_clip": 1.02113914, + "balance_loss_mlp": 1.02671051, + "epoch": 0.06577483841875846, + "flos": 26212858871040.0, + "grad_norm": 1.604752633499389, + "language_loss": 0.90374291, + "learning_rate": 3.986590348226282e-06, + "loss": 0.92575741, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.921875, + "step": 1094, + "time_per_iteration": 2.4544970989227295 + }, + { + "auxiliary_loss_clip": 0.01120645, + "auxiliary_loss_mlp": 0.01081226, + "balance_loss_clip": 1.02033389, + "balance_loss_mlp": 1.02774262, + "epoch": 0.06583496167142643, + "flos": 25079683006080.0, + "grad_norm": 1.5905818614331495, + "language_loss": 0.83708948, + "learning_rate": 3.986545286538044e-06, + "loss": 0.85910821, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.9296875, + "step": 1095, + "time_per_iteration": 2.4460747241973877 + }, + { + "auxiliary_loss_clip": 0.01118077, + "auxiliary_loss_mlp": 0.01077466, + "balance_loss_clip": 1.02241492, + "balance_loss_mlp": 1.02531183, + "epoch": 0.06589508492409439, + "flos": 25628622312960.0, + "grad_norm": 2.1936971704059385, + "language_loss": 0.7455287, + "learning_rate": 3.986500149519811e-06, + "loss": 0.76748407, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.9296875, + "step": 1096, + "time_per_iteration": 2.473275899887085 + }, + { + "auxiliary_loss_clip": 0.0111817, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_clip": 1.01775455, + "balance_loss_mlp": 1.0258646, + "epoch": 0.06595520817676236, + "flos": 23620173863040.0, + "grad_norm": 1.9152610164693724, + "language_loss": 0.79522955, + "learning_rate": 3.986454937173292e-06, + "loss": 0.81711304, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.921875, + "step": 1097, + "time_per_iteration": 2.443470001220703 + }, + { + "auxiliary_loss_clip": 0.01121931, + "auxiliary_loss_mlp": 0.01078039, + "balance_loss_clip": 1.02201068, + "balance_loss_mlp": 1.02572334, + "epoch": 0.06601533142943034, + "flos": 33800323449600.0, + "grad_norm": 2.0040873426898913, + "language_loss": 0.80652416, + "learning_rate": 3.986409649500203e-06, + "loss": 0.82852387, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.9609375, + "step": 1098, + "time_per_iteration": 2.5048789978027344 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.01081685, + "balance_loss_clip": 1.0231055, + "balance_loss_mlp": 1.02524841, + "epoch": 0.0660754546820983, + "flos": 20258352403200.0, + "grad_norm": 1.8866886674671082, + "language_loss": 0.84760594, + "learning_rate": 3.986364286502261e-06, + "loss": 0.86957514, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.8984375, + "step": 1099, + "time_per_iteration": 2.405137777328491 + }, + { + "auxiliary_loss_clip": 0.01113493, + "auxiliary_loss_mlp": 0.01071534, + "balance_loss_clip": 1.01891482, + "balance_loss_mlp": 1.02531719, + "epoch": 0.06613557793476627, + "flos": 19353158916480.0, + "grad_norm": 1.9630346137601873, + "language_loss": 0.86324638, + "learning_rate": 3.986318848181186e-06, + "loss": 0.88509667, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.87890625, + "step": 1100, + "time_per_iteration": 2.3810274600982666 + }, + { + "auxiliary_loss_clip": 0.0111834, + "auxiliary_loss_mlp": 0.01085337, + "balance_loss_clip": 1.02427745, + "balance_loss_mlp": 1.02663732, + "epoch": 0.06619570118743424, + "flos": 13771698992640.0, + "grad_norm": 2.194754349172351, + "language_loss": 0.76269424, + "learning_rate": 3.986273334538702e-06, + "loss": 0.78473103, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.9140625, + "step": 1101, + "time_per_iteration": 2.4132373332977295 + }, + { + "auxiliary_loss_clip": 0.01115933, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_clip": 1.02053523, + "balance_loss_mlp": 1.02458668, + "epoch": 0.06625582444010221, + "flos": 17856921156480.0, + "grad_norm": 2.2659518661782663, + "language_loss": 0.90404302, + "learning_rate": 3.986227745576533e-06, + "loss": 0.92600191, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.9140625, + "step": 1102, + "time_per_iteration": 2.389693260192871 + }, + { + "auxiliary_loss_clip": 0.01117092, + "auxiliary_loss_mlp": 0.01087484, + "balance_loss_clip": 1.02656841, + "balance_loss_mlp": 1.02725148, + "epoch": 0.06631594769277017, + "flos": 11837894762880.0, + "grad_norm": 2.100413544997422, + "language_loss": 0.85788357, + "learning_rate": 3.98618208129641e-06, + "loss": 0.87992936, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.8984375, + "step": 1103, + "time_per_iteration": 2.3741064071655273 + }, + { + "auxiliary_loss_clip": 0.01113874, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_clip": 1.02304006, + "balance_loss_mlp": 1.0259726, + "epoch": 0.06637607094543815, + "flos": 19792296397440.0, + "grad_norm": 1.7627943560956827, + "language_loss": 0.84274757, + "learning_rate": 3.986136341700063e-06, + "loss": 0.86466408, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.87890625, + "step": 1104, + "time_per_iteration": 2.401047945022583 + }, + { + "auxiliary_loss_clip": 0.01115866, + "auxiliary_loss_mlp": 0.0107462, + "balance_loss_clip": 1.01825738, + "balance_loss_mlp": 1.02511764, + "epoch": 0.06643619419810612, + "flos": 25484430931200.0, + "grad_norm": 1.7037591587565213, + "language_loss": 0.81663072, + "learning_rate": 3.986090526789227e-06, + "loss": 0.83853555, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.90625, + "step": 1105, + "time_per_iteration": 2.456904172897339 + }, + { + "auxiliary_loss_clip": 0.01114306, + "auxiliary_loss_mlp": 0.0107557, + "balance_loss_clip": 1.01944685, + "balance_loss_mlp": 1.02570915, + "epoch": 0.06649631745077408, + "flos": 16945583270400.0, + "grad_norm": 2.06042190957479, + "language_loss": 0.98223692, + "learning_rate": 3.986044636565639e-06, + "loss": 1.00413585, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8828125, + "step": 1106, + "time_per_iteration": 2.3892481327056885 + }, + { + "auxiliary_loss_clip": 0.0112116, + "auxiliary_loss_mlp": 0.01077339, + "balance_loss_clip": 1.01697171, + "balance_loss_mlp": 1.02492881, + "epoch": 0.06655644070344206, + "flos": 17857619383680.0, + "grad_norm": 1.770210665320836, + "language_loss": 0.85268295, + "learning_rate": 3.985998671031039e-06, + "loss": 0.874668, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.9609375, + "step": 1107, + "time_per_iteration": 2.412248134613037 + }, + { + "auxiliary_loss_clip": 0.01036255, + "auxiliary_loss_mlp": 0.01020116, + "balance_loss_clip": 1.00638294, + "balance_loss_mlp": 1.01024175, + "epoch": 0.06661656395611003, + "flos": 61416236062080.0, + "grad_norm": 0.8238038018806919, + "language_loss": 0.56833071, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58889443, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.26171875, + "step": 1108, + "time_per_iteration": 2.9792981147766113 + }, + { + "auxiliary_loss_clip": 0.01119922, + "auxiliary_loss_mlp": 0.01091585, + "balance_loss_clip": 1.02649713, + "balance_loss_mlp": 1.02596259, + "epoch": 0.066676687208778, + "flos": 20661948253440.0, + "grad_norm": 3.487870976022965, + "language_loss": 0.75580716, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.77792221, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.9375, + "step": 1109, + "time_per_iteration": 2.4156947135925293 + }, + { + "auxiliary_loss_clip": 0.01116635, + "auxiliary_loss_mlp": 0.01073149, + "balance_loss_clip": 1.01645315, + "balance_loss_mlp": 1.02510285, + "epoch": 0.06673681046144596, + "flos": 20922225505920.0, + "grad_norm": 1.602397685293025, + "language_loss": 0.80923939, + "learning_rate": 3.985860322578614e-06, + "loss": 0.83113718, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.91796875, + "step": 1110, + "time_per_iteration": 2.4009132385253906 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01084553, + "balance_loss_clip": 1.02461505, + "balance_loss_mlp": 1.02673864, + "epoch": 0.06679693371411394, + "flos": 31064494400640.0, + "grad_norm": 2.013706849262472, + "language_loss": 0.75132871, + "learning_rate": 3.985814055817427e-06, + "loss": 0.77338445, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.94140625, + "step": 1111, + "time_per_iteration": 2.4835853576660156 + }, + { + "auxiliary_loss_clip": 0.01122443, + "auxiliary_loss_mlp": 0.01088815, + "balance_loss_clip": 1.02975917, + "balance_loss_mlp": 1.02592778, + "epoch": 0.0668570569667819, + "flos": 21725053286400.0, + "grad_norm": 1.7691223670096392, + "language_loss": 0.8153646, + "learning_rate": 3.985767713753971e-06, + "loss": 0.83747715, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.96484375, + "step": 1112, + "time_per_iteration": 2.4132843017578125 + }, + { + "auxiliary_loss_clip": 0.01118617, + "auxiliary_loss_mlp": 0.01089497, + "balance_loss_clip": 1.02636349, + "balance_loss_mlp": 1.02671099, + "epoch": 0.06691718021944987, + "flos": 22746158087040.0, + "grad_norm": 2.012456003082321, + "language_loss": 0.84165847, + "learning_rate": 3.985721296390005e-06, + "loss": 0.86373967, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.91796875, + "step": 1113, + "time_per_iteration": 2.424727439880371 + }, + { + "auxiliary_loss_clip": 0.01113045, + "auxiliary_loss_mlp": 0.0108052, + "balance_loss_clip": 1.01743436, + "balance_loss_mlp": 1.023929, + "epoch": 0.06697730347211785, + "flos": 16544675594880.0, + "grad_norm": 1.8507004345553795, + "language_loss": 0.85286343, + "learning_rate": 3.985674803727289e-06, + "loss": 0.87479907, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.890625, + "step": 1114, + "time_per_iteration": 2.4428517818450928 + }, + { + "auxiliary_loss_clip": 0.0103323, + "auxiliary_loss_mlp": 0.01023788, + "balance_loss_clip": 1.0107224, + "balance_loss_mlp": 1.00802636, + "epoch": 0.06703742672478581, + "flos": 59779123499520.0, + "grad_norm": 0.852720138796611, + "language_loss": 0.5836007, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60417086, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.25195312, + "step": 1115, + "time_per_iteration": 2.9749999046325684 + }, + { + "auxiliary_loss_clip": 0.01124546, + "auxiliary_loss_mlp": 0.0109687, + "balance_loss_clip": 1.0301609, + "balance_loss_mlp": 1.02850115, + "epoch": 0.06709754997745378, + "flos": 16799262295680.0, + "grad_norm": 2.972201632054534, + "language_loss": 0.94455981, + "learning_rate": 3.985581592512658e-06, + "loss": 0.96677411, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.9609375, + "step": 1116, + "time_per_iteration": 2.3827598094940186 + }, + { + "auxiliary_loss_clip": 0.01122696, + "auxiliary_loss_mlp": 0.01083418, + "balance_loss_clip": 1.02371812, + "balance_loss_mlp": 1.02691305, + "epoch": 0.06715767323012176, + "flos": 22122923673600.0, + "grad_norm": 1.7840851487949978, + "language_loss": 0.89795351, + "learning_rate": 3.985534873964279e-06, + "loss": 0.92001468, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.95703125, + "step": 1117, + "time_per_iteration": 2.4365878105163574 + }, + { + "auxiliary_loss_clip": 0.01034245, + "auxiliary_loss_mlp": 0.01014564, + "balance_loss_clip": 1.00345373, + "balance_loss_mlp": 1.00915062, + "epoch": 0.06721779648278972, + "flos": 66615363020160.0, + "grad_norm": 0.8653409761736361, + "language_loss": 0.5997349, + "learning_rate": 3.985488080124218e-06, + "loss": 0.62022305, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.25, + "step": 1118, + "time_per_iteration": 2.9569766521453857 + }, + { + "auxiliary_loss_clip": 0.01122303, + "auxiliary_loss_mlp": 0.01081181, + "balance_loss_clip": 1.02176738, + "balance_loss_mlp": 1.02562976, + "epoch": 0.06727791973545769, + "flos": 22381385535360.0, + "grad_norm": 2.649699368813853, + "language_loss": 0.87965488, + "learning_rate": 3.985441210994251e-06, + "loss": 0.90168977, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.96875, + "step": 1119, + "time_per_iteration": 2.4172894954681396 + }, + { + "auxiliary_loss_clip": 0.01115957, + "auxiliary_loss_mlp": 0.01073688, + "balance_loss_clip": 1.01956713, + "balance_loss_mlp": 1.02567744, + "epoch": 0.06733804298812565, + "flos": 24279054641280.0, + "grad_norm": 1.9909298106634399, + "language_loss": 0.87397975, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.89587623, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.90234375, + "step": 1120, + "time_per_iteration": 2.420795440673828 + }, + { + "auxiliary_loss_clip": 0.01123296, + "auxiliary_loss_mlp": 0.01093124, + "balance_loss_clip": 1.02832222, + "balance_loss_mlp": 1.02761793, + "epoch": 0.06739816624079363, + "flos": 15917496197760.0, + "grad_norm": 1.9998410486036524, + "language_loss": 0.8118242, + "learning_rate": 3.985347246871708e-06, + "loss": 0.83398843, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.95703125, + "step": 1121, + "time_per_iteration": 2.4120850563049316 + }, + { + "auxiliary_loss_clip": 0.01035083, + "auxiliary_loss_mlp": 0.01054152, + "balance_loss_clip": 1.03917885, + "balance_loss_mlp": 1.00847876, + "epoch": 0.0674582894934616, + "flos": 71394656613120.0, + "grad_norm": 0.7980851824815436, + "language_loss": 0.58567643, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60656875, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.265625, + "step": 1122, + "time_per_iteration": 3.184535026550293 + }, + { + "auxiliary_loss_clip": 0.01123495, + "auxiliary_loss_mlp": 0.01085879, + "balance_loss_clip": 1.02193558, + "balance_loss_mlp": 1.02801776, + "epoch": 0.06751841274612956, + "flos": 25263779938560.0, + "grad_norm": 2.214353176070726, + "language_loss": 0.75975633, + "learning_rate": 3.985252981610901e-06, + "loss": 0.7818501, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.95703125, + "step": 1123, + "time_per_iteration": 3.932253837585449 + }, + { + "auxiliary_loss_clip": 0.011247, + "auxiliary_loss_mlp": 0.01084773, + "balance_loss_clip": 1.02340448, + "balance_loss_mlp": 1.02773237, + "epoch": 0.06757853599879754, + "flos": 23801687354880.0, + "grad_norm": 1.8035342248159478, + "language_loss": 0.81587684, + "learning_rate": 3.985205736058114e-06, + "loss": 0.83797157, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.96875, + "step": 1124, + "time_per_iteration": 3.9603469371795654 + }, + { + "auxiliary_loss_clip": 0.01118417, + "auxiliary_loss_mlp": 0.01076933, + "balance_loss_clip": 1.02197719, + "balance_loss_mlp": 1.02609396, + "epoch": 0.0676386592514655, + "flos": 21032655736320.0, + "grad_norm": 1.8904031565188635, + "language_loss": 0.74719065, + "learning_rate": 3.985158415226128e-06, + "loss": 0.76914418, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.921875, + "step": 1125, + "time_per_iteration": 3.834554433822632 + }, + { + "auxiliary_loss_clip": 0.01121382, + "auxiliary_loss_mlp": 0.01083275, + "balance_loss_clip": 1.02376568, + "balance_loss_mlp": 1.02692199, + "epoch": 0.06769878250413347, + "flos": 25555165079040.0, + "grad_norm": 3.32834237866338, + "language_loss": 0.83633471, + "learning_rate": 3.985111019116736e-06, + "loss": 0.85838127, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.9453125, + "step": 1126, + "time_per_iteration": 3.899857997894287 + }, + { + "auxiliary_loss_clip": 0.01029995, + "auxiliary_loss_mlp": 0.01056445, + "balance_loss_clip": 1.04328477, + "balance_loss_mlp": 1.00515544, + "epoch": 0.06775890575680145, + "flos": 70651740458880.0, + "grad_norm": 0.8617601963732228, + "language_loss": 0.59859312, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61945748, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.24804688, + "step": 1127, + "time_per_iteration": 3.0482993125915527 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_clip": 1.02528024, + "balance_loss_mlp": 1.02925444, + "epoch": 0.06781902900946941, + "flos": 24234575702400.0, + "grad_norm": 2.042128775577491, + "language_loss": 0.83990121, + "learning_rate": 3.985016001072925e-06, + "loss": 0.86198753, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.9375, + "step": 1128, + "time_per_iteration": 2.4523684978485107 + }, + { + "auxiliary_loss_clip": 0.01129507, + "auxiliary_loss_mlp": 0.01094309, + "balance_loss_clip": 1.03141379, + "balance_loss_mlp": 1.03060877, + "epoch": 0.06787915226213738, + "flos": 22416473318400.0, + "grad_norm": 1.9041930503931996, + "language_loss": 0.78820133, + "learning_rate": 3.984968379142109e-06, + "loss": 0.81043947, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.98828125, + "step": 1129, + "time_per_iteration": 2.4221837520599365 + }, + { + "auxiliary_loss_clip": 0.01126317, + "auxiliary_loss_mlp": 0.01088032, + "balance_loss_clip": 1.02589965, + "balance_loss_mlp": 1.03075194, + "epoch": 0.06793927551480534, + "flos": 37705393664640.0, + "grad_norm": 1.9418144790439253, + "language_loss": 0.75519568, + "learning_rate": 3.984920681941094e-06, + "loss": 0.77733916, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.953125, + "step": 1130, + "time_per_iteration": 2.571518659591675 + }, + { + "auxiliary_loss_clip": 0.01125286, + "auxiliary_loss_mlp": 0.01101638, + "balance_loss_clip": 1.03807545, + "balance_loss_mlp": 1.03185916, + "epoch": 0.06799939876747332, + "flos": 20630351606400.0, + "grad_norm": 2.5223262058806584, + "language_loss": 0.83711624, + "learning_rate": 3.984872909471688e-06, + "loss": 0.85938543, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.93359375, + "step": 1131, + "time_per_iteration": 2.482433795928955 + }, + { + "auxiliary_loss_clip": 0.01123376, + "auxiliary_loss_mlp": 0.01100627, + "balance_loss_clip": 1.04192817, + "balance_loss_mlp": 1.0323621, + "epoch": 0.06805952202014129, + "flos": 14863921966080.0, + "grad_norm": 2.0117562256844272, + "language_loss": 0.83521867, + "learning_rate": 3.984825061735701e-06, + "loss": 0.85745865, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.91015625, + "step": 1132, + "time_per_iteration": 2.4478507041931152 + }, + { + "auxiliary_loss_clip": 0.01127269, + "auxiliary_loss_mlp": 0.01092815, + "balance_loss_clip": 1.03416383, + "balance_loss_mlp": 1.03286517, + "epoch": 0.06811964527280925, + "flos": 48907555747200.0, + "grad_norm": 1.5167081980092614, + "language_loss": 0.66709793, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.68929875, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.9453125, + "step": 1133, + "time_per_iteration": 2.676922082901001 + }, + { + "auxiliary_loss_clip": 0.01130865, + "auxiliary_loss_mlp": 0.01103078, + "balance_loss_clip": 1.03584409, + "balance_loss_mlp": 1.0317868, + "epoch": 0.06817976852547723, + "flos": 15376377035520.0, + "grad_norm": 2.2872385688703463, + "language_loss": 0.78900862, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.81134808, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.98828125, + "step": 1134, + "time_per_iteration": 2.41567325592041 + }, + { + "auxiliary_loss_clip": 0.01121873, + "auxiliary_loss_mlp": 0.01091113, + "balance_loss_clip": 1.03389275, + "balance_loss_mlp": 1.03119636, + "epoch": 0.0682398917781452, + "flos": 20154694976640.0, + "grad_norm": 1.7019263422609774, + "language_loss": 0.90251881, + "learning_rate": 3.984681066946423e-06, + "loss": 0.92464864, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.90625, + "step": 1135, + "time_per_iteration": 2.4410014152526855 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01088732, + "balance_loss_clip": 1.03043914, + "balance_loss_mlp": 1.02930117, + "epoch": 0.06830001503081316, + "flos": 23439498243840.0, + "grad_norm": 2.608476546521454, + "language_loss": 0.82385993, + "learning_rate": 3.984632918162291e-06, + "loss": 0.84596151, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.921875, + "step": 1136, + "time_per_iteration": 2.488626003265381 + }, + { + "auxiliary_loss_clip": 0.01125287, + "auxiliary_loss_mlp": 0.01097232, + "balance_loss_clip": 1.0357914, + "balance_loss_mlp": 1.02988625, + "epoch": 0.06836013828348114, + "flos": 34348389972480.0, + "grad_norm": 2.1092199946668257, + "language_loss": 0.874327, + "learning_rate": 3.984584694120679e-06, + "loss": 0.89655221, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.953125, + "step": 1137, + "time_per_iteration": 2.5108256340026855 + }, + { + "auxiliary_loss_clip": 0.01122085, + "auxiliary_loss_mlp": 0.01092169, + "balance_loss_clip": 1.03289843, + "balance_loss_mlp": 1.02879333, + "epoch": 0.06842026153614911, + "flos": 23147729078400.0, + "grad_norm": 1.8941065411664983, + "language_loss": 0.82052231, + "learning_rate": 3.984536394823418e-06, + "loss": 0.84266484, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.93359375, + "step": 1138, + "time_per_iteration": 2.451204299926758 + }, + { + "auxiliary_loss_clip": 0.01122444, + "auxiliary_loss_mlp": 0.01096732, + "balance_loss_clip": 1.03254938, + "balance_loss_mlp": 1.02774584, + "epoch": 0.06848038478881707, + "flos": 24607796803200.0, + "grad_norm": 1.9489130121631686, + "language_loss": 0.8792212, + "learning_rate": 3.984488020272336e-06, + "loss": 0.9014129, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.94921875, + "step": 1139, + "time_per_iteration": 2.4835870265960693 + }, + { + "auxiliary_loss_clip": 0.01126947, + "auxiliary_loss_mlp": 0.01094943, + "balance_loss_clip": 1.03362191, + "balance_loss_mlp": 1.03009427, + "epoch": 0.06854050804148504, + "flos": 40879382676480.0, + "grad_norm": 1.7967079356159903, + "language_loss": 0.77245498, + "learning_rate": 3.984439570469271e-06, + "loss": 0.7946738, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.96875, + "step": 1140, + "time_per_iteration": 2.5894415378570557 + }, + { + "auxiliary_loss_clip": 0.01125202, + "auxiliary_loss_mlp": 0.01098284, + "balance_loss_clip": 1.02580476, + "balance_loss_mlp": 1.03009641, + "epoch": 0.06860063129415302, + "flos": 31685005728000.0, + "grad_norm": 2.184653874082324, + "language_loss": 0.71900249, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.74123734, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.94921875, + "step": 1141, + "time_per_iteration": 2.550922393798828 + }, + { + "auxiliary_loss_clip": 0.0112521, + "auxiliary_loss_mlp": 0.01079015, + "balance_loss_clip": 1.01278222, + "balance_loss_mlp": 1.02810967, + "epoch": 0.06866075454682098, + "flos": 26540798071680.0, + "grad_norm": 2.038034663497038, + "language_loss": 0.81297326, + "learning_rate": 3.984342445114538e-06, + "loss": 0.83501542, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.96875, + "step": 1142, + "time_per_iteration": 2.5274693965911865 + }, + { + "auxiliary_loss_clip": 0.01120549, + "auxiliary_loss_mlp": 0.01084358, + "balance_loss_clip": 1.02069974, + "balance_loss_mlp": 1.02910841, + "epoch": 0.06872087779948895, + "flos": 29788453785600.0, + "grad_norm": 3.283402850429473, + "language_loss": 0.71786511, + "learning_rate": 3.984293769566553e-06, + "loss": 0.73991418, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.9140625, + "step": 1143, + "time_per_iteration": 2.4973251819610596 + }, + { + "auxiliary_loss_clip": 0.01122981, + "auxiliary_loss_mlp": 0.01082928, + "balance_loss_clip": 1.02232218, + "balance_loss_mlp": 1.03154778, + "epoch": 0.06878100105215693, + "flos": 26939960179200.0, + "grad_norm": 1.6362535099889581, + "language_loss": 0.76401097, + "learning_rate": 3.98424501877395e-06, + "loss": 0.78607011, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.9140625, + "step": 1144, + "time_per_iteration": 2.4799752235412598 + }, + { + "auxiliary_loss_clip": 0.01129897, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_clip": 1.02040291, + "balance_loss_mlp": 1.03165627, + "epoch": 0.06884112430482489, + "flos": 10669980228480.0, + "grad_norm": 2.386005081791177, + "language_loss": 0.95492917, + "learning_rate": 3.984196192738577e-06, + "loss": 0.97711122, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.984375, + "step": 1145, + "time_per_iteration": 2.3852670192718506 + }, + { + "auxiliary_loss_clip": 0.01136948, + "auxiliary_loss_mlp": 0.01095834, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.03430319, + "epoch": 0.06890124755749286, + "flos": 20192610579840.0, + "grad_norm": 2.5621384087053105, + "language_loss": 0.8596679, + "learning_rate": 3.984147291462285e-06, + "loss": 0.88199568, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 1.03125, + "step": 1146, + "time_per_iteration": 2.4385712146759033 + }, + { + "auxiliary_loss_clip": 0.01123646, + "auxiliary_loss_mlp": 0.01086992, + "balance_loss_clip": 1.02471733, + "balance_loss_mlp": 1.03171659, + "epoch": 0.06896137081016084, + "flos": 20448174798720.0, + "grad_norm": 1.9064338641776337, + "language_loss": 0.87195885, + "learning_rate": 3.98409831494693e-06, + "loss": 0.8940652, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.91796875, + "step": 1147, + "time_per_iteration": 2.4222569465637207 + }, + { + "auxiliary_loss_clip": 0.01125913, + "auxiliary_loss_mlp": 0.01085966, + "balance_loss_clip": 1.02085328, + "balance_loss_mlp": 1.0294975, + "epoch": 0.0690214940628288, + "flos": 18367735392000.0, + "grad_norm": 1.9764152505946622, + "language_loss": 0.88885844, + "learning_rate": 3.984049263194367e-06, + "loss": 0.91097713, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.96484375, + "step": 1148, + "time_per_iteration": 2.421541929244995 + }, + { + "auxiliary_loss_clip": 0.01123515, + "auxiliary_loss_mlp": 0.01082554, + "balance_loss_clip": 1.02056539, + "balance_loss_mlp": 1.029423, + "epoch": 0.06908161731549677, + "flos": 20556999106560.0, + "grad_norm": 2.392153570835658, + "language_loss": 0.72474724, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.74680793, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.94140625, + "step": 1149, + "time_per_iteration": 2.432215452194214 + }, + { + "auxiliary_loss_clip": 0.0112754, + "auxiliary_loss_mlp": 0.01082993, + "balance_loss_clip": 1.02057528, + "balance_loss_mlp": 1.02859437, + "epoch": 0.06914174056816474, + "flos": 27562426542720.0, + "grad_norm": 1.885564513768512, + "language_loss": 0.8711127, + "learning_rate": 3.983950933985064e-06, + "loss": 0.89321804, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.98828125, + "step": 1150, + "time_per_iteration": 2.491286277770996 + }, + { + "auxiliary_loss_clip": 0.01129628, + "auxiliary_loss_mlp": 0.01093329, + "balance_loss_clip": 1.02719188, + "balance_loss_mlp": 1.03058386, + "epoch": 0.06920186382083271, + "flos": 15303129269760.0, + "grad_norm": 3.514703244721381, + "language_loss": 0.87550616, + "learning_rate": 3.983901656532052e-06, + "loss": 0.89773583, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.98828125, + "step": 1151, + "time_per_iteration": 2.3895370960235596 + }, + { + "auxiliary_loss_clip": 0.01125898, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_clip": 1.02399015, + "balance_loss_mlp": 1.02974057, + "epoch": 0.06926198707350067, + "flos": 25190078325120.0, + "grad_norm": 1.7310559545872284, + "language_loss": 0.87590617, + "learning_rate": 3.983852303849291e-06, + "loss": 0.89804876, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.9609375, + "step": 1152, + "time_per_iteration": 2.463397741317749 + }, + { + "auxiliary_loss_clip": 0.01122164, + "auxiliary_loss_mlp": 0.01088429, + "balance_loss_clip": 1.02837181, + "balance_loss_mlp": 1.02923846, + "epoch": 0.06932211032616864, + "flos": 13255438584960.0, + "grad_norm": 2.233448985568123, + "language_loss": 0.93598485, + "learning_rate": 3.983802875938651e-06, + "loss": 0.95809078, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.9296875, + "step": 1153, + "time_per_iteration": 2.383981227874756 + }, + { + "auxiliary_loss_clip": 0.01124264, + "auxiliary_loss_mlp": 0.01086272, + "balance_loss_clip": 1.02342486, + "balance_loss_mlp": 1.02828693, + "epoch": 0.06938223357883662, + "flos": 24826213468800.0, + "grad_norm": 1.977841163568739, + "language_loss": 0.83504844, + "learning_rate": 3.983753372802008e-06, + "loss": 0.85715377, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.95703125, + "step": 1154, + "time_per_iteration": 2.445939540863037 + }, + { + "auxiliary_loss_clip": 0.0111804, + "auxiliary_loss_mlp": 0.0108937, + "balance_loss_clip": 1.02471066, + "balance_loss_mlp": 1.02664399, + "epoch": 0.06944235683150458, + "flos": 27266852039040.0, + "grad_norm": 1.8378016692937635, + "language_loss": 0.77644992, + "learning_rate": 3.983703794441237e-06, + "loss": 0.79852396, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.9140625, + "step": 1155, + "time_per_iteration": 2.468337059020996 + }, + { + "auxiliary_loss_clip": 0.01120091, + "auxiliary_loss_mlp": 0.01086791, + "balance_loss_clip": 1.02341962, + "balance_loss_mlp": 1.02557087, + "epoch": 0.06950248008417255, + "flos": 25806993782400.0, + "grad_norm": 1.7401925724173481, + "language_loss": 0.73281115, + "learning_rate": 3.98365414085822e-06, + "loss": 0.75487995, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.9453125, + "step": 1156, + "time_per_iteration": 2.45001482963562 + }, + { + "auxiliary_loss_clip": 0.01122873, + "auxiliary_loss_mlp": 0.01090399, + "balance_loss_clip": 1.02435744, + "balance_loss_mlp": 1.02761257, + "epoch": 0.06956260333684053, + "flos": 22270501457280.0, + "grad_norm": 1.8370850115205077, + "language_loss": 0.77475703, + "learning_rate": 3.98360441205484e-06, + "loss": 0.79688978, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.953125, + "step": 1157, + "time_per_iteration": 2.4102835655212402 + }, + { + "auxiliary_loss_clip": 0.01122157, + "auxiliary_loss_mlp": 0.01085671, + "balance_loss_clip": 1.02358639, + "balance_loss_mlp": 1.02596426, + "epoch": 0.0696227265895085, + "flos": 29680048414080.0, + "grad_norm": 1.9458102866366156, + "language_loss": 0.74436134, + "learning_rate": 3.983554608032982e-06, + "loss": 0.76643968, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.9609375, + "step": 1158, + "time_per_iteration": 2.4733190536499023 + }, + { + "auxiliary_loss_clip": 0.01127856, + "auxiliary_loss_mlp": 0.01082402, + "balance_loss_clip": 1.01693273, + "balance_loss_mlp": 1.02864301, + "epoch": 0.06968284984217646, + "flos": 25522276711680.0, + "grad_norm": 2.071886089028901, + "language_loss": 0.82614207, + "learning_rate": 3.983504728794533e-06, + "loss": 0.84824467, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.9921875, + "step": 1159, + "time_per_iteration": 2.4483911991119385 + }, + { + "auxiliary_loss_clip": 0.0112646, + "auxiliary_loss_mlp": 0.01089671, + "balance_loss_clip": 1.01881254, + "balance_loss_mlp": 1.02979612, + "epoch": 0.06974297309484444, + "flos": 20697315327360.0, + "grad_norm": 2.7241934538980273, + "language_loss": 0.87037098, + "learning_rate": 3.983454774341387e-06, + "loss": 0.89253217, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.96875, + "step": 1160, + "time_per_iteration": 2.3994834423065186 + }, + { + "auxiliary_loss_clip": 0.01127559, + "auxiliary_loss_mlp": 0.01095395, + "balance_loss_clip": 1.02539492, + "balance_loss_mlp": 1.02781618, + "epoch": 0.0698030963475124, + "flos": 26503999632000.0, + "grad_norm": 1.625587204700965, + "language_loss": 0.78050566, + "learning_rate": 3.983404744675437e-06, + "loss": 0.80273521, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 1.0, + "step": 1161, + "time_per_iteration": 2.4453136920928955 + }, + { + "auxiliary_loss_clip": 0.01121993, + "auxiliary_loss_mlp": 0.01096327, + "balance_loss_clip": 1.02737641, + "balance_loss_mlp": 1.0261029, + "epoch": 0.06986321960018037, + "flos": 23039288795520.0, + "grad_norm": 1.7730575241492539, + "language_loss": 0.85159492, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.87377816, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.9609375, + "step": 1162, + "time_per_iteration": 3.8944971561431885 + }, + { + "auxiliary_loss_clip": 0.01120592, + "auxiliary_loss_mlp": 0.010764, + "balance_loss_clip": 1.01326632, + "balance_loss_mlp": 1.02559292, + "epoch": 0.06992334285284833, + "flos": 28583566254720.0, + "grad_norm": 1.8005549986175502, + "language_loss": 0.81445593, + "learning_rate": 3.983304459712716e-06, + "loss": 0.83642584, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.94921875, + "step": 1163, + "time_per_iteration": 2.4830455780029297 + }, + { + "auxiliary_loss_clip": 0.01123889, + "auxiliary_loss_mlp": 0.01081741, + "balance_loss_clip": 1.01979947, + "balance_loss_mlp": 1.02658606, + "epoch": 0.06998346610551631, + "flos": 20594286305280.0, + "grad_norm": 2.137255250669476, + "language_loss": 0.8066082, + "learning_rate": 3.983254204419749e-06, + "loss": 0.82866442, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.97265625, + "step": 1164, + "time_per_iteration": 5.3419575691223145 + }, + { + "auxiliary_loss_clip": 0.01123909, + "auxiliary_loss_mlp": 0.01085924, + "balance_loss_clip": 1.02093148, + "balance_loss_mlp": 1.02699661, + "epoch": 0.07004358935818428, + "flos": 22527706510080.0, + "grad_norm": 1.7896258245151857, + "language_loss": 0.75779712, + "learning_rate": 3.983203873921583e-06, + "loss": 0.77989542, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.96875, + "step": 1165, + "time_per_iteration": 2.423755645751953 + }, + { + "auxiliary_loss_clip": 0.01123987, + "auxiliary_loss_mlp": 0.01089922, + "balance_loss_clip": 1.02111435, + "balance_loss_mlp": 1.02588546, + "epoch": 0.07010371261085224, + "flos": 28948722831360.0, + "grad_norm": 1.8631089368546352, + "language_loss": 0.82915854, + "learning_rate": 3.983153468220128e-06, + "loss": 0.85129762, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.98046875, + "step": 1166, + "time_per_iteration": 3.87127685546875 + }, + { + "auxiliary_loss_clip": 0.0112414, + "auxiliary_loss_mlp": 0.01092909, + "balance_loss_clip": 1.02495956, + "balance_loss_mlp": 1.02779317, + "epoch": 0.07016383586352022, + "flos": 23658054554880.0, + "grad_norm": 2.0286047740789, + "language_loss": 0.87652385, + "learning_rate": 3.983102987317295e-06, + "loss": 0.8986944, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.96484375, + "step": 1167, + "time_per_iteration": 2.425036907196045 + }, + { + "auxiliary_loss_clip": 0.01126049, + "auxiliary_loss_mlp": 0.01091101, + "balance_loss_clip": 1.02067208, + "balance_loss_mlp": 1.02578366, + "epoch": 0.07022395911618819, + "flos": 19791109411200.0, + "grad_norm": 1.989751081251553, + "language_loss": 0.93672723, + "learning_rate": 3.983052431214997e-06, + "loss": 0.95889866, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.0, + "step": 1168, + "time_per_iteration": 2.398099660873413 + }, + { + "auxiliary_loss_clip": 0.01128506, + "auxiliary_loss_mlp": 0.01105523, + "balance_loss_clip": 1.03042114, + "balance_loss_mlp": 1.02679992, + "epoch": 0.07028408236885615, + "flos": 21688080289920.0, + "grad_norm": 1.9694278304428756, + "language_loss": 0.91400796, + "learning_rate": 3.983001799915153e-06, + "loss": 0.93634826, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 1.015625, + "step": 1169, + "time_per_iteration": 2.396488666534424 + }, + { + "auxiliary_loss_clip": 0.01127363, + "auxiliary_loss_mlp": 0.01089321, + "balance_loss_clip": 1.01994097, + "balance_loss_mlp": 1.0284791, + "epoch": 0.07034420562152413, + "flos": 25629076160640.0, + "grad_norm": 2.4510198917683526, + "language_loss": 0.8710537, + "learning_rate": 3.982951093419681e-06, + "loss": 0.89322054, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.98828125, + "step": 1170, + "time_per_iteration": 2.4446558952331543 + }, + { + "auxiliary_loss_clip": 0.01121907, + "auxiliary_loss_mlp": 0.01095745, + "balance_loss_clip": 1.02688932, + "balance_loss_mlp": 1.02577257, + "epoch": 0.0704043288741921, + "flos": 20809491125760.0, + "grad_norm": 1.974348525507159, + "language_loss": 0.78108561, + "learning_rate": 3.982900311730506e-06, + "loss": 0.80326217, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.9609375, + "step": 1171, + "time_per_iteration": 2.4116785526275635 + }, + { + "auxiliary_loss_clip": 0.01122969, + "auxiliary_loss_mlp": 0.01089955, + "balance_loss_clip": 1.02524829, + "balance_loss_mlp": 1.02575743, + "epoch": 0.07046445212686006, + "flos": 25591998430080.0, + "grad_norm": 1.7534652989738828, + "language_loss": 0.91614544, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.93827468, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.97265625, + "step": 1172, + "time_per_iteration": 2.4500577449798584 + }, + { + "auxiliary_loss_clip": 0.01127021, + "auxiliary_loss_mlp": 0.01093403, + "balance_loss_clip": 1.02297401, + "balance_loss_mlp": 1.02703094, + "epoch": 0.07052457537952803, + "flos": 25555793483520.0, + "grad_norm": 1.6369875259069804, + "language_loss": 0.84426713, + "learning_rate": 3.982798522778748e-06, + "loss": 0.86647129, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 1.0, + "step": 1173, + "time_per_iteration": 2.4377236366271973 + }, + { + "auxiliary_loss_clip": 0.01123274, + "auxiliary_loss_mlp": 0.01090843, + "balance_loss_clip": 1.02270269, + "balance_loss_mlp": 1.0266397, + "epoch": 0.070584698632196, + "flos": 17967525943680.0, + "grad_norm": 1.9760185715190408, + "language_loss": 0.83913708, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.8612783, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.96875, + "step": 1174, + "time_per_iteration": 2.4259960651397705 + }, + { + "auxiliary_loss_clip": 0.01125288, + "auxiliary_loss_mlp": 0.01099835, + "balance_loss_clip": 1.03031182, + "balance_loss_mlp": 1.02695394, + "epoch": 0.07064482188486397, + "flos": 25369811337600.0, + "grad_norm": 1.8712729103564836, + "language_loss": 0.87261462, + "learning_rate": 3.982696433075317e-06, + "loss": 0.89486587, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.984375, + "step": 1175, + "time_per_iteration": 2.427386999130249 + }, + { + "auxiliary_loss_clip": 0.01124963, + "auxiliary_loss_mlp": 0.01092704, + "balance_loss_clip": 1.02160811, + "balance_loss_mlp": 1.02870667, + "epoch": 0.07070494513753194, + "flos": 24898693184640.0, + "grad_norm": 1.7003345106287127, + "language_loss": 0.86938488, + "learning_rate": 3.982645275446563e-06, + "loss": 0.89156163, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.9609375, + "step": 1176, + "time_per_iteration": 2.447356939315796 + }, + { + "auxiliary_loss_clip": 0.01120673, + "auxiliary_loss_mlp": 0.01099006, + "balance_loss_clip": 1.03072345, + "balance_loss_mlp": 1.02573276, + "epoch": 0.07076506839019991, + "flos": 22337569912320.0, + "grad_norm": 2.622261797986799, + "language_loss": 0.77637994, + "learning_rate": 3.982594042635701e-06, + "loss": 0.79857677, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.9453125, + "step": 1177, + "time_per_iteration": 2.3983569145202637 + }, + { + "auxiliary_loss_clip": 0.01129956, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_clip": 1.0238384, + "balance_loss_mlp": 1.02857423, + "epoch": 0.07082519164286788, + "flos": 18659818759680.0, + "grad_norm": 1.760069954688353, + "language_loss": 0.88014412, + "learning_rate": 3.982542734644673e-06, + "loss": 0.9024412, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 1.015625, + "step": 1178, + "time_per_iteration": 2.430652379989624 + }, + { + "auxiliary_loss_clip": 0.01035933, + "auxiliary_loss_mlp": 0.01042589, + "balance_loss_clip": 1.03057301, + "balance_loss_mlp": 1.00861716, + "epoch": 0.07088531489553584, + "flos": 63650676942720.0, + "grad_norm": 0.8984989359649695, + "language_loss": 0.63537645, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65616167, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.2734375, + "step": 1179, + "time_per_iteration": 3.1636335849761963 + }, + { + "auxiliary_loss_clip": 0.01125684, + "auxiliary_loss_mlp": 0.01095091, + "balance_loss_clip": 1.02518678, + "balance_loss_mlp": 1.02717006, + "epoch": 0.07094543814820382, + "flos": 21571819862400.0, + "grad_norm": 2.0676158963514344, + "language_loss": 0.88718003, + "learning_rate": 3.98243989312991e-06, + "loss": 0.90938783, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.984375, + "step": 1180, + "time_per_iteration": 2.415634870529175 + }, + { + "auxiliary_loss_clip": 0.01122677, + "auxiliary_loss_mlp": 0.01089986, + "balance_loss_clip": 1.02642345, + "balance_loss_mlp": 1.02745593, + "epoch": 0.07100556140087179, + "flos": 22088883231360.0, + "grad_norm": 1.9755949973321099, + "language_loss": 0.9062252, + "learning_rate": 3.982388359610074e-06, + "loss": 0.92835188, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.953125, + "step": 1181, + "time_per_iteration": 2.3916428089141846 + }, + { + "auxiliary_loss_clip": 0.01117525, + "auxiliary_loss_mlp": 0.01080919, + "balance_loss_clip": 1.02231574, + "balance_loss_mlp": 1.02573502, + "epoch": 0.07106568465353975, + "flos": 47920491388800.0, + "grad_norm": 1.8169800846786288, + "language_loss": 0.86468613, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.88667059, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.9140625, + "step": 1182, + "time_per_iteration": 2.6625585556030273 + }, + { + "auxiliary_loss_clip": 0.01118918, + "auxiliary_loss_mlp": 0.01092169, + "balance_loss_clip": 1.02874947, + "balance_loss_mlp": 1.02779865, + "epoch": 0.07112580790620772, + "flos": 23439672800640.0, + "grad_norm": 2.175270210512498, + "language_loss": 0.83351719, + "learning_rate": 3.982285067055262e-06, + "loss": 0.85562801, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.91015625, + "step": 1183, + "time_per_iteration": 2.415926694869995 + }, + { + "auxiliary_loss_clip": 0.01123528, + "auxiliary_loss_mlp": 0.01092604, + "balance_loss_clip": 1.0248456, + "balance_loss_mlp": 1.02660775, + "epoch": 0.0711859311588757, + "flos": 31867531649280.0, + "grad_norm": 1.9336690096561036, + "language_loss": 0.82249904, + "learning_rate": 3.982233308024204e-06, + "loss": 0.84466034, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.96875, + "step": 1184, + "time_per_iteration": 2.4871761798858643 + }, + { + "auxiliary_loss_clip": 0.01116557, + "auxiliary_loss_mlp": 0.01102872, + "balance_loss_clip": 1.03845167, + "balance_loss_mlp": 1.02501965, + "epoch": 0.07124605441154366, + "flos": 19609281717120.0, + "grad_norm": 1.8042516016519636, + "language_loss": 0.79406917, + "learning_rate": 3.98218147382666e-06, + "loss": 0.81626344, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.9140625, + "step": 1185, + "time_per_iteration": 2.3854079246520996 + }, + { + "auxiliary_loss_clip": 0.01116748, + "auxiliary_loss_mlp": 0.01096321, + "balance_loss_clip": 1.03581023, + "balance_loss_mlp": 1.02474868, + "epoch": 0.07130617766421163, + "flos": 14683560549120.0, + "grad_norm": 2.262057352365454, + "language_loss": 0.69308913, + "learning_rate": 3.982129564464596e-06, + "loss": 0.71521986, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.921875, + "step": 1186, + "time_per_iteration": 2.3720531463623047 + }, + { + "auxiliary_loss_clip": 0.01112848, + "auxiliary_loss_mlp": 0.01087575, + "balance_loss_clip": 1.02396536, + "balance_loss_mlp": 1.02491748, + "epoch": 0.07136630091687961, + "flos": 26066712453120.0, + "grad_norm": 1.7667968910349536, + "language_loss": 0.71992087, + "learning_rate": 3.98207757993998e-06, + "loss": 0.74192506, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.87890625, + "step": 1187, + "time_per_iteration": 2.4268622398376465 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01081683, + "balance_loss_clip": 1.02472436, + "balance_loss_mlp": 1.0262754, + "epoch": 0.07142642416954757, + "flos": 15668285846400.0, + "grad_norm": 2.7678316515069654, + "language_loss": 0.8119632, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.83394706, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.90625, + "step": 1188, + "time_per_iteration": 2.4028849601745605 + }, + { + "auxiliary_loss_clip": 0.01115977, + "auxiliary_loss_mlp": 0.0108913, + "balance_loss_clip": 1.02346921, + "balance_loss_mlp": 1.02768111, + "epoch": 0.07148654742221554, + "flos": 19754310971520.0, + "grad_norm": 2.0267495769136916, + "language_loss": 0.87510866, + "learning_rate": 3.981973385410981e-06, + "loss": 0.8971597, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.8828125, + "step": 1189, + "time_per_iteration": 2.384150981903076 + }, + { + "auxiliary_loss_clip": 0.01114505, + "auxiliary_loss_mlp": 0.0107604, + "balance_loss_clip": 1.01865232, + "balance_loss_mlp": 1.0250957, + "epoch": 0.07154667067488352, + "flos": 23470850511360.0, + "grad_norm": 1.6568544516657202, + "language_loss": 0.79731643, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.81922185, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.89453125, + "step": 1190, + "time_per_iteration": 2.417663812637329 + }, + { + "auxiliary_loss_clip": 0.0112034, + "auxiliary_loss_mlp": 0.01089419, + "balance_loss_clip": 1.02547491, + "balance_loss_mlp": 1.0268836, + "epoch": 0.07160679392755148, + "flos": 18331949381760.0, + "grad_norm": 2.164297692622935, + "language_loss": 0.7927351, + "learning_rate": 3.981868890255468e-06, + "loss": 0.81483269, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.93359375, + "step": 1191, + "time_per_iteration": 2.3940494060516357 + }, + { + "auxiliary_loss_clip": 0.01116035, + "auxiliary_loss_mlp": 0.01083933, + "balance_loss_clip": 1.0195601, + "balance_loss_mlp": 1.02545905, + "epoch": 0.07166691718021945, + "flos": 17746106901120.0, + "grad_norm": 3.704578111152643, + "language_loss": 0.77571988, + "learning_rate": 3.981816529947719e-06, + "loss": 0.79771954, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.64453125, + "router_z_loss_mlp": 0.90625, + "step": 1192, + "time_per_iteration": 2.3752057552337646 + }, + { + "auxiliary_loss_clip": 0.01115085, + "auxiliary_loss_mlp": 0.01076688, + "balance_loss_clip": 1.01872897, + "balance_loss_mlp": 1.02501416, + "epoch": 0.07172704043288743, + "flos": 22450932696960.0, + "grad_norm": 2.235966015789626, + "language_loss": 0.80903035, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.83094811, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.8984375, + "step": 1193, + "time_per_iteration": 2.41412091255188 + }, + { + "auxiliary_loss_clip": 0.01115787, + "auxiliary_loss_mlp": 0.01075892, + "balance_loss_clip": 1.01845717, + "balance_loss_mlp": 1.02688646, + "epoch": 0.07178716368555539, + "flos": 23221081578240.0, + "grad_norm": 2.0258026916721956, + "language_loss": 0.89008176, + "learning_rate": 3.981711583882166e-06, + "loss": 0.91199857, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.890625, + "step": 1194, + "time_per_iteration": 2.401995897293091 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01072062, + "balance_loss_clip": 1.01834655, + "balance_loss_mlp": 1.02682126, + "epoch": 0.07184728693822336, + "flos": 25149788749440.0, + "grad_norm": 2.0168134213511455, + "language_loss": 0.83536619, + "learning_rate": 3.981658998128341e-06, + "loss": 0.85723782, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8828125, + "step": 1195, + "time_per_iteration": 2.4544694423675537 + }, + { + "auxiliary_loss_clip": 0.01112442, + "auxiliary_loss_mlp": 0.010746, + "balance_loss_clip": 1.01945353, + "balance_loss_mlp": 1.02524376, + "epoch": 0.07190741019089132, + "flos": 22710127697280.0, + "grad_norm": 1.75718719432, + "language_loss": 0.80885106, + "learning_rate": 3.981606337229808e-06, + "loss": 0.8307215, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.87109375, + "step": 1196, + "time_per_iteration": 2.4209485054016113 + }, + { + "auxiliary_loss_clip": 0.01112327, + "auxiliary_loss_mlp": 0.01081033, + "balance_loss_clip": 1.02319264, + "balance_loss_mlp": 1.02397537, + "epoch": 0.0719675334435593, + "flos": 29348548254720.0, + "grad_norm": 2.242594260065792, + "language_loss": 0.75135398, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.77328753, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.8828125, + "step": 1197, + "time_per_iteration": 2.451167583465576 + }, + { + "auxiliary_loss_clip": 0.01113114, + "auxiliary_loss_mlp": 0.01081348, + "balance_loss_clip": 1.02367496, + "balance_loss_mlp": 1.02495384, + "epoch": 0.07202765669622727, + "flos": 17638818693120.0, + "grad_norm": 2.0360962175838435, + "language_loss": 0.87672806, + "learning_rate": 3.98150079000661e-06, + "loss": 0.89867264, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.8828125, + "step": 1198, + "time_per_iteration": 2.3775522708892822 + }, + { + "auxiliary_loss_clip": 0.01112577, + "auxiliary_loss_mlp": 0.01076277, + "balance_loss_clip": 1.01998663, + "balance_loss_mlp": 1.02645516, + "epoch": 0.07208777994889523, + "flos": 21432969918720.0, + "grad_norm": 2.091016387007963, + "language_loss": 0.86180282, + "learning_rate": 3.981447903685947e-06, + "loss": 0.88369143, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.86328125, + "step": 1199, + "time_per_iteration": 2.4240610599517822 + }, + { + "auxiliary_loss_clip": 0.01112717, + "auxiliary_loss_mlp": 0.01075554, + "balance_loss_clip": 1.02369833, + "balance_loss_mlp": 1.0278995, + "epoch": 0.07214790320156321, + "flos": 26939715799680.0, + "grad_norm": 2.0905177417523686, + "language_loss": 0.78654313, + "learning_rate": 3.981394942228581e-06, + "loss": 0.8084259, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.84765625, + "step": 1200, + "time_per_iteration": 2.464064121246338 + }, + { + "auxiliary_loss_clip": 0.0110905, + "auxiliary_loss_mlp": 0.01073413, + "balance_loss_clip": 1.0208416, + "balance_loss_mlp": 1.02513266, + "epoch": 0.07220802645423118, + "flos": 23878775370240.0, + "grad_norm": 1.9300602262812165, + "language_loss": 0.84572875, + "learning_rate": 3.98134190563652e-06, + "loss": 0.86755347, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.83984375, + "step": 1201, + "time_per_iteration": 2.4499619007110596 + }, + { + "auxiliary_loss_clip": 0.01114972, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_clip": 1.02146769, + "balance_loss_mlp": 1.02601802, + "epoch": 0.07226814970689914, + "flos": 19242658863360.0, + "grad_norm": 2.5938350055687387, + "language_loss": 0.72377568, + "learning_rate": 3.981288793911775e-06, + "loss": 0.74566323, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.890625, + "step": 1202, + "time_per_iteration": 3.8819968700408936 + }, + { + "auxiliary_loss_clip": 0.01113223, + "auxiliary_loss_mlp": 0.01079972, + "balance_loss_clip": 1.02437234, + "balance_loss_mlp": 1.02635658, + "epoch": 0.07232827295956712, + "flos": 19171017020160.0, + "grad_norm": 6.548318522025175, + "language_loss": 0.89312029, + "learning_rate": 3.98123560705636e-06, + "loss": 0.91505218, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.87109375, + "step": 1203, + "time_per_iteration": 2.3944551944732666 + }, + { + "auxiliary_loss_clip": 0.01111437, + "auxiliary_loss_mlp": 0.01074299, + "balance_loss_clip": 1.02134633, + "balance_loss_mlp": 1.02428722, + "epoch": 0.07238839621223508, + "flos": 17638783781760.0, + "grad_norm": 1.9967232649731343, + "language_loss": 0.81829417, + "learning_rate": 3.981182345072293e-06, + "loss": 0.84015155, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.87109375, + "step": 1204, + "time_per_iteration": 5.206207990646362 + }, + { + "auxiliary_loss_clip": 0.01113144, + "auxiliary_loss_mlp": 0.01073809, + "balance_loss_clip": 1.02049828, + "balance_loss_mlp": 1.02529883, + "epoch": 0.07244851946490305, + "flos": 28291168684800.0, + "grad_norm": 3.040804559294911, + "language_loss": 0.84102583, + "learning_rate": 3.981129007961593e-06, + "loss": 0.86289537, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.87890625, + "step": 1205, + "time_per_iteration": 3.8415613174438477 + }, + { + "auxiliary_loss_clip": 0.01112206, + "auxiliary_loss_mlp": 0.01068996, + "balance_loss_clip": 1.01792717, + "balance_loss_mlp": 1.0258503, + "epoch": 0.07250864271757101, + "flos": 22563736899840.0, + "grad_norm": 1.8308996784701876, + "language_loss": 0.78907037, + "learning_rate": 3.981075595726283e-06, + "loss": 0.81088239, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.86328125, + "step": 1206, + "time_per_iteration": 2.4230120182037354 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01069985, + "balance_loss_clip": 1.01879692, + "balance_loss_mlp": 1.0238272, + "epoch": 0.072568765970239, + "flos": 21761328055680.0, + "grad_norm": 1.8639185009385169, + "language_loss": 0.79852414, + "learning_rate": 3.981022108368387e-06, + "loss": 0.82031488, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.8515625, + "step": 1207, + "time_per_iteration": 2.4163947105407715 + }, + { + "auxiliary_loss_clip": 0.01106544, + "auxiliary_loss_mlp": 0.01074992, + "balance_loss_clip": 1.02742767, + "balance_loss_mlp": 1.0255847, + "epoch": 0.07262888922290696, + "flos": 25518541196160.0, + "grad_norm": 2.1194911961140206, + "language_loss": 0.82011855, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.84193391, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.8125, + "step": 1208, + "time_per_iteration": 2.433619260787964 + }, + { + "auxiliary_loss_clip": 0.01105386, + "auxiliary_loss_mlp": 0.01063465, + "balance_loss_clip": 1.01687789, + "balance_loss_mlp": 1.02401137, + "epoch": 0.07268901247557492, + "flos": 21245626229760.0, + "grad_norm": 1.862190047866094, + "language_loss": 0.80501086, + "learning_rate": 3.980914908292955e-06, + "loss": 0.82669938, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.8125, + "step": 1209, + "time_per_iteration": 2.4045615196228027 + }, + { + "auxiliary_loss_clip": 0.01109715, + "auxiliary_loss_mlp": 0.01067112, + "balance_loss_clip": 1.02071548, + "balance_loss_mlp": 1.02487123, + "epoch": 0.0727491357282429, + "flos": 25478251620480.0, + "grad_norm": 2.7676543826676516, + "language_loss": 0.84162784, + "learning_rate": 3.980861195579486e-06, + "loss": 0.86339611, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.84765625, + "step": 1210, + "time_per_iteration": 2.424144983291626 + }, + { + "auxiliary_loss_clip": 0.0110776, + "auxiliary_loss_mlp": 0.01065589, + "balance_loss_clip": 1.01943123, + "balance_loss_mlp": 1.02581787, + "epoch": 0.07280925898091087, + "flos": 24461021980800.0, + "grad_norm": 1.8249884414728295, + "language_loss": 0.87354827, + "learning_rate": 3.98080740775156e-06, + "loss": 0.89528167, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.8203125, + "step": 1211, + "time_per_iteration": 2.4516186714172363 + }, + { + "auxiliary_loss_clip": 0.01105759, + "auxiliary_loss_mlp": 0.01074731, + "balance_loss_clip": 1.0216347, + "balance_loss_mlp": 1.02405953, + "epoch": 0.07286938223357883, + "flos": 18287435531520.0, + "grad_norm": 2.3175766095114683, + "language_loss": 0.94591117, + "learning_rate": 3.98075354481122e-06, + "loss": 0.96771604, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.81640625, + "step": 1212, + "time_per_iteration": 2.3611464500427246 + }, + { + "auxiliary_loss_clip": 0.01105698, + "auxiliary_loss_mlp": 0.01072169, + "balance_loss_clip": 1.02145743, + "balance_loss_mlp": 1.02371264, + "epoch": 0.07292950548624681, + "flos": 21213750291840.0, + "grad_norm": 1.8241637870983252, + "language_loss": 0.75461155, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.77639019, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.8203125, + "step": 1213, + "time_per_iteration": 2.4256136417388916 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.01068884, + "balance_loss_clip": 1.01740909, + "balance_loss_mlp": 1.02486157, + "epoch": 0.07298962873891478, + "flos": 24640929550080.0, + "grad_norm": 1.6581661244144796, + "language_loss": 0.87027365, + "learning_rate": 3.980645593601465e-06, + "loss": 0.89206064, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.84765625, + "step": 1214, + "time_per_iteration": 2.4261586666107178 + }, + { + "auxiliary_loss_clip": 0.01111435, + "auxiliary_loss_mlp": 0.0106732, + "balance_loss_clip": 1.01434326, + "balance_loss_mlp": 1.02481461, + "epoch": 0.07304975199158274, + "flos": 27051542484480.0, + "grad_norm": 2.29191356886576, + "language_loss": 0.87316763, + "learning_rate": 3.980591505336144e-06, + "loss": 0.89495516, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.8671875, + "step": 1215, + "time_per_iteration": 2.4961042404174805 + }, + { + "auxiliary_loss_clip": 0.01109385, + "auxiliary_loss_mlp": 0.01074217, + "balance_loss_clip": 1.01823676, + "balance_loss_mlp": 1.02366066, + "epoch": 0.07310987524425071, + "flos": 33548075809920.0, + "grad_norm": 1.6020325449163855, + "language_loss": 0.83655798, + "learning_rate": 3.980537341966595e-06, + "loss": 0.85839403, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.85546875, + "step": 1216, + "time_per_iteration": 2.53962779045105 + }, + { + "auxiliary_loss_clip": 0.0111208, + "auxiliary_loss_mlp": 0.01068369, + "balance_loss_clip": 1.01422441, + "balance_loss_mlp": 1.02654088, + "epoch": 0.07316999849691869, + "flos": 28109690104320.0, + "grad_norm": 2.0949301455298226, + "language_loss": 0.79516041, + "learning_rate": 3.980483103494872e-06, + "loss": 0.81696492, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.85546875, + "step": 1217, + "time_per_iteration": 2.4651968479156494 + }, + { + "auxiliary_loss_clip": 0.011084, + "auxiliary_loss_mlp": 0.01076036, + "balance_loss_clip": 1.02527654, + "balance_loss_mlp": 1.02599311, + "epoch": 0.07323012174958665, + "flos": 14391721560960.0, + "grad_norm": 1.8419408599874212, + "language_loss": 0.88053071, + "learning_rate": 3.98042878992303e-06, + "loss": 0.90237504, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.82421875, + "step": 1218, + "time_per_iteration": 2.389381170272827 + }, + { + "auxiliary_loss_clip": 0.01108902, + "auxiliary_loss_mlp": 0.0107483, + "balance_loss_clip": 1.02161479, + "balance_loss_mlp": 1.0239656, + "epoch": 0.07329024500225462, + "flos": 21615356194560.0, + "grad_norm": 2.905376318049401, + "language_loss": 0.8919282, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.91376549, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.8515625, + "step": 1219, + "time_per_iteration": 2.4132585525512695 + }, + { + "auxiliary_loss_clip": 0.01105741, + "auxiliary_loss_mlp": 0.01075677, + "balance_loss_clip": 1.02150822, + "balance_loss_mlp": 1.02352095, + "epoch": 0.0733503682549226, + "flos": 13223318267520.0, + "grad_norm": 2.085097348374203, + "language_loss": 0.87277043, + "learning_rate": 3.980319937487235e-06, + "loss": 0.89458454, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.8203125, + "step": 1220, + "time_per_iteration": 2.3701112270355225 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01076356, + "balance_loss_clip": 1.01951706, + "balance_loss_mlp": 1.02584934, + "epoch": 0.07341049150759056, + "flos": 20885915825280.0, + "grad_norm": 2.7781244427758502, + "language_loss": 0.81141949, + "learning_rate": 3.98026539862741e-06, + "loss": 0.83328545, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.84375, + "step": 1221, + "time_per_iteration": 2.3969979286193848 + }, + { + "auxiliary_loss_clip": 0.01109046, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_clip": 1.02324164, + "balance_loss_mlp": 1.0254637, + "epoch": 0.07347061476025853, + "flos": 15412721627520.0, + "grad_norm": 2.0837882954351112, + "language_loss": 0.95044965, + "learning_rate": 3.980210784675722e-06, + "loss": 0.97230923, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8359375, + "step": 1222, + "time_per_iteration": 2.386704444885254 + }, + { + "auxiliary_loss_clip": 0.01114441, + "auxiliary_loss_mlp": 0.01075608, + "balance_loss_clip": 1.02058101, + "balance_loss_mlp": 1.02711535, + "epoch": 0.0735307380129265, + "flos": 11108070368640.0, + "grad_norm": 2.369027230856239, + "language_loss": 0.94117868, + "learning_rate": 3.980156095634242e-06, + "loss": 0.96307921, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.875, + "step": 1223, + "time_per_iteration": 2.3978323936462402 + }, + { + "auxiliary_loss_clip": 0.01112689, + "auxiliary_loss_mlp": 0.01076749, + "balance_loss_clip": 1.02296162, + "balance_loss_mlp": 1.02690721, + "epoch": 0.07359086126559447, + "flos": 23731267409280.0, + "grad_norm": 1.9497707885845108, + "language_loss": 0.85614836, + "learning_rate": 3.980101331505045e-06, + "loss": 0.87804276, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.859375, + "step": 1224, + "time_per_iteration": 2.4262356758117676 + }, + { + "auxiliary_loss_clip": 0.01111227, + "auxiliary_loss_mlp": 0.01085937, + "balance_loss_clip": 1.02013373, + "balance_loss_mlp": 1.02544844, + "epoch": 0.07365098451826244, + "flos": 20992296337920.0, + "grad_norm": 1.9775315444808272, + "language_loss": 0.86926222, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.89123386, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.859375, + "step": 1225, + "time_per_iteration": 2.4234533309936523 + }, + { + "auxiliary_loss_clip": 0.01113064, + "auxiliary_loss_mlp": 0.01075632, + "balance_loss_clip": 1.02000856, + "balance_loss_mlp": 1.02610576, + "epoch": 0.0737111077709304, + "flos": 19932682440960.0, + "grad_norm": 1.8867982280421667, + "language_loss": 0.93087101, + "learning_rate": 3.979991577991808e-06, + "loss": 0.95275795, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8671875, + "step": 1226, + "time_per_iteration": 2.382312059402466 + }, + { + "auxiliary_loss_clip": 0.01121495, + "auxiliary_loss_mlp": 0.01077993, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.02823472, + "epoch": 0.07377123102359838, + "flos": 16580601250560.0, + "grad_norm": 3.9106377283634415, + "language_loss": 0.81725794, + "learning_rate": 3.97993658861193e-06, + "loss": 0.83925283, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.9296875, + "step": 1227, + "time_per_iteration": 2.396059989929199 + }, + { + "auxiliary_loss_clip": 0.01109074, + "auxiliary_loss_mlp": 0.01078504, + "balance_loss_clip": 1.01956725, + "balance_loss_mlp": 1.02651048, + "epoch": 0.07383135427626634, + "flos": 28327338720000.0, + "grad_norm": 1.6325203291660428, + "language_loss": 0.87493557, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.89681137, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.828125, + "step": 1228, + "time_per_iteration": 2.465291976928711 + }, + { + "auxiliary_loss_clip": 0.01109299, + "auxiliary_loss_mlp": 0.01073676, + "balance_loss_clip": 1.01771939, + "balance_loss_mlp": 1.02482212, + "epoch": 0.07389147752893431, + "flos": 20046149959680.0, + "grad_norm": 2.2455498681705266, + "language_loss": 0.83000445, + "learning_rate": 3.97982638461608e-06, + "loss": 0.85183418, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.84375, + "step": 1229, + "time_per_iteration": 2.4005115032196045 + }, + { + "auxiliary_loss_clip": 0.01113006, + "auxiliary_loss_mlp": 0.01078398, + "balance_loss_clip": 1.01755404, + "balance_loss_mlp": 1.02557695, + "epoch": 0.07395160078160229, + "flos": 18113148691200.0, + "grad_norm": 2.103421717651121, + "language_loss": 0.82628107, + "learning_rate": 3.979771170004287e-06, + "loss": 0.84819508, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.875, + "step": 1230, + "time_per_iteration": 2.4553723335266113 + }, + { + "auxiliary_loss_clip": 0.01104364, + "auxiliary_loss_mlp": 0.01067144, + "balance_loss_clip": 1.01521623, + "balance_loss_mlp": 1.02369094, + "epoch": 0.07401172403427025, + "flos": 23585784307200.0, + "grad_norm": 1.773209059859953, + "language_loss": 0.84134996, + "learning_rate": 3.979715880319372e-06, + "loss": 0.86306506, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.8046875, + "step": 1231, + "time_per_iteration": 2.4403328895568848 + }, + { + "auxiliary_loss_clip": 0.01111281, + "auxiliary_loss_mlp": 0.01080462, + "balance_loss_clip": 1.02283573, + "balance_loss_mlp": 1.02384484, + "epoch": 0.07407184728693822, + "flos": 26358691086720.0, + "grad_norm": 2.375642219981619, + "language_loss": 0.98807317, + "learning_rate": 3.979660515563434e-06, + "loss": 1.00999057, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.875, + "step": 1232, + "time_per_iteration": 2.454683542251587 + }, + { + "auxiliary_loss_clip": 0.0110602, + "auxiliary_loss_mlp": 0.01077715, + "balance_loss_clip": 1.02233052, + "balance_loss_mlp": 1.02257073, + "epoch": 0.0741319705396062, + "flos": 22199348373120.0, + "grad_norm": 1.641293591834297, + "language_loss": 0.83461791, + "learning_rate": 3.979605075738569e-06, + "loss": 0.85645527, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8359375, + "step": 1233, + "time_per_iteration": 2.413576364517212 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01076212, + "balance_loss_clip": 1.01570177, + "balance_loss_mlp": 1.02388155, + "epoch": 0.07419209379227416, + "flos": 39198978161280.0, + "grad_norm": 2.0025818126493484, + "language_loss": 0.73032165, + "learning_rate": 3.979549560846883e-06, + "loss": 0.75218701, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.8671875, + "step": 1234, + "time_per_iteration": 2.5814261436462402 + }, + { + "auxiliary_loss_clip": 0.01108463, + "auxiliary_loss_mlp": 0.01074801, + "balance_loss_clip": 1.01767635, + "balance_loss_mlp": 1.02383804, + "epoch": 0.07425221704494213, + "flos": 22780617465600.0, + "grad_norm": 2.004035315832205, + "language_loss": 0.79149365, + "learning_rate": 3.979493970890478e-06, + "loss": 0.81332636, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.84765625, + "step": 1235, + "time_per_iteration": 2.411708116531372 + }, + { + "auxiliary_loss_clip": 0.01108472, + "auxiliary_loss_mlp": 0.01066522, + "balance_loss_clip": 1.01163781, + "balance_loss_mlp": 1.02327132, + "epoch": 0.0743123402976101, + "flos": 22271897911680.0, + "grad_norm": 1.935686887854904, + "language_loss": 0.84948778, + "learning_rate": 3.979438305871464e-06, + "loss": 0.8712377, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.8515625, + "step": 1236, + "time_per_iteration": 2.4159562587738037 + }, + { + "auxiliary_loss_clip": 0.01111564, + "auxiliary_loss_mlp": 0.01078823, + "balance_loss_clip": 1.01988626, + "balance_loss_mlp": 1.02411819, + "epoch": 0.07437246355027807, + "flos": 29313739762560.0, + "grad_norm": 2.279897298121074, + "language_loss": 0.78992748, + "learning_rate": 3.979382565791951e-06, + "loss": 0.81183136, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.875, + "step": 1237, + "time_per_iteration": 2.489786386489868 + }, + { + "auxiliary_loss_clip": 0.01110402, + "auxiliary_loss_mlp": 0.01075822, + "balance_loss_clip": 1.01969862, + "balance_loss_mlp": 1.02342427, + "epoch": 0.07443258680294604, + "flos": 31943293032960.0, + "grad_norm": 1.5839640980228518, + "language_loss": 0.79452771, + "learning_rate": 3.979326750654053e-06, + "loss": 0.81638992, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.8671875, + "step": 1238, + "time_per_iteration": 2.5114190578460693 + }, + { + "auxiliary_loss_clip": 0.0111158, + "auxiliary_loss_mlp": 0.01084705, + "balance_loss_clip": 1.02221525, + "balance_loss_mlp": 1.02460039, + "epoch": 0.074492710055614, + "flos": 22674167130240.0, + "grad_norm": 2.0712338448934906, + "language_loss": 0.88884497, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.91080785, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.87109375, + "step": 1239, + "time_per_iteration": 2.3974335193634033 + }, + { + "auxiliary_loss_clip": 0.01112152, + "auxiliary_loss_mlp": 0.01082096, + "balance_loss_clip": 1.01686513, + "balance_loss_mlp": 1.02490366, + "epoch": 0.07455283330828198, + "flos": 21283925857920.0, + "grad_norm": 1.9958091208716677, + "language_loss": 0.91966289, + "learning_rate": 3.979214895211569e-06, + "loss": 0.94160545, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.875, + "step": 1240, + "time_per_iteration": 2.414065361022949 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.01080189, + "balance_loss_clip": 1.02218163, + "balance_loss_mlp": 1.02561247, + "epoch": 0.07461295656094995, + "flos": 24387285456000.0, + "grad_norm": 1.66041345755512, + "language_loss": 0.91123086, + "learning_rate": 3.979158854911225e-06, + "loss": 0.93313169, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.84375, + "step": 1241, + "time_per_iteration": 3.871889591217041 + }, + { + "auxiliary_loss_clip": 0.01031941, + "auxiliary_loss_mlp": 0.01019023, + "balance_loss_clip": 1.00652981, + "balance_loss_mlp": 1.00984669, + "epoch": 0.07467307981361791, + "flos": 62106608753280.0, + "grad_norm": 0.9086997194473594, + "language_loss": 0.63194132, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65245092, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.22070312, + "step": 1242, + "time_per_iteration": 3.0794472694396973 + }, + { + "auxiliary_loss_clip": 0.01120338, + "auxiliary_loss_mlp": 0.01082743, + "balance_loss_clip": 1.01903737, + "balance_loss_mlp": 1.02761149, + "epoch": 0.07473320306628589, + "flos": 24861999479040.0, + "grad_norm": 2.705871379581421, + "language_loss": 0.66069138, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.68272221, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.9296875, + "step": 1243, + "time_per_iteration": 3.8724122047424316 + }, + { + "auxiliary_loss_clip": 0.01108328, + "auxiliary_loss_mlp": 0.01071227, + "balance_loss_clip": 1.01634264, + "balance_loss_mlp": 1.02450335, + "epoch": 0.07479332631895386, + "flos": 24896354123520.0, + "grad_norm": 1.800178602083535, + "language_loss": 0.78938186, + "learning_rate": 3.978990283719296e-06, + "loss": 0.81117737, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.8359375, + "step": 1244, + "time_per_iteration": 3.9459540843963623 + }, + { + "auxiliary_loss_clip": 0.01116793, + "auxiliary_loss_mlp": 0.01084247, + "balance_loss_clip": 1.02387989, + "balance_loss_mlp": 1.02684224, + "epoch": 0.07485344957162182, + "flos": 17814467076480.0, + "grad_norm": 2.757158346905854, + "language_loss": 0.72362882, + "learning_rate": 3.978933943232123e-06, + "loss": 0.7456392, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.8984375, + "step": 1245, + "time_per_iteration": 2.3565220832824707 + }, + { + "auxiliary_loss_clip": 0.01111562, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.02514184, + "balance_loss_mlp": 1.02552235, + "epoch": 0.0749135728242898, + "flos": 25009018680960.0, + "grad_norm": 1.9986136969655295, + "language_loss": 0.90960586, + "learning_rate": 3.978877527703576e-06, + "loss": 0.93157637, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.86328125, + "step": 1246, + "time_per_iteration": 3.7871716022491455 + }, + { + "auxiliary_loss_clip": 0.01121845, + "auxiliary_loss_mlp": 0.01089216, + "balance_loss_clip": 1.02717948, + "balance_loss_mlp": 1.02655077, + "epoch": 0.07497369607695777, + "flos": 17821100234880.0, + "grad_norm": 3.1104411953872164, + "language_loss": 0.92641199, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.94852263, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.953125, + "step": 1247, + "time_per_iteration": 2.3781256675720215 + }, + { + "auxiliary_loss_clip": 0.01109982, + "auxiliary_loss_mlp": 0.01076662, + "balance_loss_clip": 1.0206337, + "balance_loss_mlp": 1.0251646, + "epoch": 0.07503381932962573, + "flos": 15120219323520.0, + "grad_norm": 2.321477048376059, + "language_loss": 0.6819427, + "learning_rate": 3.978764471530921e-06, + "loss": 0.70380914, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.84765625, + "step": 1248, + "time_per_iteration": 2.3754279613494873 + }, + { + "auxiliary_loss_clip": 0.0110812, + "auxiliary_loss_mlp": 0.01078073, + "balance_loss_clip": 1.02323639, + "balance_loss_mlp": 1.02528, + "epoch": 0.0750939425822937, + "flos": 12816091635840.0, + "grad_norm": 2.4957260738878477, + "language_loss": 0.76485664, + "learning_rate": 3.978707830891102e-06, + "loss": 0.78671861, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.828125, + "step": 1249, + "time_per_iteration": 2.377328395843506 + }, + { + "auxiliary_loss_clip": 0.01114947, + "auxiliary_loss_mlp": 0.01083832, + "balance_loss_clip": 1.02391791, + "balance_loss_mlp": 1.02607584, + "epoch": 0.07515406583496168, + "flos": 24205702141440.0, + "grad_norm": 2.5642893747076525, + "language_loss": 0.84517086, + "learning_rate": 3.978651115218482e-06, + "loss": 0.86715871, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.88671875, + "step": 1250, + "time_per_iteration": 2.415754556655884 + }, + { + "auxiliary_loss_clip": 0.01109544, + "auxiliary_loss_mlp": 0.01077668, + "balance_loss_clip": 1.0204953, + "balance_loss_mlp": 1.02603602, + "epoch": 0.07521418908762964, + "flos": 26686944489600.0, + "grad_norm": 2.3179918389725236, + "language_loss": 0.70440054, + "learning_rate": 3.978594324515215e-06, + "loss": 0.72627264, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.8359375, + "step": 1251, + "time_per_iteration": 2.4797544479370117 + }, + { + "auxiliary_loss_clip": 0.01031316, + "auxiliary_loss_mlp": 0.01013148, + "balance_loss_clip": 1.00318182, + "balance_loss_mlp": 1.00989604, + "epoch": 0.0752743123402976, + "flos": 59091788096640.0, + "grad_norm": 0.9691640070571871, + "language_loss": 0.70728672, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72773135, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.21484375, + "step": 1252, + "time_per_iteration": 3.060161590576172 + }, + { + "auxiliary_loss_clip": 0.01114069, + "auxiliary_loss_mlp": 0.01074152, + "balance_loss_clip": 1.01807642, + "balance_loss_mlp": 1.02649546, + "epoch": 0.07533443559296558, + "flos": 23475912658560.0, + "grad_norm": 3.4173303279108067, + "language_loss": 0.81790322, + "learning_rate": 3.97848051802535e-06, + "loss": 0.83978546, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.875, + "step": 1253, + "time_per_iteration": 2.4284822940826416 + }, + { + "auxiliary_loss_clip": 0.01114018, + "auxiliary_loss_mlp": 0.01076051, + "balance_loss_clip": 1.01508713, + "balance_loss_mlp": 1.02835727, + "epoch": 0.07539455884563355, + "flos": 20878270237440.0, + "grad_norm": 2.3738502271501973, + "language_loss": 0.97283733, + "learning_rate": 3.978423502243069e-06, + "loss": 0.9947381, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.85546875, + "step": 1254, + "time_per_iteration": 2.3995022773742676 + }, + { + "auxiliary_loss_clip": 0.01109642, + "auxiliary_loss_mlp": 0.01073235, + "balance_loss_clip": 1.0222131, + "balance_loss_mlp": 1.02569199, + "epoch": 0.07545468209830151, + "flos": 27671669786880.0, + "grad_norm": 1.765312967636275, + "language_loss": 0.90268552, + "learning_rate": 3.97836641143877e-06, + "loss": 0.92451429, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.83984375, + "step": 1255, + "time_per_iteration": 2.472668409347534 + }, + { + "auxiliary_loss_clip": 0.01109255, + "auxiliary_loss_mlp": 0.01090003, + "balance_loss_clip": 1.02791905, + "balance_loss_mlp": 1.02452517, + "epoch": 0.0755148053509695, + "flos": 14136122430720.0, + "grad_norm": 1.7580557517016258, + "language_loss": 0.81169355, + "learning_rate": 3.978309245614618e-06, + "loss": 0.83368611, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.84765625, + "step": 1256, + "time_per_iteration": 2.3847219944000244 + }, + { + "auxiliary_loss_clip": 0.01027411, + "auxiliary_loss_mlp": 0.01026582, + "balance_loss_clip": 1.01389813, + "balance_loss_mlp": 1.00679183, + "epoch": 0.07557492860363746, + "flos": 58232506780800.0, + "grad_norm": 0.7958810759124006, + "language_loss": 0.58199626, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.6025362, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.20605469, + "step": 1257, + "time_per_iteration": 3.1301071643829346 + }, + { + "auxiliary_loss_clip": 0.01110813, + "auxiliary_loss_mlp": 0.01086886, + "balance_loss_clip": 1.0254221, + "balance_loss_mlp": 1.02576709, + "epoch": 0.07563505185630542, + "flos": 24643233699840.0, + "grad_norm": 2.10426766113452, + "language_loss": 0.92514712, + "learning_rate": 3.978194688915432e-06, + "loss": 0.94712406, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.8515625, + "step": 1258, + "time_per_iteration": 2.415243148803711 + }, + { + "auxiliary_loss_clip": 0.01105832, + "auxiliary_loss_mlp": 0.01078805, + "balance_loss_clip": 1.02330077, + "balance_loss_mlp": 1.02439606, + "epoch": 0.07569517510897339, + "flos": 15522104517120.0, + "grad_norm": 2.5227415239909043, + "language_loss": 0.82888377, + "learning_rate": 3.978137298044741e-06, + "loss": 0.85073018, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8125, + "step": 1259, + "time_per_iteration": 2.3969428539276123 + }, + { + "auxiliary_loss_clip": 0.0111098, + "auxiliary_loss_mlp": 0.01080769, + "balance_loss_clip": 1.0237391, + "balance_loss_mlp": 1.0241338, + "epoch": 0.07575529836164137, + "flos": 22927462110720.0, + "grad_norm": 2.247022111498786, + "language_loss": 0.7849946, + "learning_rate": 3.978079832162885e-06, + "loss": 0.80691212, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.8671875, + "step": 1260, + "time_per_iteration": 2.4059765338897705 + }, + { + "auxiliary_loss_clip": 0.01112623, + "auxiliary_loss_mlp": 0.01083664, + "balance_loss_clip": 1.0251081, + "balance_loss_mlp": 1.02541876, + "epoch": 0.07581542161430933, + "flos": 19499410068480.0, + "grad_norm": 1.7031442565244528, + "language_loss": 0.87311423, + "learning_rate": 3.978022291272044e-06, + "loss": 0.89507705, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.87109375, + "step": 1261, + "time_per_iteration": 2.4009759426116943 + }, + { + "auxiliary_loss_clip": 0.01116196, + "auxiliary_loss_mlp": 0.01082479, + "balance_loss_clip": 1.02974129, + "balance_loss_mlp": 1.02728581, + "epoch": 0.0758755448669773, + "flos": 24972290064000.0, + "grad_norm": 1.8298346067670568, + "language_loss": 0.84464765, + "learning_rate": 3.977964675374399e-06, + "loss": 0.86663449, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.890625, + "step": 1262, + "time_per_iteration": 2.4443039894104004 + }, + { + "auxiliary_loss_clip": 0.01111299, + "auxiliary_loss_mlp": 0.01080911, + "balance_loss_clip": 1.02464437, + "balance_loss_mlp": 1.02586579, + "epoch": 0.07593566811964528, + "flos": 22746856314240.0, + "grad_norm": 2.8306592051255066, + "language_loss": 0.86415261, + "learning_rate": 3.977906984472136e-06, + "loss": 0.88607466, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.85546875, + "step": 1263, + "time_per_iteration": 2.423081159591675 + }, + { + "auxiliary_loss_clip": 0.0111389, + "auxiliary_loss_mlp": 0.01078518, + "balance_loss_clip": 1.02301455, + "balance_loss_mlp": 1.02697253, + "epoch": 0.07599579137231324, + "flos": 23111279752320.0, + "grad_norm": 1.8937584893590937, + "language_loss": 0.78919494, + "learning_rate": 3.977849218567442e-06, + "loss": 0.81111908, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8671875, + "step": 1264, + "time_per_iteration": 2.4227380752563477 + }, + { + "auxiliary_loss_clip": 0.01115976, + "auxiliary_loss_mlp": 0.01084785, + "balance_loss_clip": 1.02642047, + "balance_loss_mlp": 1.0276103, + "epoch": 0.07605591462498121, + "flos": 14501174273280.0, + "grad_norm": 5.836269019396336, + "language_loss": 0.84655112, + "learning_rate": 3.977791377662507e-06, + "loss": 0.8685587, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.8828125, + "step": 1265, + "time_per_iteration": 2.374069929122925 + }, + { + "auxiliary_loss_clip": 0.0111927, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_clip": 1.02766705, + "balance_loss_mlp": 1.02913237, + "epoch": 0.07611603787764919, + "flos": 23513060211840.0, + "grad_norm": 2.801887092896207, + "language_loss": 0.68210065, + "learning_rate": 3.977733461759524e-06, + "loss": 0.70416844, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.90234375, + "step": 1266, + "time_per_iteration": 2.4233310222625732 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01083002, + "balance_loss_clip": 1.02563834, + "balance_loss_mlp": 1.02844703, + "epoch": 0.07617616113031715, + "flos": 21506112950400.0, + "grad_norm": 2.138864087768607, + "language_loss": 0.84271103, + "learning_rate": 3.977675470860691e-06, + "loss": 0.86472666, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.90234375, + "step": 1267, + "time_per_iteration": 2.4329049587249756 + }, + { + "auxiliary_loss_clip": 0.01114862, + "auxiliary_loss_mlp": 0.0108758, + "balance_loss_clip": 1.03214765, + "balance_loss_mlp": 1.02739859, + "epoch": 0.07623628438298512, + "flos": 14572327357440.0, + "grad_norm": 2.771044179461256, + "language_loss": 0.76195407, + "learning_rate": 3.977617404968205e-06, + "loss": 0.78397858, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.875, + "step": 1268, + "time_per_iteration": 2.371276378631592 + }, + { + "auxiliary_loss_clip": 0.01118232, + "auxiliary_loss_mlp": 0.01085413, + "balance_loss_clip": 1.02945626, + "balance_loss_mlp": 1.02741373, + "epoch": 0.07629640763565308, + "flos": 14719521116160.0, + "grad_norm": 2.19354742090999, + "language_loss": 0.85878783, + "learning_rate": 3.977559264084269e-06, + "loss": 0.88082427, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.91015625, + "step": 1269, + "time_per_iteration": 2.4100465774536133 + }, + { + "auxiliary_loss_clip": 0.01117596, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_clip": 1.02993524, + "balance_loss_mlp": 1.02872252, + "epoch": 0.07635653088832106, + "flos": 14902047037440.0, + "grad_norm": 2.5239472678751866, + "language_loss": 0.92882013, + "learning_rate": 3.977501048211088e-06, + "loss": 0.95087475, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.890625, + "step": 1270, + "time_per_iteration": 2.390667200088501 + }, + { + "auxiliary_loss_clip": 0.0111819, + "auxiliary_loss_mlp": 0.01088361, + "balance_loss_clip": 1.02823234, + "balance_loss_mlp": 1.02841234, + "epoch": 0.07641665414098903, + "flos": 26650355518080.0, + "grad_norm": 2.1356163807142554, + "language_loss": 0.74612987, + "learning_rate": 3.977442757350869e-06, + "loss": 0.76819539, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.8984375, + "step": 1271, + "time_per_iteration": 2.4645323753356934 + }, + { + "auxiliary_loss_clip": 0.01106961, + "auxiliary_loss_mlp": 0.01072809, + "balance_loss_clip": 1.01928401, + "balance_loss_mlp": 1.02486467, + "epoch": 0.07647677739365699, + "flos": 25191614424960.0, + "grad_norm": 2.00499679941534, + "language_loss": 0.83985341, + "learning_rate": 3.977384391505823e-06, + "loss": 0.86165106, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.8203125, + "step": 1272, + "time_per_iteration": 2.453439950942993 + }, + { + "auxiliary_loss_clip": 0.01112914, + "auxiliary_loss_mlp": 0.01075436, + "balance_loss_clip": 1.01936054, + "balance_loss_mlp": 1.02516222, + "epoch": 0.07653690064632497, + "flos": 20557103840640.0, + "grad_norm": 1.6337958630292642, + "language_loss": 0.83530927, + "learning_rate": 3.977325950678162e-06, + "loss": 0.85719275, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.87890625, + "step": 1273, + "time_per_iteration": 2.404423475265503 + }, + { + "auxiliary_loss_clip": 0.01116261, + "auxiliary_loss_mlp": 0.01080123, + "balance_loss_clip": 1.02066171, + "balance_loss_mlp": 1.02459669, + "epoch": 0.07659702389899294, + "flos": 22268336952960.0, + "grad_norm": 1.7880935307707355, + "language_loss": 0.83093613, + "learning_rate": 3.977267434870103e-06, + "loss": 0.85289991, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.91796875, + "step": 1274, + "time_per_iteration": 2.426405429840088 + }, + { + "auxiliary_loss_clip": 0.0111479, + "auxiliary_loss_mlp": 0.0107794, + "balance_loss_clip": 1.02069569, + "balance_loss_mlp": 1.0269407, + "epoch": 0.0766571471516609, + "flos": 32634713064960.0, + "grad_norm": 1.6413403518417475, + "language_loss": 0.74679124, + "learning_rate": 3.977208844083865e-06, + "loss": 0.7687186, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.87890625, + "step": 1275, + "time_per_iteration": 2.490630865097046 + }, + { + "auxiliary_loss_clip": 0.01114042, + "auxiliary_loss_mlp": 0.01087374, + "balance_loss_clip": 1.02521873, + "balance_loss_mlp": 1.02556527, + "epoch": 0.07671727040432888, + "flos": 15266505386880.0, + "grad_norm": 2.124158014918776, + "language_loss": 0.83142871, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.85344291, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.8828125, + "step": 1276, + "time_per_iteration": 2.3857274055480957 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01076865, + "balance_loss_clip": 1.01792765, + "balance_loss_mlp": 1.02478075, + "epoch": 0.07677739365699685, + "flos": 28182833136000.0, + "grad_norm": 2.33514294870614, + "language_loss": 0.6354655, + "learning_rate": 3.97709143758574e-06, + "loss": 0.6573956, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.9140625, + "step": 1277, + "time_per_iteration": 2.4477226734161377 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01082188, + "balance_loss_clip": 1.02167702, + "balance_loss_mlp": 1.02578163, + "epoch": 0.07683751690966481, + "flos": 18295150942080.0, + "grad_norm": 2.7361114192969347, + "language_loss": 0.78426409, + "learning_rate": 3.977032621878305e-06, + "loss": 0.80626243, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.91796875, + "step": 1278, + "time_per_iteration": 2.3961613178253174 + }, + { + "auxiliary_loss_clip": 0.01111011, + "auxiliary_loss_mlp": 0.01078639, + "balance_loss_clip": 1.02032232, + "balance_loss_mlp": 1.02468765, + "epoch": 0.07689764016233278, + "flos": 21980024012160.0, + "grad_norm": 2.1504266101209026, + "language_loss": 0.90395373, + "learning_rate": 3.976973731201596e-06, + "loss": 0.92585027, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.86328125, + "step": 1279, + "time_per_iteration": 2.408963918685913 + }, + { + "auxiliary_loss_clip": 0.01114736, + "auxiliary_loss_mlp": 0.01081161, + "balance_loss_clip": 1.02448916, + "balance_loss_mlp": 1.02696049, + "epoch": 0.07695776341500075, + "flos": 22234924915200.0, + "grad_norm": 2.903596671162025, + "language_loss": 0.85071397, + "learning_rate": 3.976914765557845e-06, + "loss": 0.87267292, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.87890625, + "step": 1280, + "time_per_iteration": 3.9086408615112305 + }, + { + "auxiliary_loss_clip": 0.01116047, + "auxiliary_loss_mlp": 0.01084506, + "balance_loss_clip": 1.02199268, + "balance_loss_mlp": 1.02780628, + "epoch": 0.07701788666766872, + "flos": 16142825312640.0, + "grad_norm": 2.0221543100916013, + "language_loss": 0.77778512, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.79979062, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.8828125, + "step": 1281, + "time_per_iteration": 2.4197499752044678 + }, + { + "auxiliary_loss_clip": 0.01119853, + "auxiliary_loss_mlp": 0.01084451, + "balance_loss_clip": 1.02625275, + "balance_loss_mlp": 1.02659404, + "epoch": 0.07707800992033668, + "flos": 19462053047040.0, + "grad_norm": 1.8473089572523251, + "language_loss": 0.78039145, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.80243444, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.9296875, + "step": 1282, + "time_per_iteration": 3.84993839263916 + }, + { + "auxiliary_loss_clip": 0.01115278, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_clip": 1.0307976, + "balance_loss_mlp": 1.02454948, + "epoch": 0.07713813317300466, + "flos": 18989259148800.0, + "grad_norm": 2.4401431012498196, + "language_loss": 0.85270166, + "learning_rate": 3.976737418846713e-06, + "loss": 0.87470013, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.90625, + "step": 1283, + "time_per_iteration": 3.811868906021118 + }, + { + "auxiliary_loss_clip": 0.01112127, + "auxiliary_loss_mlp": 0.01088686, + "balance_loss_clip": 1.02974927, + "balance_loss_mlp": 1.02479434, + "epoch": 0.07719825642567263, + "flos": 18112974134400.0, + "grad_norm": 2.2766967210165423, + "language_loss": 0.77546883, + "learning_rate": 3.976678153357181e-06, + "loss": 0.79747695, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.875, + "step": 1284, + "time_per_iteration": 2.3963985443115234 + }, + { + "auxiliary_loss_clip": 0.01113136, + "auxiliary_loss_mlp": 0.01080375, + "balance_loss_clip": 1.02406073, + "balance_loss_mlp": 1.02427959, + "epoch": 0.0772583796783406, + "flos": 42192780312960.0, + "grad_norm": 2.1345674308908213, + "language_loss": 0.78020114, + "learning_rate": 3.976618812911817e-06, + "loss": 0.80213624, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.88671875, + "step": 1285, + "time_per_iteration": 3.9257423877716064 + }, + { + "auxiliary_loss_clip": 0.01112151, + "auxiliary_loss_mlp": 0.01081314, + "balance_loss_clip": 1.02573895, + "balance_loss_mlp": 1.02636755, + "epoch": 0.07731850293100857, + "flos": 24752546766720.0, + "grad_norm": 1.9899012172378996, + "language_loss": 0.86305422, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.8849889, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.859375, + "step": 1286, + "time_per_iteration": 2.4472496509552 + }, + { + "auxiliary_loss_clip": 0.01115797, + "auxiliary_loss_mlp": 0.01078622, + "balance_loss_clip": 1.02040064, + "balance_loss_mlp": 1.02593493, + "epoch": 0.07737862618367654, + "flos": 17564942522880.0, + "grad_norm": 2.7013698388651735, + "language_loss": 0.82241994, + "learning_rate": 3.97649990716259e-06, + "loss": 0.84436411, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.8984375, + "step": 1287, + "time_per_iteration": 2.3523247241973877 + }, + { + "auxiliary_loss_clip": 0.01113506, + "auxiliary_loss_mlp": 0.01078623, + "balance_loss_clip": 1.01923311, + "balance_loss_mlp": 1.0250392, + "epoch": 0.0774387494363445, + "flos": 25626038872320.0, + "grad_norm": 1.840611530581988, + "language_loss": 0.86045706, + "learning_rate": 3.976440341863237e-06, + "loss": 0.88237834, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.88671875, + "step": 1288, + "time_per_iteration": 2.4548630714416504 + }, + { + "auxiliary_loss_clip": 0.0111625, + "auxiliary_loss_mlp": 0.01086655, + "balance_loss_clip": 1.02561998, + "balance_loss_mlp": 1.02570438, + "epoch": 0.07749887268901248, + "flos": 12239046817920.0, + "grad_norm": 2.220347960724826, + "language_loss": 0.88918692, + "learning_rate": 3.976380701617068e-06, + "loss": 0.91121596, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.90625, + "step": 1289, + "time_per_iteration": 2.36944580078125 + }, + { + "auxiliary_loss_clip": 0.01114819, + "auxiliary_loss_mlp": 0.0107578, + "balance_loss_clip": 1.01746273, + "balance_loss_mlp": 1.02522171, + "epoch": 0.07755899594168045, + "flos": 25080590701440.0, + "grad_norm": 1.844656224417422, + "language_loss": 0.87233949, + "learning_rate": 3.976320986426344e-06, + "loss": 0.89424545, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.8984375, + "step": 1290, + "time_per_iteration": 2.450631618499756 + }, + { + "auxiliary_loss_clip": 0.01109913, + "auxiliary_loss_mlp": 0.01088995, + "balance_loss_clip": 1.02924693, + "balance_loss_mlp": 1.02527738, + "epoch": 0.07761911919434841, + "flos": 14245540231680.0, + "grad_norm": 2.0480529724921737, + "language_loss": 0.94071054, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.96269959, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.84765625, + "step": 1291, + "time_per_iteration": 2.3772730827331543 + }, + { + "auxiliary_loss_clip": 0.01037002, + "auxiliary_loss_mlp": 0.01070855, + "balance_loss_clip": 1.06127095, + "balance_loss_mlp": 1.01347315, + "epoch": 0.07767924244701638, + "flos": 67233463597440.0, + "grad_norm": 0.9379739157002828, + "language_loss": 0.65196186, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67304045, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.09570312, + "router_z_loss_mlp": 0.23535156, + "step": 1292, + "time_per_iteration": 3.140533924102783 + }, + { + "auxiliary_loss_clip": 0.01111043, + "auxiliary_loss_mlp": 0.0108101, + "balance_loss_clip": 1.02264476, + "balance_loss_mlp": 1.02411091, + "epoch": 0.07773936569968436, + "flos": 28549316344320.0, + "grad_norm": 1.6000556653965194, + "language_loss": 0.90237826, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.92429888, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.8671875, + "step": 1293, + "time_per_iteration": 2.4698164463043213 + }, + { + "auxiliary_loss_clip": 0.01113567, + "auxiliary_loss_mlp": 0.0108832, + "balance_loss_clip": 1.02516317, + "balance_loss_mlp": 1.02611494, + "epoch": 0.07779948895235232, + "flos": 27489039131520.0, + "grad_norm": 2.3333879269140048, + "language_loss": 0.88249624, + "learning_rate": 3.976081376263239e-06, + "loss": 0.90451515, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.875, + "step": 1294, + "time_per_iteration": 2.50152325630188 + }, + { + "auxiliary_loss_clip": 0.01117477, + "auxiliary_loss_mlp": 0.01092092, + "balance_loss_clip": 1.02714658, + "balance_loss_mlp": 1.02734089, + "epoch": 0.07785961220502029, + "flos": 18222322112640.0, + "grad_norm": 2.292976268986861, + "language_loss": 0.82917792, + "learning_rate": 3.976021286383768e-06, + "loss": 0.85127366, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.8984375, + "step": 1295, + "time_per_iteration": 2.3899710178375244 + }, + { + "auxiliary_loss_clip": 0.01111525, + "auxiliary_loss_mlp": 0.01088247, + "balance_loss_clip": 1.02911878, + "balance_loss_mlp": 1.02379036, + "epoch": 0.07791973545768827, + "flos": 24607063664640.0, + "grad_norm": 1.977872379709689, + "language_loss": 0.91423941, + "learning_rate": 3.975961121573371e-06, + "loss": 0.93623716, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.875, + "step": 1296, + "time_per_iteration": 2.435633420944214 + }, + { + "auxiliary_loss_clip": 0.0111575, + "auxiliary_loss_mlp": 0.01091681, + "balance_loss_clip": 1.02830935, + "balance_loss_mlp": 1.02601528, + "epoch": 0.07797985871035623, + "flos": 14281221507840.0, + "grad_norm": 2.6504782850883415, + "language_loss": 0.99314928, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.0152235, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.8984375, + "step": 1297, + "time_per_iteration": 2.3925867080688477 + }, + { + "auxiliary_loss_clip": 0.0111716, + "auxiliary_loss_mlp": 0.01092027, + "balance_loss_clip": 1.02958512, + "balance_loss_mlp": 1.02448416, + "epoch": 0.0780399819630242, + "flos": 26609367715200.0, + "grad_norm": 2.308454576673817, + "language_loss": 0.78110254, + "learning_rate": 3.97584056716893e-06, + "loss": 0.80319446, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.921875, + "step": 1298, + "time_per_iteration": 2.4545648097991943 + }, + { + "auxiliary_loss_clip": 0.01113427, + "auxiliary_loss_mlp": 0.01093542, + "balance_loss_clip": 1.0345093, + "balance_loss_mlp": 1.02643466, + "epoch": 0.07810010521569218, + "flos": 21833458657920.0, + "grad_norm": 1.6949417476983086, + "language_loss": 0.82703048, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.84910017, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.8671875, + "step": 1299, + "time_per_iteration": 2.4164834022521973 + }, + { + "auxiliary_loss_clip": 0.01107761, + "auxiliary_loss_mlp": 0.01077304, + "balance_loss_clip": 1.02737927, + "balance_loss_mlp": 1.02589202, + "epoch": 0.07816022846836014, + "flos": 25080101942400.0, + "grad_norm": 3.0651608150289977, + "language_loss": 0.89050609, + "learning_rate": 3.975719713068202e-06, + "loss": 0.91235673, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.81640625, + "step": 1300, + "time_per_iteration": 2.431771755218506 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.01097212, + "balance_loss_clip": 1.03429329, + "balance_loss_mlp": 1.0273807, + "epoch": 0.0782203517210281, + "flos": 40915901825280.0, + "grad_norm": 2.2082953876317792, + "language_loss": 0.74573475, + "learning_rate": 3.975659173637458e-06, + "loss": 0.76786602, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.8828125, + "step": 1301, + "time_per_iteration": 2.5685629844665527 + }, + { + "auxiliary_loss_clip": 0.01118665, + "auxiliary_loss_mlp": 0.01084393, + "balance_loss_clip": 1.02462149, + "balance_loss_mlp": 1.02612078, + "epoch": 0.07828047497369607, + "flos": 41170418703360.0, + "grad_norm": 3.978183644179768, + "language_loss": 0.73210657, + "learning_rate": 3.97559855928952e-06, + "loss": 0.75413716, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.921875, + "step": 1302, + "time_per_iteration": 2.5999114513397217 + }, + { + "auxiliary_loss_clip": 0.01113395, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_clip": 1.0244621, + "balance_loss_mlp": 1.02644074, + "epoch": 0.07834059822636405, + "flos": 23507160192000.0, + "grad_norm": 2.008387485909996, + "language_loss": 0.84534049, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.86726838, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.8671875, + "step": 1303, + "time_per_iteration": 2.4097769260406494 + }, + { + "auxiliary_loss_clip": 0.0111355, + "auxiliary_loss_mlp": 0.01082052, + "balance_loss_clip": 1.02661943, + "balance_loss_mlp": 1.02537799, + "epoch": 0.07840072147903202, + "flos": 20192854959360.0, + "grad_norm": 2.123858019636386, + "language_loss": 0.77221608, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.79417211, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.87890625, + "step": 1304, + "time_per_iteration": 2.410893678665161 + }, + { + "auxiliary_loss_clip": 0.0111136, + "auxiliary_loss_mlp": 0.01075512, + "balance_loss_clip": 1.02065158, + "balance_loss_mlp": 1.02583385, + "epoch": 0.07846084473169998, + "flos": 21359757064320.0, + "grad_norm": 1.6187257570693327, + "language_loss": 0.78066891, + "learning_rate": 3.975416266765542e-06, + "loss": 0.80253768, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.8515625, + "step": 1305, + "time_per_iteration": 2.3891444206237793 + }, + { + "auxiliary_loss_clip": 0.01117743, + "auxiliary_loss_mlp": 0.01082784, + "balance_loss_clip": 1.0204618, + "balance_loss_mlp": 1.02722454, + "epoch": 0.07852096798436796, + "flos": 25409786711040.0, + "grad_norm": 1.9346441081910533, + "language_loss": 0.89066184, + "learning_rate": 3.975355352771841e-06, + "loss": 0.9126671, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.90625, + "step": 1306, + "time_per_iteration": 2.4632389545440674 + }, + { + "auxiliary_loss_clip": 0.0111108, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_clip": 1.01848888, + "balance_loss_mlp": 1.02584505, + "epoch": 0.07858109123703592, + "flos": 24570335047680.0, + "grad_norm": 2.179967687732444, + "language_loss": 0.92156303, + "learning_rate": 3.975294363872468e-06, + "loss": 0.94339848, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8515625, + "step": 1307, + "time_per_iteration": 2.4139764308929443 + }, + { + "auxiliary_loss_clip": 0.01108242, + "auxiliary_loss_mlp": 0.01078333, + "balance_loss_clip": 1.02058756, + "balance_loss_mlp": 1.02415991, + "epoch": 0.07864121448970389, + "flos": 20697978643200.0, + "grad_norm": 1.936468025474089, + "language_loss": 0.85044777, + "learning_rate": 3.975233300069735e-06, + "loss": 0.8723135, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.83984375, + "step": 1308, + "time_per_iteration": 2.421154737472534 + }, + { + "auxiliary_loss_clip": 0.01109996, + "auxiliary_loss_mlp": 0.01066194, + "balance_loss_clip": 1.01703167, + "balance_loss_mlp": 1.02503967, + "epoch": 0.07870133774237187, + "flos": 22965412625280.0, + "grad_norm": 1.4516141332667416, + "language_loss": 0.7930882, + "learning_rate": 3.975172161365958e-06, + "loss": 0.81485015, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.8515625, + "step": 1309, + "time_per_iteration": 2.4105026721954346 + }, + { + "auxiliary_loss_clip": 0.01112374, + "auxiliary_loss_mlp": 0.01074677, + "balance_loss_clip": 1.02091324, + "balance_loss_mlp": 1.02549911, + "epoch": 0.07876146099503983, + "flos": 18841855921920.0, + "grad_norm": 2.479661519581678, + "language_loss": 0.83342427, + "learning_rate": 3.975110947763453e-06, + "loss": 0.8552947, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8671875, + "step": 1310, + "time_per_iteration": 2.3910391330718994 + }, + { + "auxiliary_loss_clip": 0.01109369, + "auxiliary_loss_mlp": 0.01073696, + "balance_loss_clip": 1.0241046, + "balance_loss_mlp": 1.02740824, + "epoch": 0.0788215842477078, + "flos": 23804654820480.0, + "grad_norm": 1.7860338378880796, + "language_loss": 0.7527582, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.77458882, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.8203125, + "step": 1311, + "time_per_iteration": 2.4259324073791504 + }, + { + "auxiliary_loss_clip": 0.01107963, + "auxiliary_loss_mlp": 0.01075911, + "balance_loss_clip": 1.02608192, + "balance_loss_mlp": 1.0258348, + "epoch": 0.07888170750037576, + "flos": 21578837045760.0, + "grad_norm": 1.7606789402917775, + "language_loss": 0.88043088, + "learning_rate": 3.974988295871553e-06, + "loss": 0.90226966, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.8203125, + "step": 1312, + "time_per_iteration": 2.4214870929718018 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01075993, + "balance_loss_clip": 1.02740359, + "balance_loss_mlp": 1.02531779, + "epoch": 0.07894183075304374, + "flos": 19863833506560.0, + "grad_norm": 1.7732417231147266, + "language_loss": 0.84008086, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.86191571, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.8203125, + "step": 1313, + "time_per_iteration": 2.388854503631592 + }, + { + "auxiliary_loss_clip": 0.01114652, + "auxiliary_loss_mlp": 0.0107273, + "balance_loss_clip": 1.01615298, + "balance_loss_mlp": 1.02576399, + "epoch": 0.07900195400571171, + "flos": 16142546021760.0, + "grad_norm": 2.4525097879860853, + "language_loss": 0.7790637, + "learning_rate": 3.97486534441264e-06, + "loss": 0.80093753, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.890625, + "step": 1314, + "time_per_iteration": 2.43091082572937 + }, + { + "auxiliary_loss_clip": 0.01106784, + "auxiliary_loss_mlp": 0.01072151, + "balance_loss_clip": 1.01984191, + "balance_loss_mlp": 1.02299643, + "epoch": 0.07906207725837967, + "flos": 23729347284480.0, + "grad_norm": 1.5957687776262106, + "language_loss": 0.81942004, + "learning_rate": 3.974803756351379e-06, + "loss": 0.84120941, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.83984375, + "step": 1315, + "time_per_iteration": 2.473597764968872 + }, + { + "auxiliary_loss_clip": 0.01110859, + "auxiliary_loss_mlp": 0.01079685, + "balance_loss_clip": 1.02134371, + "balance_loss_mlp": 1.02446198, + "epoch": 0.07912220051104765, + "flos": 24314770828800.0, + "grad_norm": 2.3508693143268804, + "language_loss": 0.7562601, + "learning_rate": 3.974742093405362e-06, + "loss": 0.77816558, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.8671875, + "step": 1316, + "time_per_iteration": 2.4428412914276123 + }, + { + "auxiliary_loss_clip": 0.01108821, + "auxiliary_loss_mlp": 0.01075307, + "balance_loss_clip": 1.01985073, + "balance_loss_mlp": 1.02337098, + "epoch": 0.07918232376371562, + "flos": 18879038386560.0, + "grad_norm": 3.7468830690884083, + "language_loss": 0.6882624, + "learning_rate": 3.974680355576927e-06, + "loss": 0.71010375, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.85546875, + "step": 1317, + "time_per_iteration": 2.401154041290283 + }, + { + "auxiliary_loss_clip": 0.01115937, + "auxiliary_loss_mlp": 0.01082899, + "balance_loss_clip": 1.0198375, + "balance_loss_mlp": 1.02634323, + "epoch": 0.07924244701638358, + "flos": 27375187587840.0, + "grad_norm": 2.9065938412832373, + "language_loss": 0.77951217, + "learning_rate": 3.974618542868415e-06, + "loss": 0.80150056, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.8984375, + "step": 1318, + "time_per_iteration": 2.4679880142211914 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_clip": 1.02147758, + "balance_loss_mlp": 1.02551961, + "epoch": 0.07930257026905156, + "flos": 25119134709120.0, + "grad_norm": 1.5806039296746688, + "language_loss": 0.92131197, + "learning_rate": 3.97455665528217e-06, + "loss": 0.9431386, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.83203125, + "step": 1319, + "time_per_iteration": 2.4307196140289307 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.01071557, + "balance_loss_clip": 1.01819885, + "balance_loss_mlp": 1.02368414, + "epoch": 0.07936269352171953, + "flos": 21833423746560.0, + "grad_norm": 1.8222302199754363, + "language_loss": 0.82644433, + "learning_rate": 3.974494692820539e-06, + "loss": 0.84823799, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.83984375, + "step": 1320, + "time_per_iteration": 3.8100790977478027 + }, + { + "auxiliary_loss_clip": 0.01110221, + "auxiliary_loss_mlp": 0.01077873, + "balance_loss_clip": 1.02284575, + "balance_loss_mlp": 1.0259999, + "epoch": 0.07942281677438749, + "flos": 16939124668800.0, + "grad_norm": 2.203952441167756, + "language_loss": 0.71933568, + "learning_rate": 3.974432655485872e-06, + "loss": 0.74121666, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.84375, + "step": 1321, + "time_per_iteration": 2.352989673614502 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.01075883, + "balance_loss_clip": 1.01956868, + "balance_loss_mlp": 1.02319884, + "epoch": 0.07948294002705546, + "flos": 18986012392320.0, + "grad_norm": 2.003780916195859, + "language_loss": 0.87183511, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.89366043, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8359375, + "step": 1322, + "time_per_iteration": 3.8031857013702393 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.0107431, + "balance_loss_clip": 1.02092791, + "balance_loss_mlp": 1.02308059, + "epoch": 0.07954306327972344, + "flos": 21652364102400.0, + "grad_norm": 1.9804893736623392, + "language_loss": 0.93101752, + "learning_rate": 3.974308356206838e-06, + "loss": 0.95283353, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.83984375, + "step": 1323, + "time_per_iteration": 3.7943084239959717 + }, + { + "auxiliary_loss_clip": 0.0110532, + "auxiliary_loss_mlp": 0.01070733, + "balance_loss_clip": 1.01937795, + "balance_loss_mlp": 1.02309442, + "epoch": 0.0796031865323914, + "flos": 23219196364800.0, + "grad_norm": 1.701213951488955, + "language_loss": 0.84239727, + "learning_rate": 3.974246094267187e-06, + "loss": 0.8641578, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.8203125, + "step": 1324, + "time_per_iteration": 4.064820289611816 + }, + { + "auxiliary_loss_clip": 0.01109688, + "auxiliary_loss_mlp": 0.01073908, + "balance_loss_clip": 1.0191915, + "balance_loss_mlp": 1.02427816, + "epoch": 0.07966330978505937, + "flos": 23293421648640.0, + "grad_norm": 2.06694175959185, + "language_loss": 0.82039601, + "learning_rate": 3.974183757463925e-06, + "loss": 0.84223199, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.8515625, + "step": 1325, + "time_per_iteration": 2.572401762008667 + }, + { + "auxiliary_loss_clip": 0.0110625, + "auxiliary_loss_mlp": 0.0108306, + "balance_loss_clip": 1.02476692, + "balance_loss_mlp": 1.02346456, + "epoch": 0.07972343303772735, + "flos": 18362952535680.0, + "grad_norm": 2.8761018157466154, + "language_loss": 0.90512717, + "learning_rate": 3.974121345799418e-06, + "loss": 0.92702031, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.828125, + "step": 1326, + "time_per_iteration": 2.4042999744415283 + }, + { + "auxiliary_loss_clip": 0.01104156, + "auxiliary_loss_mlp": 0.01065993, + "balance_loss_clip": 1.01489997, + "balance_loss_mlp": 1.02272522, + "epoch": 0.07978355629039531, + "flos": 21761432789760.0, + "grad_norm": 1.85635247404814, + "language_loss": 0.85363793, + "learning_rate": 3.974058859276032e-06, + "loss": 0.87533939, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.8125, + "step": 1327, + "time_per_iteration": 2.3813905715942383 + }, + { + "auxiliary_loss_clip": 0.01109646, + "auxiliary_loss_mlp": 0.01082696, + "balance_loss_clip": 1.02225661, + "balance_loss_mlp": 1.02393973, + "epoch": 0.07984367954306328, + "flos": 18550331136000.0, + "grad_norm": 2.3710652503006786, + "language_loss": 0.82645732, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.84838068, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.859375, + "step": 1328, + "time_per_iteration": 2.3928399085998535 + }, + { + "auxiliary_loss_clip": 0.01108894, + "auxiliary_loss_mlp": 0.01075048, + "balance_loss_clip": 1.01503801, + "balance_loss_mlp": 1.02365673, + "epoch": 0.07990380279573125, + "flos": 16903268835840.0, + "grad_norm": 3.316858033803759, + "language_loss": 0.77969515, + "learning_rate": 3.973933661662101e-06, + "loss": 0.80153453, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.8515625, + "step": 1329, + "time_per_iteration": 2.377809762954712 + }, + { + "auxiliary_loss_clip": 0.01108532, + "auxiliary_loss_mlp": 0.01069511, + "balance_loss_clip": 1.01584268, + "balance_loss_mlp": 1.02538419, + "epoch": 0.07996392604839922, + "flos": 24097192035840.0, + "grad_norm": 1.6127542204939962, + "language_loss": 0.83123344, + "learning_rate": 3.973870950576305e-06, + "loss": 0.85301387, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.83203125, + "step": 1330, + "time_per_iteration": 2.5272607803344727 + }, + { + "auxiliary_loss_clip": 0.01109376, + "auxiliary_loss_mlp": 0.01078341, + "balance_loss_clip": 1.02381492, + "balance_loss_mlp": 1.02487612, + "epoch": 0.08002404930106718, + "flos": 14277974751360.0, + "grad_norm": 1.8084030049542563, + "language_loss": 0.91104257, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.9329198, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.84375, + "step": 1331, + "time_per_iteration": 2.372652530670166 + }, + { + "auxiliary_loss_clip": 0.01113275, + "auxiliary_loss_mlp": 0.01074205, + "balance_loss_clip": 1.01974988, + "balance_loss_mlp": 1.02541745, + "epoch": 0.08008417255373516, + "flos": 40404633742080.0, + "grad_norm": 1.80529961066992, + "language_loss": 0.75606763, + "learning_rate": 3.973745303858942e-06, + "loss": 0.77794242, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.87890625, + "step": 1332, + "time_per_iteration": 2.570234775543213 + }, + { + "auxiliary_loss_clip": 0.01107836, + "auxiliary_loss_mlp": 0.01066879, + "balance_loss_clip": 1.01726389, + "balance_loss_mlp": 1.02548635, + "epoch": 0.08014429580640313, + "flos": 18477921242880.0, + "grad_norm": 1.7278731775416323, + "language_loss": 0.83549148, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85723865, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.82421875, + "step": 1333, + "time_per_iteration": 2.3747293949127197 + }, + { + "auxiliary_loss_clip": 0.01108425, + "auxiliary_loss_mlp": 0.01070924, + "balance_loss_clip": 1.01768541, + "balance_loss_mlp": 1.02411902, + "epoch": 0.0802044190590711, + "flos": 22052398993920.0, + "grad_norm": 2.498379040975243, + "language_loss": 0.79254526, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.81433874, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.84375, + "step": 1334, + "time_per_iteration": 2.4315414428710938 + }, + { + "auxiliary_loss_clip": 0.0110698, + "auxiliary_loss_mlp": 0.01074646, + "balance_loss_clip": 1.01988173, + "balance_loss_mlp": 1.02452981, + "epoch": 0.08026454231173906, + "flos": 24570963452160.0, + "grad_norm": 1.937995151572117, + "language_loss": 0.82187104, + "learning_rate": 3.973556272454221e-06, + "loss": 0.8436873, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.82421875, + "step": 1335, + "time_per_iteration": 2.4134137630462646 + }, + { + "auxiliary_loss_clip": 0.01036358, + "auxiliary_loss_mlp": 0.01014533, + "balance_loss_clip": 1.00466263, + "balance_loss_mlp": 1.01556134, + "epoch": 0.08032466556440704, + "flos": 52579195545600.0, + "grad_norm": 0.7459005631939178, + "language_loss": 0.56162834, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58213723, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.20800781, + "step": 1336, + "time_per_iteration": 3.086026191711426 + }, + { + "auxiliary_loss_clip": 0.01110818, + "auxiliary_loss_mlp": 0.01073064, + "balance_loss_clip": 1.0180608, + "balance_loss_mlp": 1.02682829, + "epoch": 0.080384788817075, + "flos": 23841453260160.0, + "grad_norm": 1.8846504309521057, + "language_loss": 0.69978154, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.72162032, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.83984375, + "step": 1337, + "time_per_iteration": 2.412435293197632 + }, + { + "auxiliary_loss_clip": 0.01108605, + "auxiliary_loss_mlp": 0.01078672, + "balance_loss_clip": 1.02750707, + "balance_loss_mlp": 1.02578926, + "epoch": 0.08044491206974297, + "flos": 25299565948800.0, + "grad_norm": 1.802117850198766, + "language_loss": 0.89150703, + "learning_rate": 3.973366567512453e-06, + "loss": 0.91337979, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.828125, + "step": 1338, + "time_per_iteration": 2.507984161376953 + }, + { + "auxiliary_loss_clip": 0.01108053, + "auxiliary_loss_mlp": 0.01086142, + "balance_loss_clip": 1.03149617, + "balance_loss_mlp": 1.0237062, + "epoch": 0.08050503532241095, + "flos": 22375625160960.0, + "grad_norm": 2.2070546301110285, + "language_loss": 0.89671904, + "learning_rate": 3.973303182868147e-06, + "loss": 0.91866106, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.84375, + "step": 1339, + "time_per_iteration": 2.3635315895080566 + }, + { + "auxiliary_loss_clip": 0.01104289, + "auxiliary_loss_mlp": 0.01065787, + "balance_loss_clip": 1.01827073, + "balance_loss_mlp": 1.02461195, + "epoch": 0.08056515857507891, + "flos": 18368433619200.0, + "grad_norm": 2.026810562942507, + "language_loss": 0.91961873, + "learning_rate": 3.973239723395988e-06, + "loss": 0.94131953, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.796875, + "step": 1340, + "time_per_iteration": 2.395108461380005 + }, + { + "auxiliary_loss_clip": 0.01028235, + "auxiliary_loss_mlp": 0.01020038, + "balance_loss_clip": 1.00959539, + "balance_loss_mlp": 1.00784612, + "epoch": 0.08062528182774688, + "flos": 51345329719680.0, + "grad_norm": 0.9265618169854715, + "language_loss": 0.64822388, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66870654, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.203125, + "step": 1341, + "time_per_iteration": 2.9663238525390625 + }, + { + "auxiliary_loss_clip": 0.01115499, + "auxiliary_loss_mlp": 0.01083281, + "balance_loss_clip": 1.02615595, + "balance_loss_mlp": 1.02748251, + "epoch": 0.08068540508041486, + "flos": 17598843319680.0, + "grad_norm": 1.8554481480279617, + "language_loss": 0.91549879, + "learning_rate": 3.973112579977733e-06, + "loss": 0.93748653, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.8828125, + "step": 1342, + "time_per_iteration": 2.396362781524658 + }, + { + "auxiliary_loss_clip": 0.01116091, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_clip": 1.01815748, + "balance_loss_mlp": 1.02835691, + "epoch": 0.08074552833308282, + "flos": 10560422782080.0, + "grad_norm": 2.4758569954282668, + "language_loss": 0.78880423, + "learning_rate": 3.973048896036459e-06, + "loss": 0.81071007, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.87890625, + "step": 1343, + "time_per_iteration": 2.3771536350250244 + }, + { + "auxiliary_loss_clip": 0.01027631, + "auxiliary_loss_mlp": 0.01013065, + "balance_loss_clip": 1.00276518, + "balance_loss_mlp": 1.00715125, + "epoch": 0.08080565158575079, + "flos": 60837026739840.0, + "grad_norm": 0.8118396813334285, + "language_loss": 0.57663286, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59703982, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.20507812, + "step": 1344, + "time_per_iteration": 2.960801124572754 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01085935, + "balance_loss_clip": 1.02973962, + "balance_loss_mlp": 1.03091156, + "epoch": 0.08086577483841875, + "flos": 18331390800000.0, + "grad_norm": 2.253586511031173, + "language_loss": 0.89439678, + "learning_rate": 3.972921303701695e-06, + "loss": 0.91644335, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.87890625, + "step": 1345, + "time_per_iteration": 2.3916232585906982 + }, + { + "auxiliary_loss_clip": 0.0111543, + "auxiliary_loss_mlp": 0.01078627, + "balance_loss_clip": 1.02681899, + "balance_loss_mlp": 1.03055668, + "epoch": 0.08092589809108673, + "flos": 21542527365120.0, + "grad_norm": 1.6045428472178436, + "language_loss": 0.89417076, + "learning_rate": 3.972857395313042e-06, + "loss": 0.91611147, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.84765625, + "step": 1346, + "time_per_iteration": 2.424100160598755 + }, + { + "auxiliary_loss_clip": 0.01114454, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_clip": 1.02897811, + "balance_loss_mlp": 1.0300796, + "epoch": 0.0809860213437547, + "flos": 22126903568640.0, + "grad_norm": 2.0283170538473114, + "language_loss": 0.94470823, + "learning_rate": 3.972793412113439e-06, + "loss": 0.96664733, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.84375, + "step": 1347, + "time_per_iteration": 2.4152109622955322 + }, + { + "auxiliary_loss_clip": 0.01117346, + "auxiliary_loss_mlp": 0.01086466, + "balance_loss_clip": 1.0314157, + "balance_loss_mlp": 1.03346086, + "epoch": 0.08104614459642266, + "flos": 21724424881920.0, + "grad_norm": 1.7080277963871962, + "language_loss": 0.91198003, + "learning_rate": 3.972729354105312e-06, + "loss": 0.93401814, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.83984375, + "step": 1348, + "time_per_iteration": 2.4430036544799805 + }, + { + "auxiliary_loss_clip": 0.01113425, + "auxiliary_loss_mlp": 0.01083193, + "balance_loss_clip": 1.03145576, + "balance_loss_mlp": 1.03117883, + "epoch": 0.08110626784909064, + "flos": 23950731415680.0, + "grad_norm": 1.8472960606832112, + "language_loss": 0.78028381, + "learning_rate": 3.97266522129109e-06, + "loss": 0.80224997, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.8203125, + "step": 1349, + "time_per_iteration": 2.4375977516174316 + }, + { + "auxiliary_loss_clip": 0.01117057, + "auxiliary_loss_mlp": 0.0109268, + "balance_loss_clip": 1.0311923, + "balance_loss_mlp": 1.03009534, + "epoch": 0.0811663911017586, + "flos": 19024696045440.0, + "grad_norm": 1.9444947526387593, + "language_loss": 0.89922661, + "learning_rate": 3.972601013673205e-06, + "loss": 0.92132401, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 0.8671875, + "step": 1350, + "time_per_iteration": 2.4147348403930664 + }, + { + "auxiliary_loss_clip": 0.01112617, + "auxiliary_loss_mlp": 0.01095476, + "balance_loss_clip": 1.0372777, + "balance_loss_mlp": 1.02918196, + "epoch": 0.08122651435442657, + "flos": 15340381557120.0, + "grad_norm": 1.94539313352845, + "language_loss": 0.8486737, + "learning_rate": 3.972536731254092e-06, + "loss": 0.8707546, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.8359375, + "step": 1351, + "time_per_iteration": 2.4005260467529297 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.01089045, + "balance_loss_clip": 1.02853477, + "balance_loss_mlp": 1.02726603, + "epoch": 0.08128663760709455, + "flos": 23220453173760.0, + "grad_norm": 1.9763791643624964, + "language_loss": 0.77385086, + "learning_rate": 3.972472374036189e-06, + "loss": 0.79587245, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.859375, + "step": 1352, + "time_per_iteration": 2.426219940185547 + }, + { + "auxiliary_loss_clip": 0.0111365, + "auxiliary_loss_mlp": 0.01078439, + "balance_loss_clip": 1.01988328, + "balance_loss_mlp": 1.02704501, + "epoch": 0.08134676085976252, + "flos": 22964539841280.0, + "grad_norm": 1.7220478412127043, + "language_loss": 0.85082364, + "learning_rate": 3.972407942021935e-06, + "loss": 0.87274456, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.8671875, + "step": 1353, + "time_per_iteration": 2.443525791168213 + }, + { + "auxiliary_loss_clip": 0.01024492, + "auxiliary_loss_mlp": 0.01023138, + "balance_loss_clip": 1.0150317, + "balance_loss_mlp": 1.00437808, + "epoch": 0.08140688411243048, + "flos": 64319369679360.0, + "grad_norm": 0.8590377884044433, + "language_loss": 0.59925461, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61973089, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.08105469, + "router_z_loss_mlp": 0.20117188, + "step": 1354, + "time_per_iteration": 3.0278260707855225 + }, + { + "auxiliary_loss_clip": 0.01111919, + "auxiliary_loss_mlp": 0.01077298, + "balance_loss_clip": 1.01936281, + "balance_loss_mlp": 1.02710986, + "epoch": 0.08146700736509845, + "flos": 22490768424960.0, + "grad_norm": 1.7397507454779344, + "language_loss": 0.84593832, + "learning_rate": 3.972278853614154e-06, + "loss": 0.86783051, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.84765625, + "step": 1355, + "time_per_iteration": 2.4200022220611572 + }, + { + "auxiliary_loss_clip": 0.01110562, + "auxiliary_loss_mlp": 0.01076307, + "balance_loss_clip": 1.01922917, + "balance_loss_mlp": 1.02485931, + "epoch": 0.08152713061776642, + "flos": 20446813255680.0, + "grad_norm": 1.7823100838189185, + "language_loss": 0.74108452, + "learning_rate": 3.972214197225521e-06, + "loss": 0.76295322, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.85546875, + "step": 1356, + "time_per_iteration": 2.402315378189087 + }, + { + "auxiliary_loss_clip": 0.01115896, + "auxiliary_loss_mlp": 0.01083975, + "balance_loss_clip": 1.02484739, + "balance_loss_mlp": 1.02906394, + "epoch": 0.08158725387043439, + "flos": 23549090601600.0, + "grad_norm": 1.9158472627661638, + "language_loss": 0.72575223, + "learning_rate": 3.972149466050329e-06, + "loss": 0.74775088, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.8671875, + "step": 1357, + "time_per_iteration": 2.4154152870178223 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01088367, + "balance_loss_clip": 1.02799928, + "balance_loss_mlp": 1.03011346, + "epoch": 0.08164737712310235, + "flos": 22016263870080.0, + "grad_norm": 2.5200883897254838, + "language_loss": 0.87415963, + "learning_rate": 3.97208466009103e-06, + "loss": 0.89622176, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.875, + "step": 1358, + "time_per_iteration": 2.4459500312805176 + }, + { + "auxiliary_loss_clip": 0.01118647, + "auxiliary_loss_mlp": 0.01076745, + "balance_loss_clip": 1.01692605, + "balance_loss_mlp": 1.03133488, + "epoch": 0.08170750037577033, + "flos": 23366704325760.0, + "grad_norm": 1.9807527731988828, + "language_loss": 1.0362227, + "learning_rate": 3.972019779350084e-06, + "loss": 1.05817676, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.87109375, + "step": 1359, + "time_per_iteration": 2.438546657562256 + }, + { + "auxiliary_loss_clip": 0.01117796, + "auxiliary_loss_mlp": 0.01086262, + "balance_loss_clip": 1.02620423, + "balance_loss_mlp": 1.03030992, + "epoch": 0.0817676236284383, + "flos": 28396850970240.0, + "grad_norm": 1.8875304057098072, + "language_loss": 0.85893643, + "learning_rate": 3.971954823829951e-06, + "loss": 0.88097703, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.875, + "step": 1360, + "time_per_iteration": 3.9236767292022705 + }, + { + "auxiliary_loss_clip": 0.01117644, + "auxiliary_loss_mlp": 0.01093077, + "balance_loss_clip": 1.03361583, + "balance_loss_mlp": 1.02963698, + "epoch": 0.08182774688110626, + "flos": 19207885282560.0, + "grad_norm": 2.1343271935604355, + "language_loss": 0.76211262, + "learning_rate": 3.971889793533093e-06, + "loss": 0.78421992, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.8828125, + "step": 1361, + "time_per_iteration": 3.830503225326538 + }, + { + "auxiliary_loss_clip": 0.01115325, + "auxiliary_loss_mlp": 0.01088676, + "balance_loss_clip": 1.02995372, + "balance_loss_mlp": 1.0295558, + "epoch": 0.08188787013377424, + "flos": 22782991438080.0, + "grad_norm": 2.9674175338757527, + "language_loss": 0.79178232, + "learning_rate": 3.971824688461976e-06, + "loss": 0.81382239, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.85546875, + "step": 1362, + "time_per_iteration": 3.9383528232574463 + }, + { + "auxiliary_loss_clip": 0.01114144, + "auxiliary_loss_mlp": 0.01088155, + "balance_loss_clip": 1.03300905, + "balance_loss_mlp": 1.0316236, + "epoch": 0.08194799338644221, + "flos": 16467273377280.0, + "grad_norm": 2.044144902953067, + "language_loss": 0.75931215, + "learning_rate": 3.971759508619069e-06, + "loss": 0.78133518, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.82421875, + "step": 1363, + "time_per_iteration": 2.3868229389190674 + }, + { + "auxiliary_loss_clip": 0.01116961, + "auxiliary_loss_mlp": 0.01097824, + "balance_loss_clip": 1.04010272, + "balance_loss_mlp": 1.03175497, + "epoch": 0.08200811663911017, + "flos": 23912536521600.0, + "grad_norm": 2.1876829620482177, + "language_loss": 0.80381334, + "learning_rate": 3.971694254006844e-06, + "loss": 0.82596123, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.8515625, + "step": 1364, + "time_per_iteration": 3.8066349029541016 + }, + { + "auxiliary_loss_clip": 0.01113448, + "auxiliary_loss_mlp": 0.01083449, + "balance_loss_clip": 1.02670503, + "balance_loss_mlp": 1.02880073, + "epoch": 0.08206823989177814, + "flos": 17895534986880.0, + "grad_norm": 1.76667309511773, + "language_loss": 0.83329529, + "learning_rate": 3.971628924627776e-06, + "loss": 0.85526419, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.84765625, + "step": 1365, + "time_per_iteration": 2.4012794494628906 + }, + { + "auxiliary_loss_clip": 0.01110978, + "auxiliary_loss_mlp": 0.01083732, + "balance_loss_clip": 1.02949214, + "balance_loss_mlp": 1.02886939, + "epoch": 0.08212836314444612, + "flos": 22087172574720.0, + "grad_norm": 1.8620937434407556, + "language_loss": 0.83406579, + "learning_rate": 3.97156352048434e-06, + "loss": 0.85601294, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.8203125, + "step": 1366, + "time_per_iteration": 2.430553436279297 + }, + { + "auxiliary_loss_clip": 0.01114399, + "auxiliary_loss_mlp": 0.01082446, + "balance_loss_clip": 1.02660871, + "balance_loss_mlp": 1.0268085, + "epoch": 0.08218848639711408, + "flos": 17596678815360.0, + "grad_norm": 1.858159419660863, + "language_loss": 0.84323168, + "learning_rate": 3.97149804157902e-06, + "loss": 0.86520016, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.875, + "step": 1367, + "time_per_iteration": 2.366088390350342 + }, + { + "auxiliary_loss_clip": 0.01115816, + "auxiliary_loss_mlp": 0.01082889, + "balance_loss_clip": 1.02698016, + "balance_loss_mlp": 1.0285399, + "epoch": 0.08224860964978205, + "flos": 17856886245120.0, + "grad_norm": 2.0715979750697713, + "language_loss": 0.86516553, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.88715255, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.87109375, + "step": 1368, + "time_per_iteration": 2.4284470081329346 + }, + { + "auxiliary_loss_clip": 0.01106765, + "auxiliary_loss_mlp": 0.01074191, + "balance_loss_clip": 1.02183437, + "balance_loss_mlp": 1.02654362, + "epoch": 0.08230873290245003, + "flos": 25226388005760.0, + "grad_norm": 1.7006888295315814, + "language_loss": 0.82558376, + "learning_rate": 3.971366859492653e-06, + "loss": 0.84739339, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.80078125, + "step": 1369, + "time_per_iteration": 2.443692684173584 + }, + { + "auxiliary_loss_clip": 0.01112208, + "auxiliary_loss_mlp": 0.01078383, + "balance_loss_clip": 1.0241667, + "balance_loss_mlp": 1.02871931, + "epoch": 0.08236885615511799, + "flos": 31758567696000.0, + "grad_norm": 2.4183339853649835, + "language_loss": 0.77598745, + "learning_rate": 3.971301156316582e-06, + "loss": 0.79789335, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8359375, + "step": 1370, + "time_per_iteration": 2.5017385482788086 + }, + { + "auxiliary_loss_clip": 0.01115027, + "auxiliary_loss_mlp": 0.01081006, + "balance_loss_clip": 1.02507341, + "balance_loss_mlp": 1.0285244, + "epoch": 0.08242897940778596, + "flos": 23184702074880.0, + "grad_norm": 1.4787990402702074, + "language_loss": 0.76676124, + "learning_rate": 3.971235378388573e-06, + "loss": 0.78872156, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.86328125, + "step": 1371, + "time_per_iteration": 2.411972761154175 + }, + { + "auxiliary_loss_clip": 0.01114813, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.01644278, + "balance_loss_mlp": 1.02628553, + "epoch": 0.08248910266045394, + "flos": 34490172470400.0, + "grad_norm": 2.0184619927737, + "language_loss": 0.73864537, + "learning_rate": 3.971169525711122e-06, + "loss": 0.760548, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.88671875, + "step": 1372, + "time_per_iteration": 2.5271806716918945 + }, + { + "auxiliary_loss_clip": 0.01114102, + "auxiliary_loss_mlp": 0.01090021, + "balance_loss_clip": 1.02493286, + "balance_loss_mlp": 1.02688265, + "epoch": 0.0825492259131219, + "flos": 13435590533760.0, + "grad_norm": 2.942985855534107, + "language_loss": 0.91760528, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.93964648, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.875, + "step": 1373, + "time_per_iteration": 2.4015536308288574 + }, + { + "auxiliary_loss_clip": 0.01114623, + "auxiliary_loss_mlp": 0.01078616, + "balance_loss_clip": 1.02006078, + "balance_loss_mlp": 1.02902246, + "epoch": 0.08260934916578987, + "flos": 25811252968320.0, + "grad_norm": 1.8169075735936528, + "language_loss": 0.85061991, + "learning_rate": 3.971037596117882e-06, + "loss": 0.87255228, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.85546875, + "step": 1374, + "time_per_iteration": 2.4519412517547607 + }, + { + "auxiliary_loss_clip": 0.01032341, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.02135658, + "balance_loss_mlp": 1.0102607, + "epoch": 0.08266947241845783, + "flos": 63456909563520.0, + "grad_norm": 0.8509303171829613, + "language_loss": 0.60715562, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62777174, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.22070312, + "step": 1375, + "time_per_iteration": 3.0209405422210693 + }, + { + "auxiliary_loss_clip": 0.01031259, + "auxiliary_loss_mlp": 0.01018319, + "balance_loss_clip": 1.01016533, + "balance_loss_mlp": 1.00869417, + "epoch": 0.08272959567112581, + "flos": 69990346062720.0, + "grad_norm": 0.9198688518618698, + "language_loss": 0.62376827, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64426404, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.08154297, + "router_z_loss_mlp": 0.2265625, + "step": 1376, + "time_per_iteration": 3.005570411682129 + }, + { + "auxiliary_loss_clip": 0.01115597, + "auxiliary_loss_mlp": 0.01086386, + "balance_loss_clip": 1.02148843, + "balance_loss_mlp": 1.02857971, + "epoch": 0.08278971892379378, + "flos": 20412144408960.0, + "grad_norm": 1.5735052034043406, + "language_loss": 0.84243321, + "learning_rate": 3.970839141169718e-06, + "loss": 0.86445308, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.8671875, + "step": 1377, + "time_per_iteration": 2.4132397174835205 + }, + { + "auxiliary_loss_clip": 0.01114045, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_clip": 1.0204885, + "balance_loss_mlp": 1.02717841, + "epoch": 0.08284984217646174, + "flos": 26249028906240.0, + "grad_norm": 1.8402358781554387, + "language_loss": 0.86907488, + "learning_rate": 3.970772840048147e-06, + "loss": 0.89102912, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.8671875, + "step": 1378, + "time_per_iteration": 2.470933198928833 + }, + { + "auxiliary_loss_clip": 0.01114856, + "auxiliary_loss_mlp": 0.01092253, + "balance_loss_clip": 1.03014469, + "balance_loss_mlp": 1.02781117, + "epoch": 0.08290996542912972, + "flos": 27193569361920.0, + "grad_norm": 1.8556355902745458, + "language_loss": 0.89778137, + "learning_rate": 3.970706464194672e-06, + "loss": 0.9198525, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.87109375, + "step": 1379, + "time_per_iteration": 2.5366737842559814 + }, + { + "auxiliary_loss_clip": 0.01112084, + "auxiliary_loss_mlp": 0.01081845, + "balance_loss_clip": 1.02567363, + "balance_loss_mlp": 1.02736938, + "epoch": 0.08297008868179769, + "flos": 38616661728000.0, + "grad_norm": 1.9638152864727898, + "language_loss": 0.80275232, + "learning_rate": 3.970640013611812e-06, + "loss": 0.82469153, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.84765625, + "step": 1380, + "time_per_iteration": 2.5874640941619873 + }, + { + "auxiliary_loss_clip": 0.01112256, + "auxiliary_loss_mlp": 0.01090402, + "balance_loss_clip": 1.03277659, + "balance_loss_mlp": 1.02845407, + "epoch": 0.08303021193446565, + "flos": 19973705155200.0, + "grad_norm": 2.10247974081279, + "language_loss": 0.88980794, + "learning_rate": 3.970573488302083e-06, + "loss": 0.91183454, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.8359375, + "step": 1381, + "time_per_iteration": 2.390225410461426 + }, + { + "auxiliary_loss_clip": 0.01123764, + "auxiliary_loss_mlp": 0.01095449, + "balance_loss_clip": 1.02807164, + "balance_loss_mlp": 1.03204083, + "epoch": 0.08309033518713363, + "flos": 13661792432640.0, + "grad_norm": 2.6700346984088696, + "language_loss": 0.90960169, + "learning_rate": 3.970506888268011e-06, + "loss": 0.93179387, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.91796875, + "step": 1382, + "time_per_iteration": 2.3958740234375 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.01102995, + "balance_loss_clip": 1.03766894, + "balance_loss_mlp": 1.03030193, + "epoch": 0.0831504584398016, + "flos": 17967560855040.0, + "grad_norm": 2.0165986644569442, + "language_loss": 0.7833159, + "learning_rate": 3.970440213512121e-06, + "loss": 0.80554134, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.890625, + "step": 1383, + "time_per_iteration": 2.375577688217163 + }, + { + "auxiliary_loss_clip": 0.0112312, + "auxiliary_loss_mlp": 0.01093332, + "balance_loss_clip": 1.02929306, + "balance_loss_mlp": 1.03103316, + "epoch": 0.08321058169246956, + "flos": 22600290960000.0, + "grad_norm": 1.8004299875189334, + "language_loss": 0.85283303, + "learning_rate": 3.97037346403694e-06, + "loss": 0.8749975, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.921875, + "step": 1384, + "time_per_iteration": 2.4694762229919434 + }, + { + "auxiliary_loss_clip": 0.01125951, + "auxiliary_loss_mlp": 0.01092494, + "balance_loss_clip": 1.02773952, + "balance_loss_mlp": 1.03245091, + "epoch": 0.08327070494513754, + "flos": 22849501311360.0, + "grad_norm": 2.403872375738541, + "language_loss": 0.88780528, + "learning_rate": 3.970306639845e-06, + "loss": 0.90998971, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.9375, + "step": 1385, + "time_per_iteration": 2.400082588195801 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01093875, + "balance_loss_clip": 1.02392304, + "balance_loss_mlp": 1.03065276, + "epoch": 0.0833308281978055, + "flos": 22781909185920.0, + "grad_norm": 2.0184492473530082, + "language_loss": 0.71468121, + "learning_rate": 3.970239740938835e-06, + "loss": 0.73683113, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.90625, + "step": 1386, + "time_per_iteration": 2.4170522689819336 + }, + { + "auxiliary_loss_clip": 0.01114436, + "auxiliary_loss_mlp": 0.01079458, + "balance_loss_clip": 1.01932871, + "balance_loss_mlp": 1.0261935, + "epoch": 0.08339095145047347, + "flos": 20811585807360.0, + "grad_norm": 1.611814450062574, + "language_loss": 0.83610451, + "learning_rate": 3.97017276732098e-06, + "loss": 0.85804343, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.8828125, + "step": 1387, + "time_per_iteration": 2.397003650665283 + }, + { + "auxiliary_loss_clip": 0.01115976, + "auxiliary_loss_mlp": 0.01085902, + "balance_loss_clip": 1.02138579, + "balance_loss_mlp": 1.02646101, + "epoch": 0.08345107470314143, + "flos": 18514335657600.0, + "grad_norm": 1.7992570829050043, + "language_loss": 0.79475051, + "learning_rate": 3.970105718993978e-06, + "loss": 0.81676924, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.89453125, + "step": 1388, + "time_per_iteration": 2.399960994720459 + }, + { + "auxiliary_loss_clip": 0.01112153, + "auxiliary_loss_mlp": 0.01081116, + "balance_loss_clip": 1.021631, + "balance_loss_mlp": 1.02725863, + "epoch": 0.08351119795580941, + "flos": 18806558670720.0, + "grad_norm": 2.041565377541685, + "language_loss": 0.81683552, + "learning_rate": 3.970038595960369e-06, + "loss": 0.83876818, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.84765625, + "step": 1389, + "time_per_iteration": 2.3746466636657715 + }, + { + "auxiliary_loss_clip": 0.01118415, + "auxiliary_loss_mlp": 0.01075261, + "balance_loss_clip": 1.01460695, + "balance_loss_mlp": 1.0302484, + "epoch": 0.08357132120847738, + "flos": 18440843512320.0, + "grad_norm": 2.2653982118689875, + "language_loss": 0.90585464, + "learning_rate": 3.969971398222699e-06, + "loss": 0.92779136, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.8828125, + "step": 1390, + "time_per_iteration": 2.369727373123169 + }, + { + "auxiliary_loss_clip": 0.01112828, + "auxiliary_loss_mlp": 0.01083222, + "balance_loss_clip": 1.02411819, + "balance_loss_mlp": 1.02617037, + "epoch": 0.08363144446114534, + "flos": 25921124616960.0, + "grad_norm": 1.6385066683570457, + "language_loss": 0.88485849, + "learning_rate": 3.969904125783517e-06, + "loss": 0.90681899, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.8671875, + "step": 1391, + "time_per_iteration": 2.4261651039123535 + }, + { + "auxiliary_loss_clip": 0.01120452, + "auxiliary_loss_mlp": 0.01089992, + "balance_loss_clip": 1.0265727, + "balance_loss_mlp": 1.03008628, + "epoch": 0.08369156771381332, + "flos": 18040319861760.0, + "grad_norm": 2.1010120146315083, + "language_loss": 0.91791081, + "learning_rate": 3.969836778645371e-06, + "loss": 0.9400152, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.90625, + "step": 1392, + "time_per_iteration": 2.418532133102417 + }, + { + "auxiliary_loss_clip": 0.01112852, + "auxiliary_loss_mlp": 0.01081653, + "balance_loss_clip": 1.02698362, + "balance_loss_mlp": 1.02702689, + "epoch": 0.08375169096648129, + "flos": 22673992573440.0, + "grad_norm": 2.412700474191262, + "language_loss": 0.82848752, + "learning_rate": 3.969769356810819e-06, + "loss": 0.85043252, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.859375, + "step": 1393, + "time_per_iteration": 2.411720037460327 + }, + { + "auxiliary_loss_clip": 0.01109279, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_clip": 1.02825701, + "balance_loss_mlp": 1.02592278, + "epoch": 0.08381181421914925, + "flos": 26102044615680.0, + "grad_norm": 1.7798438229829463, + "language_loss": 0.8641988, + "learning_rate": 3.969701860282415e-06, + "loss": 0.88613778, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8359375, + "step": 1394, + "time_per_iteration": 2.447401285171509 + }, + { + "auxiliary_loss_clip": 0.01115185, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_clip": 1.02781987, + "balance_loss_mlp": 1.02782297, + "epoch": 0.08387193747181723, + "flos": 20628780595200.0, + "grad_norm": 2.370356933459279, + "language_loss": 0.84495461, + "learning_rate": 3.969634289062719e-06, + "loss": 0.8669914, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.875, + "step": 1395, + "time_per_iteration": 2.4002926349639893 + }, + { + "auxiliary_loss_clip": 0.01116037, + "auxiliary_loss_mlp": 0.01094812, + "balance_loss_clip": 1.03110683, + "balance_loss_mlp": 1.02817655, + "epoch": 0.0839320607244852, + "flos": 13442363337600.0, + "grad_norm": 2.1400180442785484, + "language_loss": 0.85569286, + "learning_rate": 3.969566643154293e-06, + "loss": 0.8778013, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.87890625, + "step": 1396, + "time_per_iteration": 2.368790864944458 + }, + { + "auxiliary_loss_clip": 0.01109521, + "auxiliary_loss_mlp": 0.01091867, + "balance_loss_clip": 1.02384639, + "balance_loss_mlp": 1.02685153, + "epoch": 0.08399218397715316, + "flos": 23476122126720.0, + "grad_norm": 1.9649349017174513, + "language_loss": 0.78599036, + "learning_rate": 3.969498922559703e-06, + "loss": 0.8080042, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.828125, + "step": 1397, + "time_per_iteration": 2.4267778396606445 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01091288, + "balance_loss_clip": 1.0262953, + "balance_loss_mlp": 1.02805257, + "epoch": 0.08405230722982113, + "flos": 25919553605760.0, + "grad_norm": 2.1799131720060716, + "language_loss": 0.79999208, + "learning_rate": 3.969431127281516e-06, + "loss": 0.82204044, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 0.85546875, + "step": 1398, + "time_per_iteration": 2.432393789291382 + }, + { + "auxiliary_loss_clip": 0.01110378, + "auxiliary_loss_mlp": 0.01088126, + "balance_loss_clip": 1.03023756, + "balance_loss_mlp": 1.02603745, + "epoch": 0.0841124304824891, + "flos": 17966478602880.0, + "grad_norm": 2.2132381604744102, + "language_loss": 0.96883905, + "learning_rate": 3.969363257322304e-06, + "loss": 0.9908241, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.84375, + "step": 1399, + "time_per_iteration": 3.844805955886841 + }, + { + "auxiliary_loss_clip": 0.01113905, + "auxiliary_loss_mlp": 0.01084044, + "balance_loss_clip": 1.02243638, + "balance_loss_mlp": 1.02659392, + "epoch": 0.08417255373515707, + "flos": 25628482667520.0, + "grad_norm": 2.293454677068318, + "language_loss": 0.84955823, + "learning_rate": 3.96929531268464e-06, + "loss": 0.87153769, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.875, + "step": 1400, + "time_per_iteration": 2.4963936805725098 + }, + { + "auxiliary_loss_clip": 0.01112735, + "auxiliary_loss_mlp": 0.01079858, + "balance_loss_clip": 1.01870322, + "balance_loss_mlp": 1.02786112, + "epoch": 0.08423267698782504, + "flos": 26248540147200.0, + "grad_norm": 1.848027880186848, + "language_loss": 0.8897863, + "learning_rate": 3.969227293371099e-06, + "loss": 0.91171229, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.84765625, + "step": 1401, + "time_per_iteration": 3.905409097671509 + }, + { + "auxiliary_loss_clip": 0.01116566, + "auxiliary_loss_mlp": 0.01093457, + "balance_loss_clip": 1.03149176, + "balance_loss_mlp": 1.02777958, + "epoch": 0.08429280024049302, + "flos": 20118699498240.0, + "grad_norm": 1.7897395412274912, + "language_loss": 0.89774215, + "learning_rate": 3.969159199384263e-06, + "loss": 0.91984236, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.88671875, + "step": 1402, + "time_per_iteration": 3.7939095497131348 + }, + { + "auxiliary_loss_clip": 0.01113293, + "auxiliary_loss_mlp": 0.01080029, + "balance_loss_clip": 1.020329, + "balance_loss_mlp": 1.02711666, + "epoch": 0.08435292349316098, + "flos": 42922849086720.0, + "grad_norm": 2.0808049087504883, + "language_loss": 0.90966964, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.93160284, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.86328125, + "step": 1403, + "time_per_iteration": 2.590278387069702 + }, + { + "auxiliary_loss_clip": 0.01117132, + "auxiliary_loss_mlp": 0.01081346, + "balance_loss_clip": 1.02164555, + "balance_loss_mlp": 1.02769399, + "epoch": 0.08441304674582895, + "flos": 22856169381120.0, + "grad_norm": 2.1862286044609727, + "language_loss": 0.82894778, + "learning_rate": 3.969022787401033e-06, + "loss": 0.8509326, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.89453125, + "step": 1404, + "time_per_iteration": 3.798909902572632 + }, + { + "auxiliary_loss_clip": 0.01119545, + "auxiliary_loss_mlp": 0.01089754, + "balance_loss_clip": 1.02604866, + "balance_loss_mlp": 1.02864075, + "epoch": 0.08447316999849692, + "flos": 18696512465280.0, + "grad_norm": 1.9164886258212148, + "language_loss": 0.85926306, + "learning_rate": 3.968954469409811e-06, + "loss": 0.881356, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.90625, + "step": 1405, + "time_per_iteration": 2.3572118282318115 + }, + { + "auxiliary_loss_clip": 0.01117086, + "auxiliary_loss_mlp": 0.01078609, + "balance_loss_clip": 1.01580918, + "balance_loss_mlp": 1.02815735, + "epoch": 0.08453329325116489, + "flos": 25482790097280.0, + "grad_norm": 1.456029417431269, + "language_loss": 0.82221991, + "learning_rate": 3.968886076755639e-06, + "loss": 0.84417683, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 0.890625, + "step": 1406, + "time_per_iteration": 2.4397239685058594 + }, + { + "auxiliary_loss_clip": 0.01117915, + "auxiliary_loss_mlp": 0.01087035, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.03029108, + "epoch": 0.08459341650383286, + "flos": 20919083483520.0, + "grad_norm": 1.8756244038628709, + "language_loss": 0.81468219, + "learning_rate": 3.96881760944111e-06, + "loss": 0.83673167, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.875, + "step": 1407, + "time_per_iteration": 2.3878774642944336 + }, + { + "auxiliary_loss_clip": 0.01112223, + "auxiliary_loss_mlp": 0.01083123, + "balance_loss_clip": 1.02661812, + "balance_loss_mlp": 1.0285027, + "epoch": 0.08465353975650082, + "flos": 13042223712000.0, + "grad_norm": 2.064055858098722, + "language_loss": 0.93716156, + "learning_rate": 3.968749067468819e-06, + "loss": 0.95911503, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8359375, + "step": 1408, + "time_per_iteration": 2.3763062953948975 + }, + { + "auxiliary_loss_clip": 0.01048802, + "auxiliary_loss_mlp": 0.01009303, + "balance_loss_clip": 1.00076771, + "balance_loss_mlp": 1.02741575, + "epoch": 0.0847136630091688, + "flos": 60874174293120.0, + "grad_norm": 0.899121895159353, + "language_loss": 0.61954868, + "learning_rate": 3.968680450841368e-06, + "loss": 0.6401298, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.08544922, + "router_z_loss_mlp": 0.21386719, + "step": 1409, + "time_per_iteration": 3.1043546199798584 + }, + { + "auxiliary_loss_clip": 0.011052, + "auxiliary_loss_mlp": 0.0107871, + "balance_loss_clip": 1.02497053, + "balance_loss_mlp": 1.02605641, + "epoch": 0.08477378626183676, + "flos": 22045661101440.0, + "grad_norm": 2.3133840908624625, + "language_loss": 0.89147848, + "learning_rate": 3.968611759561355e-06, + "loss": 0.91331756, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.79296875, + "step": 1410, + "time_per_iteration": 2.4078266620635986 + }, + { + "auxiliary_loss_clip": 0.01113556, + "auxiliary_loss_mlp": 0.01084476, + "balance_loss_clip": 1.02100849, + "balance_loss_mlp": 1.02655125, + "epoch": 0.08483390951450473, + "flos": 16689146267520.0, + "grad_norm": 2.017730094235175, + "language_loss": 0.76440203, + "learning_rate": 3.968542993631388e-06, + "loss": 0.78638232, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.87109375, + "step": 1411, + "time_per_iteration": 2.376321792602539 + }, + { + "auxiliary_loss_clip": 0.01034838, + "auxiliary_loss_mlp": 0.01048379, + "balance_loss_clip": 1.03874671, + "balance_loss_mlp": 1.01362562, + "epoch": 0.08489403276717271, + "flos": 51581341710720.0, + "grad_norm": 0.9844933613043992, + "language_loss": 0.56860018, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58943236, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.09619141, + "router_z_loss_mlp": 0.21289062, + "step": 1412, + "time_per_iteration": 2.970231056213379 + }, + { + "auxiliary_loss_clip": 0.01110336, + "auxiliary_loss_mlp": 0.01076913, + "balance_loss_clip": 1.02260101, + "balance_loss_mlp": 1.02662587, + "epoch": 0.08495415601984067, + "flos": 17091380574720.0, + "grad_norm": 2.318960863034292, + "language_loss": 0.91942322, + "learning_rate": 3.96840523783202e-06, + "loss": 0.94129574, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.8359375, + "step": 1413, + "time_per_iteration": 2.365710496902466 + }, + { + "auxiliary_loss_clip": 0.01108886, + "auxiliary_loss_mlp": 0.01079732, + "balance_loss_clip": 1.02401352, + "balance_loss_mlp": 1.02638221, + "epoch": 0.08501427927250864, + "flos": 23147310142080.0, + "grad_norm": 1.7790083794186835, + "language_loss": 0.90110755, + "learning_rate": 3.968336247967844e-06, + "loss": 0.92299372, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.82421875, + "step": 1414, + "time_per_iteration": 2.426056146621704 + }, + { + "auxiliary_loss_clip": 0.01118333, + "auxiliary_loss_mlp": 0.0108492, + "balance_loss_clip": 1.03380275, + "balance_loss_mlp": 1.03100491, + "epoch": 0.08507440252517662, + "flos": 19062437091840.0, + "grad_norm": 1.6916776567164322, + "language_loss": 0.79130346, + "learning_rate": 3.96826718346416e-06, + "loss": 0.81333601, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.875, + "step": 1415, + "time_per_iteration": 2.3966903686523438 + }, + { + "auxiliary_loss_clip": 0.01109801, + "auxiliary_loss_mlp": 0.01082812, + "balance_loss_clip": 1.03272045, + "balance_loss_mlp": 1.02818453, + "epoch": 0.08513452577784458, + "flos": 60180137775360.0, + "grad_norm": 1.6403556611559988, + "language_loss": 0.72788024, + "learning_rate": 3.968198044323587e-06, + "loss": 0.7498064, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.81640625, + "step": 1416, + "time_per_iteration": 2.780937671661377 + }, + { + "auxiliary_loss_clip": 0.0112044, + "auxiliary_loss_mlp": 0.01098129, + "balance_loss_clip": 1.04224336, + "balance_loss_mlp": 1.0331502, + "epoch": 0.08519464903051255, + "flos": 27307246348800.0, + "grad_norm": 1.8788234481105044, + "language_loss": 0.77030414, + "learning_rate": 3.968128830548748e-06, + "loss": 0.79248983, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.875, + "step": 1417, + "time_per_iteration": 2.44214129447937 + }, + { + "auxiliary_loss_clip": 0.01115767, + "auxiliary_loss_mlp": 0.01088158, + "balance_loss_clip": 1.03415632, + "balance_loss_mlp": 1.03219712, + "epoch": 0.08525477228318051, + "flos": 20265404497920.0, + "grad_norm": 2.226908673252253, + "language_loss": 0.85176468, + "learning_rate": 3.968059542142265e-06, + "loss": 0.87380391, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8359375, + "step": 1418, + "time_per_iteration": 2.4115822315216064 + }, + { + "auxiliary_loss_clip": 0.01032178, + "auxiliary_loss_mlp": 0.0104884, + "balance_loss_clip": 1.04082918, + "balance_loss_mlp": 1.01130629, + "epoch": 0.08531489553584849, + "flos": 67611923268480.0, + "grad_norm": 0.9103125170411256, + "language_loss": 0.56791902, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58872914, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.08007812, + "router_z_loss_mlp": 0.20898438, + "step": 1419, + "time_per_iteration": 2.9400699138641357 + }, + { + "auxiliary_loss_clip": 0.01116261, + "auxiliary_loss_mlp": 0.01087167, + "balance_loss_clip": 1.03481054, + "balance_loss_mlp": 1.0308677, + "epoch": 0.08537501878851646, + "flos": 27525732837120.0, + "grad_norm": 2.158622550947699, + "language_loss": 0.73245436, + "learning_rate": 3.967920741444886e-06, + "loss": 0.75448865, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.8515625, + "step": 1420, + "time_per_iteration": 2.505561351776123 + }, + { + "auxiliary_loss_clip": 0.01113341, + "auxiliary_loss_mlp": 0.01077893, + "balance_loss_clip": 1.02517867, + "balance_loss_mlp": 1.03103662, + "epoch": 0.08543514204118442, + "flos": 22783131083520.0, + "grad_norm": 1.6820846281710935, + "language_loss": 0.90052062, + "learning_rate": 3.967851229159252e-06, + "loss": 0.9224329, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.8203125, + "step": 1421, + "time_per_iteration": 2.476797342300415 + }, + { + "auxiliary_loss_clip": 0.01035795, + "auxiliary_loss_mlp": 0.01012584, + "balance_loss_clip": 1.00371516, + "balance_loss_mlp": 1.01580596, + "epoch": 0.0854952652938524, + "flos": 60987362520960.0, + "grad_norm": 0.8083987011504955, + "language_loss": 0.63656533, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65704906, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.08886719, + "router_z_loss_mlp": 0.19921875, + "step": 1422, + "time_per_iteration": 3.1240267753601074 + }, + { + "auxiliary_loss_clip": 0.01112471, + "auxiliary_loss_mlp": 0.01092175, + "balance_loss_clip": 1.03595614, + "balance_loss_mlp": 1.03040361, + "epoch": 0.08555538854652037, + "flos": 28036791452160.0, + "grad_norm": 1.872987604068758, + "language_loss": 0.85021901, + "learning_rate": 3.967711980727276e-06, + "loss": 0.87226552, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8203125, + "step": 1423, + "time_per_iteration": 2.452277183532715 + }, + { + "auxiliary_loss_clip": 0.01113182, + "auxiliary_loss_mlp": 0.01089304, + "balance_loss_clip": 1.0351119, + "balance_loss_mlp": 1.02935719, + "epoch": 0.08561551179918833, + "flos": 23508277355520.0, + "grad_norm": 1.7501794811926428, + "language_loss": 0.77323192, + "learning_rate": 3.967642244586213e-06, + "loss": 0.79525679, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.8359375, + "step": 1424, + "time_per_iteration": 2.471280097961426 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01081251, + "balance_loss_clip": 1.02760649, + "balance_loss_mlp": 1.02971339, + "epoch": 0.08567563505185631, + "flos": 17926084293120.0, + "grad_norm": 1.7599352281880454, + "language_loss": 0.78676742, + "learning_rate": 3.96757243383196e-06, + "loss": 0.80869502, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.81640625, + "step": 1425, + "time_per_iteration": 2.3701725006103516 + }, + { + "auxiliary_loss_clip": 0.01111266, + "auxiliary_loss_mlp": 0.01080377, + "balance_loss_clip": 1.02430069, + "balance_loss_mlp": 1.02844954, + "epoch": 0.08573575830452428, + "flos": 19718490049920.0, + "grad_norm": 1.878361620876402, + "language_loss": 0.95259249, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.97450894, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.828125, + "step": 1426, + "time_per_iteration": 2.4057061672210693 + }, + { + "auxiliary_loss_clip": 0.01116925, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_clip": 1.02069998, + "balance_loss_mlp": 1.03035212, + "epoch": 0.08579588155719224, + "flos": 17930587858560.0, + "grad_norm": 2.2873133884261785, + "language_loss": 0.77411801, + "learning_rate": 3.967432588494471e-06, + "loss": 0.79609221, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.8671875, + "step": 1427, + "time_per_iteration": 2.3804595470428467 + }, + { + "auxiliary_loss_clip": 0.01111194, + "auxiliary_loss_mlp": 0.01078663, + "balance_loss_clip": 1.02852321, + "balance_loss_mlp": 1.02917266, + "epoch": 0.08585600480986022, + "flos": 16032429993600.0, + "grad_norm": 2.5994789225577244, + "language_loss": 0.84816945, + "learning_rate": 3.96736255391654e-06, + "loss": 0.87006807, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.8203125, + "step": 1428, + "time_per_iteration": 2.38411283493042 + }, + { + "auxiliary_loss_clip": 0.01113934, + "auxiliary_loss_mlp": 0.01080702, + "balance_loss_clip": 1.02407813, + "balance_loss_mlp": 1.02999592, + "epoch": 0.08591612806252819, + "flos": 28656185616000.0, + "grad_norm": 2.057324733047282, + "language_loss": 0.82921427, + "learning_rate": 3.967292444736023e-06, + "loss": 0.85116065, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.83984375, + "step": 1429, + "time_per_iteration": 2.4644107818603516 + }, + { + "auxiliary_loss_clip": 0.01116789, + "auxiliary_loss_mlp": 0.01086597, + "balance_loss_clip": 1.02804112, + "balance_loss_mlp": 1.03144622, + "epoch": 0.08597625131519615, + "flos": 20958081338880.0, + "grad_norm": 1.9420518808743064, + "language_loss": 0.89817762, + "learning_rate": 3.967222260955578e-06, + "loss": 0.92021149, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.8515625, + "step": 1430, + "time_per_iteration": 2.4637722969055176 + }, + { + "auxiliary_loss_clip": 0.01116165, + "auxiliary_loss_mlp": 0.01070056, + "balance_loss_clip": 1.01619685, + "balance_loss_mlp": 1.03415608, + "epoch": 0.08603637456786412, + "flos": 23255296577280.0, + "grad_norm": 1.571123833435201, + "language_loss": 0.83976698, + "learning_rate": 3.96715200257787e-06, + "loss": 0.86162913, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8203125, + "step": 1431, + "time_per_iteration": 2.4456446170806885 + }, + { + "auxiliary_loss_clip": 0.01116124, + "auxiliary_loss_mlp": 0.01072626, + "balance_loss_clip": 1.02117491, + "balance_loss_mlp": 1.03247929, + "epoch": 0.0860964978205321, + "flos": 28692914232960.0, + "grad_norm": 1.7188953589434028, + "language_loss": 0.79189759, + "learning_rate": 3.967081669605559e-06, + "loss": 0.81378508, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.83984375, + "step": 1432, + "time_per_iteration": 2.4799957275390625 + }, + { + "auxiliary_loss_clip": 0.01116294, + "auxiliary_loss_mlp": 0.01078601, + "balance_loss_clip": 1.02440858, + "balance_loss_mlp": 1.03194785, + "epoch": 0.08615662107320006, + "flos": 19317372906240.0, + "grad_norm": 1.96902837000507, + "language_loss": 0.76014602, + "learning_rate": 3.967011262041315e-06, + "loss": 0.7820949, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.84375, + "step": 1433, + "time_per_iteration": 2.407606601715088 + }, + { + "auxiliary_loss_clip": 0.0112356, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_clip": 1.02156353, + "balance_loss_mlp": 1.03532553, + "epoch": 0.08621674432586802, + "flos": 15850776856320.0, + "grad_norm": 2.6097833625342957, + "language_loss": 0.89278162, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.91482919, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.8828125, + "step": 1434, + "time_per_iteration": 2.4276256561279297 + }, + { + "auxiliary_loss_clip": 0.01119496, + "auxiliary_loss_mlp": 0.01081947, + "balance_loss_clip": 1.0290184, + "balance_loss_mlp": 1.03286588, + "epoch": 0.086276867578536, + "flos": 14099777838720.0, + "grad_norm": 2.450471605172046, + "language_loss": 0.81907034, + "learning_rate": 3.966870223147707e-06, + "loss": 0.84108478, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.8671875, + "step": 1435, + "time_per_iteration": 2.394925832748413 + }, + { + "auxiliary_loss_clip": 0.01045552, + "auxiliary_loss_mlp": 0.010149, + "balance_loss_clip": 1.00855815, + "balance_loss_mlp": 1.02510858, + "epoch": 0.08633699083120397, + "flos": 70181250710400.0, + "grad_norm": 0.9024021432673198, + "language_loss": 0.57986838, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60047293, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.06347656, + "router_z_loss_mlp": 0.20507812, + "step": 1436, + "time_per_iteration": 3.1369192600250244 + }, + { + "auxiliary_loss_clip": 0.01119287, + "auxiliary_loss_mlp": 0.01094713, + "balance_loss_clip": 1.03568101, + "balance_loss_mlp": 1.03313851, + "epoch": 0.08639711408387193, + "flos": 30297592275840.0, + "grad_norm": 2.1584659396296706, + "language_loss": 0.72806776, + "learning_rate": 3.966728885918437e-06, + "loss": 0.75020778, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.859375, + "step": 1437, + "time_per_iteration": 2.47381591796875 + }, + { + "auxiliary_loss_clip": 0.01111881, + "auxiliary_loss_mlp": 0.0107538, + "balance_loss_clip": 1.02516866, + "balance_loss_mlp": 1.03080988, + "epoch": 0.08645723733653991, + "flos": 20296791676800.0, + "grad_norm": 1.7511859793835034, + "language_loss": 0.75384074, + "learning_rate": 3.966658105434627e-06, + "loss": 0.77571332, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.8125, + "step": 1438, + "time_per_iteration": 2.4227497577667236 + }, + { + "auxiliary_loss_clip": 0.01110918, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.02244115, + "balance_loss_mlp": 1.03024793, + "epoch": 0.08651736058920788, + "flos": 32889195031680.0, + "grad_norm": 2.006054621899296, + "language_loss": 0.66755843, + "learning_rate": 3.966587250374945e-06, + "loss": 0.68939036, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.80859375, + "step": 1439, + "time_per_iteration": 3.968137264251709 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01089778, + "balance_loss_clip": 1.03758812, + "balance_loss_mlp": 1.0281558, + "epoch": 0.08657748384187584, + "flos": 22636286438400.0, + "grad_norm": 2.1781412637823956, + "language_loss": 0.90426373, + "learning_rate": 3.966516320742077e-06, + "loss": 0.92625922, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.81640625, + "step": 1440, + "time_per_iteration": 2.426774501800537 + }, + { + "auxiliary_loss_clip": 0.0111726, + "auxiliary_loss_mlp": 0.01104429, + "balance_loss_clip": 1.04463363, + "balance_loss_mlp": 1.02877986, + "epoch": 0.08663760709454381, + "flos": 23657286504960.0, + "grad_norm": 2.1808064570110806, + "language_loss": 0.86329901, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.88551581, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.8828125, + "step": 1441, + "time_per_iteration": 3.8498198986053467 + }, + { + "auxiliary_loss_clip": 0.01025273, + "auxiliary_loss_mlp": 0.01014786, + "balance_loss_clip": 1.00782466, + "balance_loss_mlp": 1.00432384, + "epoch": 0.08669773034721179, + "flos": 62683688482560.0, + "grad_norm": 0.8589133435335505, + "language_loss": 0.60599625, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62639678, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.06982422, + "router_z_loss_mlp": 0.20898438, + "step": 1442, + "time_per_iteration": 4.606550455093384 + }, + { + "auxiliary_loss_clip": 0.01115005, + "auxiliary_loss_mlp": 0.0108462, + "balance_loss_clip": 1.03266871, + "balance_loss_mlp": 1.02724648, + "epoch": 0.08675785359987975, + "flos": 20666451818880.0, + "grad_norm": 2.183011345817947, + "language_loss": 0.82634366, + "learning_rate": 3.96630308443127e-06, + "loss": 0.84833992, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.87890625, + "step": 1443, + "time_per_iteration": 3.9023399353027344 + }, + { + "auxiliary_loss_clip": 0.01112582, + "auxiliary_loss_mlp": 0.01079389, + "balance_loss_clip": 1.02512515, + "balance_loss_mlp": 1.02741575, + "epoch": 0.08681797685254772, + "flos": 26939960179200.0, + "grad_norm": 1.7510366184191377, + "language_loss": 0.8502841, + "learning_rate": 3.966231856532584e-06, + "loss": 0.87220383, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.8515625, + "step": 1444, + "time_per_iteration": 2.4551033973693848 + }, + { + "auxiliary_loss_clip": 0.01115627, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_clip": 1.02037346, + "balance_loss_mlp": 1.02859306, + "epoch": 0.0868781001052157, + "flos": 17711856990720.0, + "grad_norm": 2.0176007667047724, + "language_loss": 0.91051841, + "learning_rate": 3.966160554074189e-06, + "loss": 0.93240941, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.87109375, + "step": 1445, + "time_per_iteration": 2.3741347789764404 + }, + { + "auxiliary_loss_clip": 0.01116336, + "auxiliary_loss_mlp": 0.01070498, + "balance_loss_clip": 1.01835549, + "balance_loss_mlp": 1.03191006, + "epoch": 0.08693822335788366, + "flos": 19895639621760.0, + "grad_norm": 2.147681418847971, + "language_loss": 0.84657621, + "learning_rate": 3.96608917705879e-06, + "loss": 0.86844456, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.84375, + "step": 1446, + "time_per_iteration": 2.423619031906128 + }, + { + "auxiliary_loss_clip": 0.01035002, + "auxiliary_loss_mlp": 0.01007121, + "balance_loss_clip": 1.00063586, + "balance_loss_mlp": 1.01290298, + "epoch": 0.08699834661055163, + "flos": 67020878995200.0, + "grad_norm": 0.7314113702234439, + "language_loss": 0.54856455, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56898582, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.06494141, + "router_z_loss_mlp": 0.22070312, + "step": 1447, + "time_per_iteration": 3.0771048069000244 + }, + { + "auxiliary_loss_clip": 0.0111426, + "auxiliary_loss_mlp": 0.01072662, + "balance_loss_clip": 1.0185647, + "balance_loss_mlp": 1.03261232, + "epoch": 0.0870584698632196, + "flos": 13479650536320.0, + "grad_norm": 2.072212645881914, + "language_loss": 0.87208784, + "learning_rate": 3.965946199367804e-06, + "loss": 0.89395702, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.81640625, + "step": 1448, + "time_per_iteration": 2.3829267024993896 + }, + { + "auxiliary_loss_clip": 0.01123087, + "auxiliary_loss_mlp": 0.01074617, + "balance_loss_clip": 1.01801658, + "balance_loss_mlp": 1.03458977, + "epoch": 0.08711859311588757, + "flos": 16106096695680.0, + "grad_norm": 2.5894806760174016, + "language_loss": 0.85140091, + "learning_rate": 3.965874598697638e-06, + "loss": 0.87337792, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.8828125, + "step": 1449, + "time_per_iteration": 2.3910915851593018 + }, + { + "auxiliary_loss_clip": 0.01117386, + "auxiliary_loss_mlp": 0.01079108, + "balance_loss_clip": 1.02322221, + "balance_loss_mlp": 1.03371429, + "epoch": 0.08717871636855554, + "flos": 38470829512320.0, + "grad_norm": 1.514910957918278, + "language_loss": 0.73106921, + "learning_rate": 3.965802923481313e-06, + "loss": 0.75303411, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.8359375, + "step": 1450, + "time_per_iteration": 2.5526249408721924 + }, + { + "auxiliary_loss_clip": 0.01116501, + "auxiliary_loss_mlp": 0.01080257, + "balance_loss_clip": 1.02596951, + "balance_loss_mlp": 1.03247833, + "epoch": 0.0872388396212235, + "flos": 17599681192320.0, + "grad_norm": 1.8398225043867875, + "language_loss": 0.8475877, + "learning_rate": 3.965731173721542e-06, + "loss": 0.86955529, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.83984375, + "step": 1451, + "time_per_iteration": 2.4064066410064697 + }, + { + "auxiliary_loss_clip": 0.01115896, + "auxiliary_loss_mlp": 0.01081916, + "balance_loss_clip": 1.02867699, + "balance_loss_mlp": 1.03330588, + "epoch": 0.08729896287389148, + "flos": 25258368677760.0, + "grad_norm": 1.7512759878795952, + "language_loss": 0.76147288, + "learning_rate": 3.965659349421049e-06, + "loss": 0.78345096, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.82421875, + "step": 1452, + "time_per_iteration": 2.4591469764709473 + }, + { + "auxiliary_loss_clip": 0.01117849, + "auxiliary_loss_mlp": 0.01088864, + "balance_loss_clip": 1.03562462, + "balance_loss_mlp": 1.03127027, + "epoch": 0.08735908612655945, + "flos": 15631557229440.0, + "grad_norm": 2.5387254400196277, + "language_loss": 0.83638072, + "learning_rate": 3.965587450582556e-06, + "loss": 0.85844791, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.8671875, + "step": 1453, + "time_per_iteration": 2.4037959575653076 + }, + { + "auxiliary_loss_clip": 0.01114814, + "auxiliary_loss_mlp": 0.0107522, + "balance_loss_clip": 1.02684498, + "balance_loss_mlp": 1.0319078, + "epoch": 0.08741920937922741, + "flos": 20338617352320.0, + "grad_norm": 2.5070264808703895, + "language_loss": 0.73180056, + "learning_rate": 3.96551547720879e-06, + "loss": 0.75370097, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.828125, + "step": 1454, + "time_per_iteration": 2.453380823135376 + }, + { + "auxiliary_loss_clip": 0.01029206, + "auxiliary_loss_mlp": 0.01018032, + "balance_loss_clip": 1.01121342, + "balance_loss_mlp": 1.00741255, + "epoch": 0.08747933263189539, + "flos": 62816252515200.0, + "grad_norm": 0.7899442217902768, + "language_loss": 0.58746308, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60793549, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.06835938, + "router_z_loss_mlp": 0.21875, + "step": 1455, + "time_per_iteration": 3.0455079078674316 + }, + { + "auxiliary_loss_clip": 0.01111902, + "auxiliary_loss_mlp": 0.01078802, + "balance_loss_clip": 1.02134299, + "balance_loss_mlp": 1.02725852, + "epoch": 0.08753945588456336, + "flos": 33034503576960.0, + "grad_norm": 1.712183080070253, + "language_loss": 0.79740429, + "learning_rate": 3.965371306866359e-06, + "loss": 0.81931138, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.84765625, + "step": 1456, + "time_per_iteration": 2.514287233352661 + }, + { + "auxiliary_loss_clip": 0.01108973, + "auxiliary_loss_mlp": 0.01077824, + "balance_loss_clip": 1.02050805, + "balance_loss_mlp": 1.02620721, + "epoch": 0.08759957913723132, + "flos": 35545911206400.0, + "grad_norm": 1.8340154401536264, + "language_loss": 0.74541891, + "learning_rate": 3.96529910990316e-06, + "loss": 0.7672869, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.828125, + "step": 1457, + "time_per_iteration": 2.5370287895202637 + }, + { + "auxiliary_loss_clip": 0.01106001, + "auxiliary_loss_mlp": 0.01066732, + "balance_loss_clip": 1.0150907, + "balance_loss_mlp": 1.02479792, + "epoch": 0.0876597023898993, + "flos": 23910092726400.0, + "grad_norm": 1.509206461076867, + "language_loss": 0.88228375, + "learning_rate": 3.965226838415622e-06, + "loss": 0.90401107, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.8125, + "step": 1458, + "time_per_iteration": 2.52462100982666 + }, + { + "auxiliary_loss_clip": 0.01111115, + "auxiliary_loss_mlp": 0.01074795, + "balance_loss_clip": 1.02107942, + "balance_loss_mlp": 1.02947068, + "epoch": 0.08771982564256726, + "flos": 18113043957120.0, + "grad_norm": 1.594725376203298, + "language_loss": 0.81962031, + "learning_rate": 3.965154492406486e-06, + "loss": 0.84147942, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.81640625, + "step": 1459, + "time_per_iteration": 2.4338138103485107 + }, + { + "auxiliary_loss_clip": 0.01113952, + "auxiliary_loss_mlp": 0.01083841, + "balance_loss_clip": 1.02750266, + "balance_loss_mlp": 1.02884984, + "epoch": 0.08777994889523523, + "flos": 17711054029440.0, + "grad_norm": 2.12412159317749, + "language_loss": 0.86771858, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.88969648, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8515625, + "step": 1460, + "time_per_iteration": 2.3847978115081787 + }, + { + "auxiliary_loss_clip": 0.01111219, + "auxiliary_loss_mlp": 0.01077052, + "balance_loss_clip": 1.02357459, + "balance_loss_mlp": 1.02895474, + "epoch": 0.0878400721479032, + "flos": 12819198746880.0, + "grad_norm": 2.5102168231036037, + "language_loss": 0.83369768, + "learning_rate": 3.965009576834394e-06, + "loss": 0.85558033, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.82421875, + "step": 1461, + "time_per_iteration": 2.3685266971588135 + }, + { + "auxiliary_loss_clip": 0.01115651, + "auxiliary_loss_mlp": 0.01082455, + "balance_loss_clip": 1.02954984, + "balance_loss_mlp": 1.03143322, + "epoch": 0.08790019540057117, + "flos": 26391579454080.0, + "grad_norm": 1.6498477376973795, + "language_loss": 0.7713989, + "learning_rate": 3.964937007276932e-06, + "loss": 0.79338002, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.84375, + "step": 1462, + "time_per_iteration": 2.486179828643799 + }, + { + "auxiliary_loss_clip": 0.01116504, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.02683353, + "balance_loss_mlp": 1.02971745, + "epoch": 0.08796031865323914, + "flos": 19133066505600.0, + "grad_norm": 2.9582952116501096, + "language_loss": 0.78336728, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.80539954, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.8671875, + "step": 1463, + "time_per_iteration": 2.407639503479004 + }, + { + "auxiliary_loss_clip": 0.01114469, + "auxiliary_loss_mlp": 0.01082092, + "balance_loss_clip": 1.02460885, + "balance_loss_mlp": 1.02797174, + "epoch": 0.0880204419059071, + "flos": 26063186405760.0, + "grad_norm": 1.8364709504617904, + "language_loss": 0.86058027, + "learning_rate": 3.964791644632941e-06, + "loss": 0.88254589, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.8671875, + "step": 1464, + "time_per_iteration": 2.4386796951293945 + }, + { + "auxiliary_loss_clip": 0.01113357, + "auxiliary_loss_mlp": 0.01080794, + "balance_loss_clip": 1.02705455, + "balance_loss_mlp": 1.02911568, + "epoch": 0.08808056515857508, + "flos": 22376881969920.0, + "grad_norm": 2.071622882381408, + "language_loss": 0.80284196, + "learning_rate": 3.964718851551923e-06, + "loss": 0.82478344, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.84375, + "step": 1465, + "time_per_iteration": 2.4034459590911865 + }, + { + "auxiliary_loss_clip": 0.01116493, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_clip": 1.02204871, + "balance_loss_mlp": 1.02890396, + "epoch": 0.08814068841124305, + "flos": 23184178404480.0, + "grad_norm": 1.9811141977764644, + "language_loss": 0.88102806, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.90299541, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.87890625, + "step": 1466, + "time_per_iteration": 2.428858757019043 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01083564, + "balance_loss_clip": 1.02424502, + "balance_loss_mlp": 1.02694619, + "epoch": 0.08820081166391101, + "flos": 25154117758080.0, + "grad_norm": 2.0553448563211307, + "language_loss": 0.85068804, + "learning_rate": 3.964573041885641e-06, + "loss": 0.872639, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.84375, + "step": 1467, + "time_per_iteration": 2.4491524696350098 + }, + { + "auxiliary_loss_clip": 0.01112154, + "auxiliary_loss_mlp": 0.01078099, + "balance_loss_clip": 1.02125967, + "balance_loss_mlp": 1.02750313, + "epoch": 0.08826093491657899, + "flos": 22230735552000.0, + "grad_norm": 1.662114176694134, + "language_loss": 0.78000438, + "learning_rate": 3.964500025305907e-06, + "loss": 0.80190694, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.84375, + "step": 1468, + "time_per_iteration": 2.4559433460235596 + }, + { + "auxiliary_loss_clip": 0.01111113, + "auxiliary_loss_mlp": 0.01075941, + "balance_loss_clip": 1.02358401, + "balance_loss_mlp": 1.02803528, + "epoch": 0.08832105816924696, + "flos": 22125751493760.0, + "grad_norm": 1.5428232121566852, + "language_loss": 0.81364679, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.83551735, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.828125, + "step": 1469, + "time_per_iteration": 2.431121349334717 + }, + { + "auxiliary_loss_clip": 0.01113366, + "auxiliary_loss_mlp": 0.01079233, + "balance_loss_clip": 1.02155972, + "balance_loss_mlp": 1.02842462, + "epoch": 0.08838118142191492, + "flos": 17565536016000.0, + "grad_norm": 2.117264508542001, + "language_loss": 0.80071217, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.82263815, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.8515625, + "step": 1470, + "time_per_iteration": 2.396859884262085 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01082593, + "balance_loss_clip": 1.02432382, + "balance_loss_mlp": 1.02704954, + "epoch": 0.0884413046745829, + "flos": 20776148910720.0, + "grad_norm": 1.811908864242278, + "language_loss": 0.86483073, + "learning_rate": 3.964280528613569e-06, + "loss": 0.88675773, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.828125, + "step": 1471, + "time_per_iteration": 2.4096903800964355 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01075301, + "balance_loss_clip": 1.02654409, + "balance_loss_mlp": 1.02547359, + "epoch": 0.08850142792725087, + "flos": 22124424862080.0, + "grad_norm": 1.5157660656074357, + "language_loss": 0.84710777, + "learning_rate": 3.964207214074324e-06, + "loss": 0.8688916, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7734375, + "step": 1472, + "time_per_iteration": 2.4593687057495117 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01075374, + "balance_loss_clip": 1.02053773, + "balance_loss_mlp": 1.02671289, + "epoch": 0.08856155117991883, + "flos": 22417660304640.0, + "grad_norm": 2.4906480891090883, + "language_loss": 0.86922204, + "learning_rate": 3.964133825052146e-06, + "loss": 0.89107454, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.83203125, + "step": 1473, + "time_per_iteration": 2.3947339057922363 + }, + { + "auxiliary_loss_clip": 0.01111052, + "auxiliary_loss_mlp": 0.01075839, + "balance_loss_clip": 1.02250493, + "balance_loss_mlp": 1.02588296, + "epoch": 0.0886216744325868, + "flos": 29935647544320.0, + "grad_norm": 1.6252181455641697, + "language_loss": 0.80898452, + "learning_rate": 3.964060361549816e-06, + "loss": 0.8308534, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.8515625, + "step": 1474, + "time_per_iteration": 2.5051679611206055 + }, + { + "auxiliary_loss_clip": 0.01107695, + "auxiliary_loss_mlp": 0.01080868, + "balance_loss_clip": 1.02333748, + "balance_loss_mlp": 1.02549338, + "epoch": 0.08868179768525478, + "flos": 23981839303680.0, + "grad_norm": 1.773515094833119, + "language_loss": 0.81431776, + "learning_rate": 3.963986823570121e-06, + "loss": 0.8362034, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.82421875, + "step": 1475, + "time_per_iteration": 2.412865400314331 + }, + { + "auxiliary_loss_clip": 0.0110928, + "auxiliary_loss_mlp": 0.0107432, + "balance_loss_clip": 1.02103305, + "balance_loss_mlp": 1.02490354, + "epoch": 0.08874192093792274, + "flos": 43175934599040.0, + "grad_norm": 1.556744656878787, + "language_loss": 0.76169324, + "learning_rate": 3.963913211115848e-06, + "loss": 0.78352916, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.84375, + "step": 1476, + "time_per_iteration": 2.580961227416992 + }, + { + "auxiliary_loss_clip": 0.01110321, + "auxiliary_loss_mlp": 0.01077995, + "balance_loss_clip": 1.02296758, + "balance_loss_mlp": 1.0258162, + "epoch": 0.0888020441905907, + "flos": 32851104871680.0, + "grad_norm": 1.5813534333872663, + "language_loss": 0.76812601, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.79000914, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.84375, + "step": 1477, + "time_per_iteration": 2.5251657962799072 + }, + { + "auxiliary_loss_clip": 0.0111123, + "auxiliary_loss_mlp": 0.0108111, + "balance_loss_clip": 1.02529621, + "balance_loss_mlp": 1.0262146, + "epoch": 0.08886216744325869, + "flos": 23148217837440.0, + "grad_norm": 1.871240919147204, + "language_loss": 0.89508778, + "learning_rate": 3.963765762794739e-06, + "loss": 0.9170112, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.8515625, + "step": 1478, + "time_per_iteration": 2.4318556785583496 + }, + { + "auxiliary_loss_clip": 0.01108157, + "auxiliary_loss_mlp": 0.01075162, + "balance_loss_clip": 1.02008748, + "balance_loss_mlp": 1.02472031, + "epoch": 0.08892229069592665, + "flos": 23330464467840.0, + "grad_norm": 1.5450972500791056, + "language_loss": 0.79439819, + "learning_rate": 3.963691926933495e-06, + "loss": 0.81623137, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.8359375, + "step": 1479, + "time_per_iteration": 5.375801086425781 + }, + { + "auxiliary_loss_clip": 0.01107266, + "auxiliary_loss_mlp": 0.01069251, + "balance_loss_clip": 1.01715624, + "balance_loss_mlp": 1.02508116, + "epoch": 0.08898241394859462, + "flos": 26212579580160.0, + "grad_norm": 2.4087970391462044, + "language_loss": 0.80752939, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.82929456, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.8203125, + "step": 1480, + "time_per_iteration": 2.4887309074401855 + }, + { + "auxiliary_loss_clip": 0.01110635, + "auxiliary_loss_mlp": 0.01082888, + "balance_loss_clip": 1.0250001, + "balance_loss_mlp": 1.02557898, + "epoch": 0.0890425372012626, + "flos": 23549474626560.0, + "grad_norm": 1.6490382780217814, + "language_loss": 0.69447935, + "learning_rate": 3.963544031823624e-06, + "loss": 0.71641457, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.8515625, + "step": 1481, + "time_per_iteration": 3.8958072662353516 + }, + { + "auxiliary_loss_clip": 0.01109083, + "auxiliary_loss_mlp": 0.01069186, + "balance_loss_clip": 1.02095389, + "balance_loss_mlp": 1.02680755, + "epoch": 0.08910266045393056, + "flos": 23001687394560.0, + "grad_norm": 1.9861187832442893, + "language_loss": 0.98841119, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.01019382, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.8203125, + "step": 1482, + "time_per_iteration": 3.8417856693267822 + }, + { + "auxiliary_loss_clip": 0.01114455, + "auxiliary_loss_mlp": 0.0108084, + "balance_loss_clip": 1.02404857, + "balance_loss_mlp": 1.02737927, + "epoch": 0.08916278370659853, + "flos": 31935298331520.0, + "grad_norm": 1.864300427496658, + "language_loss": 0.80082321, + "learning_rate": 3.96339583888261e-06, + "loss": 0.82277614, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.87109375, + "step": 1483, + "time_per_iteration": 2.519275665283203 + }, + { + "auxiliary_loss_clip": 0.01111005, + "auxiliary_loss_mlp": 0.01083248, + "balance_loss_clip": 1.02819729, + "balance_loss_mlp": 1.02736831, + "epoch": 0.08922290695926649, + "flos": 17529435803520.0, + "grad_norm": 2.394137763470711, + "language_loss": 0.87980139, + "learning_rate": 3.963321630732448e-06, + "loss": 0.90174389, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.8359375, + "step": 1484, + "time_per_iteration": 2.386676549911499 + }, + { + "auxiliary_loss_clip": 0.01116719, + "auxiliary_loss_mlp": 0.01077153, + "balance_loss_clip": 1.02095723, + "balance_loss_mlp": 1.02943611, + "epoch": 0.08928303021193447, + "flos": 32123689361280.0, + "grad_norm": 1.650437815783227, + "language_loss": 0.81977254, + "learning_rate": 3.963247348132932e-06, + "loss": 0.84171122, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.87109375, + "step": 1485, + "time_per_iteration": 2.481862783432007 + }, + { + "auxiliary_loss_clip": 0.01107643, + "auxiliary_loss_mlp": 0.01067612, + "balance_loss_clip": 1.02197814, + "balance_loss_mlp": 1.02539802, + "epoch": 0.08934315346460243, + "flos": 22124180482560.0, + "grad_norm": 1.6348100812736432, + "language_loss": 0.84867632, + "learning_rate": 3.96317299108688e-06, + "loss": 0.87042892, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.82421875, + "step": 1486, + "time_per_iteration": 2.427866220474243 + }, + { + "auxiliary_loss_clip": 0.0110823, + "auxiliary_loss_mlp": 0.01079714, + "balance_loss_clip": 1.02537894, + "balance_loss_mlp": 1.02566624, + "epoch": 0.0894032767172704, + "flos": 22564470038400.0, + "grad_norm": 1.6325281266127587, + "language_loss": 0.78536284, + "learning_rate": 3.963098559597111e-06, + "loss": 0.80724233, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.82421875, + "step": 1487, + "time_per_iteration": 2.405565023422241 + }, + { + "auxiliary_loss_clip": 0.01108807, + "auxiliary_loss_mlp": 0.01078202, + "balance_loss_clip": 1.02603602, + "balance_loss_mlp": 1.02558923, + "epoch": 0.08946339996993838, + "flos": 20192366200320.0, + "grad_norm": 2.0813114479462085, + "language_loss": 0.85506034, + "learning_rate": 3.963024053666449e-06, + "loss": 0.87693036, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.83203125, + "step": 1488, + "time_per_iteration": 2.400275945663452 + }, + { + "auxiliary_loss_clip": 0.01105438, + "auxiliary_loss_mlp": 0.01067989, + "balance_loss_clip": 1.02121127, + "balance_loss_mlp": 1.02490723, + "epoch": 0.08952352322260634, + "flos": 48358372060800.0, + "grad_norm": 1.777939991273865, + "language_loss": 0.74371445, + "learning_rate": 3.962949473297718e-06, + "loss": 0.76544875, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.8046875, + "step": 1489, + "time_per_iteration": 2.6290080547332764 + }, + { + "auxiliary_loss_clip": 0.011064, + "auxiliary_loss_mlp": 0.01078983, + "balance_loss_clip": 1.02505255, + "balance_loss_mlp": 1.02541518, + "epoch": 0.08958364647527431, + "flos": 31791805176960.0, + "grad_norm": 1.805984216123992, + "language_loss": 0.91613555, + "learning_rate": 3.962874818493745e-06, + "loss": 0.93798935, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.80859375, + "step": 1490, + "time_per_iteration": 2.4992287158966064 + }, + { + "auxiliary_loss_clip": 0.01110975, + "auxiliary_loss_mlp": 0.01081261, + "balance_loss_clip": 1.02809381, + "balance_loss_mlp": 1.02654743, + "epoch": 0.08964376972794229, + "flos": 23367053439360.0, + "grad_norm": 2.066606195312171, + "language_loss": 0.77037829, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.7923007, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.84375, + "step": 1491, + "time_per_iteration": 2.424330949783325 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.0106472, + "balance_loss_clip": 1.01758516, + "balance_loss_mlp": 1.02610373, + "epoch": 0.08970389298061025, + "flos": 23293666028160.0, + "grad_norm": 1.7139758131889415, + "language_loss": 0.79328859, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.81499797, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.80078125, + "step": 1492, + "time_per_iteration": 2.4186363220214844 + }, + { + "auxiliary_loss_clip": 0.0110577, + "auxiliary_loss_mlp": 0.0107999, + "balance_loss_clip": 1.02975535, + "balance_loss_mlp": 1.02684021, + "epoch": 0.08976401623327822, + "flos": 33760417898880.0, + "grad_norm": 2.0929453957468644, + "language_loss": 0.73093432, + "learning_rate": 3.962650407498707e-06, + "loss": 0.75279194, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.7890625, + "step": 1493, + "time_per_iteration": 2.498934507369995 + }, + { + "auxiliary_loss_clip": 0.01107342, + "auxiliary_loss_mlp": 0.01080879, + "balance_loss_clip": 1.02930951, + "balance_loss_mlp": 1.02570963, + "epoch": 0.08982413948594618, + "flos": 23910302194560.0, + "grad_norm": 1.6719924955778092, + "language_loss": 0.88979417, + "learning_rate": 3.962575454982109e-06, + "loss": 0.91167641, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.81640625, + "step": 1494, + "time_per_iteration": 2.436227560043335 + }, + { + "auxiliary_loss_clip": 0.01107432, + "auxiliary_loss_mlp": 0.01076442, + "balance_loss_clip": 1.02580237, + "balance_loss_mlp": 1.0265584, + "epoch": 0.08988426273861416, + "flos": 16836584405760.0, + "grad_norm": 1.6152033139692503, + "language_loss": 0.84875274, + "learning_rate": 3.962500428044454e-06, + "loss": 0.8705914, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.80859375, + "step": 1495, + "time_per_iteration": 2.370147466659546 + }, + { + "auxiliary_loss_clip": 0.01111222, + "auxiliary_loss_mlp": 0.01084687, + "balance_loss_clip": 1.03054237, + "balance_loss_mlp": 1.02658439, + "epoch": 0.08994438599128213, + "flos": 14792489591040.0, + "grad_norm": 2.5087939534055232, + "language_loss": 0.72740614, + "learning_rate": 3.962425326688585e-06, + "loss": 0.74936521, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.84765625, + "step": 1496, + "time_per_iteration": 2.4009199142456055 + }, + { + "auxiliary_loss_clip": 0.01108157, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.02210319, + "balance_loss_mlp": 1.02644873, + "epoch": 0.09000450924395009, + "flos": 17383359208320.0, + "grad_norm": 1.5683690833251411, + "language_loss": 0.82072341, + "learning_rate": 3.962350150917351e-06, + "loss": 0.84254199, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.81640625, + "step": 1497, + "time_per_iteration": 2.3679754734039307 + }, + { + "auxiliary_loss_clip": 0.01114222, + "auxiliary_loss_mlp": 0.01091999, + "balance_loss_clip": 1.0322994, + "balance_loss_mlp": 1.02717352, + "epoch": 0.09006463249661807, + "flos": 24279159375360.0, + "grad_norm": 2.078008327489936, + "language_loss": 0.85090685, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.87296909, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.87109375, + "step": 1498, + "time_per_iteration": 2.4333441257476807 + }, + { + "auxiliary_loss_clip": 0.01113483, + "auxiliary_loss_mlp": 0.01088362, + "balance_loss_clip": 1.02928185, + "balance_loss_mlp": 1.027282, + "epoch": 0.09012475574928604, + "flos": 13661094205440.0, + "grad_norm": 2.2239430335605586, + "language_loss": 0.81338161, + "learning_rate": 3.962199576140195e-06, + "loss": 0.8354001, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.86328125, + "step": 1499, + "time_per_iteration": 2.382538080215454 + }, + { + "auxiliary_loss_clip": 0.01108616, + "auxiliary_loss_mlp": 0.01087125, + "balance_loss_clip": 1.03312302, + "balance_loss_mlp": 1.02713943, + "epoch": 0.090184879001954, + "flos": 23326728952320.0, + "grad_norm": 1.6609470057309854, + "language_loss": 0.94312239, + "learning_rate": 3.962124177139981e-06, + "loss": 0.96507972, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8125, + "step": 1500, + "time_per_iteration": 2.4417648315429688 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01081727, + "balance_loss_clip": 1.02560329, + "balance_loss_mlp": 1.02830207, + "epoch": 0.09024500225462198, + "flos": 23001582660480.0, + "grad_norm": 2.384421745382371, + "language_loss": 0.77172077, + "learning_rate": 3.962048703735822e-06, + "loss": 0.79371285, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.890625, + "step": 1501, + "time_per_iteration": 2.412130355834961 + }, + { + "auxiliary_loss_clip": 0.01031203, + "auxiliary_loss_mlp": 0.01012354, + "balance_loss_clip": 1.00381875, + "balance_loss_mlp": 1.0109694, + "epoch": 0.09030512550728995, + "flos": 62185966007040.0, + "grad_norm": 0.7434971847226125, + "language_loss": 0.58397663, + "learning_rate": 3.96197315593058e-06, + "loss": 0.6044122, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.08544922, + "router_z_loss_mlp": 0.20214844, + "step": 1502, + "time_per_iteration": 3.039884328842163 + }, + { + "auxiliary_loss_clip": 0.01113158, + "auxiliary_loss_mlp": 0.01077862, + "balance_loss_clip": 1.02810419, + "balance_loss_mlp": 1.02844501, + "epoch": 0.09036524875995791, + "flos": 38799152737920.0, + "grad_norm": 2.656786334468031, + "language_loss": 0.73868883, + "learning_rate": 3.961897533727119e-06, + "loss": 0.76059902, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.84765625, + "step": 1503, + "time_per_iteration": 2.5744569301605225 + }, + { + "auxiliary_loss_clip": 0.01120002, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_clip": 1.03146553, + "balance_loss_mlp": 1.03034616, + "epoch": 0.09042537201262588, + "flos": 21688987985280.0, + "grad_norm": 1.8825656797466777, + "language_loss": 0.87943518, + "learning_rate": 3.961821837128306e-06, + "loss": 0.9015311, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.8984375, + "step": 1504, + "time_per_iteration": 2.447361469268799 + }, + { + "auxiliary_loss_clip": 0.01122538, + "auxiliary_loss_mlp": 0.01099923, + "balance_loss_clip": 1.03388131, + "balance_loss_mlp": 1.03130198, + "epoch": 0.09048549526529386, + "flos": 22266102625920.0, + "grad_norm": 1.8164687052282962, + "language_loss": 0.75615788, + "learning_rate": 3.961746066137014e-06, + "loss": 0.77838242, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.9140625, + "step": 1505, + "time_per_iteration": 2.3914897441864014 + }, + { + "auxiliary_loss_clip": 0.01113895, + "auxiliary_loss_mlp": 0.01087988, + "balance_loss_clip": 1.02685773, + "balance_loss_mlp": 1.02978277, + "epoch": 0.09054561851796182, + "flos": 14610068403840.0, + "grad_norm": 1.9622807901576116, + "language_loss": 0.84377092, + "learning_rate": 3.961670220756114e-06, + "loss": 0.86578977, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.83984375, + "step": 1506, + "time_per_iteration": 2.4064269065856934 + }, + { + "auxiliary_loss_clip": 0.01115243, + "auxiliary_loss_mlp": 0.01084887, + "balance_loss_clip": 1.02814388, + "balance_loss_mlp": 1.03013754, + "epoch": 0.09060574177062979, + "flos": 27634941169920.0, + "grad_norm": 2.5462207629751776, + "language_loss": 0.79067171, + "learning_rate": 3.961594300988482e-06, + "loss": 0.81267309, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.8515625, + "step": 1507, + "time_per_iteration": 2.4762706756591797 + }, + { + "auxiliary_loss_clip": 0.0103263, + "auxiliary_loss_mlp": 0.01007821, + "balance_loss_clip": 1.00004888, + "balance_loss_mlp": 1.01230764, + "epoch": 0.09066586502329776, + "flos": 66082657495680.0, + "grad_norm": 0.7336673025900186, + "language_loss": 0.57814103, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59854555, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.07763672, + "router_z_loss_mlp": 0.203125, + "step": 1508, + "time_per_iteration": 2.869333028793335 + }, + { + "auxiliary_loss_clip": 0.01116022, + "auxiliary_loss_mlp": 0.01096671, + "balance_loss_clip": 1.03751945, + "balance_loss_mlp": 1.02970862, + "epoch": 0.09072598827596573, + "flos": 18915452801280.0, + "grad_norm": 1.7032013109541906, + "language_loss": 0.86847401, + "learning_rate": 3.961442238304543e-06, + "loss": 0.89060092, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.86328125, + "step": 1509, + "time_per_iteration": 2.4741878509521484 + }, + { + "auxiliary_loss_clip": 0.01124513, + "auxiliary_loss_mlp": 0.01096673, + "balance_loss_clip": 1.03339696, + "balance_loss_mlp": 1.0302484, + "epoch": 0.0907861115286337, + "flos": 24820732385280.0, + "grad_norm": 2.1688221227733795, + "language_loss": 0.87194145, + "learning_rate": 3.961366095394002e-06, + "loss": 0.89415336, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.9453125, + "step": 1510, + "time_per_iteration": 2.433610439300537 + }, + { + "auxiliary_loss_clip": 0.01118004, + "auxiliary_loss_mlp": 0.01099033, + "balance_loss_clip": 1.03714001, + "balance_loss_mlp": 1.02932215, + "epoch": 0.09084623478130167, + "flos": 21651770609280.0, + "grad_norm": 2.0422380703842347, + "language_loss": 0.89015245, + "learning_rate": 3.961289878108262e-06, + "loss": 0.91232282, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.88671875, + "step": 1511, + "time_per_iteration": 2.410454273223877 + }, + { + "auxiliary_loss_clip": 0.01113964, + "auxiliary_loss_mlp": 0.01092176, + "balance_loss_clip": 1.03238022, + "balance_loss_mlp": 1.02987075, + "epoch": 0.09090635803396964, + "flos": 27637943546880.0, + "grad_norm": 1.428868263474219, + "language_loss": 0.86297405, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.8850354, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.83984375, + "step": 1512, + "time_per_iteration": 2.466679334640503 + }, + { + "auxiliary_loss_clip": 0.01111912, + "auxiliary_loss_mlp": 0.0107679, + "balance_loss_clip": 1.02245474, + "balance_loss_mlp": 1.02744186, + "epoch": 0.0909664812866376, + "flos": 17668355569920.0, + "grad_norm": 2.816549002711642, + "language_loss": 0.90307796, + "learning_rate": 3.961137220422749e-06, + "loss": 0.92496502, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.84375, + "step": 1513, + "time_per_iteration": 2.363409996032715 + }, + { + "auxiliary_loss_clip": 0.01112741, + "auxiliary_loss_mlp": 0.01080415, + "balance_loss_clip": 1.02290881, + "balance_loss_mlp": 1.02823675, + "epoch": 0.09102660453930557, + "flos": 23950312479360.0, + "grad_norm": 1.819754572719603, + "language_loss": 0.88657814, + "learning_rate": 3.961060780028764e-06, + "loss": 0.90850973, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.84375, + "step": 1514, + "time_per_iteration": 2.4353113174438477 + }, + { + "auxiliary_loss_clip": 0.01114721, + "auxiliary_loss_mlp": 0.01097078, + "balance_loss_clip": 1.0397861, + "balance_loss_mlp": 1.02902222, + "epoch": 0.09108672779197355, + "flos": 25811741727360.0, + "grad_norm": 1.752178218617194, + "language_loss": 0.92032456, + "learning_rate": 3.960984265271159e-06, + "loss": 0.94244254, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.85546875, + "step": 1515, + "time_per_iteration": 2.4596993923187256 + }, + { + "auxiliary_loss_clip": 0.01113694, + "auxiliary_loss_mlp": 0.01082629, + "balance_loss_clip": 1.02295291, + "balance_loss_mlp": 1.02768445, + "epoch": 0.09114685104464151, + "flos": 29638292561280.0, + "grad_norm": 1.9227267279839466, + "language_loss": 0.87477815, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.89674139, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.859375, + "step": 1516, + "time_per_iteration": 2.463984727859497 + }, + { + "auxiliary_loss_clip": 0.01118362, + "auxiliary_loss_mlp": 0.01085374, + "balance_loss_clip": 1.0278194, + "balance_loss_mlp": 1.03084636, + "epoch": 0.09120697429730948, + "flos": 33728227758720.0, + "grad_norm": 1.4721470117606972, + "language_loss": 0.82415593, + "learning_rate": 3.960831012676692e-06, + "loss": 0.84619325, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.875, + "step": 1517, + "time_per_iteration": 2.518859624862671 + }, + { + "auxiliary_loss_clip": 0.01120377, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_clip": 1.02607906, + "balance_loss_mlp": 1.03049028, + "epoch": 0.09126709754997746, + "flos": 18400519025280.0, + "grad_norm": 1.7662413571937445, + "language_loss": 0.79791492, + "learning_rate": 3.960754274845642e-06, + "loss": 0.82002568, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.8984375, + "step": 1518, + "time_per_iteration": 3.8782687187194824 + }, + { + "auxiliary_loss_clip": 0.01113453, + "auxiliary_loss_mlp": 0.01088132, + "balance_loss_clip": 1.02557123, + "balance_loss_mlp": 1.02892709, + "epoch": 0.09132722080264542, + "flos": 22090838267520.0, + "grad_norm": 1.6890513801790352, + "language_loss": 0.89372605, + "learning_rate": 3.960677462662594e-06, + "loss": 0.91574192, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 0.84765625, + "step": 1519, + "time_per_iteration": 3.8329615592956543 + }, + { + "auxiliary_loss_clip": 0.011151, + "auxiliary_loss_mlp": 0.01080916, + "balance_loss_clip": 1.0199281, + "balance_loss_mlp": 1.02956319, + "epoch": 0.09138734405531339, + "flos": 21032062243200.0, + "grad_norm": 2.143659652590676, + "language_loss": 0.76293993, + "learning_rate": 3.96060057613046e-06, + "loss": 0.78490007, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.85546875, + "step": 1520, + "time_per_iteration": 3.941753625869751 + }, + { + "auxiliary_loss_clip": 0.0111784, + "auxiliary_loss_mlp": 0.01081193, + "balance_loss_clip": 1.0172019, + "balance_loss_mlp": 1.03070045, + "epoch": 0.09144746730798137, + "flos": 20082913488000.0, + "grad_norm": 2.2890452716644507, + "language_loss": 0.88765359, + "learning_rate": 3.960523615252156e-06, + "loss": 0.90964389, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.87109375, + "step": 1521, + "time_per_iteration": 2.4317142963409424 + }, + { + "auxiliary_loss_clip": 0.01119535, + "auxiliary_loss_mlp": 0.01087612, + "balance_loss_clip": 1.0281024, + "balance_loss_mlp": 1.030599, + "epoch": 0.09150759056064933, + "flos": 22777265975040.0, + "grad_norm": 1.925555600767937, + "language_loss": 0.86278105, + "learning_rate": 3.960446580030599e-06, + "loss": 0.88485247, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.890625, + "step": 1522, + "time_per_iteration": 3.969216823577881 + }, + { + "auxiliary_loss_clip": 0.01110378, + "auxiliary_loss_mlp": 0.01087944, + "balance_loss_clip": 1.02967477, + "balance_loss_mlp": 1.02874708, + "epoch": 0.0915677138133173, + "flos": 27562950213120.0, + "grad_norm": 1.7411318178844923, + "language_loss": 0.82919931, + "learning_rate": 3.960369470468711e-06, + "loss": 0.85118252, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.81640625, + "step": 1523, + "time_per_iteration": 2.4459054470062256 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01089493, + "balance_loss_clip": 1.02964997, + "balance_loss_mlp": 1.02889049, + "epoch": 0.09162783706598528, + "flos": 17673836653440.0, + "grad_norm": 2.271721925238458, + "language_loss": 0.76478308, + "learning_rate": 3.960292286569418e-06, + "loss": 0.78682315, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.85546875, + "step": 1524, + "time_per_iteration": 2.380953311920166 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01079847, + "balance_loss_clip": 1.02424788, + "balance_loss_mlp": 1.02752495, + "epoch": 0.09168796031865324, + "flos": 18477223015680.0, + "grad_norm": 2.346109390527503, + "language_loss": 0.8828454, + "learning_rate": 3.960215028335644e-06, + "loss": 0.90476561, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.84375, + "step": 1525, + "time_per_iteration": 2.3737895488739014 + }, + { + "auxiliary_loss_clip": 0.01114648, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_clip": 1.02301931, + "balance_loss_mlp": 1.02992296, + "epoch": 0.0917480835713212, + "flos": 29386324212480.0, + "grad_norm": 1.9602787547021547, + "language_loss": 0.76971221, + "learning_rate": 3.96013769577032e-06, + "loss": 0.79167777, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.84765625, + "step": 1526, + "time_per_iteration": 2.4519925117492676 + }, + { + "auxiliary_loss_clip": 0.01112, + "auxiliary_loss_mlp": 0.01080243, + "balance_loss_clip": 1.02566838, + "balance_loss_mlp": 1.02896392, + "epoch": 0.09180820682398917, + "flos": 19828222053120.0, + "grad_norm": 1.8519425670605048, + "language_loss": 0.79425347, + "learning_rate": 3.960060288876378e-06, + "loss": 0.81617594, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.828125, + "step": 1527, + "time_per_iteration": 2.388181686401367 + }, + { + "auxiliary_loss_clip": 0.01112667, + "auxiliary_loss_mlp": 0.01081673, + "balance_loss_clip": 1.02254558, + "balance_loss_mlp": 1.02794886, + "epoch": 0.09186833007665715, + "flos": 23840720121600.0, + "grad_norm": 1.9131397859192008, + "language_loss": 0.82477582, + "learning_rate": 3.959982807656753e-06, + "loss": 0.84671915, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.84765625, + "step": 1528, + "time_per_iteration": 2.4230844974517822 + }, + { + "auxiliary_loss_clip": 0.01116468, + "auxiliary_loss_mlp": 0.01080869, + "balance_loss_clip": 1.02317202, + "balance_loss_mlp": 1.03010392, + "epoch": 0.09192845332932512, + "flos": 12931898215680.0, + "grad_norm": 2.5570562671691874, + "language_loss": 0.79772675, + "learning_rate": 3.959905252114384e-06, + "loss": 0.81970012, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.86328125, + "step": 1529, + "time_per_iteration": 2.4137628078460693 + }, + { + "auxiliary_loss_clip": 0.01113831, + "auxiliary_loss_mlp": 0.01083875, + "balance_loss_clip": 1.02577209, + "balance_loss_mlp": 1.02598858, + "epoch": 0.09198857658199308, + "flos": 24567123202560.0, + "grad_norm": 2.047433573079776, + "language_loss": 0.85047191, + "learning_rate": 3.959827622252211e-06, + "loss": 0.87244898, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.87890625, + "step": 1530, + "time_per_iteration": 2.42779278755188 + }, + { + "auxiliary_loss_clip": 0.01112382, + "auxiliary_loss_mlp": 0.01083326, + "balance_loss_clip": 1.02667737, + "balance_loss_mlp": 1.02954578, + "epoch": 0.09204869983466106, + "flos": 20265893256960.0, + "grad_norm": 1.9283982258582641, + "language_loss": 0.86324906, + "learning_rate": 3.959749918073179e-06, + "loss": 0.8852061, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.828125, + "step": 1531, + "time_per_iteration": 2.397167682647705 + }, + { + "auxiliary_loss_clip": 0.01111363, + "auxiliary_loss_mlp": 0.01079901, + "balance_loss_clip": 1.02349126, + "balance_loss_mlp": 1.02722526, + "epoch": 0.09210882308732903, + "flos": 20884624104960.0, + "grad_norm": 1.7640482213845297, + "language_loss": 0.82682145, + "learning_rate": 3.959672139580233e-06, + "loss": 0.84873402, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.84375, + "step": 1532, + "time_per_iteration": 2.4126298427581787 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01083749, + "balance_loss_clip": 1.02714825, + "balance_loss_mlp": 1.02861822, + "epoch": 0.09216894633999699, + "flos": 30955006776960.0, + "grad_norm": 1.825428949142385, + "language_loss": 0.86100858, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.88297808, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.84765625, + "step": 1533, + "time_per_iteration": 2.4915108680725098 + }, + { + "auxiliary_loss_clip": 0.01111485, + "auxiliary_loss_mlp": 0.01081142, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.02756143, + "epoch": 0.09222906959266497, + "flos": 13150733817600.0, + "grad_norm": 1.9602486106635217, + "language_loss": 0.92837369, + "learning_rate": 3.959516359664402e-06, + "loss": 0.95029998, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.83984375, + "step": 1534, + "time_per_iteration": 2.403778076171875 + }, + { + "auxiliary_loss_clip": 0.01112943, + "auxiliary_loss_mlp": 0.01096766, + "balance_loss_clip": 1.03699422, + "balance_loss_mlp": 1.02815974, + "epoch": 0.09228919284533293, + "flos": 25993290130560.0, + "grad_norm": 2.368152291143002, + "language_loss": 0.7839613, + "learning_rate": 3.959438358247424e-06, + "loss": 0.80605841, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.84765625, + "step": 1535, + "time_per_iteration": 2.4370861053466797 + }, + { + "auxiliary_loss_clip": 0.01104872, + "auxiliary_loss_mlp": 0.01078993, + "balance_loss_clip": 1.02470541, + "balance_loss_mlp": 1.02602029, + "epoch": 0.0923493160980009, + "flos": 18659818759680.0, + "grad_norm": 1.7222424341723932, + "language_loss": 0.83470994, + "learning_rate": 3.959360282528346e-06, + "loss": 0.85654861, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.7890625, + "step": 1536, + "time_per_iteration": 2.412506580352783 + }, + { + "auxiliary_loss_clip": 0.0110547, + "auxiliary_loss_mlp": 0.01074689, + "balance_loss_clip": 1.02273798, + "balance_loss_mlp": 1.02472973, + "epoch": 0.09240943935066886, + "flos": 21139559919360.0, + "grad_norm": 1.92284312716687, + "language_loss": 0.91189051, + "learning_rate": 3.959282132510131e-06, + "loss": 0.93369216, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.8046875, + "step": 1537, + "time_per_iteration": 2.38391375541687 + }, + { + "auxiliary_loss_clip": 0.01111628, + "auxiliary_loss_mlp": 0.0108499, + "balance_loss_clip": 1.0272212, + "balance_loss_mlp": 1.02669752, + "epoch": 0.09246956260333684, + "flos": 20591458485120.0, + "grad_norm": 1.9652504614992454, + "language_loss": 0.83112502, + "learning_rate": 3.959203908195741e-06, + "loss": 0.85309124, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.84765625, + "step": 1538, + "time_per_iteration": 2.4389352798461914 + }, + { + "auxiliary_loss_clip": 0.01029558, + "auxiliary_loss_mlp": 0.01007044, + "balance_loss_clip": 0.99998689, + "balance_loss_mlp": 1.00782847, + "epoch": 0.09252968585600481, + "flos": 67555153664640.0, + "grad_norm": 0.7462228114839802, + "language_loss": 0.57460278, + "learning_rate": 3.959125609588142e-06, + "loss": 0.5949688, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.07080078, + "router_z_loss_mlp": 0.21679688, + "step": 1539, + "time_per_iteration": 3.1154885292053223 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01081833, + "balance_loss_clip": 1.02308667, + "balance_loss_mlp": 1.0286262, + "epoch": 0.09258980910867277, + "flos": 17382905360640.0, + "grad_norm": 2.20583550041661, + "language_loss": 0.70783305, + "learning_rate": 3.959047236690304e-06, + "loss": 0.72977436, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.8359375, + "step": 1540, + "time_per_iteration": 2.381819725036621 + }, + { + "auxiliary_loss_clip": 0.01111804, + "auxiliary_loss_mlp": 0.01070341, + "balance_loss_clip": 1.01486063, + "balance_loss_mlp": 1.02877617, + "epoch": 0.09264993236134075, + "flos": 19864880847360.0, + "grad_norm": 1.7652616938640926, + "language_loss": 0.85139537, + "learning_rate": 3.958968789505198e-06, + "loss": 0.87321687, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.83203125, + "step": 1541, + "time_per_iteration": 2.3839406967163086 + }, + { + "auxiliary_loss_clip": 0.01027101, + "auxiliary_loss_mlp": 0.01010737, + "balance_loss_clip": 1.00291705, + "balance_loss_mlp": 1.00495458, + "epoch": 0.09271005561400872, + "flos": 62281558909440.0, + "grad_norm": 0.8929756684182153, + "language_loss": 0.62018621, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64056462, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.078125, + "router_z_loss_mlp": 0.22167969, + "step": 1542, + "time_per_iteration": 3.051844358444214 + }, + { + "auxiliary_loss_clip": 0.01111346, + "auxiliary_loss_mlp": 0.01075436, + "balance_loss_clip": 1.02329421, + "balance_loss_mlp": 1.0284003, + "epoch": 0.09277017886667668, + "flos": 23328788722560.0, + "grad_norm": 1.5237704694615548, + "language_loss": 0.84647286, + "learning_rate": 3.958811672285086e-06, + "loss": 0.86834067, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.83203125, + "step": 1543, + "time_per_iteration": 2.426017999649048 + }, + { + "auxiliary_loss_clip": 0.01107356, + "auxiliary_loss_mlp": 0.01074721, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.02798557, + "epoch": 0.09283030211934466, + "flos": 54743183435520.0, + "grad_norm": 1.5796454495499592, + "language_loss": 0.74377328, + "learning_rate": 3.958733002256038e-06, + "loss": 0.76559407, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.796875, + "step": 1544, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0111193, + "auxiliary_loss_mlp": 0.01073564, + "balance_loss_clip": 1.02135026, + "balance_loss_mlp": 1.02825546, + "epoch": 0.09289042537201263, + "flos": 30333517931520.0, + "grad_norm": 1.660205508839919, + "language_loss": 0.79370642, + "learning_rate": 3.958654257951637e-06, + "loss": 0.81556141, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.8359375, + "step": 1545, + "time_per_iteration": 2.4560794830322266 + }, + { + "auxiliary_loss_clip": 0.01107708, + "auxiliary_loss_mlp": 0.01077919, + "balance_loss_clip": 1.02425075, + "balance_loss_mlp": 1.02750731, + "epoch": 0.09295054862468059, + "flos": 17745932344320.0, + "grad_norm": 3.434896832588203, + "language_loss": 0.79653955, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.81839579, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8046875, + "step": 1546, + "time_per_iteration": 2.3745367527008057 + }, + { + "auxiliary_loss_clip": 0.01107977, + "auxiliary_loss_mlp": 0.01068469, + "balance_loss_clip": 1.01673222, + "balance_loss_mlp": 1.0264827, + "epoch": 0.09301067187734856, + "flos": 23656937391360.0, + "grad_norm": 1.730644398449994, + "language_loss": 0.86076963, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.88253415, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.81640625, + "step": 1547, + "time_per_iteration": 2.415879726409912 + }, + { + "auxiliary_loss_clip": 0.01111751, + "auxiliary_loss_mlp": 0.01083771, + "balance_loss_clip": 1.03067517, + "balance_loss_mlp": 1.02696967, + "epoch": 0.09307079513001654, + "flos": 27526465975680.0, + "grad_norm": 1.8871385391519144, + "language_loss": 0.7104708, + "learning_rate": 3.958417579416199e-06, + "loss": 0.73242599, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.8515625, + "step": 1548, + "time_per_iteration": 2.447303533554077 + }, + { + "auxiliary_loss_clip": 0.01110969, + "auxiliary_loss_mlp": 0.01077747, + "balance_loss_clip": 1.02171826, + "balance_loss_mlp": 1.02755284, + "epoch": 0.0931309183826845, + "flos": 20626406622720.0, + "grad_norm": 2.5382528167317915, + "language_loss": 0.85762775, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.87951493, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.8359375, + "step": 1549, + "time_per_iteration": 2.414156436920166 + }, + { + "auxiliary_loss_clip": 0.01104786, + "auxiliary_loss_mlp": 0.01072112, + "balance_loss_clip": 1.02223539, + "balance_loss_mlp": 1.02682269, + "epoch": 0.09319104163535247, + "flos": 29019701358720.0, + "grad_norm": 1.5537684547248862, + "language_loss": 0.77314007, + "learning_rate": 3.958259422403966e-06, + "loss": 0.794909, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.78125, + "step": 1550, + "time_per_iteration": 2.5018465518951416 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.01082223, + "balance_loss_clip": 1.0286746, + "balance_loss_mlp": 1.02446139, + "epoch": 0.09325116488802045, + "flos": 25300368910080.0, + "grad_norm": 2.112278640344617, + "language_loss": 0.85835373, + "learning_rate": 3.95818023251026e-06, + "loss": 0.8802436, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.8203125, + "step": 1551, + "time_per_iteration": 2.4166924953460693 + }, + { + "auxiliary_loss_clip": 0.0102262, + "auxiliary_loss_mlp": 0.01026248, + "balance_loss_clip": 1.02045465, + "balance_loss_mlp": 1.00239635, + "epoch": 0.09331128814068841, + "flos": 61532880514560.0, + "grad_norm": 0.7650003354319651, + "language_loss": 0.6194067, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63989538, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.05786133, + "router_z_loss_mlp": 0.20214844, + "step": 1552, + "time_per_iteration": 3.142827033996582 + }, + { + "auxiliary_loss_clip": 0.01025153, + "auxiliary_loss_mlp": 0.01007158, + "balance_loss_clip": 1.00119793, + "balance_loss_mlp": 1.00498664, + "epoch": 0.09337141139335638, + "flos": 53290515052800.0, + "grad_norm": 0.8331778187860939, + "language_loss": 0.59030706, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61063015, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.20117188, + "step": 1553, + "time_per_iteration": 3.1626241207122803 + }, + { + "auxiliary_loss_clip": 0.01109075, + "auxiliary_loss_mlp": 0.0107384, + "balance_loss_clip": 1.02134061, + "balance_loss_mlp": 1.02737617, + "epoch": 0.09343153464602436, + "flos": 23475738101760.0, + "grad_norm": 1.9319505666546912, + "language_loss": 0.9002043, + "learning_rate": 3.957942217314823e-06, + "loss": 0.92203349, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.81640625, + "step": 1554, + "time_per_iteration": 2.427488327026367 + }, + { + "auxiliary_loss_clip": 0.01104856, + "auxiliary_loss_mlp": 0.01069639, + "balance_loss_clip": 1.02298045, + "balance_loss_mlp": 1.02935243, + "epoch": 0.09349165789869232, + "flos": 19352495600640.0, + "grad_norm": 1.758496786718638, + "language_loss": 0.83159781, + "learning_rate": 3.957862730421599e-06, + "loss": 0.85334265, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.75390625, + "step": 1555, + "time_per_iteration": 2.4667415618896484 + }, + { + "auxiliary_loss_clip": 0.01031897, + "auxiliary_loss_mlp": 0.0100935, + "balance_loss_clip": 1.00310326, + "balance_loss_mlp": 1.01125431, + "epoch": 0.09355178115136029, + "flos": 67499572913280.0, + "grad_norm": 0.9059725369078745, + "language_loss": 0.59695101, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61736351, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.0625, + "router_z_loss_mlp": 0.20703125, + "step": 1556, + "time_per_iteration": 3.074983596801758 + }, + { + "auxiliary_loss_clip": 0.01105572, + "auxiliary_loss_mlp": 0.01084063, + "balance_loss_clip": 1.03325653, + "balance_loss_mlp": 1.02782059, + "epoch": 0.09361190440402825, + "flos": 37340132353920.0, + "grad_norm": 1.638913048941182, + "language_loss": 0.8656038, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.88750023, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.77734375, + "step": 1557, + "time_per_iteration": 4.161519527435303 + }, + { + "auxiliary_loss_clip": 0.01106419, + "auxiliary_loss_mlp": 0.01076303, + "balance_loss_clip": 1.02377987, + "balance_loss_mlp": 1.02708817, + "epoch": 0.09367202765669623, + "flos": 24898553539200.0, + "grad_norm": 1.8434474701957655, + "language_loss": 0.79396212, + "learning_rate": 3.957623824299893e-06, + "loss": 0.81578934, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.79296875, + "step": 1558, + "time_per_iteration": 2.4767630100250244 + }, + { + "auxiliary_loss_clip": 0.011109, + "auxiliary_loss_mlp": 0.01088082, + "balance_loss_clip": 1.03489101, + "balance_loss_mlp": 1.03006721, + "epoch": 0.0937321509093642, + "flos": 15704665349760.0, + "grad_norm": 2.0589546542043986, + "language_loss": 0.82168341, + "learning_rate": 3.957544040455379e-06, + "loss": 0.84367323, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.80859375, + "step": 1559, + "time_per_iteration": 3.7921829223632812 + }, + { + "auxiliary_loss_clip": 0.01104497, + "auxiliary_loss_mlp": 0.01079965, + "balance_loss_clip": 1.03275776, + "balance_loss_mlp": 1.02828133, + "epoch": 0.09379227416203216, + "flos": 20482704000000.0, + "grad_norm": 1.8961158718188131, + "language_loss": 0.78434062, + "learning_rate": 3.957464182380599e-06, + "loss": 0.80618531, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.76171875, + "step": 1560, + "time_per_iteration": 3.8214142322540283 + }, + { + "auxiliary_loss_clip": 0.01109541, + "auxiliary_loss_mlp": 0.01083402, + "balance_loss_clip": 1.03509879, + "balance_loss_mlp": 1.02703166, + "epoch": 0.09385239741470014, + "flos": 24351359800320.0, + "grad_norm": 2.131541226283718, + "language_loss": 0.83193266, + "learning_rate": 3.95738425007858e-06, + "loss": 0.85386217, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.82421875, + "step": 1561, + "time_per_iteration": 3.8965706825256348 + }, + { + "auxiliary_loss_clip": 0.01104066, + "auxiliary_loss_mlp": 0.01089738, + "balance_loss_clip": 1.04119647, + "balance_loss_mlp": 1.02517021, + "epoch": 0.0939125206673681, + "flos": 33290102707200.0, + "grad_norm": 1.9692995827690527, + "language_loss": 0.64528763, + "learning_rate": 3.957304243552354e-06, + "loss": 0.66722572, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7890625, + "step": 1562, + "time_per_iteration": 2.4954569339752197 + }, + { + "auxiliary_loss_clip": 0.01101187, + "auxiliary_loss_mlp": 0.01076462, + "balance_loss_clip": 1.03242636, + "balance_loss_mlp": 1.02564025, + "epoch": 0.09397264392003607, + "flos": 19243915672320.0, + "grad_norm": 1.8985637387716423, + "language_loss": 0.87626088, + "learning_rate": 3.957224162804956e-06, + "loss": 0.89803737, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.7578125, + "step": 1563, + "time_per_iteration": 2.398174524307251 + }, + { + "auxiliary_loss_clip": 0.01100595, + "auxiliary_loss_mlp": 0.01075822, + "balance_loss_clip": 1.0278995, + "balance_loss_mlp": 1.02481222, + "epoch": 0.09403276717270405, + "flos": 19316919058560.0, + "grad_norm": 1.8995493509656518, + "language_loss": 0.79113925, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.8129034, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.7578125, + "step": 1564, + "time_per_iteration": 2.3925321102142334 + }, + { + "auxiliary_loss_clip": 0.01103273, + "auxiliary_loss_mlp": 0.01081313, + "balance_loss_clip": 1.03370118, + "balance_loss_mlp": 1.02530456, + "epoch": 0.09409289042537201, + "flos": 23582432816640.0, + "grad_norm": 1.7752293655588647, + "language_loss": 0.82152903, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.84337491, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.78125, + "step": 1565, + "time_per_iteration": 2.4311835765838623 + }, + { + "auxiliary_loss_clip": 0.0110452, + "auxiliary_loss_mlp": 0.01074704, + "balance_loss_clip": 1.02387357, + "balance_loss_mlp": 1.02522397, + "epoch": 0.09415301367803998, + "flos": 20077572049920.0, + "grad_norm": 1.8521367298484126, + "language_loss": 0.7796939, + "learning_rate": 3.956983475266103e-06, + "loss": 0.80148613, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7890625, + "step": 1566, + "time_per_iteration": 2.3932371139526367 + }, + { + "auxiliary_loss_clip": 0.01107134, + "auxiliary_loss_mlp": 0.01072801, + "balance_loss_clip": 1.02368712, + "balance_loss_mlp": 1.02791309, + "epoch": 0.09421313693070796, + "flos": 21061215095040.0, + "grad_norm": 2.00594193303136, + "language_loss": 0.80909908, + "learning_rate": 3.956903097664407e-06, + "loss": 0.8308984, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.79296875, + "step": 1567, + "time_per_iteration": 2.4117980003356934 + }, + { + "auxiliary_loss_clip": 0.01111898, + "auxiliary_loss_mlp": 0.01076306, + "balance_loss_clip": 1.0257374, + "balance_loss_mlp": 1.03119612, + "epoch": 0.09427326018337592, + "flos": 24315015208320.0, + "grad_norm": 2.132844068242884, + "language_loss": 0.84984076, + "learning_rate": 3.956822645856749e-06, + "loss": 0.87172282, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.8046875, + "step": 1568, + "time_per_iteration": 2.43041729927063 + }, + { + "auxiliary_loss_clip": 0.01110439, + "auxiliary_loss_mlp": 0.01075039, + "balance_loss_clip": 1.02342153, + "balance_loss_mlp": 1.03023338, + "epoch": 0.09433338343604389, + "flos": 20262925791360.0, + "grad_norm": 2.183146245202465, + "language_loss": 0.78571796, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.80757272, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.80078125, + "step": 1569, + "time_per_iteration": 2.4096498489379883 + }, + { + "auxiliary_loss_clip": 0.01108764, + "auxiliary_loss_mlp": 0.01076066, + "balance_loss_clip": 1.02795291, + "balance_loss_mlp": 1.03015351, + "epoch": 0.09439350668871185, + "flos": 12742355111040.0, + "grad_norm": 2.634157246885169, + "language_loss": 0.87898397, + "learning_rate": 3.956661519635756e-06, + "loss": 0.9008323, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.78515625, + "step": 1570, + "time_per_iteration": 2.359407424926758 + }, + { + "auxiliary_loss_clip": 0.01112552, + "auxiliary_loss_mlp": 0.01069505, + "balance_loss_clip": 1.01919878, + "balance_loss_mlp": 1.03211808, + "epoch": 0.09445362994137983, + "flos": 25960960344960.0, + "grad_norm": 1.5666366674499692, + "language_loss": 0.78847367, + "learning_rate": 3.95658084522853e-06, + "loss": 0.81029427, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.8046875, + "step": 1571, + "time_per_iteration": 2.4857497215270996 + }, + { + "auxiliary_loss_clip": 0.01108073, + "auxiliary_loss_mlp": 0.01080379, + "balance_loss_clip": 1.03054905, + "balance_loss_mlp": 1.0318594, + "epoch": 0.0945137531940478, + "flos": 19714440332160.0, + "grad_norm": 1.6296477687990505, + "language_loss": 0.81438005, + "learning_rate": 3.956500096627561e-06, + "loss": 0.83626461, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.76171875, + "step": 1572, + "time_per_iteration": 2.4068503379821777 + }, + { + "auxiliary_loss_clip": 0.01108048, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_clip": 1.02293742, + "balance_loss_mlp": 1.03071332, + "epoch": 0.09457387644671576, + "flos": 23616089233920.0, + "grad_norm": 1.789363533905425, + "language_loss": 0.89132404, + "learning_rate": 3.956419273835913e-06, + "loss": 0.91311669, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.7734375, + "step": 1573, + "time_per_iteration": 2.4506731033325195 + }, + { + "auxiliary_loss_clip": 0.01113773, + "auxiliary_loss_mlp": 0.01090894, + "balance_loss_clip": 1.03581882, + "balance_loss_mlp": 1.03181887, + "epoch": 0.09463399969938374, + "flos": 26906059382400.0, + "grad_norm": 2.0179844319136917, + "language_loss": 0.83331752, + "learning_rate": 3.95633837685665e-06, + "loss": 0.8553642, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.8203125, + "step": 1574, + "time_per_iteration": 2.470216989517212 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.0107776, + "balance_loss_clip": 1.02905083, + "balance_loss_mlp": 1.03025961, + "epoch": 0.0946941229520517, + "flos": 23658438579840.0, + "grad_norm": 1.71571141830514, + "language_loss": 0.8295446, + "learning_rate": 3.95625740569284e-06, + "loss": 0.85139763, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7734375, + "step": 1575, + "time_per_iteration": 2.4480130672454834 + }, + { + "auxiliary_loss_clip": 0.01103318, + "auxiliary_loss_mlp": 0.0107547, + "balance_loss_clip": 1.0231843, + "balance_loss_mlp": 1.02626145, + "epoch": 0.09475424620471967, + "flos": 24132908223360.0, + "grad_norm": 1.8923284697200395, + "language_loss": 0.8921082, + "learning_rate": 3.956176360347553e-06, + "loss": 0.91389608, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.76953125, + "step": 1576, + "time_per_iteration": 2.425128221511841 + }, + { + "auxiliary_loss_clip": 0.0102364, + "auxiliary_loss_mlp": 0.01008138, + "balance_loss_clip": 1.00079513, + "balance_loss_mlp": 1.00480807, + "epoch": 0.09481436945738765, + "flos": 68422815573120.0, + "grad_norm": 0.9813990260561815, + "language_loss": 0.65956259, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67988038, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.07324219, + "router_z_loss_mlp": 0.1875, + "step": 1577, + "time_per_iteration": 2.9893174171447754 + }, + { + "auxiliary_loss_clip": 0.01107265, + "auxiliary_loss_mlp": 0.01075863, + "balance_loss_clip": 1.02734518, + "balance_loss_mlp": 1.02733207, + "epoch": 0.09487449271005562, + "flos": 16653150789120.0, + "grad_norm": 2.031646104724104, + "language_loss": 0.8257004, + "learning_rate": 3.956014047124844e-06, + "loss": 0.84753168, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.796875, + "step": 1578, + "time_per_iteration": 2.3976097106933594 + }, + { + "auxiliary_loss_clip": 0.01105996, + "auxiliary_loss_mlp": 0.01075616, + "balance_loss_clip": 1.02559578, + "balance_loss_mlp": 1.02519178, + "epoch": 0.09493461596272358, + "flos": 24274655809920.0, + "grad_norm": 1.6811323280272412, + "language_loss": 0.79995799, + "learning_rate": 3.955932779253578e-06, + "loss": 0.82177413, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.80859375, + "step": 1579, + "time_per_iteration": 2.469198703765869 + }, + { + "auxiliary_loss_clip": 0.01108438, + "auxiliary_loss_mlp": 0.01081956, + "balance_loss_clip": 1.02709603, + "balance_loss_mlp": 1.02677846, + "epoch": 0.09499473921539155, + "flos": 21869139934080.0, + "grad_norm": 1.8988950420069364, + "language_loss": 0.75379193, + "learning_rate": 3.955851437213144e-06, + "loss": 0.77569592, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.8203125, + "step": 1580, + "time_per_iteration": 2.401291847229004 + }, + { + "auxiliary_loss_clip": 0.01104649, + "auxiliary_loss_mlp": 0.01077328, + "balance_loss_clip": 1.02444661, + "balance_loss_mlp": 1.02620959, + "epoch": 0.09505486246805953, + "flos": 33545736748800.0, + "grad_norm": 1.6746147552912378, + "language_loss": 0.79085857, + "learning_rate": 3.955770021006627e-06, + "loss": 0.81267834, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.78515625, + "step": 1581, + "time_per_iteration": 2.5123448371887207 + }, + { + "auxiliary_loss_clip": 0.01105023, + "auxiliary_loss_mlp": 0.01078223, + "balance_loss_clip": 1.02324378, + "balance_loss_mlp": 1.02631831, + "epoch": 0.09511498572072749, + "flos": 21214273962240.0, + "grad_norm": 2.1456370893664896, + "language_loss": 0.89219153, + "learning_rate": 3.955688530637116e-06, + "loss": 0.914024, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.78515625, + "step": 1582, + "time_per_iteration": 2.4266607761383057 + }, + { + "auxiliary_loss_clip": 0.01107021, + "auxiliary_loss_mlp": 0.01078792, + "balance_loss_clip": 1.02319264, + "balance_loss_mlp": 1.02714324, + "epoch": 0.09517510897339546, + "flos": 14610382606080.0, + "grad_norm": 1.900717047310488, + "language_loss": 0.69393158, + "learning_rate": 3.955606966107699e-06, + "loss": 0.71578968, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.80078125, + "step": 1583, + "time_per_iteration": 2.4320664405822754 + }, + { + "auxiliary_loss_clip": 0.0111246, + "auxiliary_loss_mlp": 0.01081714, + "balance_loss_clip": 1.020679, + "balance_loss_mlp": 1.02934122, + "epoch": 0.09523523222606343, + "flos": 27816140459520.0, + "grad_norm": 1.6806659134654878, + "language_loss": 0.72713912, + "learning_rate": 3.95552532742147e-06, + "loss": 0.74908078, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.828125, + "step": 1584, + "time_per_iteration": 2.44708251953125 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01082578, + "balance_loss_clip": 1.03010249, + "balance_loss_mlp": 1.02794814, + "epoch": 0.0952953554787314, + "flos": 20705170383360.0, + "grad_norm": 1.503810785159523, + "language_loss": 0.83231986, + "learning_rate": 3.955443614581525e-06, + "loss": 0.85422695, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.8046875, + "step": 1585, + "time_per_iteration": 2.465524435043335 + }, + { + "auxiliary_loss_clip": 0.01113662, + "auxiliary_loss_mlp": 0.01081864, + "balance_loss_clip": 1.02564538, + "balance_loss_mlp": 1.02906871, + "epoch": 0.09535547873139937, + "flos": 24786552297600.0, + "grad_norm": 1.7422501134792872, + "language_loss": 0.75063449, + "learning_rate": 3.955361827590961e-06, + "loss": 0.7725898, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.84765625, + "step": 1586, + "time_per_iteration": 2.4342799186706543 + }, + { + "auxiliary_loss_clip": 0.01030788, + "auxiliary_loss_mlp": 0.01010919, + "balance_loss_clip": 1.00371873, + "balance_loss_mlp": 1.0112958, + "epoch": 0.09541560198406734, + "flos": 71909208230400.0, + "grad_norm": 0.8497229837703482, + "language_loss": 0.55547488, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57589197, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.07177734, + "router_z_loss_mlp": 0.1953125, + "step": 1587, + "time_per_iteration": 2.825852155685425 + }, + { + "auxiliary_loss_clip": 0.01109955, + "auxiliary_loss_mlp": 0.01083323, + "balance_loss_clip": 1.02874851, + "balance_loss_mlp": 1.02728915, + "epoch": 0.09547572523673531, + "flos": 28981436641920.0, + "grad_norm": 1.7844032546801347, + "language_loss": 0.8340835, + "learning_rate": 3.955198031170391e-06, + "loss": 0.85601628, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.828125, + "step": 1588, + "time_per_iteration": 2.456507921218872 + }, + { + "auxiliary_loss_clip": 0.01103965, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_clip": 1.02252698, + "balance_loss_mlp": 1.02584124, + "epoch": 0.09553584848940327, + "flos": 24132768577920.0, + "grad_norm": 1.4960330355428664, + "language_loss": 0.83697015, + "learning_rate": 3.955116021746594e-06, + "loss": 0.85874009, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.78125, + "step": 1589, + "time_per_iteration": 2.4288554191589355 + }, + { + "auxiliary_loss_clip": 0.01105287, + "auxiliary_loss_mlp": 0.01078504, + "balance_loss_clip": 1.02412093, + "balance_loss_mlp": 1.02553964, + "epoch": 0.09559597174207124, + "flos": 42849706055040.0, + "grad_norm": 1.750539087268331, + "language_loss": 0.67070806, + "learning_rate": 3.955033938184601e-06, + "loss": 0.69254601, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.796875, + "step": 1590, + "time_per_iteration": 2.5858876705169678 + }, + { + "auxiliary_loss_clip": 0.01104613, + "auxiliary_loss_mlp": 0.01086164, + "balance_loss_clip": 1.0318048, + "balance_loss_mlp": 1.02535272, + "epoch": 0.09565609499473922, + "flos": 32669486645760.0, + "grad_norm": 1.6314640431535699, + "language_loss": 0.85342956, + "learning_rate": 3.954951780487526e-06, + "loss": 0.87533736, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.79296875, + "step": 1591, + "time_per_iteration": 2.4922778606414795 + }, + { + "auxiliary_loss_clip": 0.01108017, + "auxiliary_loss_mlp": 0.0109249, + "balance_loss_clip": 1.03670049, + "balance_loss_mlp": 1.02587175, + "epoch": 0.09571621824740718, + "flos": 18477432483840.0, + "grad_norm": 2.3849426279399784, + "language_loss": 0.77764428, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.79964936, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8203125, + "step": 1592, + "time_per_iteration": 2.3681893348693848 + }, + { + "auxiliary_loss_clip": 0.01105275, + "auxiliary_loss_mlp": 0.01074097, + "balance_loss_clip": 1.02305174, + "balance_loss_mlp": 1.02530873, + "epoch": 0.09577634150007515, + "flos": 29386219478400.0, + "grad_norm": 2.3232916184256145, + "language_loss": 0.76257885, + "learning_rate": 3.954787242700592e-06, + "loss": 0.78437251, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.80078125, + "step": 1593, + "time_per_iteration": 2.4658350944519043 + }, + { + "auxiliary_loss_clip": 0.01103093, + "auxiliary_loss_mlp": 0.01081601, + "balance_loss_clip": 1.02705038, + "balance_loss_mlp": 1.02460766, + "epoch": 0.09583646475274313, + "flos": 22746716668800.0, + "grad_norm": 2.1606517441671773, + "language_loss": 0.72042239, + "learning_rate": 3.954704862616971e-06, + "loss": 0.74226928, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.78515625, + "step": 1594, + "time_per_iteration": 2.39479923248291 + }, + { + "auxiliary_loss_clip": 0.01107826, + "auxiliary_loss_mlp": 0.01081718, + "balance_loss_clip": 1.02506983, + "balance_loss_mlp": 1.02684975, + "epoch": 0.0958965880054111, + "flos": 23217346062720.0, + "grad_norm": 2.1700569724748457, + "language_loss": 0.84671354, + "learning_rate": 3.954622408410747e-06, + "loss": 0.86860907, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.8125, + "step": 1595, + "time_per_iteration": 2.4211251735687256 + }, + { + "auxiliary_loss_clip": 0.011075, + "auxiliary_loss_mlp": 0.01077701, + "balance_loss_clip": 1.02360368, + "balance_loss_mlp": 1.02667379, + "epoch": 0.09595671125807906, + "flos": 21323377560960.0, + "grad_norm": 2.0185559424295265, + "language_loss": 0.87429786, + "learning_rate": 3.954539880085045e-06, + "loss": 0.89614993, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.80859375, + "step": 1596, + "time_per_iteration": 2.3998987674713135 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01078782, + "balance_loss_clip": 1.02299201, + "balance_loss_mlp": 1.02714634, + "epoch": 0.09601683451074704, + "flos": 39601910695680.0, + "grad_norm": 1.5286096168479322, + "language_loss": 0.70642501, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.72831523, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.83203125, + "step": 1597, + "time_per_iteration": 4.014523029327393 + }, + { + "auxiliary_loss_clip": 0.01109605, + "auxiliary_loss_mlp": 0.01075291, + "balance_loss_clip": 1.0210743, + "balance_loss_mlp": 1.0256654, + "epoch": 0.096076957763415, + "flos": 23731581611520.0, + "grad_norm": 2.2648826640474438, + "language_loss": 0.77086252, + "learning_rate": 3.954374601087729e-06, + "loss": 0.79271144, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.83984375, + "step": 1598, + "time_per_iteration": 3.853623867034912 + }, + { + "auxiliary_loss_clip": 0.01106219, + "auxiliary_loss_mlp": 0.01078902, + "balance_loss_clip": 1.02287364, + "balance_loss_mlp": 1.02560878, + "epoch": 0.09613708101608297, + "flos": 34676678286720.0, + "grad_norm": 1.666917606142713, + "language_loss": 0.71481001, + "learning_rate": 3.954291850422382e-06, + "loss": 0.73666126, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8046875, + "step": 1599, + "time_per_iteration": 3.916212797164917 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01073854, + "balance_loss_clip": 1.02297568, + "balance_loss_mlp": 1.02742863, + "epoch": 0.09619720426875093, + "flos": 20739001357440.0, + "grad_norm": 2.035548287324398, + "language_loss": 0.86479074, + "learning_rate": 3.954209025650093e-06, + "loss": 0.88662589, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.8203125, + "step": 1600, + "time_per_iteration": 2.4057278633117676 + }, + { + "auxiliary_loss_clip": 0.01109067, + "auxiliary_loss_mlp": 0.01078084, + "balance_loss_clip": 1.02563214, + "balance_loss_mlp": 1.02641964, + "epoch": 0.09625732752141891, + "flos": 13041874598400.0, + "grad_norm": 2.10188966121092, + "language_loss": 0.83140117, + "learning_rate": 3.954126126774001e-06, + "loss": 0.85327268, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.828125, + "step": 1601, + "time_per_iteration": 3.9549529552459717 + }, + { + "auxiliary_loss_clip": 0.01110201, + "auxiliary_loss_mlp": 0.01083674, + "balance_loss_clip": 1.02645397, + "balance_loss_mlp": 1.02718687, + "epoch": 0.09631745077408688, + "flos": 22272526316160.0, + "grad_norm": 2.172260091146376, + "language_loss": 0.84739375, + "learning_rate": 3.954043153797251e-06, + "loss": 0.86933249, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.83203125, + "step": 1602, + "time_per_iteration": 2.4098377227783203 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.01848888, + "balance_loss_mlp": 1.02790439, + "epoch": 0.09637757402675484, + "flos": 24753105348480.0, + "grad_norm": 2.1743481600705854, + "language_loss": 0.65381384, + "learning_rate": 3.953960106722989e-06, + "loss": 0.67561227, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.796875, + "step": 1603, + "time_per_iteration": 2.4325709342956543 + }, + { + "auxiliary_loss_clip": 0.01111576, + "auxiliary_loss_mlp": 0.01078123, + "balance_loss_clip": 1.02066398, + "balance_loss_mlp": 1.02754831, + "epoch": 0.09643769727942282, + "flos": 22524739044480.0, + "grad_norm": 2.3389055380757733, + "language_loss": 0.74557132, + "learning_rate": 3.953876985554364e-06, + "loss": 0.76746833, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.83984375, + "step": 1604, + "time_per_iteration": 2.4298739433288574 + }, + { + "auxiliary_loss_clip": 0.01106201, + "auxiliary_loss_mlp": 0.01076743, + "balance_loss_clip": 1.02348018, + "balance_loss_mlp": 1.02613711, + "epoch": 0.09649782053209079, + "flos": 30919674614400.0, + "grad_norm": 1.9121618101168492, + "language_loss": 0.81045002, + "learning_rate": 3.953793790294527e-06, + "loss": 0.83227944, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.80078125, + "step": 1605, + "time_per_iteration": 2.4829559326171875 + }, + { + "auxiliary_loss_clip": 0.01109094, + "auxiliary_loss_mlp": 0.01077648, + "balance_loss_clip": 1.01999855, + "balance_loss_mlp": 1.0262568, + "epoch": 0.09655794378475875, + "flos": 25336469122560.0, + "grad_norm": 1.8284130742423268, + "language_loss": 0.77764148, + "learning_rate": 3.953710520946634e-06, + "loss": 0.79950893, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.828125, + "step": 1606, + "time_per_iteration": 2.4197540283203125 + }, + { + "auxiliary_loss_clip": 0.01109385, + "auxiliary_loss_mlp": 0.01069583, + "balance_loss_clip": 1.01622534, + "balance_loss_mlp": 1.02695251, + "epoch": 0.09661806703742673, + "flos": 22344971120640.0, + "grad_norm": 2.232074573932912, + "language_loss": 0.77761221, + "learning_rate": 3.953627177513843e-06, + "loss": 0.79940194, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.828125, + "step": 1607, + "time_per_iteration": 2.4170305728912354 + }, + { + "auxiliary_loss_clip": 0.01108011, + "auxiliary_loss_mlp": 0.01075646, + "balance_loss_clip": 1.02011871, + "balance_loss_mlp": 1.02582693, + "epoch": 0.0966781902900947, + "flos": 17456606974080.0, + "grad_norm": 1.7884519847317093, + "language_loss": 0.89080453, + "learning_rate": 3.953543759999312e-06, + "loss": 0.91264111, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8203125, + "step": 1608, + "time_per_iteration": 2.357280731201172 + }, + { + "auxiliary_loss_clip": 0.01113112, + "auxiliary_loss_mlp": 0.01093932, + "balance_loss_clip": 1.03337407, + "balance_loss_mlp": 1.0284512, + "epoch": 0.09673831354276266, + "flos": 36902496061440.0, + "grad_norm": 2.23730631740532, + "language_loss": 0.74107963, + "learning_rate": 3.953460268406207e-06, + "loss": 0.7631501, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.84765625, + "step": 1609, + "time_per_iteration": 2.530562400817871 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01082921, + "balance_loss_clip": 1.02398443, + "balance_loss_mlp": 1.0248208, + "epoch": 0.09679843679543064, + "flos": 20700422438400.0, + "grad_norm": 2.6421030902365703, + "language_loss": 0.88088429, + "learning_rate": 3.953376702737693e-06, + "loss": 0.9027921, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.828125, + "step": 1610, + "time_per_iteration": 2.383683204650879 + }, + { + "auxiliary_loss_clip": 0.01106756, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_clip": 1.02199757, + "balance_loss_mlp": 1.02533531, + "epoch": 0.0968585600480986, + "flos": 23513269680000.0, + "grad_norm": 2.0644212487621867, + "language_loss": 0.70081913, + "learning_rate": 3.953293062996939e-06, + "loss": 0.72266126, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.81640625, + "step": 1611, + "time_per_iteration": 2.421774387359619 + }, + { + "auxiliary_loss_clip": 0.01106464, + "auxiliary_loss_mlp": 0.01078324, + "balance_loss_clip": 1.02353513, + "balance_loss_mlp": 1.02658069, + "epoch": 0.09691868330076657, + "flos": 20120026129920.0, + "grad_norm": 1.703965667846029, + "language_loss": 0.83754063, + "learning_rate": 3.953209349187115e-06, + "loss": 0.85938853, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.80078125, + "step": 1612, + "time_per_iteration": 2.428283452987671 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.01080526, + "balance_loss_clip": 1.02161276, + "balance_loss_mlp": 1.02682436, + "epoch": 0.09697880655343454, + "flos": 16543767899520.0, + "grad_norm": 3.3711158908387078, + "language_loss": 0.83217096, + "learning_rate": 3.953125561311398e-06, + "loss": 0.85406685, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.82421875, + "step": 1613, + "time_per_iteration": 2.4338459968566895 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01082104, + "balance_loss_clip": 1.02371502, + "balance_loss_mlp": 1.02677321, + "epoch": 0.09703892980610251, + "flos": 26102987222400.0, + "grad_norm": 1.8159604347520433, + "language_loss": 0.86943018, + "learning_rate": 3.953041699372964e-06, + "loss": 0.89133227, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.8125, + "step": 1614, + "time_per_iteration": 2.4420080184936523 + }, + { + "auxiliary_loss_clip": 0.01027149, + "auxiliary_loss_mlp": 0.01009179, + "balance_loss_clip": 1.00097728, + "balance_loss_mlp": 1.00765395, + "epoch": 0.09709905305877048, + "flos": 60440273516160.0, + "grad_norm": 0.7114388910725121, + "language_loss": 0.54640126, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56676459, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.08203125, + "router_z_loss_mlp": 0.1953125, + "step": 1615, + "time_per_iteration": 3.001404047012329 + }, + { + "auxiliary_loss_clip": 0.01024632, + "auxiliary_loss_mlp": 0.01008046, + "balance_loss_clip": 0.99993986, + "balance_loss_mlp": 1.00553751, + "epoch": 0.09715917631143844, + "flos": 57636503228160.0, + "grad_norm": 0.7627212241250961, + "language_loss": 0.58397746, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60430431, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.08105469, + "router_z_loss_mlp": 0.19140625, + "step": 1616, + "time_per_iteration": 3.171184778213501 + }, + { + "auxiliary_loss_clip": 0.01108775, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_clip": 1.02542663, + "balance_loss_mlp": 1.02615321, + "epoch": 0.09721929956410642, + "flos": 20557173663360.0, + "grad_norm": 1.7329379376361493, + "language_loss": 0.71487916, + "learning_rate": 3.952789669213172e-06, + "loss": 0.73679388, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.828125, + "step": 1617, + "time_per_iteration": 2.410926580429077 + }, + { + "auxiliary_loss_clip": 0.01106126, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_clip": 1.02985239, + "balance_loss_mlp": 1.02493536, + "epoch": 0.09727942281677439, + "flos": 27343137093120.0, + "grad_norm": 1.6878754135616512, + "language_loss": 0.82465041, + "learning_rate": 3.952705511055698e-06, + "loss": 0.84656811, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.8125, + "step": 1618, + "time_per_iteration": 2.445171594619751 + }, + { + "auxiliary_loss_clip": 0.0110414, + "auxiliary_loss_mlp": 0.01069679, + "balance_loss_clip": 1.02070832, + "balance_loss_mlp": 1.02560842, + "epoch": 0.09733954606944235, + "flos": 24898867741440.0, + "grad_norm": 1.599561037233907, + "language_loss": 0.94515753, + "learning_rate": 3.952621278851435e-06, + "loss": 0.9668957, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.78515625, + "step": 1619, + "time_per_iteration": 2.4506914615631104 + }, + { + "auxiliary_loss_clip": 0.01102778, + "auxiliary_loss_mlp": 0.01076802, + "balance_loss_clip": 1.01898563, + "balance_loss_mlp": 1.02645719, + "epoch": 0.09739966932211033, + "flos": 31502584540800.0, + "grad_norm": 1.7504334102373715, + "language_loss": 0.90421933, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.92601514, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.765625, + "step": 1620, + "time_per_iteration": 2.5068349838256836 + }, + { + "auxiliary_loss_clip": 0.01105957, + "auxiliary_loss_mlp": 0.01080507, + "balance_loss_clip": 1.02829373, + "balance_loss_mlp": 1.02688682, + "epoch": 0.0974597925747783, + "flos": 23877623295360.0, + "grad_norm": 3.0041560534363776, + "language_loss": 0.79666996, + "learning_rate": 3.952452592315324e-06, + "loss": 0.81853455, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7890625, + "step": 1621, + "time_per_iteration": 2.4402284622192383 + }, + { + "auxiliary_loss_clip": 0.01105235, + "auxiliary_loss_mlp": 0.01090211, + "balance_loss_clip": 1.03592336, + "balance_loss_mlp": 1.02590334, + "epoch": 0.09751991582744626, + "flos": 17018621568000.0, + "grad_norm": 1.985980220000075, + "language_loss": 0.78992647, + "learning_rate": 3.952368137989871e-06, + "loss": 0.81188095, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.79296875, + "step": 1622, + "time_per_iteration": 2.370020866394043 + }, + { + "auxiliary_loss_clip": 0.01107671, + "auxiliary_loss_mlp": 0.01080646, + "balance_loss_clip": 1.02569056, + "balance_loss_mlp": 1.02691627, + "epoch": 0.09758003908011423, + "flos": 28401564003840.0, + "grad_norm": 3.103647140341387, + "language_loss": 0.86719602, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88907921, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.80859375, + "step": 1623, + "time_per_iteration": 2.490595817565918 + }, + { + "auxiliary_loss_clip": 0.0110253, + "auxiliary_loss_mlp": 0.0108473, + "balance_loss_clip": 1.03275466, + "balance_loss_mlp": 1.02556264, + "epoch": 0.09764016233278221, + "flos": 18143488529280.0, + "grad_norm": 1.9044894173336737, + "language_loss": 0.82347345, + "learning_rate": 3.952199007240184e-06, + "loss": 0.84534609, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.76953125, + "step": 1624, + "time_per_iteration": 2.4219815731048584 + }, + { + "auxiliary_loss_clip": 0.0110418, + "auxiliary_loss_mlp": 0.01069204, + "balance_loss_clip": 1.01939845, + "balance_loss_mlp": 1.02603126, + "epoch": 0.09770028558545017, + "flos": 15265004198400.0, + "grad_norm": 2.104520799773898, + "language_loss": 0.88625598, + "learning_rate": 3.952114330822364e-06, + "loss": 0.90798992, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.78125, + "step": 1625, + "time_per_iteration": 2.4146273136138916 + }, + { + "auxiliary_loss_clip": 0.0110774, + "auxiliary_loss_mlp": 0.01070435, + "balance_loss_clip": 1.01936615, + "balance_loss_mlp": 1.0270443, + "epoch": 0.09776040883811814, + "flos": 23471444004480.0, + "grad_norm": 2.7032420062029074, + "language_loss": 0.87488401, + "learning_rate": 3.952029580380172e-06, + "loss": 0.89666575, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.8046875, + "step": 1626, + "time_per_iteration": 2.4074630737304688 + }, + { + "auxiliary_loss_clip": 0.01109466, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_clip": 1.01779664, + "balance_loss_mlp": 1.02637446, + "epoch": 0.09782053209078612, + "flos": 24498309179520.0, + "grad_norm": 1.8556351402213684, + "language_loss": 0.84202832, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.86386955, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.828125, + "step": 1627, + "time_per_iteration": 2.432111978530884 + }, + { + "auxiliary_loss_clip": 0.01105382, + "auxiliary_loss_mlp": 0.01070005, + "balance_loss_clip": 1.02131987, + "balance_loss_mlp": 1.02688503, + "epoch": 0.09788065534345408, + "flos": 21579081425280.0, + "grad_norm": 1.6867048831332374, + "language_loss": 0.86580735, + "learning_rate": 3.951859857435534e-06, + "loss": 0.88756126, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.78515625, + "step": 1628, + "time_per_iteration": 2.392963171005249 + }, + { + "auxiliary_loss_clip": 0.01105305, + "auxiliary_loss_mlp": 0.01075369, + "balance_loss_clip": 1.02415705, + "balance_loss_mlp": 1.02563691, + "epoch": 0.09794077859612205, + "flos": 23841313614720.0, + "grad_norm": 1.5817848201246871, + "language_loss": 0.77773905, + "learning_rate": 3.951774884939523e-06, + "loss": 0.79954582, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.796875, + "step": 1629, + "time_per_iteration": 2.4805216789245605 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.01078042, + "balance_loss_clip": 1.02353954, + "balance_loss_mlp": 1.02616382, + "epoch": 0.09800090184879003, + "flos": 23658752782080.0, + "grad_norm": 1.6901381584867556, + "language_loss": 0.80345124, + "learning_rate": 3.951689838432013e-06, + "loss": 0.82528359, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.7890625, + "step": 1630, + "time_per_iteration": 2.4625566005706787 + }, + { + "auxiliary_loss_clip": 0.01103643, + "auxiliary_loss_mlp": 0.01079806, + "balance_loss_clip": 1.02651906, + "balance_loss_mlp": 1.02607346, + "epoch": 0.09806102510145799, + "flos": 17054826514560.0, + "grad_norm": 2.101403614883873, + "language_loss": 0.89114201, + "learning_rate": 3.951604717916228e-06, + "loss": 0.9129765, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.7734375, + "step": 1631, + "time_per_iteration": 2.384974241256714 + }, + { + "auxiliary_loss_clip": 0.01103291, + "auxiliary_loss_mlp": 0.01071925, + "balance_loss_clip": 1.02049804, + "balance_loss_mlp": 1.02565217, + "epoch": 0.09812114835412596, + "flos": 23877344004480.0, + "grad_norm": 2.2983287885926877, + "language_loss": 0.84923869, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.87099087, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.7734375, + "step": 1632, + "time_per_iteration": 2.4236254692077637 + }, + { + "auxiliary_loss_clip": 0.01105379, + "auxiliary_loss_mlp": 0.01090306, + "balance_loss_clip": 1.03570819, + "balance_loss_mlp": 1.0269289, + "epoch": 0.09818127160679392, + "flos": 20594425950720.0, + "grad_norm": 1.8463381223431157, + "language_loss": 0.80762249, + "learning_rate": 3.951434254872751e-06, + "loss": 0.82957935, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.78515625, + "step": 1633, + "time_per_iteration": 2.4258577823638916 + }, + { + "auxiliary_loss_clip": 0.01103628, + "auxiliary_loss_mlp": 0.01072191, + "balance_loss_clip": 1.02124059, + "balance_loss_mlp": 1.02655423, + "epoch": 0.0982413948594619, + "flos": 15486423240960.0, + "grad_norm": 2.860326656266267, + "language_loss": 0.75749266, + "learning_rate": 3.951348912351521e-06, + "loss": 0.77925086, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7734375, + "step": 1634, + "time_per_iteration": 2.3652942180633545 + }, + { + "auxiliary_loss_clip": 0.01108616, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_clip": 1.03067935, + "balance_loss_mlp": 1.02506614, + "epoch": 0.09830151811212987, + "flos": 24206784393600.0, + "grad_norm": 2.5944991562321227, + "language_loss": 0.76422465, + "learning_rate": 3.951263495834947e-06, + "loss": 0.78615546, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8359375, + "step": 1635, + "time_per_iteration": 2.4179203510284424 + }, + { + "auxiliary_loss_clip": 0.01107572, + "auxiliary_loss_mlp": 0.01081586, + "balance_loss_clip": 1.02858508, + "balance_loss_mlp": 1.0257231, + "epoch": 0.09836164136479783, + "flos": 20593553166720.0, + "grad_norm": 2.03612435648842, + "language_loss": 0.79732376, + "learning_rate": 3.951178005326264e-06, + "loss": 0.8192153, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.8203125, + "step": 1636, + "time_per_iteration": 2.463616132736206 + }, + { + "auxiliary_loss_clip": 0.01104361, + "auxiliary_loss_mlp": 0.01069919, + "balance_loss_clip": 1.02051902, + "balance_loss_mlp": 1.02598381, + "epoch": 0.09842176461746581, + "flos": 19933241022720.0, + "grad_norm": 1.9874216757811056, + "language_loss": 0.72334164, + "learning_rate": 3.951092440828715e-06, + "loss": 0.7450844, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.78515625, + "step": 1637, + "time_per_iteration": 3.864670753479004 + }, + { + "auxiliary_loss_clip": 0.01103426, + "auxiliary_loss_mlp": 0.01077499, + "balance_loss_clip": 1.02559495, + "balance_loss_mlp": 1.02377057, + "epoch": 0.09848188787013377, + "flos": 21213610646400.0, + "grad_norm": 2.1222354235221528, + "language_loss": 0.79913712, + "learning_rate": 3.951006802345545e-06, + "loss": 0.8209464, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.796875, + "step": 1638, + "time_per_iteration": 3.946472406387329 + }, + { + "auxiliary_loss_clip": 0.01098065, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.02311969, + "balance_loss_mlp": 1.02258277, + "epoch": 0.09854201112280174, + "flos": 30152912135040.0, + "grad_norm": 1.4919503165803094, + "language_loss": 0.74175179, + "learning_rate": 3.950921089880003e-06, + "loss": 0.76343936, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.75390625, + "step": 1639, + "time_per_iteration": 3.848884105682373 + }, + { + "auxiliary_loss_clip": 0.01102439, + "auxiliary_loss_mlp": 0.01062317, + "balance_loss_clip": 1.01513398, + "balance_loss_mlp": 1.02497363, + "epoch": 0.09860213437546972, + "flos": 21794740093440.0, + "grad_norm": 2.290369854834682, + "language_loss": 0.90147913, + "learning_rate": 3.950835303435337e-06, + "loss": 0.9231267, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.7734375, + "step": 1640, + "time_per_iteration": 2.4076266288757324 + }, + { + "auxiliary_loss_clip": 0.01105975, + "auxiliary_loss_mlp": 0.01064786, + "balance_loss_clip": 1.01819932, + "balance_loss_mlp": 1.02790391, + "epoch": 0.09866225762813768, + "flos": 21834471087360.0, + "grad_norm": 1.7458191478006757, + "language_loss": 0.84006023, + "learning_rate": 3.950749443014801e-06, + "loss": 0.86176777, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.78125, + "step": 1641, + "time_per_iteration": 3.8483307361602783 + }, + { + "auxiliary_loss_clip": 0.01103361, + "auxiliary_loss_mlp": 0.01070837, + "balance_loss_clip": 1.02370191, + "balance_loss_mlp": 1.02643633, + "epoch": 0.09872238088080565, + "flos": 17598982965120.0, + "grad_norm": 2.80792561788103, + "language_loss": 0.88879156, + "learning_rate": 3.95066350862165e-06, + "loss": 0.91053355, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.76953125, + "step": 1642, + "time_per_iteration": 2.3969016075134277 + }, + { + "auxiliary_loss_clip": 0.01103133, + "auxiliary_loss_mlp": 0.01071077, + "balance_loss_clip": 1.02332211, + "balance_loss_mlp": 1.02518606, + "epoch": 0.09878250413347361, + "flos": 27634906258560.0, + "grad_norm": 1.7195390827569323, + "language_loss": 0.82884479, + "learning_rate": 3.950577500259144e-06, + "loss": 0.85058689, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.78125, + "step": 1643, + "time_per_iteration": 2.4703660011291504 + }, + { + "auxiliary_loss_clip": 0.01102265, + "auxiliary_loss_mlp": 0.01071512, + "balance_loss_clip": 1.02156353, + "balance_loss_mlp": 1.02477062, + "epoch": 0.0988426273861416, + "flos": 16543802810880.0, + "grad_norm": 1.7163951577795258, + "language_loss": 0.85234201, + "learning_rate": 3.950491417930543e-06, + "loss": 0.87407976, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7734375, + "step": 1644, + "time_per_iteration": 2.3618063926696777 + }, + { + "auxiliary_loss_clip": 0.01099057, + "auxiliary_loss_mlp": 0.01064712, + "balance_loss_clip": 1.01807773, + "balance_loss_mlp": 1.02407956, + "epoch": 0.09890275063880956, + "flos": 21214204139520.0, + "grad_norm": 1.6231948390854696, + "language_loss": 0.70650393, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.72814161, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.75, + "step": 1645, + "time_per_iteration": 2.398422956466675 + }, + { + "auxiliary_loss_clip": 0.01029536, + "auxiliary_loss_mlp": 0.01014002, + "balance_loss_clip": 1.00618172, + "balance_loss_mlp": 1.01121688, + "epoch": 0.09896287389147752, + "flos": 59376225876480.0, + "grad_norm": 0.8510179972440951, + "language_loss": 0.61036646, + "learning_rate": 3.950319031388119e-06, + "loss": 0.6308018, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.078125, + "router_z_loss_mlp": 0.18359375, + "step": 1646, + "time_per_iteration": 2.9981093406677246 + }, + { + "auxiliary_loss_clip": 0.01100304, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.02481687, + "epoch": 0.0990229971441455, + "flos": 29641399672320.0, + "grad_norm": 1.8296214726173763, + "language_loss": 0.74999666, + "learning_rate": 3.950232727180833e-06, + "loss": 0.77169681, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.7578125, + "step": 1647, + "time_per_iteration": 2.463477373123169 + }, + { + "auxiliary_loss_clip": 0.01103714, + "auxiliary_loss_mlp": 0.01071137, + "balance_loss_clip": 1.01966262, + "balance_loss_mlp": 1.02644491, + "epoch": 0.09908312039681347, + "flos": 21833807771520.0, + "grad_norm": 1.9046592328881813, + "language_loss": 0.8663798, + "learning_rate": 3.950146349020525e-06, + "loss": 0.88812834, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.7734375, + "step": 1648, + "time_per_iteration": 2.4102609157562256 + }, + { + "auxiliary_loss_clip": 0.01025869, + "auxiliary_loss_mlp": 0.01010326, + "balance_loss_clip": 1.00207663, + "balance_loss_mlp": 1.00691056, + "epoch": 0.09914324364948143, + "flos": 57560951312640.0, + "grad_norm": 0.7325394945353588, + "language_loss": 0.5575493, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57791126, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.08251953, + "router_z_loss_mlp": 0.18945312, + "step": 1649, + "time_per_iteration": 2.9665117263793945 + }, + { + "auxiliary_loss_clip": 0.01098228, + "auxiliary_loss_mlp": 0.0106882, + "balance_loss_clip": 1.01975393, + "balance_loss_mlp": 1.02328897, + "epoch": 0.09920336690214941, + "flos": 34122711744000.0, + "grad_norm": 2.1235813948399356, + "language_loss": 0.93026525, + "learning_rate": 3.949973370853954e-06, + "loss": 0.95193571, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.75, + "step": 1650, + "time_per_iteration": 2.4819529056549072 + }, + { + "auxiliary_loss_clip": 0.01024179, + "auxiliary_loss_mlp": 0.01014586, + "balance_loss_clip": 1.00614572, + "balance_loss_mlp": 1.00518847, + "epoch": 0.09926349015481738, + "flos": 71212514716800.0, + "grad_norm": 0.8072824449091115, + "language_loss": 0.63901246, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65940011, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.08447266, + "router_z_loss_mlp": 0.18945312, + "step": 1651, + "time_per_iteration": 3.217590808868408 + }, + { + "auxiliary_loss_clip": 0.01100447, + "auxiliary_loss_mlp": 0.01075628, + "balance_loss_clip": 1.02467823, + "balance_loss_mlp": 1.0248729, + "epoch": 0.09932361340748534, + "flos": 23147589432960.0, + "grad_norm": 1.6063869520628244, + "language_loss": 0.89663959, + "learning_rate": 3.949800096914643e-06, + "loss": 0.91840041, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7578125, + "step": 1652, + "time_per_iteration": 2.4003946781158447 + }, + { + "auxiliary_loss_clip": 0.0110586, + "auxiliary_loss_mlp": 0.01077704, + "balance_loss_clip": 1.02565694, + "balance_loss_mlp": 1.0269829, + "epoch": 0.09938373666015332, + "flos": 19827628560000.0, + "grad_norm": 1.8222551001494267, + "language_loss": 0.83790624, + "learning_rate": 3.949713349038422e-06, + "loss": 0.85974193, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.7890625, + "step": 1653, + "time_per_iteration": 2.4091849327087402 + }, + { + "auxiliary_loss_clip": 0.01108187, + "auxiliary_loss_mlp": 0.01083704, + "balance_loss_clip": 1.02736545, + "balance_loss_mlp": 1.02973485, + "epoch": 0.09944385991282129, + "flos": 22089581458560.0, + "grad_norm": 1.833242030229131, + "language_loss": 0.81321424, + "learning_rate": 3.949626527228875e-06, + "loss": 0.83513319, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.78515625, + "step": 1654, + "time_per_iteration": 2.5021731853485107 + }, + { + "auxiliary_loss_clip": 0.01102695, + "auxiliary_loss_mlp": 0.0107381, + "balance_loss_clip": 1.02581692, + "balance_loss_mlp": 1.02767527, + "epoch": 0.09950398316548925, + "flos": 19827838028160.0, + "grad_norm": 1.566558572440024, + "language_loss": 0.83031702, + "learning_rate": 3.949539631489295e-06, + "loss": 0.85208207, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.75, + "step": 1655, + "time_per_iteration": 2.412677526473999 + }, + { + "auxiliary_loss_clip": 0.01104579, + "auxiliary_loss_mlp": 0.01072711, + "balance_loss_clip": 1.01930523, + "balance_loss_mlp": 1.02801538, + "epoch": 0.09956410641815722, + "flos": 25002699724800.0, + "grad_norm": 1.7761133133741258, + "language_loss": 0.83285224, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.85462517, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.765625, + "step": 1656, + "time_per_iteration": 2.4364984035491943 + }, + { + "auxiliary_loss_clip": 0.01107359, + "auxiliary_loss_mlp": 0.01077246, + "balance_loss_clip": 1.02770281, + "balance_loss_mlp": 1.0314548, + "epoch": 0.0996242296708252, + "flos": 19316709590400.0, + "grad_norm": 1.573705895283084, + "language_loss": 0.90835059, + "learning_rate": 3.949365618233217e-06, + "loss": 0.9301967, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.7578125, + "step": 1657, + "time_per_iteration": 2.4293630123138428 + }, + { + "auxiliary_loss_clip": 0.0111191, + "auxiliary_loss_mlp": 0.01082596, + "balance_loss_clip": 1.02358711, + "balance_loss_mlp": 1.02992654, + "epoch": 0.09968435292349316, + "flos": 21870536388480.0, + "grad_norm": 1.9942871250625316, + "language_loss": 0.86624467, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.88818973, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.8203125, + "step": 1658, + "time_per_iteration": 2.4032602310180664 + }, + { + "auxiliary_loss_clip": 0.01023088, + "auxiliary_loss_mlp": 0.01012921, + "balance_loss_clip": 1.00605428, + "balance_loss_mlp": 1.0052948, + "epoch": 0.09974447617616113, + "flos": 65381636839680.0, + "grad_norm": 0.9035744283793647, + "language_loss": 0.61019313, + "learning_rate": 3.949191309296585e-06, + "loss": 0.63055325, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.17773438, + "step": 1659, + "time_per_iteration": 3.042620897293091 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01080048, + "balance_loss_clip": 1.0253787, + "balance_loss_mlp": 1.02740419, + "epoch": 0.0998045994288291, + "flos": 23658682959360.0, + "grad_norm": 1.7876464630642381, + "language_loss": 0.8787992, + "learning_rate": 3.949104043956321e-06, + "loss": 0.9006561, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.78515625, + "step": 1660, + "time_per_iteration": 2.43302059173584 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.01076383, + "balance_loss_clip": 1.02590942, + "balance_loss_mlp": 1.02911949, + "epoch": 0.09986472268149707, + "flos": 19608688224000.0, + "grad_norm": 2.043085972937327, + "language_loss": 0.8189894, + "learning_rate": 3.949016704705836e-06, + "loss": 0.84080315, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.7578125, + "step": 1661, + "time_per_iteration": 2.4227871894836426 + }, + { + "auxiliary_loss_clip": 0.01109526, + "auxiliary_loss_mlp": 0.01080195, + "balance_loss_clip": 1.02416635, + "balance_loss_mlp": 1.02767491, + "epoch": 0.09992484593416504, + "flos": 26212125732480.0, + "grad_norm": 1.7513440960401827, + "language_loss": 0.85802221, + "learning_rate": 3.948929291548443e-06, + "loss": 0.87991941, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8203125, + "step": 1662, + "time_per_iteration": 2.4701945781707764 + }, + { + "auxiliary_loss_clip": 0.01108106, + "auxiliary_loss_mlp": 0.01076639, + "balance_loss_clip": 1.02354312, + "balance_loss_mlp": 1.02962053, + "epoch": 0.09998496918683301, + "flos": 17492672275200.0, + "grad_norm": 1.9401008221420013, + "language_loss": 0.91297233, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.9348197, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.78515625, + "step": 1663, + "time_per_iteration": 2.4012911319732666 + }, + { + "auxiliary_loss_clip": 0.01105768, + "auxiliary_loss_mlp": 0.0108855, + "balance_loss_clip": 1.03268862, + "balance_loss_mlp": 1.02755368, + "epoch": 0.10004509243950098, + "flos": 22783794399360.0, + "grad_norm": 1.6317062477092048, + "language_loss": 0.72309226, + "learning_rate": 3.948754243526191e-06, + "loss": 0.74503541, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.78125, + "step": 1664, + "time_per_iteration": 2.441054344177246 + }, + { + "auxiliary_loss_clip": 0.01107504, + "auxiliary_loss_mlp": 0.01076544, + "balance_loss_clip": 1.02273309, + "balance_loss_mlp": 1.0297699, + "epoch": 0.10010521569216894, + "flos": 16252452581760.0, + "grad_norm": 1.9524252939564413, + "language_loss": 0.823089, + "learning_rate": 3.94866660866797e-06, + "loss": 0.84492946, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.77734375, + "step": 1665, + "time_per_iteration": 2.3911240100860596 + }, + { + "auxiliary_loss_clip": 0.01106564, + "auxiliary_loss_mlp": 0.01074543, + "balance_loss_clip": 1.02018332, + "balance_loss_mlp": 1.02831328, + "epoch": 0.10016533894483691, + "flos": 23401512817920.0, + "grad_norm": 1.6224831224515477, + "language_loss": 0.71278971, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.73460078, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.78125, + "step": 1666, + "time_per_iteration": 2.420421838760376 + }, + { + "auxiliary_loss_clip": 0.01108363, + "auxiliary_loss_mlp": 0.01082547, + "balance_loss_clip": 1.02804422, + "balance_loss_mlp": 1.03002882, + "epoch": 0.10022546219750489, + "flos": 19353158916480.0, + "grad_norm": 2.0743367313092698, + "language_loss": 0.8221736, + "learning_rate": 3.948491117273956e-06, + "loss": 0.84408271, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.78125, + "step": 1667, + "time_per_iteration": 2.406050443649292 + }, + { + "auxiliary_loss_clip": 0.01105174, + "auxiliary_loss_mlp": 0.01079178, + "balance_loss_clip": 1.02202892, + "balance_loss_mlp": 1.02810073, + "epoch": 0.10028558545017285, + "flos": 27084640320000.0, + "grad_norm": 2.962510225553142, + "language_loss": 0.81216514, + "learning_rate": 3.948403260744817e-06, + "loss": 0.83400869, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.76953125, + "step": 1668, + "time_per_iteration": 2.440906524658203 + }, + { + "auxiliary_loss_clip": 0.01105594, + "auxiliary_loss_mlp": 0.01070959, + "balance_loss_clip": 1.01667142, + "balance_loss_mlp": 1.0277009, + "epoch": 0.10034570870284082, + "flos": 25845991637760.0, + "grad_norm": 1.7095814976337, + "language_loss": 0.80399078, + "learning_rate": 3.948315330332031e-06, + "loss": 0.82575631, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.77734375, + "step": 1669, + "time_per_iteration": 2.447714328765869 + }, + { + "auxiliary_loss_clip": 0.01111113, + "auxiliary_loss_mlp": 0.01082516, + "balance_loss_clip": 1.02701187, + "balance_loss_mlp": 1.03042197, + "epoch": 0.1004058319555088, + "flos": 26248400501760.0, + "grad_norm": 2.1808067693062987, + "language_loss": 0.87943649, + "learning_rate": 3.948227326038933e-06, + "loss": 0.90137285, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8046875, + "step": 1670, + "time_per_iteration": 2.4442763328552246 + }, + { + "auxiliary_loss_clip": 0.01099476, + "auxiliary_loss_mlp": 0.01067475, + "balance_loss_clip": 1.01874185, + "balance_loss_mlp": 1.02605903, + "epoch": 0.10046595520817676, + "flos": 25373302473600.0, + "grad_norm": 1.4678212646482667, + "language_loss": 0.78069031, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.80235982, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.734375, + "step": 1671, + "time_per_iteration": 2.459282636642456 + }, + { + "auxiliary_loss_clip": 0.01026201, + "auxiliary_loss_mlp": 0.01023446, + "balance_loss_clip": 1.01643682, + "balance_loss_mlp": 1.00930071, + "epoch": 0.10052607846084473, + "flos": 67458934224000.0, + "grad_norm": 0.7830368120380419, + "language_loss": 0.60795546, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62845194, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.0703125, + "router_z_loss_mlp": 0.16894531, + "step": 1672, + "time_per_iteration": 3.003394365310669 + }, + { + "auxiliary_loss_clip": 0.0111145, + "auxiliary_loss_mlp": 0.01082955, + "balance_loss_clip": 1.02826142, + "balance_loss_mlp": 1.03193748, + "epoch": 0.10058620171351271, + "flos": 21359442862080.0, + "grad_norm": 2.0430421435278627, + "language_loss": 0.7931211, + "learning_rate": 3.947962869911147e-06, + "loss": 0.81506515, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.796875, + "step": 1673, + "time_per_iteration": 2.4184505939483643 + }, + { + "auxiliary_loss_clip": 0.01110096, + "auxiliary_loss_mlp": 0.01072848, + "balance_loss_clip": 1.0224936, + "balance_loss_mlp": 1.03155243, + "epoch": 0.10064632496618067, + "flos": 16799192472960.0, + "grad_norm": 2.1285189967702527, + "language_loss": 0.77331603, + "learning_rate": 3.947874570130197e-06, + "loss": 0.79514539, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.78515625, + "step": 1674, + "time_per_iteration": 2.3782689571380615 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01084497, + "balance_loss_clip": 1.03171098, + "balance_loss_mlp": 1.03081131, + "epoch": 0.10070644821884864, + "flos": 23623280974080.0, + "grad_norm": 1.7656979150418646, + "language_loss": 0.81593972, + "learning_rate": 3.947786196485649e-06, + "loss": 0.8378858, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.79296875, + "step": 1675, + "time_per_iteration": 2.4475479125976562 + }, + { + "auxiliary_loss_clip": 0.01106209, + "auxiliary_loss_mlp": 0.01086164, + "balance_loss_clip": 1.02975428, + "balance_loss_mlp": 1.03001475, + "epoch": 0.1007665714715166, + "flos": 24461406005760.0, + "grad_norm": 1.998815326205301, + "language_loss": 0.83522958, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8571533, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.76171875, + "step": 1676, + "time_per_iteration": 3.846339702606201 + }, + { + "auxiliary_loss_clip": 0.01111108, + "auxiliary_loss_mlp": 0.0107867, + "balance_loss_clip": 1.0241437, + "balance_loss_mlp": 1.03246546, + "epoch": 0.10082669472418458, + "flos": 16798214954880.0, + "grad_norm": 2.3958481830996132, + "language_loss": 0.87914789, + "learning_rate": 3.947609227619163e-06, + "loss": 0.90104556, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.78515625, + "step": 1677, + "time_per_iteration": 3.8398962020874023 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.01085804, + "balance_loss_clip": 1.03318465, + "balance_loss_mlp": 1.03134227, + "epoch": 0.10088681797685255, + "flos": 13552653922560.0, + "grad_norm": 1.7430008580850598, + "language_loss": 0.88445795, + "learning_rate": 3.947520632403936e-06, + "loss": 0.90640783, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.77734375, + "step": 1678, + "time_per_iteration": 3.9414219856262207 + }, + { + "auxiliary_loss_clip": 0.01107858, + "auxiliary_loss_mlp": 0.01087762, + "balance_loss_clip": 1.03333139, + "balance_loss_mlp": 1.03052711, + "epoch": 0.10094694122952051, + "flos": 25264513077120.0, + "grad_norm": 2.5474008514443587, + "language_loss": 0.9212302, + "learning_rate": 3.947431963338532e-06, + "loss": 0.9431864, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.7734375, + "step": 1679, + "time_per_iteration": 2.499066114425659 + }, + { + "auxiliary_loss_clip": 0.0102739, + "auxiliary_loss_mlp": 0.01011976, + "balance_loss_clip": 1.00482333, + "balance_loss_mlp": 1.01003671, + "epoch": 0.10100706448218849, + "flos": 69850762980480.0, + "grad_norm": 0.7906405769344667, + "language_loss": 0.53072375, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55111742, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.07128906, + "router_z_loss_mlp": 0.17382812, + "step": 1680, + "time_per_iteration": 4.543371677398682 + }, + { + "auxiliary_loss_clip": 0.01102655, + "auxiliary_loss_mlp": 0.01085409, + "balance_loss_clip": 1.03419662, + "balance_loss_mlp": 1.02732265, + "epoch": 0.10106718773485646, + "flos": 20006244408960.0, + "grad_norm": 1.7271896323681721, + "language_loss": 0.78886855, + "learning_rate": 3.947254403670641e-06, + "loss": 0.81074917, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.75390625, + "step": 1681, + "time_per_iteration": 2.4518518447875977 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01082165, + "balance_loss_clip": 1.02225041, + "balance_loss_mlp": 1.02527046, + "epoch": 0.10112731098752442, + "flos": 13478987220480.0, + "grad_norm": 2.650385534651113, + "language_loss": 0.9594028, + "learning_rate": 3.947165513074889e-06, + "loss": 0.98129779, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.8203125, + "step": 1682, + "time_per_iteration": 2.4166274070739746 + }, + { + "auxiliary_loss_clip": 0.01103846, + "auxiliary_loss_mlp": 0.0109433, + "balance_loss_clip": 1.04101944, + "balance_loss_mlp": 1.0250783, + "epoch": 0.1011874342401924, + "flos": 18514894239360.0, + "grad_norm": 2.728925762999583, + "language_loss": 0.89122677, + "learning_rate": 3.947076548642425e-06, + "loss": 0.91320854, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7890625, + "step": 1683, + "time_per_iteration": 2.3865911960601807 + }, + { + "auxiliary_loss_clip": 0.01100901, + "auxiliary_loss_mlp": 0.01090019, + "balance_loss_clip": 1.03675628, + "balance_loss_mlp": 1.02476954, + "epoch": 0.10124755749286037, + "flos": 20701853804160.0, + "grad_norm": 1.7808197296605388, + "language_loss": 0.77214837, + "learning_rate": 3.946987510376624e-06, + "loss": 0.79405755, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.76171875, + "step": 1684, + "time_per_iteration": 2.394023895263672 + }, + { + "auxiliary_loss_clip": 0.01023718, + "auxiliary_loss_mlp": 0.01014093, + "balance_loss_clip": 1.0078944, + "balance_loss_mlp": 1.00592101, + "epoch": 0.10130768074552833, + "flos": 56106015557760.0, + "grad_norm": 0.7676853116422544, + "language_loss": 0.61160648, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63198459, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.06201172, + "router_z_loss_mlp": 0.17773438, + "step": 1685, + "time_per_iteration": 3.095723867416382 + }, + { + "auxiliary_loss_clip": 0.01108052, + "auxiliary_loss_mlp": 0.01085198, + "balance_loss_clip": 1.03031373, + "balance_loss_mlp": 1.029971, + "epoch": 0.1013678039981963, + "flos": 33400916962560.0, + "grad_norm": 2.111615690317149, + "language_loss": 0.63476104, + "learning_rate": 3.946809212358516e-06, + "loss": 0.65669346, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.78125, + "step": 1686, + "time_per_iteration": 2.5104753971099854 + }, + { + "auxiliary_loss_clip": 0.01108642, + "auxiliary_loss_mlp": 0.01079564, + "balance_loss_clip": 1.0259676, + "balance_loss_mlp": 1.03217244, + "epoch": 0.10142792725086427, + "flos": 31903980975360.0, + "grad_norm": 10.601570834435808, + "language_loss": 0.82949173, + "learning_rate": 3.946719952612972e-06, + "loss": 0.85137379, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.765625, + "step": 1687, + "time_per_iteration": 2.498149871826172 + }, + { + "auxiliary_loss_clip": 0.01119154, + "auxiliary_loss_mlp": 0.01081824, + "balance_loss_clip": 1.02605772, + "balance_loss_mlp": 1.03715932, + "epoch": 0.10148805050353224, + "flos": 28474637212800.0, + "grad_norm": 1.7171874218684928, + "language_loss": 0.74394321, + "learning_rate": 3.94663061904761e-06, + "loss": 0.765953, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.8203125, + "step": 1688, + "time_per_iteration": 2.472393035888672 + }, + { + "auxiliary_loss_clip": 0.01112984, + "auxiliary_loss_mlp": 0.0108364, + "balance_loss_clip": 1.02968574, + "balance_loss_mlp": 1.03614771, + "epoch": 0.1015481737562002, + "flos": 25147903536000.0, + "grad_norm": 2.016415756707416, + "language_loss": 0.89538038, + "learning_rate": 3.94654121166582e-06, + "loss": 0.9173466, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.76953125, + "step": 1689, + "time_per_iteration": 2.453439950942993 + }, + { + "auxiliary_loss_clip": 0.01110808, + "auxiliary_loss_mlp": 0.01085457, + "balance_loss_clip": 1.03174126, + "balance_loss_mlp": 1.0336895, + "epoch": 0.10160829700886818, + "flos": 30881479720320.0, + "grad_norm": 1.7074489867464142, + "language_loss": 0.89970905, + "learning_rate": 3.946451730470993e-06, + "loss": 0.92167169, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.76953125, + "step": 1690, + "time_per_iteration": 2.482327699661255 + }, + { + "auxiliary_loss_clip": 0.01117787, + "auxiliary_loss_mlp": 0.01085114, + "balance_loss_clip": 1.02901411, + "balance_loss_mlp": 1.0359695, + "epoch": 0.10166842026153615, + "flos": 20410992334080.0, + "grad_norm": 1.8633947966149866, + "language_loss": 0.85577786, + "learning_rate": 3.946362175466521e-06, + "loss": 0.87780678, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.81640625, + "step": 1691, + "time_per_iteration": 2.422517776489258 + }, + { + "auxiliary_loss_clip": 0.01119182, + "auxiliary_loss_mlp": 0.01083795, + "balance_loss_clip": 1.02745676, + "balance_loss_mlp": 1.03834581, + "epoch": 0.10172854351420411, + "flos": 33475491360000.0, + "grad_norm": 1.6245598726369788, + "language_loss": 0.68487811, + "learning_rate": 3.946272546655801e-06, + "loss": 0.70690787, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.80859375, + "step": 1692, + "time_per_iteration": 2.5340418815612793 + }, + { + "auxiliary_loss_clip": 0.0111507, + "auxiliary_loss_mlp": 0.0109333, + "balance_loss_clip": 1.03956699, + "balance_loss_mlp": 1.03449917, + "epoch": 0.1017886667668721, + "flos": 23549195335680.0, + "grad_norm": 1.8222298802354826, + "language_loss": 0.7816118, + "learning_rate": 3.94618284404223e-06, + "loss": 0.80369586, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.8046875, + "step": 1693, + "time_per_iteration": 2.4445548057556152 + }, + { + "auxiliary_loss_clip": 0.0111351, + "auxiliary_loss_mlp": 0.01099128, + "balance_loss_clip": 1.0438621, + "balance_loss_mlp": 1.033234, + "epoch": 0.10184879001954006, + "flos": 23294922837120.0, + "grad_norm": 1.7935220108264887, + "language_loss": 0.88515556, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.90728188, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.80078125, + "step": 1694, + "time_per_iteration": 2.4354090690612793 + }, + { + "auxiliary_loss_clip": 0.01118306, + "auxiliary_loss_mlp": 0.01088675, + "balance_loss_clip": 1.02806878, + "balance_loss_mlp": 1.03197527, + "epoch": 0.10190891327220802, + "flos": 18332123938560.0, + "grad_norm": 1.8072700694683024, + "language_loss": 0.81191146, + "learning_rate": 3.946003217420147e-06, + "loss": 0.83398128, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.86328125, + "step": 1695, + "time_per_iteration": 2.406522512435913 + }, + { + "auxiliary_loss_clip": 0.01112203, + "auxiliary_loss_mlp": 0.01087565, + "balance_loss_clip": 1.02724528, + "balance_loss_mlp": 1.02998638, + "epoch": 0.10196903652487599, + "flos": 26464268638080.0, + "grad_norm": 1.6456914367748285, + "language_loss": 0.87992936, + "learning_rate": 3.945913293418447e-06, + "loss": 0.90192705, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.8203125, + "step": 1696, + "time_per_iteration": 2.447374105453491 + }, + { + "auxiliary_loss_clip": 0.01105355, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_clip": 1.02831733, + "balance_loss_mlp": 1.02866495, + "epoch": 0.10202915977754397, + "flos": 21868511529600.0, + "grad_norm": 1.9037190669795645, + "language_loss": 0.83369887, + "learning_rate": 3.945823295627519e-06, + "loss": 0.8555851, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.765625, + "step": 1697, + "time_per_iteration": 2.4468212127685547 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.01087715, + "balance_loss_clip": 1.02935028, + "balance_loss_mlp": 1.02713656, + "epoch": 0.10208928303021193, + "flos": 22308661440000.0, + "grad_norm": 1.8315455295002698, + "language_loss": 0.82827985, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.85023034, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.80078125, + "step": 1698, + "time_per_iteration": 2.391206741333008 + }, + { + "auxiliary_loss_clip": 0.0110694, + "auxiliary_loss_mlp": 0.01074481, + "balance_loss_clip": 1.01802373, + "balance_loss_mlp": 1.02673221, + "epoch": 0.1021494062828799, + "flos": 22124529596160.0, + "grad_norm": 2.2395514219476897, + "language_loss": 0.78339553, + "learning_rate": 3.945643078691637e-06, + "loss": 0.80520976, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.80078125, + "step": 1699, + "time_per_iteration": 2.4337379932403564 + }, + { + "auxiliary_loss_clip": 0.01104368, + "auxiliary_loss_mlp": 0.01075843, + "balance_loss_clip": 1.02222311, + "balance_loss_mlp": 1.02649045, + "epoch": 0.10220952953554788, + "flos": 19645696131840.0, + "grad_norm": 1.8037904448179025, + "language_loss": 0.81814075, + "learning_rate": 3.945552859553516e-06, + "loss": 0.83994281, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.78125, + "step": 1700, + "time_per_iteration": 2.3783936500549316 + }, + { + "auxiliary_loss_clip": 0.0110729, + "auxiliary_loss_mlp": 0.0107458, + "balance_loss_clip": 1.01912379, + "balance_loss_mlp": 1.02595687, + "epoch": 0.10226965278821584, + "flos": 29786044901760.0, + "grad_norm": 2.6901443094678332, + "language_loss": 0.78627449, + "learning_rate": 3.945462566639836e-06, + "loss": 0.80809319, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8125, + "step": 1701, + "time_per_iteration": 2.4773969650268555 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01084947, + "balance_loss_clip": 1.02586699, + "balance_loss_mlp": 1.02863884, + "epoch": 0.10232977604088381, + "flos": 27015581917440.0, + "grad_norm": 1.8262557935007746, + "language_loss": 0.79048133, + "learning_rate": 3.945372199954019e-06, + "loss": 0.81244218, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.828125, + "step": 1702, + "time_per_iteration": 2.461958169937134 + }, + { + "auxiliary_loss_clip": 0.01105537, + "auxiliary_loss_mlp": 0.0107428, + "balance_loss_clip": 1.02066016, + "balance_loss_mlp": 1.02783585, + "epoch": 0.10238989929355179, + "flos": 20776463112960.0, + "grad_norm": 2.4703235872792173, + "language_loss": 0.96282721, + "learning_rate": 3.945281759499494e-06, + "loss": 0.9846254, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.77734375, + "step": 1703, + "time_per_iteration": 2.4080567359924316 + }, + { + "auxiliary_loss_clip": 0.0103261, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.0239321, + "balance_loss_mlp": 1.01406717, + "epoch": 0.10245002254621975, + "flos": 57695297690880.0, + "grad_norm": 0.9258191526817878, + "language_loss": 0.55187511, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57252014, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.07958984, + "router_z_loss_mlp": 0.18554688, + "step": 1704, + "time_per_iteration": 2.939100980758667 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.01084044, + "balance_loss_clip": 1.02575064, + "balance_loss_mlp": 1.03422976, + "epoch": 0.10251014579888772, + "flos": 16799192472960.0, + "grad_norm": 5.143255582877381, + "language_loss": 0.86511385, + "learning_rate": 3.945100657298039e-06, + "loss": 0.88710612, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.80859375, + "step": 1705, + "time_per_iteration": 2.387413740158081 + }, + { + "auxiliary_loss_clip": 0.01040487, + "auxiliary_loss_mlp": 0.01015939, + "balance_loss_clip": 1.00802302, + "balance_loss_mlp": 1.02154326, + "epoch": 0.1025702690515557, + "flos": 68562328832640.0, + "grad_norm": 0.7745995925100884, + "language_loss": 0.60470951, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62527382, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.18945312, + "step": 1706, + "time_per_iteration": 3.0883145332336426 + }, + { + "auxiliary_loss_clip": 0.01113822, + "auxiliary_loss_mlp": 0.01091965, + "balance_loss_clip": 1.03591275, + "balance_loss_mlp": 1.03332329, + "epoch": 0.10263039230422366, + "flos": 14865737356800.0, + "grad_norm": 2.071868684070822, + "language_loss": 0.88570905, + "learning_rate": 3.94491926006294e-06, + "loss": 0.90776688, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8046875, + "step": 1707, + "time_per_iteration": 2.407362222671509 + }, + { + "auxiliary_loss_clip": 0.01112264, + "auxiliary_loss_mlp": 0.01094323, + "balance_loss_clip": 1.045614, + "balance_loss_mlp": 1.03464532, + "epoch": 0.10269051555689163, + "flos": 25336434211200.0, + "grad_norm": 1.6415747229778852, + "language_loss": 0.74190348, + "learning_rate": 3.944828450816369e-06, + "loss": 0.7639693, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.77734375, + "step": 1708, + "time_per_iteration": 2.447716236114502 + }, + { + "auxiliary_loss_clip": 0.01112415, + "auxiliary_loss_mlp": 0.01111042, + "balance_loss_clip": 1.05611062, + "balance_loss_mlp": 1.03229928, + "epoch": 0.10275063880955959, + "flos": 21067778430720.0, + "grad_norm": 1.6774154183617775, + "language_loss": 0.92570019, + "learning_rate": 3.944737567821709e-06, + "loss": 0.94793481, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.80078125, + "step": 1709, + "time_per_iteration": 2.4175167083740234 + }, + { + "auxiliary_loss_clip": 0.01105327, + "auxiliary_loss_mlp": 0.01103785, + "balance_loss_clip": 1.05045044, + "balance_loss_mlp": 1.0304563, + "epoch": 0.10281076206222757, + "flos": 30365638248960.0, + "grad_norm": 1.8243466391873682, + "language_loss": 0.89839363, + "learning_rate": 3.944646611082406e-06, + "loss": 0.92048472, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.75, + "step": 1710, + "time_per_iteration": 2.471731424331665 + }, + { + "auxiliary_loss_clip": 0.01104399, + "auxiliary_loss_mlp": 0.01097068, + "balance_loss_clip": 1.04456818, + "balance_loss_mlp": 1.02980089, + "epoch": 0.10287088531489554, + "flos": 22417241368320.0, + "grad_norm": 1.642893144258983, + "language_loss": 0.80988669, + "learning_rate": 3.944555580601908e-06, + "loss": 0.83190137, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.74609375, + "step": 1711, + "time_per_iteration": 2.443575143814087 + }, + { + "auxiliary_loss_clip": 0.0110543, + "auxiliary_loss_mlp": 0.01099183, + "balance_loss_clip": 1.04148567, + "balance_loss_mlp": 1.02597201, + "epoch": 0.1029310085675635, + "flos": 25114910434560.0, + "grad_norm": 1.7403236661141095, + "language_loss": 0.74764544, + "learning_rate": 3.944464476383668e-06, + "loss": 0.76969159, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.796875, + "step": 1712, + "time_per_iteration": 2.4352235794067383 + }, + { + "auxiliary_loss_clip": 0.01105168, + "auxiliary_loss_mlp": 0.01101998, + "balance_loss_clip": 1.04546881, + "balance_loss_mlp": 1.02839458, + "epoch": 0.10299113182023148, + "flos": 19864601556480.0, + "grad_norm": 1.970794318926384, + "language_loss": 0.88408291, + "learning_rate": 3.94437329843114e-06, + "loss": 0.90615463, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.765625, + "step": 1713, + "time_per_iteration": 2.424957036972046 + }, + { + "auxiliary_loss_clip": 0.01102188, + "auxiliary_loss_mlp": 0.0108087, + "balance_loss_clip": 1.03011107, + "balance_loss_mlp": 1.02580845, + "epoch": 0.10305125507289944, + "flos": 20446603787520.0, + "grad_norm": 1.8000194211968905, + "language_loss": 0.74432325, + "learning_rate": 3.944282046747782e-06, + "loss": 0.76615381, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.765625, + "step": 1714, + "time_per_iteration": 2.427103281021118 + }, + { + "auxiliary_loss_clip": 0.01106951, + "auxiliary_loss_mlp": 0.01083233, + "balance_loss_clip": 1.02770531, + "balance_loss_mlp": 1.02668977, + "epoch": 0.10311137832556741, + "flos": 26249552576640.0, + "grad_norm": 2.155616730845291, + "language_loss": 0.92984933, + "learning_rate": 3.944190721337053e-06, + "loss": 0.95175111, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.80078125, + "step": 1715, + "time_per_iteration": 3.8688347339630127 + }, + { + "auxiliary_loss_clip": 0.01104741, + "auxiliary_loss_mlp": 0.01080761, + "balance_loss_clip": 1.0251621, + "balance_loss_mlp": 1.02737999, + "epoch": 0.10317150157823539, + "flos": 35297468904960.0, + "grad_norm": 1.7389587491597964, + "language_loss": 0.77931786, + "learning_rate": 3.944099322202418e-06, + "loss": 0.80117285, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.7734375, + "step": 1716, + "time_per_iteration": 2.542492151260376 + }, + { + "auxiliary_loss_clip": 0.01112112, + "auxiliary_loss_mlp": 0.01087352, + "balance_loss_clip": 1.02851033, + "balance_loss_mlp": 1.03216183, + "epoch": 0.10323162483090335, + "flos": 25738738341120.0, + "grad_norm": 2.0683965101923794, + "language_loss": 0.87261111, + "learning_rate": 3.944007849347342e-06, + "loss": 0.89460576, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.796875, + "step": 1717, + "time_per_iteration": 3.933896780014038 + }, + { + "auxiliary_loss_clip": 0.01111102, + "auxiliary_loss_mlp": 0.01077799, + "balance_loss_clip": 1.02301013, + "balance_loss_mlp": 1.03202367, + "epoch": 0.10329174808357132, + "flos": 16288936819200.0, + "grad_norm": 1.984285108379828, + "language_loss": 0.85239971, + "learning_rate": 3.943916302775292e-06, + "loss": 0.87428874, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.7890625, + "step": 1718, + "time_per_iteration": 3.857229709625244 + }, + { + "auxiliary_loss_clip": 0.01111893, + "auxiliary_loss_mlp": 0.01072621, + "balance_loss_clip": 1.02097917, + "balance_loss_mlp": 1.03422248, + "epoch": 0.10335187133623928, + "flos": 36685615495680.0, + "grad_norm": 1.775685402296659, + "language_loss": 0.7450586, + "learning_rate": 3.943824682489742e-06, + "loss": 0.76690376, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.77734375, + "step": 1719, + "time_per_iteration": 3.985074520111084 + }, + { + "auxiliary_loss_clip": 0.01111863, + "auxiliary_loss_mlp": 0.01076509, + "balance_loss_clip": 1.02422357, + "balance_loss_mlp": 1.03332925, + "epoch": 0.10341199458890726, + "flos": 14974771132800.0, + "grad_norm": 1.7378824337936356, + "language_loss": 0.94711894, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.9690026, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.78515625, + "step": 1720, + "time_per_iteration": 2.39813232421875 + }, + { + "auxiliary_loss_clip": 0.0111473, + "auxiliary_loss_mlp": 0.01080621, + "balance_loss_clip": 1.02576137, + "balance_loss_mlp": 1.03586745, + "epoch": 0.10347211784157523, + "flos": 21030561054720.0, + "grad_norm": 1.7121834479711338, + "language_loss": 0.81945658, + "learning_rate": 3.943641220792039e-06, + "loss": 0.8414101, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.7890625, + "step": 1721, + "time_per_iteration": 2.451406955718994 + }, + { + "auxiliary_loss_clip": 0.01114688, + "auxiliary_loss_mlp": 0.0109068, + "balance_loss_clip": 1.03541446, + "balance_loss_mlp": 1.03259182, + "epoch": 0.1035322410942432, + "flos": 19791074499840.0, + "grad_norm": 1.6744972544471992, + "language_loss": 0.82205379, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.84410751, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8203125, + "step": 1722, + "time_per_iteration": 2.4130895137786865 + }, + { + "auxiliary_loss_clip": 0.01041872, + "auxiliary_loss_mlp": 0.01020019, + "balance_loss_clip": 1.01343882, + "balance_loss_mlp": 1.02324462, + "epoch": 0.10359236434691117, + "flos": 52696014554880.0, + "grad_norm": 1.3863620069761562, + "language_loss": 0.67233062, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69294953, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.18652344, + "step": 1723, + "time_per_iteration": 2.8234076499938965 + }, + { + "auxiliary_loss_clip": 0.0110913, + "auxiliary_loss_mlp": 0.01094734, + "balance_loss_clip": 1.04204345, + "balance_loss_mlp": 1.03068423, + "epoch": 0.10365248759957914, + "flos": 18404429097600.0, + "grad_norm": 2.600924163459291, + "language_loss": 0.80372667, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.82576525, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.78515625, + "step": 1724, + "time_per_iteration": 2.3935773372650146 + }, + { + "auxiliary_loss_clip": 0.01116829, + "auxiliary_loss_mlp": 0.01089709, + "balance_loss_clip": 1.03763819, + "balance_loss_mlp": 1.03290534, + "epoch": 0.1037126108522471, + "flos": 47551878587520.0, + "grad_norm": 2.2011352645150275, + "language_loss": 0.76597255, + "learning_rate": 3.943273412987676e-06, + "loss": 0.7880379, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.8359375, + "step": 1725, + "time_per_iteration": 2.6224093437194824 + }, + { + "auxiliary_loss_clip": 0.01104009, + "auxiliary_loss_mlp": 0.01077861, + "balance_loss_clip": 1.03005838, + "balance_loss_mlp": 1.029616, + "epoch": 0.10377273410491508, + "flos": 22815670337280.0, + "grad_norm": 1.9498744616912764, + "language_loss": 0.77007508, + "learning_rate": 3.943181276805054e-06, + "loss": 0.79189378, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.74609375, + "step": 1726, + "time_per_iteration": 2.4405667781829834 + }, + { + "auxiliary_loss_clip": 0.01103183, + "auxiliary_loss_mlp": 0.01086102, + "balance_loss_clip": 1.03529525, + "balance_loss_mlp": 1.02632451, + "epoch": 0.10383285735758305, + "flos": 26137551335040.0, + "grad_norm": 1.8901683432943996, + "language_loss": 0.76808071, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.78997362, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.765625, + "step": 1727, + "time_per_iteration": 2.490493059158325 + }, + { + "auxiliary_loss_clip": 0.01100129, + "auxiliary_loss_mlp": 0.01074938, + "balance_loss_clip": 1.02315354, + "balance_loss_mlp": 1.02456856, + "epoch": 0.10389298061025101, + "flos": 17090856904320.0, + "grad_norm": 2.2252696229426134, + "language_loss": 0.86654198, + "learning_rate": 3.942996783386422e-06, + "loss": 0.88829267, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.7578125, + "step": 1728, + "time_per_iteration": 2.4056386947631836 + }, + { + "auxiliary_loss_clip": 0.0110147, + "auxiliary_loss_mlp": 0.01085257, + "balance_loss_clip": 1.03099298, + "balance_loss_mlp": 1.02684975, + "epoch": 0.10395310386291898, + "flos": 20775485594880.0, + "grad_norm": 2.1085349470638364, + "language_loss": 0.72606266, + "learning_rate": 3.942904426157406e-06, + "loss": 0.74792993, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.7421875, + "step": 1729, + "time_per_iteration": 2.426069498062134 + }, + { + "auxiliary_loss_clip": 0.0110546, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_clip": 1.02645743, + "balance_loss_mlp": 1.02849364, + "epoch": 0.10401322711558696, + "flos": 12819792240000.0, + "grad_norm": 2.3180731850441143, + "language_loss": 0.85371125, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.87558097, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.76953125, + "step": 1730, + "time_per_iteration": 2.344053030014038 + }, + { + "auxiliary_loss_clip": 0.01102588, + "auxiliary_loss_mlp": 0.01069443, + "balance_loss_clip": 1.02204537, + "balance_loss_mlp": 1.02747202, + "epoch": 0.10407335036825492, + "flos": 23183584911360.0, + "grad_norm": 1.6471010996759012, + "language_loss": 0.77910888, + "learning_rate": 3.942719490677489e-06, + "loss": 0.80082923, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.75, + "step": 1731, + "time_per_iteration": 2.453585147857666 + }, + { + "auxiliary_loss_clip": 0.0110041, + "auxiliary_loss_mlp": 0.01068512, + "balance_loss_clip": 1.02290297, + "balance_loss_mlp": 1.02684975, + "epoch": 0.10413347362092289, + "flos": 26102987222400.0, + "grad_norm": 1.8566926893903308, + "language_loss": 0.855335, + "learning_rate": 3.9426269124336e-06, + "loss": 0.87702429, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.734375, + "step": 1732, + "time_per_iteration": 2.450479030609131 + }, + { + "auxiliary_loss_clip": 0.0110461, + "auxiliary_loss_mlp": 0.01065566, + "balance_loss_clip": 1.02019525, + "balance_loss_mlp": 1.02991307, + "epoch": 0.10419359687359087, + "flos": 12640233784320.0, + "grad_norm": 2.1366048150256103, + "language_loss": 0.85656518, + "learning_rate": 3.942534260525104e-06, + "loss": 0.87826693, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.74609375, + "step": 1733, + "time_per_iteration": 2.3979127407073975 + }, + { + "auxiliary_loss_clip": 0.01108081, + "auxiliary_loss_mlp": 0.0107207, + "balance_loss_clip": 1.02095342, + "balance_loss_mlp": 1.02968049, + "epoch": 0.10425372012625883, + "flos": 12124427224320.0, + "grad_norm": 2.1451602273606136, + "language_loss": 0.79590678, + "learning_rate": 3.942441534955514e-06, + "loss": 0.81770831, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.78515625, + "step": 1734, + "time_per_iteration": 2.3754119873046875 + }, + { + "auxiliary_loss_clip": 0.01105388, + "auxiliary_loss_mlp": 0.01068708, + "balance_loss_clip": 1.02119088, + "balance_loss_mlp": 1.03000927, + "epoch": 0.1043138433789268, + "flos": 25336399299840.0, + "grad_norm": 1.651585380082517, + "language_loss": 0.76703095, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.78877187, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.75390625, + "step": 1735, + "time_per_iteration": 2.441539764404297 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.010779, + "balance_loss_clip": 1.02623487, + "balance_loss_mlp": 1.03130734, + "epoch": 0.10437396663159478, + "flos": 29165917599360.0, + "grad_norm": 1.593149538973987, + "language_loss": 0.80877233, + "learning_rate": 3.94225586284712e-06, + "loss": 0.83064324, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.77734375, + "step": 1736, + "time_per_iteration": 2.4682459831237793 + }, + { + "auxiliary_loss_clip": 0.01108014, + "auxiliary_loss_mlp": 0.01086474, + "balance_loss_clip": 1.0309937, + "balance_loss_mlp": 1.03250098, + "epoch": 0.10443408988426274, + "flos": 25079822651520.0, + "grad_norm": 1.758983206856964, + "language_loss": 0.72942877, + "learning_rate": 3.942162916315356e-06, + "loss": 0.75137365, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.75390625, + "step": 1737, + "time_per_iteration": 2.4726333618164062 + }, + { + "auxiliary_loss_clip": 0.0111101, + "auxiliary_loss_mlp": 0.01083496, + "balance_loss_clip": 1.02355814, + "balance_loss_mlp": 1.02849495, + "epoch": 0.1044942131369307, + "flos": 26758481598720.0, + "grad_norm": 2.106372038071898, + "language_loss": 0.83633351, + "learning_rate": 3.942069896136581e-06, + "loss": 0.85827851, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.82421875, + "step": 1738, + "time_per_iteration": 2.4630205631256104 + }, + { + "auxiliary_loss_clip": 0.01110793, + "auxiliary_loss_mlp": 0.01091925, + "balance_loss_clip": 1.03093743, + "balance_loss_mlp": 1.02955198, + "epoch": 0.10455433638959867, + "flos": 18441576650880.0, + "grad_norm": 1.7722587618717178, + "language_loss": 0.77094793, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.79297507, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.8125, + "step": 1739, + "time_per_iteration": 2.411649465560913 + }, + { + "auxiliary_loss_clip": 0.01105986, + "auxiliary_loss_mlp": 0.01084402, + "balance_loss_clip": 1.03063858, + "balance_loss_mlp": 1.02936029, + "epoch": 0.10461445964226665, + "flos": 23217939555840.0, + "grad_norm": 2.262531061026446, + "language_loss": 0.78922421, + "learning_rate": 3.941883634852104e-06, + "loss": 0.81112808, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.765625, + "step": 1740, + "time_per_iteration": 2.430528402328491 + }, + { + "auxiliary_loss_clip": 0.01103, + "auxiliary_loss_mlp": 0.01080446, + "balance_loss_clip": 1.03047299, + "balance_loss_mlp": 1.02723455, + "epoch": 0.10467458289493461, + "flos": 24344307705600.0, + "grad_norm": 2.1227922078655226, + "language_loss": 0.8819254, + "learning_rate": 3.941790393753467e-06, + "loss": 0.9037599, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7578125, + "step": 1741, + "time_per_iteration": 2.4366374015808105 + }, + { + "auxiliary_loss_clip": 0.01104656, + "auxiliary_loss_mlp": 0.01082952, + "balance_loss_clip": 1.02623248, + "balance_loss_mlp": 1.02602303, + "epoch": 0.10473470614760258, + "flos": 21286893323520.0, + "grad_norm": 2.8649686200634403, + "language_loss": 0.78106952, + "learning_rate": 3.941697079021942e-06, + "loss": 0.80294561, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.7890625, + "step": 1742, + "time_per_iteration": 2.4112653732299805 + }, + { + "auxiliary_loss_clip": 0.01103632, + "auxiliary_loss_mlp": 0.01090508, + "balance_loss_clip": 1.0376029, + "balance_loss_mlp": 1.02753162, + "epoch": 0.10479482940027056, + "flos": 21686195076480.0, + "grad_norm": 1.9432176912097687, + "language_loss": 0.88935453, + "learning_rate": 3.94160369066107e-06, + "loss": 0.91129589, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.76171875, + "step": 1743, + "time_per_iteration": 2.458625555038452 + }, + { + "auxiliary_loss_clip": 0.01102449, + "auxiliary_loss_mlp": 0.01074502, + "balance_loss_clip": 1.0164237, + "balance_loss_mlp": 1.02577698, + "epoch": 0.10485495265293852, + "flos": 21572797380480.0, + "grad_norm": 2.1551523198318656, + "language_loss": 0.77628118, + "learning_rate": 3.941510228674391e-06, + "loss": 0.79805064, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.76953125, + "step": 1744, + "time_per_iteration": 2.39650559425354 + }, + { + "auxiliary_loss_clip": 0.01100994, + "auxiliary_loss_mlp": 0.01067006, + "balance_loss_clip": 1.01972735, + "balance_loss_mlp": 1.02608323, + "epoch": 0.10491507590560649, + "flos": 37960399301760.0, + "grad_norm": 1.9718095596353262, + "language_loss": 0.83021355, + "learning_rate": 3.941416693065451e-06, + "loss": 0.85189354, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.75, + "step": 1745, + "time_per_iteration": 2.558744430541992 + }, + { + "auxiliary_loss_clip": 0.01101938, + "auxiliary_loss_mlp": 0.01080051, + "balance_loss_clip": 1.02731323, + "balance_loss_mlp": 1.02485788, + "epoch": 0.10497519915827447, + "flos": 26395070590080.0, + "grad_norm": 1.9358127498644706, + "language_loss": 0.86234146, + "learning_rate": 3.941323083837794e-06, + "loss": 0.88416135, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.7734375, + "step": 1746, + "time_per_iteration": 2.4348719120025635 + }, + { + "auxiliary_loss_clip": 0.01103265, + "auxiliary_loss_mlp": 0.01071581, + "balance_loss_clip": 1.02261019, + "balance_loss_mlp": 1.02674568, + "epoch": 0.10503532241094243, + "flos": 40660581985920.0, + "grad_norm": 1.725517830137955, + "language_loss": 0.72621816, + "learning_rate": 3.941229400994971e-06, + "loss": 0.74796665, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.765625, + "step": 1747, + "time_per_iteration": 2.5897576808929443 + }, + { + "auxiliary_loss_clip": 0.01108677, + "auxiliary_loss_mlp": 0.01085365, + "balance_loss_clip": 1.03069568, + "balance_loss_mlp": 1.02755857, + "epoch": 0.1050954456636104, + "flos": 29788104672000.0, + "grad_norm": 2.170442113125856, + "language_loss": 0.86797142, + "learning_rate": 3.941135644540535e-06, + "loss": 0.88991189, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.8125, + "step": 1748, + "time_per_iteration": 2.4490978717803955 + }, + { + "auxiliary_loss_clip": 0.01101727, + "auxiliary_loss_mlp": 0.01077528, + "balance_loss_clip": 1.02254891, + "balance_loss_mlp": 1.02556896, + "epoch": 0.10515556891627838, + "flos": 23947694127360.0, + "grad_norm": 1.78316055913918, + "language_loss": 0.7382955, + "learning_rate": 3.941041814478041e-06, + "loss": 0.76008803, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.76171875, + "step": 1749, + "time_per_iteration": 2.454747200012207 + }, + { + "auxiliary_loss_clip": 0.01097965, + "auxiliary_loss_mlp": 0.01072972, + "balance_loss_clip": 1.02023411, + "balance_loss_mlp": 1.02401686, + "epoch": 0.10521569216894634, + "flos": 18258631793280.0, + "grad_norm": 3.2437011177144996, + "language_loss": 0.83870012, + "learning_rate": 3.940947910811047e-06, + "loss": 0.8604095, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.73828125, + "step": 1750, + "time_per_iteration": 2.3645286560058594 + }, + { + "auxiliary_loss_clip": 0.01104413, + "auxiliary_loss_mlp": 0.0107844, + "balance_loss_clip": 1.02234054, + "balance_loss_mlp": 1.02754259, + "epoch": 0.10527581542161431, + "flos": 15630056040960.0, + "grad_norm": 2.4064263129332835, + "language_loss": 0.95029044, + "learning_rate": 3.940853933543114e-06, + "loss": 0.97211897, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.765625, + "step": 1751, + "time_per_iteration": 2.392998456954956 + }, + { + "auxiliary_loss_clip": 0.01102059, + "auxiliary_loss_mlp": 0.01077734, + "balance_loss_clip": 1.02761889, + "balance_loss_mlp": 1.025594, + "epoch": 0.10533593867428227, + "flos": 18295569878400.0, + "grad_norm": 1.9450340763664844, + "language_loss": 0.80375457, + "learning_rate": 3.940759882677805e-06, + "loss": 0.82555246, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.765625, + "step": 1752, + "time_per_iteration": 2.38059663772583 + }, + { + "auxiliary_loss_clip": 0.01101173, + "auxiliary_loss_mlp": 0.0107712, + "balance_loss_clip": 1.02338016, + "balance_loss_mlp": 1.02617431, + "epoch": 0.10539606192695025, + "flos": 29021935685760.0, + "grad_norm": 1.7824144705570768, + "language_loss": 0.77227348, + "learning_rate": 3.940665758218686e-06, + "loss": 0.79405642, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.75, + "step": 1753, + "time_per_iteration": 2.46714186668396 + }, + { + "auxiliary_loss_clip": 0.01109271, + "auxiliary_loss_mlp": 0.01092933, + "balance_loss_clip": 1.03111124, + "balance_loss_mlp": 1.02777219, + "epoch": 0.10545618517961822, + "flos": 19968433539840.0, + "grad_norm": 1.8498816916365897, + "language_loss": 0.85656959, + "learning_rate": 3.940571560169328e-06, + "loss": 0.87859166, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.81640625, + "step": 1754, + "time_per_iteration": 2.383728265762329 + }, + { + "auxiliary_loss_clip": 0.0110962, + "auxiliary_loss_mlp": 0.01087991, + "balance_loss_clip": 1.02969813, + "balance_loss_mlp": 1.02937365, + "epoch": 0.10551630843228618, + "flos": 16142511110400.0, + "grad_norm": 2.4053003192181555, + "language_loss": 0.71640629, + "learning_rate": 3.940477288533302e-06, + "loss": 0.73838246, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.80078125, + "step": 1755, + "time_per_iteration": 3.890162229537964 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01085541, + "balance_loss_clip": 1.02860713, + "balance_loss_mlp": 1.02744937, + "epoch": 0.10557643168495416, + "flos": 23439009484800.0, + "grad_norm": 2.3103513893585466, + "language_loss": 0.79771608, + "learning_rate": 3.940382943314182e-06, + "loss": 0.81965208, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.8046875, + "step": 1756, + "time_per_iteration": 2.418246269226074 + }, + { + "auxiliary_loss_clip": 0.01105716, + "auxiliary_loss_mlp": 0.01082168, + "balance_loss_clip": 1.02807117, + "balance_loss_mlp": 1.0261395, + "epoch": 0.10563655493762213, + "flos": 21797951938560.0, + "grad_norm": 1.7831141851609307, + "language_loss": 0.81864619, + "learning_rate": 3.940288524515547e-06, + "loss": 0.84052503, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.796875, + "step": 1757, + "time_per_iteration": 3.865386962890625 + }, + { + "auxiliary_loss_clip": 0.01104999, + "auxiliary_loss_mlp": 0.01080074, + "balance_loss_clip": 1.02199554, + "balance_loss_mlp": 1.02627003, + "epoch": 0.10569667819029009, + "flos": 53798782625280.0, + "grad_norm": 1.640322340769754, + "language_loss": 0.80951768, + "learning_rate": 3.940194032140976e-06, + "loss": 0.83136839, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.58203125, + "router_z_loss_mlp": 0.7890625, + "step": 1758, + "time_per_iteration": 4.0562357902526855 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.01078501, + "balance_loss_clip": 1.02337837, + "balance_loss_mlp": 1.02605987, + "epoch": 0.10575680144295807, + "flos": 22924529556480.0, + "grad_norm": 1.7387165765084127, + "language_loss": 0.93444455, + "learning_rate": 3.940099466194054e-06, + "loss": 0.9562968, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.80859375, + "step": 1759, + "time_per_iteration": 2.4169657230377197 + }, + { + "auxiliary_loss_clip": 0.011073, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_clip": 1.0261898, + "balance_loss_mlp": 1.02522445, + "epoch": 0.10581692469562604, + "flos": 14135808228480.0, + "grad_norm": 2.900651299958461, + "language_loss": 0.7968415, + "learning_rate": 3.940004826678365e-06, + "loss": 0.81874669, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.8203125, + "step": 1760, + "time_per_iteration": 2.3788199424743652 + }, + { + "auxiliary_loss_clip": 0.01104948, + "auxiliary_loss_mlp": 0.01076679, + "balance_loss_clip": 1.01759946, + "balance_loss_mlp": 1.02429938, + "epoch": 0.105877047948294, + "flos": 25957469208960.0, + "grad_norm": 2.090556124116569, + "language_loss": 0.92227876, + "learning_rate": 3.939910113597498e-06, + "loss": 0.94409502, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.58984375, + "router_z_loss_mlp": 0.80859375, + "step": 1761, + "time_per_iteration": 2.4436850547790527 + }, + { + "auxiliary_loss_clip": 0.0110351, + "auxiliary_loss_mlp": 0.01090403, + "balance_loss_clip": 1.03380227, + "balance_loss_mlp": 1.02528715, + "epoch": 0.10593717120096197, + "flos": 30663447079680.0, + "grad_norm": 1.905793487116791, + "language_loss": 0.80323756, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.82517666, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.78125, + "step": 1762, + "time_per_iteration": 2.4932267665863037 + }, + { + "auxiliary_loss_clip": 0.01029151, + "auxiliary_loss_mlp": 0.01023788, + "balance_loss_clip": 1.0159204, + "balance_loss_mlp": 1.01038504, + "epoch": 0.10599729445362994, + "flos": 66432905055360.0, + "grad_norm": 0.7872640798473728, + "language_loss": 0.60670274, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62723213, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.07861328, + "router_z_loss_mlp": 0.1875, + "step": 1763, + "time_per_iteration": 3.200045347213745 + }, + { + "auxiliary_loss_clip": 0.01102062, + "auxiliary_loss_mlp": 0.01076692, + "balance_loss_clip": 1.02304816, + "balance_loss_mlp": 1.02437401, + "epoch": 0.10605741770629791, + "flos": 23947135545600.0, + "grad_norm": 1.7498856337444955, + "language_loss": 0.82034898, + "learning_rate": 3.939625532999763e-06, + "loss": 0.8421365, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.77734375, + "step": 1764, + "time_per_iteration": 2.4080810546875 + }, + { + "auxiliary_loss_clip": 0.01101561, + "auxiliary_loss_mlp": 0.01082916, + "balance_loss_clip": 1.02564776, + "balance_loss_mlp": 1.02468348, + "epoch": 0.10611754095896588, + "flos": 19386605865600.0, + "grad_norm": 1.6836110639743302, + "language_loss": 0.81733233, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.83917707, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.765625, + "step": 1765, + "time_per_iteration": 2.3842990398406982 + }, + { + "auxiliary_loss_clip": 0.01100262, + "auxiliary_loss_mlp": 0.01086566, + "balance_loss_clip": 1.02963209, + "balance_loss_mlp": 1.02377963, + "epoch": 0.10617766421163385, + "flos": 22236635571840.0, + "grad_norm": 1.9145436044122668, + "language_loss": 0.7815975, + "learning_rate": 3.939435444841306e-06, + "loss": 0.80346572, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.765625, + "step": 1766, + "time_per_iteration": 2.391209602355957 + }, + { + "auxiliary_loss_clip": 0.0110268, + "auxiliary_loss_mlp": 0.01084346, + "balance_loss_clip": 1.03144097, + "balance_loss_mlp": 1.02505374, + "epoch": 0.10623778746430182, + "flos": 28403100103680.0, + "grad_norm": 1.6850992759018937, + "language_loss": 0.78949219, + "learning_rate": 3.939340290444895e-06, + "loss": 0.81136245, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7734375, + "step": 1767, + "time_per_iteration": 2.449960708618164 + }, + { + "auxiliary_loss_clip": 0.01024884, + "auxiliary_loss_mlp": 0.01018669, + "balance_loss_clip": 1.00989532, + "balance_loss_mlp": 1.00626087, + "epoch": 0.10629791071696978, + "flos": 64231282719360.0, + "grad_norm": 0.7268573682080434, + "language_loss": 0.58080262, + "learning_rate": 3.939245062508506e-06, + "loss": 0.60123819, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.18554688, + "step": 1768, + "time_per_iteration": 3.1312811374664307 + }, + { + "auxiliary_loss_clip": 0.01102663, + "auxiliary_loss_mlp": 0.01080966, + "balance_loss_clip": 1.02636826, + "balance_loss_mlp": 1.02605498, + "epoch": 0.10635803396963776, + "flos": 22746472289280.0, + "grad_norm": 1.8171339001980529, + "language_loss": 0.8795433, + "learning_rate": 3.939149761035749e-06, + "loss": 0.90137959, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.765625, + "step": 1769, + "time_per_iteration": 2.4456729888916016 + }, + { + "auxiliary_loss_clip": 0.01106284, + "auxiliary_loss_mlp": 0.0108993, + "balance_loss_clip": 1.02922893, + "balance_loss_mlp": 1.02607942, + "epoch": 0.10641815722230573, + "flos": 31394214080640.0, + "grad_norm": 1.9790553190782525, + "language_loss": 0.63328922, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.65525138, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.60546875, + "router_z_loss_mlp": 0.80078125, + "step": 1770, + "time_per_iteration": 2.4703547954559326 + }, + { + "auxiliary_loss_clip": 0.01026773, + "auxiliary_loss_mlp": 0.01018848, + "balance_loss_clip": 1.01212454, + "balance_loss_mlp": 1.00814915, + "epoch": 0.1064782804749737, + "flos": 58550077307520.0, + "grad_norm": 0.8862819607823016, + "language_loss": 0.57190061, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.5923568, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.06738281, + "router_z_loss_mlp": 0.18554688, + "step": 1771, + "time_per_iteration": 2.936384439468384 + }, + { + "auxiliary_loss_clip": 0.01102323, + "auxiliary_loss_mlp": 0.01091041, + "balance_loss_clip": 1.03372538, + "balance_loss_mlp": 1.02577662, + "epoch": 0.10653840372764166, + "flos": 23986691982720.0, + "grad_norm": 1.754576749718091, + "language_loss": 0.9086321, + "learning_rate": 3.938863415435429e-06, + "loss": 0.93056571, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.765625, + "step": 1772, + "time_per_iteration": 2.5058155059814453 + }, + { + "auxiliary_loss_clip": 0.01107295, + "auxiliary_loss_mlp": 0.01092642, + "balance_loss_clip": 1.03051054, + "balance_loss_mlp": 1.02517343, + "epoch": 0.10659852698030964, + "flos": 18293719576320.0, + "grad_norm": 2.7009871763587276, + "language_loss": 0.80025184, + "learning_rate": 3.93876781985337e-06, + "loss": 0.8222512, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.8203125, + "step": 1773, + "time_per_iteration": 2.407862663269043 + }, + { + "auxiliary_loss_clip": 0.01104662, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_clip": 1.02822495, + "balance_loss_mlp": 1.02659631, + "epoch": 0.1066586502329776, + "flos": 32159230992000.0, + "grad_norm": 2.021912391354111, + "language_loss": 0.8612777, + "learning_rate": 3.938672150753041e-06, + "loss": 0.88318115, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.78125, + "step": 1774, + "time_per_iteration": 2.516582489013672 + }, + { + "auxiliary_loss_clip": 0.01108602, + "auxiliary_loss_mlp": 0.01079221, + "balance_loss_clip": 1.02326417, + "balance_loss_mlp": 1.0266552, + "epoch": 0.10671877348564557, + "flos": 17784197061120.0, + "grad_norm": 2.738987551332105, + "language_loss": 0.80211937, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.82399762, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.8203125, + "step": 1775, + "time_per_iteration": 2.428105592727661 + }, + { + "auxiliary_loss_clip": 0.01025724, + "auxiliary_loss_mlp": 0.01022127, + "balance_loss_clip": 1.01492631, + "balance_loss_mlp": 1.0059725, + "epoch": 0.10677889673831355, + "flos": 63506695029120.0, + "grad_norm": 0.8426283761754608, + "language_loss": 0.57612145, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59659994, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.07177734, + "router_z_loss_mlp": 0.19726562, + "step": 1776, + "time_per_iteration": 3.0867412090301514 + }, + { + "auxiliary_loss_clip": 0.01104121, + "auxiliary_loss_mlp": 0.0108124, + "balance_loss_clip": 1.02392364, + "balance_loss_mlp": 1.0284574, + "epoch": 0.10683901999098151, + "flos": 22016612983680.0, + "grad_norm": 1.5670006323144838, + "language_loss": 0.84922236, + "learning_rate": 3.938384702378727e-06, + "loss": 0.87107599, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.75390625, + "step": 1777, + "time_per_iteration": 2.4592878818511963 + }, + { + "auxiliary_loss_clip": 0.01105015, + "auxiliary_loss_mlp": 0.01077855, + "balance_loss_clip": 1.02275634, + "balance_loss_mlp": 1.02937341, + "epoch": 0.10689914324364948, + "flos": 25041872136960.0, + "grad_norm": 2.253557687486277, + "language_loss": 0.89016473, + "learning_rate": 3.938288739241625e-06, + "loss": 0.91199344, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.7578125, + "step": 1778, + "time_per_iteration": 2.4878475666046143 + }, + { + "auxiliary_loss_clip": 0.01110961, + "auxiliary_loss_mlp": 0.01091808, + "balance_loss_clip": 1.03527892, + "balance_loss_mlp": 1.03283834, + "epoch": 0.10695926649631746, + "flos": 16434210453120.0, + "grad_norm": 2.3985440398511293, + "language_loss": 0.86333096, + "learning_rate": 3.938192702604417e-06, + "loss": 0.88535857, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.78125, + "step": 1779, + "time_per_iteration": 2.4623663425445557 + }, + { + "auxiliary_loss_clip": 0.01105912, + "auxiliary_loss_mlp": 0.01076924, + "balance_loss_clip": 1.02525902, + "balance_loss_mlp": 1.03076243, + "epoch": 0.10701938974898542, + "flos": 16978366903680.0, + "grad_norm": 1.9241107119409386, + "language_loss": 0.68562496, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.70745331, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.75, + "step": 1780, + "time_per_iteration": 2.430586099624634 + }, + { + "auxiliary_loss_clip": 0.01108171, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_clip": 1.02619958, + "balance_loss_mlp": 1.03187001, + "epoch": 0.10707951300165339, + "flos": 15887191271040.0, + "grad_norm": 2.2635731432969513, + "language_loss": 0.94581503, + "learning_rate": 3.938000408844265e-06, + "loss": 0.96771145, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.76171875, + "step": 1781, + "time_per_iteration": 2.4462666511535645 + }, + { + "auxiliary_loss_clip": 0.01112088, + "auxiliary_loss_mlp": 0.01091707, + "balance_loss_clip": 1.0363698, + "balance_loss_mlp": 1.03361332, + "epoch": 0.10713963625432135, + "flos": 14246273370240.0, + "grad_norm": 1.9007409248390699, + "language_loss": 0.80847782, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.8305158, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.78515625, + "step": 1782, + "time_per_iteration": 2.431711196899414 + }, + { + "auxiliary_loss_clip": 0.01112488, + "auxiliary_loss_mlp": 0.01092532, + "balance_loss_clip": 1.029351, + "balance_loss_mlp": 1.03159404, + "epoch": 0.10719975950698933, + "flos": 16756040165760.0, + "grad_norm": 2.5355711094729965, + "language_loss": 0.81853294, + "learning_rate": 3.937807821127436e-06, + "loss": 0.84058315, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.80859375, + "step": 1783, + "time_per_iteration": 2.466233730316162 + }, + { + "auxiliary_loss_clip": 0.0110944, + "auxiliary_loss_mlp": 0.01086229, + "balance_loss_clip": 1.03072476, + "balance_loss_mlp": 1.02974701, + "epoch": 0.1072598827596573, + "flos": 22709534204160.0, + "grad_norm": 2.2699161573974274, + "language_loss": 0.89580917, + "learning_rate": 3.937711417044395e-06, + "loss": 0.91776586, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.796875, + "step": 1784, + "time_per_iteration": 2.4875853061676025 + }, + { + "auxiliary_loss_clip": 0.01109738, + "auxiliary_loss_mlp": 0.01101137, + "balance_loss_clip": 1.0390532, + "balance_loss_mlp": 1.03087699, + "epoch": 0.10732000601232526, + "flos": 23257146879360.0, + "grad_norm": 2.4666659234288852, + "language_loss": 1.04631567, + "learning_rate": 3.937614939483143e-06, + "loss": 1.06842446, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.7890625, + "step": 1785, + "time_per_iteration": 2.463488817214966 + }, + { + "auxiliary_loss_clip": 0.01107039, + "auxiliary_loss_mlp": 0.01091889, + "balance_loss_clip": 1.03803027, + "balance_loss_mlp": 1.03054619, + "epoch": 0.10738012926499324, + "flos": 24205911609600.0, + "grad_norm": 1.361971881963877, + "language_loss": 0.85996342, + "learning_rate": 3.937518388447339e-06, + "loss": 0.8819527, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.765625, + "step": 1786, + "time_per_iteration": 2.4628236293792725 + }, + { + "auxiliary_loss_clip": 0.01108516, + "auxiliary_loss_mlp": 0.01089435, + "balance_loss_clip": 1.02954459, + "balance_loss_mlp": 1.02760863, + "epoch": 0.1074402525176612, + "flos": 20922016037760.0, + "grad_norm": 1.875708549236498, + "language_loss": 0.80072117, + "learning_rate": 3.937421763940642e-06, + "loss": 0.82270074, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.80859375, + "step": 1787, + "time_per_iteration": 2.4292001724243164 + }, + { + "auxiliary_loss_clip": 0.01108543, + "auxiliary_loss_mlp": 0.01093566, + "balance_loss_clip": 1.03176785, + "balance_loss_mlp": 1.02777445, + "epoch": 0.10750037577032917, + "flos": 16945967295360.0, + "grad_norm": 1.7860093625280076, + "language_loss": 0.86416298, + "learning_rate": 3.937325065966719e-06, + "loss": 0.8861841, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.80859375, + "step": 1788, + "time_per_iteration": 2.399644136428833 + }, + { + "auxiliary_loss_clip": 0.01103189, + "auxiliary_loss_mlp": 0.01085771, + "balance_loss_clip": 1.03060114, + "balance_loss_mlp": 1.02529407, + "epoch": 0.10756049902299715, + "flos": 20265509232000.0, + "grad_norm": 2.053981299260295, + "language_loss": 0.80566937, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.82755888, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.78125, + "step": 1789, + "time_per_iteration": 2.430898666381836 + }, + { + "auxiliary_loss_clip": 0.0110825, + "auxiliary_loss_mlp": 0.01097361, + "balance_loss_clip": 1.03241563, + "balance_loss_mlp": 1.02742267, + "epoch": 0.10762062227566511, + "flos": 23585400282240.0, + "grad_norm": 2.5226822089913132, + "language_loss": 0.78892046, + "learning_rate": 3.937131449631859e-06, + "loss": 0.81097662, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.80859375, + "step": 1790, + "time_per_iteration": 2.4127004146575928 + }, + { + "auxiliary_loss_clip": 0.01106577, + "auxiliary_loss_mlp": 0.01104172, + "balance_loss_clip": 1.03889334, + "balance_loss_mlp": 1.02771962, + "epoch": 0.10768074552833308, + "flos": 24309638858880.0, + "grad_norm": 2.1810887277185493, + "language_loss": 0.8067987, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.82890618, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.7890625, + "step": 1791, + "time_per_iteration": 2.4471685886383057 + }, + { + "auxiliary_loss_clip": 0.01099337, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_clip": 1.01885617, + "balance_loss_mlp": 1.02518487, + "epoch": 0.10774086878100106, + "flos": 25298832810240.0, + "grad_norm": 1.8081157187861745, + "language_loss": 0.73336768, + "learning_rate": 3.936937539472126e-06, + "loss": 0.7550801, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7421875, + "step": 1792, + "time_per_iteration": 2.4370737075805664 + }, + { + "auxiliary_loss_clip": 0.01105259, + "auxiliary_loss_mlp": 0.01084122, + "balance_loss_clip": 1.01989162, + "balance_loss_mlp": 1.02604699, + "epoch": 0.10780099203366902, + "flos": 22052957575680.0, + "grad_norm": 1.7162053470356557, + "language_loss": 0.78707123, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.80896509, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.7890625, + "step": 1793, + "time_per_iteration": 2.429750680923462 + }, + { + "auxiliary_loss_clip": 0.01103944, + "auxiliary_loss_mlp": 0.01080391, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.0294826, + "epoch": 0.10786111528633699, + "flos": 22746367555200.0, + "grad_norm": 2.0704457453356135, + "language_loss": 0.86846381, + "learning_rate": 3.936743335516936e-06, + "loss": 0.89030719, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.74609375, + "step": 1794, + "time_per_iteration": 2.4511470794677734 + }, + { + "auxiliary_loss_clip": 0.01113186, + "auxiliary_loss_mlp": 0.01085709, + "balance_loss_clip": 1.0249362, + "balance_loss_mlp": 1.03084242, + "epoch": 0.10792123853900495, + "flos": 20849990169600.0, + "grad_norm": 1.6622183139610038, + "language_loss": 0.77430236, + "learning_rate": 3.936646123375246e-06, + "loss": 0.79629123, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.82421875, + "step": 1795, + "time_per_iteration": 3.8470277786254883 + }, + { + "auxiliary_loss_clip": 0.01111658, + "auxiliary_loss_mlp": 0.0108933, + "balance_loss_clip": 1.02624404, + "balance_loss_mlp": 1.02906227, + "epoch": 0.10798136179167293, + "flos": 17747747735040.0, + "grad_norm": 2.43360129851574, + "language_loss": 0.85200572, + "learning_rate": 3.936548837795741e-06, + "loss": 0.87401557, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.82421875, + "step": 1796, + "time_per_iteration": 2.391935348510742 + }, + { + "auxiliary_loss_clip": 0.01112981, + "auxiliary_loss_mlp": 0.01093153, + "balance_loss_clip": 1.03331017, + "balance_loss_mlp": 1.03083777, + "epoch": 0.1080414850443409, + "flos": 13588789046400.0, + "grad_norm": 2.3434917788046796, + "language_loss": 0.77301693, + "learning_rate": 3.936451478782111e-06, + "loss": 0.79507822, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 0.8203125, + "step": 1797, + "time_per_iteration": 6.6293559074401855 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_clip": 1.02563453, + "balance_loss_mlp": 1.02958536, + "epoch": 0.10810160829700886, + "flos": 16252487493120.0, + "grad_norm": 2.120633774622022, + "language_loss": 0.83814836, + "learning_rate": 3.936354046338046e-06, + "loss": 0.86007309, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.81640625, + "step": 1798, + "time_per_iteration": 2.382735013961792 + }, + { + "auxiliary_loss_clip": 0.01110223, + "auxiliary_loss_mlp": 0.01084513, + "balance_loss_clip": 1.02905643, + "balance_loss_mlp": 1.02816665, + "epoch": 0.10816173154967684, + "flos": 15157122497280.0, + "grad_norm": 2.490878376981136, + "language_loss": 0.89662349, + "learning_rate": 3.936256540467242e-06, + "loss": 0.91857082, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.8203125, + "step": 1799, + "time_per_iteration": 2.3897128105163574 + }, + { + "auxiliary_loss_clip": 0.0110706, + "auxiliary_loss_mlp": 0.01087878, + "balance_loss_clip": 1.03144479, + "balance_loss_mlp": 1.02830184, + "epoch": 0.10822185480234481, + "flos": 17784371617920.0, + "grad_norm": 2.005832505660918, + "language_loss": 0.8002528, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.82220221, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.78515625, + "step": 1800, + "time_per_iteration": 2.392831563949585 + }, + { + "auxiliary_loss_clip": 0.01104785, + "auxiliary_loss_mlp": 0.01083678, + "balance_loss_clip": 1.0259093, + "balance_loss_mlp": 1.0265398, + "epoch": 0.10828197805501277, + "flos": 25555479281280.0, + "grad_norm": 1.585579711025779, + "language_loss": 0.7437495, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.76563412, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.78515625, + "step": 1801, + "time_per_iteration": 2.45788836479187 + }, + { + "auxiliary_loss_clip": 0.01109108, + "auxiliary_loss_mlp": 0.01082036, + "balance_loss_clip": 1.02548313, + "balance_loss_mlp": 1.02760434, + "epoch": 0.10834210130768075, + "flos": 28983217121280.0, + "grad_norm": 2.0517478025170517, + "language_loss": 0.68840164, + "learning_rate": 3.935963582331381e-06, + "loss": 0.71031308, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.8125, + "step": 1802, + "time_per_iteration": 2.458113670349121 + }, + { + "auxiliary_loss_clip": 0.01103922, + "auxiliary_loss_mlp": 0.01092844, + "balance_loss_clip": 1.03433609, + "balance_loss_mlp": 1.02603364, + "epoch": 0.10840222456034872, + "flos": 20263239993600.0, + "grad_norm": 1.68790812593393, + "language_loss": 0.83659798, + "learning_rate": 3.935865782790621e-06, + "loss": 0.85856569, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.78125, + "step": 1803, + "time_per_iteration": 2.4181807041168213 + }, + { + "auxiliary_loss_clip": 0.01101154, + "auxiliary_loss_mlp": 0.01079098, + "balance_loss_clip": 1.02402294, + "balance_loss_mlp": 1.02442694, + "epoch": 0.10846234781301668, + "flos": 19862087938560.0, + "grad_norm": 1.6038514852205523, + "language_loss": 0.92416739, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.94596988, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.765625, + "step": 1804, + "time_per_iteration": 2.3894455432891846 + }, + { + "auxiliary_loss_clip": 0.01103263, + "auxiliary_loss_mlp": 0.01076114, + "balance_loss_clip": 1.01891756, + "balance_loss_mlp": 1.02494025, + "epoch": 0.10852247106568465, + "flos": 26467829596800.0, + "grad_norm": 1.9022023289830055, + "language_loss": 0.78340149, + "learning_rate": 3.935669963488139e-06, + "loss": 0.80519527, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.78125, + "step": 1805, + "time_per_iteration": 2.4441189765930176 + }, + { + "auxiliary_loss_clip": 0.0109866, + "auxiliary_loss_mlp": 0.01071766, + "balance_loss_clip": 1.02238917, + "balance_loss_mlp": 1.02457011, + "epoch": 0.10858259431835263, + "flos": 30080188039680.0, + "grad_norm": 1.7681877395895296, + "language_loss": 0.87335479, + "learning_rate": 3.935571943733843e-06, + "loss": 0.89505905, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.7421875, + "step": 1806, + "time_per_iteration": 2.4484691619873047 + }, + { + "auxiliary_loss_clip": 0.01105445, + "auxiliary_loss_mlp": 0.0108059, + "balance_loss_clip": 1.02589726, + "balance_loss_mlp": 1.02533329, + "epoch": 0.10864271757102059, + "flos": 19062157800960.0, + "grad_norm": 2.097358810791154, + "language_loss": 0.82492584, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.84678614, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.80078125, + "step": 1807, + "time_per_iteration": 2.3942699432373047 + }, + { + "auxiliary_loss_clip": 0.01100054, + "auxiliary_loss_mlp": 0.0107129, + "balance_loss_clip": 1.02222371, + "balance_loss_mlp": 1.02492642, + "epoch": 0.10870284082368856, + "flos": 24713967847680.0, + "grad_norm": 1.7698734313011755, + "language_loss": 0.80686378, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.82857722, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.75390625, + "step": 1808, + "time_per_iteration": 2.4194867610931396 + }, + { + "auxiliary_loss_clip": 0.01103773, + "auxiliary_loss_mlp": 0.01075431, + "balance_loss_clip": 1.02428973, + "balance_loss_mlp": 1.02712774, + "epoch": 0.10876296407635654, + "flos": 20626720824960.0, + "grad_norm": 1.728962795467947, + "language_loss": 0.80291831, + "learning_rate": 3.935277444103342e-06, + "loss": 0.82471037, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.765625, + "step": 1809, + "time_per_iteration": 2.416457414627075 + }, + { + "auxiliary_loss_clip": 0.01101859, + "auxiliary_loss_mlp": 0.01074955, + "balance_loss_clip": 1.02579308, + "balance_loss_mlp": 1.02672076, + "epoch": 0.1088230873290245, + "flos": 21578767223040.0, + "grad_norm": 2.060769033204955, + "language_loss": 0.88350594, + "learning_rate": 3.935179130783046e-06, + "loss": 0.90527409, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.75, + "step": 1810, + "time_per_iteration": 2.3957698345184326 + }, + { + "auxiliary_loss_clip": 0.01111289, + "auxiliary_loss_mlp": 0.0108179, + "balance_loss_clip": 1.02595186, + "balance_loss_mlp": 1.02955151, + "epoch": 0.10888321058169247, + "flos": 26467829596800.0, + "grad_norm": 2.4125826690210683, + "language_loss": 0.65641016, + "learning_rate": 3.935080744080564e-06, + "loss": 0.67834091, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.81640625, + "step": 1811, + "time_per_iteration": 2.448526620864868 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01084385, + "balance_loss_clip": 1.03255248, + "balance_loss_mlp": 1.02673995, + "epoch": 0.10894333383436045, + "flos": 25847423003520.0, + "grad_norm": 1.8778586847031504, + "language_loss": 0.76418281, + "learning_rate": 3.934982283999626e-06, + "loss": 0.78606355, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.76953125, + "step": 1812, + "time_per_iteration": 2.432039737701416 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01072442, + "balance_loss_clip": 1.0257597, + "balance_loss_mlp": 1.02623367, + "epoch": 0.10900345708702841, + "flos": 19536068862720.0, + "grad_norm": 2.7185472842113785, + "language_loss": 0.74803507, + "learning_rate": 3.934883750543966e-06, + "loss": 0.76978689, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.765625, + "step": 1813, + "time_per_iteration": 2.381136417388916 + }, + { + "auxiliary_loss_clip": 0.01100547, + "auxiliary_loss_mlp": 0.01074623, + "balance_loss_clip": 1.02879846, + "balance_loss_mlp": 1.02725816, + "epoch": 0.10906358033969638, + "flos": 23622163810560.0, + "grad_norm": 1.8292889523661957, + "language_loss": 0.84670591, + "learning_rate": 3.93478514371732e-06, + "loss": 0.86845762, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.734375, + "step": 1814, + "time_per_iteration": 2.5078063011169434 + }, + { + "auxiliary_loss_clip": 0.01104288, + "auxiliary_loss_mlp": 0.01076016, + "balance_loss_clip": 1.02795124, + "balance_loss_mlp": 1.02705216, + "epoch": 0.10912370359236434, + "flos": 21213680469120.0, + "grad_norm": 2.4664526514741794, + "language_loss": 0.86996579, + "learning_rate": 3.934686463523429e-06, + "loss": 0.89176887, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.7734375, + "step": 1815, + "time_per_iteration": 2.4300730228424072 + }, + { + "auxiliary_loss_clip": 0.01100588, + "auxiliary_loss_mlp": 0.0107573, + "balance_loss_clip": 1.02559042, + "balance_loss_mlp": 1.02637064, + "epoch": 0.10918382684503232, + "flos": 13552339720320.0, + "grad_norm": 2.1726986316347983, + "language_loss": 0.75033742, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.77210057, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7421875, + "step": 1816, + "time_per_iteration": 2.393874406814575 + }, + { + "auxiliary_loss_clip": 0.01104792, + "auxiliary_loss_mlp": 0.01078006, + "balance_loss_clip": 1.02567279, + "balance_loss_mlp": 1.02782249, + "epoch": 0.10924395009770028, + "flos": 27963089838720.0, + "grad_norm": 2.5136614161580138, + "language_loss": 0.7655099, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.7873379, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.76953125, + "step": 1817, + "time_per_iteration": 2.4470858573913574 + }, + { + "auxiliary_loss_clip": 0.01100335, + "auxiliary_loss_mlp": 0.010728, + "balance_loss_clip": 1.02285075, + "balance_loss_mlp": 1.02527452, + "epoch": 0.10930407335036825, + "flos": 25592557011840.0, + "grad_norm": 1.8053648135546518, + "language_loss": 0.69606841, + "learning_rate": 3.934389982775706e-06, + "loss": 0.71779972, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.75, + "step": 1818, + "time_per_iteration": 2.4431748390197754 + }, + { + "auxiliary_loss_clip": 0.0110365, + "auxiliary_loss_mlp": 0.01086034, + "balance_loss_clip": 1.03539348, + "balance_loss_mlp": 1.02706945, + "epoch": 0.10936419660303623, + "flos": 18405197147520.0, + "grad_norm": 2.047310951877223, + "language_loss": 0.75600147, + "learning_rate": 3.934291009150275e-06, + "loss": 0.77789837, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.765625, + "step": 1819, + "time_per_iteration": 2.38191294670105 + }, + { + "auxiliary_loss_clip": 0.01099158, + "auxiliary_loss_mlp": 0.01071375, + "balance_loss_clip": 1.02378702, + "balance_loss_mlp": 1.02459538, + "epoch": 0.1094243198557042, + "flos": 23838974553600.0, + "grad_norm": 2.279546498192579, + "language_loss": 0.7643944, + "learning_rate": 3.934191962176335e-06, + "loss": 0.78609967, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.74609375, + "step": 1820, + "time_per_iteration": 2.446497678756714 + }, + { + "auxiliary_loss_clip": 0.01098986, + "auxiliary_loss_mlp": 0.0107538, + "balance_loss_clip": 1.02445364, + "balance_loss_mlp": 1.02423286, + "epoch": 0.10948444310837216, + "flos": 14643166239360.0, + "grad_norm": 2.3469975415300772, + "language_loss": 0.843799, + "learning_rate": 3.934092841857642e-06, + "loss": 0.86554271, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.74609375, + "step": 1821, + "time_per_iteration": 2.3553030490875244 + }, + { + "auxiliary_loss_clip": 0.01096828, + "auxiliary_loss_mlp": 0.01064266, + "balance_loss_clip": 1.01682115, + "balance_loss_mlp": 1.0230161, + "epoch": 0.10954456636104014, + "flos": 27817571825280.0, + "grad_norm": 1.8614427716209658, + "language_loss": 0.78595269, + "learning_rate": 3.933993648197955e-06, + "loss": 0.80756366, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.7421875, + "step": 1822, + "time_per_iteration": 2.4424335956573486 + }, + { + "auxiliary_loss_clip": 0.01096434, + "auxiliary_loss_mlp": 0.0106469, + "balance_loss_clip": 1.01767373, + "balance_loss_mlp": 1.02380204, + "epoch": 0.1096046896137081, + "flos": 33619508184960.0, + "grad_norm": 1.7277554588408512, + "language_loss": 0.81826091, + "learning_rate": 3.933894381201034e-06, + "loss": 0.83987218, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.7265625, + "step": 1823, + "time_per_iteration": 2.485976219177246 + }, + { + "auxiliary_loss_clip": 0.01097882, + "auxiliary_loss_mlp": 0.01064804, + "balance_loss_clip": 1.01998162, + "balance_loss_mlp": 1.02550673, + "epoch": 0.10966481286637607, + "flos": 26978783477760.0, + "grad_norm": 1.5270541878701929, + "language_loss": 0.80978745, + "learning_rate": 3.933795040870645e-06, + "loss": 0.8314144, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.7265625, + "step": 1824, + "time_per_iteration": 2.44006609916687 + }, + { + "auxiliary_loss_clip": 0.01096874, + "auxiliary_loss_mlp": 0.01068259, + "balance_loss_clip": 1.02169585, + "balance_loss_mlp": 1.02444458, + "epoch": 0.10972493611904403, + "flos": 23035518368640.0, + "grad_norm": 1.8924730153705107, + "language_loss": 0.8929252, + "learning_rate": 3.933695627210554e-06, + "loss": 0.91457653, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.7265625, + "step": 1825, + "time_per_iteration": 2.404723882675171 + }, + { + "auxiliary_loss_clip": 0.01095236, + "auxiliary_loss_mlp": 0.01070128, + "balance_loss_clip": 1.02354097, + "balance_loss_mlp": 1.02390122, + "epoch": 0.10978505937171201, + "flos": 38103194229120.0, + "grad_norm": 1.9663823765769586, + "language_loss": 0.77945828, + "learning_rate": 3.933596140224532e-06, + "loss": 0.80111194, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.7109375, + "step": 1826, + "time_per_iteration": 2.5327086448669434 + }, + { + "auxiliary_loss_clip": 0.01028265, + "auxiliary_loss_mlp": 0.01012016, + "balance_loss_clip": 1.00524461, + "balance_loss_mlp": 1.00821185, + "epoch": 0.10984518262437998, + "flos": 59846645802240.0, + "grad_norm": 0.8678077399200091, + "language_loss": 0.55114448, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57154727, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.06787109, + "router_z_loss_mlp": 0.20117188, + "step": 1827, + "time_per_iteration": 3.0208489894866943 + }, + { + "auxiliary_loss_clip": 0.01028251, + "auxiliary_loss_mlp": 0.01009823, + "balance_loss_clip": 1.00324261, + "balance_loss_mlp": 1.00909328, + "epoch": 0.10990530587704794, + "flos": 66716295494400.0, + "grad_norm": 0.7487388422310868, + "language_loss": 0.55467093, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57505167, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.19140625, + "step": 1828, + "time_per_iteration": 3.0644068717956543 + }, + { + "auxiliary_loss_clip": 0.01103204, + "auxiliary_loss_mlp": 0.01074077, + "balance_loss_clip": 1.02439022, + "balance_loss_mlp": 1.0261848, + "epoch": 0.10996542912971592, + "flos": 25446026568960.0, + "grad_norm": 2.357445326420788, + "language_loss": 0.86911714, + "learning_rate": 3.933297239348612e-06, + "loss": 0.89088994, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.76953125, + "step": 1829, + "time_per_iteration": 2.4586398601531982 + }, + { + "auxiliary_loss_clip": 0.01101747, + "auxiliary_loss_mlp": 0.01088642, + "balance_loss_clip": 1.04150617, + "balance_loss_mlp": 1.02706814, + "epoch": 0.11002555238238389, + "flos": 44016503425920.0, + "grad_norm": 1.778390409450466, + "language_loss": 0.90687287, + "learning_rate": 3.933197459096614e-06, + "loss": 0.92877674, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.74609375, + "step": 1830, + "time_per_iteration": 2.5847115516662598 + }, + { + "auxiliary_loss_clip": 0.01026855, + "auxiliary_loss_mlp": 0.01016094, + "balance_loss_clip": 1.01013315, + "balance_loss_mlp": 1.00797558, + "epoch": 0.11008567563505185, + "flos": 54061781097600.0, + "grad_norm": 0.7792404114377771, + "language_loss": 0.55553281, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57596231, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.18945312, + "step": 1831, + "time_per_iteration": 3.023843288421631 + }, + { + "auxiliary_loss_clip": 0.01103685, + "auxiliary_loss_mlp": 0.01082983, + "balance_loss_clip": 1.03799391, + "balance_loss_mlp": 1.02574992, + "epoch": 0.11014579888771983, + "flos": 24242011822080.0, + "grad_norm": 2.0010826920105207, + "language_loss": 0.93781084, + "learning_rate": 3.932997678675282e-06, + "loss": 0.95967752, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.77734375, + "step": 1832, + "time_per_iteration": 2.437107801437378 + }, + { + "auxiliary_loss_clip": 0.01021277, + "auxiliary_loss_mlp": 0.01016282, + "balance_loss_clip": 1.00932062, + "balance_loss_mlp": 1.00323629, + "epoch": 0.1102059221403878, + "flos": 57740684325120.0, + "grad_norm": 0.7346182357216329, + "language_loss": 0.60079539, + "learning_rate": 3.932897678513523e-06, + "loss": 0.621171, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.06982422, + "router_z_loss_mlp": 0.1796875, + "step": 1833, + "time_per_iteration": 3.0854196548461914 + }, + { + "auxiliary_loss_clip": 0.01095056, + "auxiliary_loss_mlp": 0.01083554, + "balance_loss_clip": 1.03744376, + "balance_loss_mlp": 1.02258062, + "epoch": 0.11026604539305576, + "flos": 16795107843840.0, + "grad_norm": 3.1958613139616094, + "language_loss": 0.84289914, + "learning_rate": 3.93279760505609e-06, + "loss": 0.86468518, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7265625, + "step": 1834, + "time_per_iteration": 3.814936876296997 + }, + { + "auxiliary_loss_clip": 0.01104762, + "auxiliary_loss_mlp": 0.01082172, + "balance_loss_clip": 1.03160393, + "balance_loss_mlp": 1.0299623, + "epoch": 0.11032616864572373, + "flos": 23986936362240.0, + "grad_norm": 2.4030834456948242, + "language_loss": 0.93920034, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9610697, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.75, + "step": 1835, + "time_per_iteration": 2.431258201599121 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.01077761, + "balance_loss_clip": 1.02962446, + "balance_loss_mlp": 1.02722776, + "epoch": 0.1103862918983917, + "flos": 19682110546560.0, + "grad_norm": 2.1275767017001606, + "language_loss": 0.67645168, + "learning_rate": 3.932597238269386e-06, + "loss": 0.6982466, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.74609375, + "step": 1836, + "time_per_iteration": 3.8317441940307617 + }, + { + "auxiliary_loss_clip": 0.01100204, + "auxiliary_loss_mlp": 0.01072603, + "balance_loss_clip": 1.02692163, + "balance_loss_mlp": 1.02717841, + "epoch": 0.11044641515105967, + "flos": 32159510282880.0, + "grad_norm": 2.145617527556642, + "language_loss": 0.75259566, + "learning_rate": 3.932496944947711e-06, + "loss": 0.7743237, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.73046875, + "step": 1837, + "time_per_iteration": 4.054734468460083 + }, + { + "auxiliary_loss_clip": 0.01104348, + "auxiliary_loss_mlp": 0.01065398, + "balance_loss_clip": 1.02143335, + "balance_loss_mlp": 1.02987385, + "epoch": 0.11050653840372764, + "flos": 16688343306240.0, + "grad_norm": 2.1565137843797135, + "language_loss": 0.80223441, + "learning_rate": 3.93239657834556e-06, + "loss": 0.82393193, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.74609375, + "step": 1838, + "time_per_iteration": 2.3821582794189453 + }, + { + "auxiliary_loss_clip": 0.01104585, + "auxiliary_loss_mlp": 0.01076735, + "balance_loss_clip": 1.02993393, + "balance_loss_mlp": 1.03211856, + "epoch": 0.11056666165639562, + "flos": 21207989917440.0, + "grad_norm": 2.0813863834827484, + "language_loss": 0.72923666, + "learning_rate": 3.932296138466736e-06, + "loss": 0.75104982, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.72265625, + "step": 1839, + "time_per_iteration": 2.420905590057373 + }, + { + "auxiliary_loss_clip": 0.01110475, + "auxiliary_loss_mlp": 0.01065261, + "balance_loss_clip": 1.01755381, + "balance_loss_mlp": 1.03310728, + "epoch": 0.11062678490906358, + "flos": 19164663152640.0, + "grad_norm": 2.4166196259860775, + "language_loss": 0.81130153, + "learning_rate": 3.93219562531505e-06, + "loss": 0.83305889, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.7734375, + "step": 1840, + "time_per_iteration": 2.407621145248413 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01061593, + "balance_loss_clip": 1.01839209, + "balance_loss_mlp": 1.03393877, + "epoch": 0.11068690816173155, + "flos": 24894259441920.0, + "grad_norm": 2.0426768268797812, + "language_loss": 0.89381814, + "learning_rate": 3.932095038894311e-06, + "loss": 0.91548836, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.71484375, + "step": 1841, + "time_per_iteration": 2.4570319652557373 + }, + { + "auxiliary_loss_clip": 0.01105203, + "auxiliary_loss_mlp": 0.01058487, + "balance_loss_clip": 1.01416469, + "balance_loss_mlp": 1.03446412, + "epoch": 0.11074703141439952, + "flos": 16471427829120.0, + "grad_norm": 1.8489573155321657, + "language_loss": 0.93145823, + "learning_rate": 3.931994379208334e-06, + "loss": 0.95309508, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.703125, + "step": 1842, + "time_per_iteration": 2.3950109481811523 + }, + { + "auxiliary_loss_clip": 0.01107564, + "auxiliary_loss_mlp": 0.01063946, + "balance_loss_clip": 1.02043509, + "balance_loss_mlp": 1.0356915, + "epoch": 0.11080715466706749, + "flos": 19171401045120.0, + "grad_norm": 1.7782385739968325, + "language_loss": 0.8774333, + "learning_rate": 3.931893646260937e-06, + "loss": 0.89914834, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.71875, + "step": 1843, + "time_per_iteration": 2.424804210662842 + }, + { + "auxiliary_loss_clip": 0.011083, + "auxiliary_loss_mlp": 0.01073619, + "balance_loss_clip": 1.0271517, + "balance_loss_mlp": 1.03437555, + "epoch": 0.11086727791973545, + "flos": 27703580636160.0, + "grad_norm": 1.673495552347332, + "language_loss": 0.76399124, + "learning_rate": 3.931792840055941e-06, + "loss": 0.78581047, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.73828125, + "step": 1844, + "time_per_iteration": 2.5054471492767334 + }, + { + "auxiliary_loss_clip": 0.01107511, + "auxiliary_loss_mlp": 0.01067027, + "balance_loss_clip": 1.02103651, + "balance_loss_mlp": 1.03420258, + "epoch": 0.11092740117240343, + "flos": 18513986544000.0, + "grad_norm": 2.1790993874468616, + "language_loss": 0.78027058, + "learning_rate": 3.931691960597165e-06, + "loss": 0.80201602, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.73046875, + "step": 1845, + "time_per_iteration": 2.4181275367736816 + }, + { + "auxiliary_loss_clip": 0.01104626, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.02362001, + "balance_loss_mlp": 1.03229225, + "epoch": 0.1109875244250714, + "flos": 20521387653120.0, + "grad_norm": 1.5254687551044002, + "language_loss": 0.78269351, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.80444372, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.7265625, + "step": 1846, + "time_per_iteration": 2.4165163040161133 + }, + { + "auxiliary_loss_clip": 0.0110536, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_clip": 1.02299798, + "balance_loss_mlp": 1.03062904, + "epoch": 0.11104764767773936, + "flos": 14097787891200.0, + "grad_norm": 2.573279062157269, + "language_loss": 0.89371645, + "learning_rate": 3.931489981933584e-06, + "loss": 0.91549039, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.75, + "step": 1847, + "time_per_iteration": 2.3723669052124023 + }, + { + "auxiliary_loss_clip": 0.01101728, + "auxiliary_loss_mlp": 0.01069096, + "balance_loss_clip": 1.02374923, + "balance_loss_mlp": 1.02788234, + "epoch": 0.11110777093040733, + "flos": 20593483344000.0, + "grad_norm": 1.8875165049366052, + "language_loss": 0.7923075, + "learning_rate": 3.931388882736438e-06, + "loss": 0.81401581, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.73828125, + "step": 1848, + "time_per_iteration": 2.4484691619873047 + }, + { + "auxiliary_loss_clip": 0.01097339, + "auxiliary_loss_mlp": 0.01067607, + "balance_loss_clip": 1.02385712, + "balance_loss_mlp": 1.02787519, + "epoch": 0.11116789418307531, + "flos": 21869035200000.0, + "grad_norm": 1.8282645621155653, + "language_loss": 0.79645181, + "learning_rate": 3.931287710300832e-06, + "loss": 0.81810129, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6953125, + "step": 1849, + "time_per_iteration": 2.40321683883667 + }, + { + "auxiliary_loss_clip": 0.01100239, + "auxiliary_loss_mlp": 0.01074533, + "balance_loss_clip": 1.02611017, + "balance_loss_mlp": 1.02469242, + "epoch": 0.11122801743574327, + "flos": 15522209251200.0, + "grad_norm": 2.4313512144389353, + "language_loss": 0.74787056, + "learning_rate": 3.931186464630601e-06, + "loss": 0.76961827, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.75390625, + "step": 1850, + "time_per_iteration": 2.3929550647735596 + }, + { + "auxiliary_loss_clip": 0.01099535, + "auxiliary_loss_mlp": 0.01070649, + "balance_loss_clip": 1.02441978, + "balance_loss_mlp": 1.02484059, + "epoch": 0.11128814068841124, + "flos": 14391407358720.0, + "grad_norm": 2.8999611090764046, + "language_loss": 0.84610397, + "learning_rate": 3.931085145729588e-06, + "loss": 0.86780584, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.74609375, + "step": 1851, + "time_per_iteration": 2.3492844104766846 + }, + { + "auxiliary_loss_clip": 0.01095434, + "auxiliary_loss_mlp": 0.0106692, + "balance_loss_clip": 1.02309883, + "balance_loss_mlp": 1.02511406, + "epoch": 0.11134826394107922, + "flos": 16653011143680.0, + "grad_norm": 2.334413820695967, + "language_loss": 0.9049499, + "learning_rate": 3.930983753601631e-06, + "loss": 0.92657351, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.703125, + "step": 1852, + "time_per_iteration": 2.3694043159484863 + }, + { + "auxiliary_loss_clip": 0.01098553, + "auxiliary_loss_mlp": 0.01074721, + "balance_loss_clip": 1.02596474, + "balance_loss_mlp": 1.025208, + "epoch": 0.11140838719374718, + "flos": 16690053962880.0, + "grad_norm": 1.9403445925353926, + "language_loss": 0.74049991, + "learning_rate": 3.930882288250578e-06, + "loss": 0.76223266, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.734375, + "step": 1853, + "time_per_iteration": 2.379237651824951 + }, + { + "auxiliary_loss_clip": 0.01024428, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.02666807, + "balance_loss_mlp": 1.00678837, + "epoch": 0.11146851044641515, + "flos": 60973397976960.0, + "grad_norm": 0.8069928914208974, + "language_loss": 0.53854942, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55912519, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.06494141, + "router_z_loss_mlp": 0.17578125, + "step": 1854, + "time_per_iteration": 2.990219831466675 + }, + { + "auxiliary_loss_clip": 0.01106473, + "auxiliary_loss_mlp": 0.01068632, + "balance_loss_clip": 1.01911259, + "balance_loss_mlp": 1.02787971, + "epoch": 0.11152863369908313, + "flos": 22192924682880.0, + "grad_norm": 3.8747024588761905, + "language_loss": 0.87201589, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.89376694, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.78515625, + "step": 1855, + "time_per_iteration": 2.415235996246338 + }, + { + "auxiliary_loss_clip": 0.01099131, + "auxiliary_loss_mlp": 0.01073064, + "balance_loss_clip": 1.02757335, + "balance_loss_mlp": 1.02733386, + "epoch": 0.11158875695175109, + "flos": 19536487799040.0, + "grad_norm": 2.4871528569716626, + "language_loss": 0.84162831, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.86335027, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.71875, + "step": 1856, + "time_per_iteration": 2.4062297344207764 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.01068074, + "balance_loss_clip": 1.02148652, + "balance_loss_mlp": 1.02964246, + "epoch": 0.11164888020441906, + "flos": 25441662648960.0, + "grad_norm": 1.605153216250298, + "language_loss": 0.84211034, + "learning_rate": 3.93047569469238e-06, + "loss": 0.86379945, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.7109375, + "step": 1857, + "time_per_iteration": 2.4475197792053223 + }, + { + "auxiliary_loss_clip": 0.01106089, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_clip": 1.0191915, + "balance_loss_mlp": 1.03082228, + "epoch": 0.11170900345708702, + "flos": 15631836520320.0, + "grad_norm": 2.0495630928987607, + "language_loss": 0.86022991, + "learning_rate": 3.930373863283608e-06, + "loss": 0.88192928, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.75390625, + "step": 1858, + "time_per_iteration": 2.403580904006958 + }, + { + "auxiliary_loss_clip": 0.01104848, + "auxiliary_loss_mlp": 0.01080399, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.03090286, + "epoch": 0.111769126709755, + "flos": 23038311277440.0, + "grad_norm": 2.2432384240293497, + "language_loss": 0.94252133, + "learning_rate": 3.930271958674866e-06, + "loss": 0.96437383, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7421875, + "step": 1859, + "time_per_iteration": 2.4300200939178467 + }, + { + "auxiliary_loss_clip": 0.01105106, + "auxiliary_loss_mlp": 0.01082846, + "balance_loss_clip": 1.0347805, + "balance_loss_mlp": 1.0297296, + "epoch": 0.11182924996242297, + "flos": 20849641056000.0, + "grad_norm": 2.044764732911658, + "language_loss": 0.852723, + "learning_rate": 3.930169980870018e-06, + "loss": 0.87460256, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.75390625, + "step": 1860, + "time_per_iteration": 2.4387805461883545 + }, + { + "auxiliary_loss_clip": 0.01104898, + "auxiliary_loss_mlp": 0.01078633, + "balance_loss_clip": 1.03218937, + "balance_loss_mlp": 1.03237712, + "epoch": 0.11188937321509093, + "flos": 17454407558400.0, + "grad_norm": 1.8532894520242558, + "language_loss": 0.78312159, + "learning_rate": 3.930067929872931e-06, + "loss": 0.80495703, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.72265625, + "step": 1861, + "time_per_iteration": 2.412659168243408 + }, + { + "auxiliary_loss_clip": 0.01102847, + "auxiliary_loss_mlp": 0.01074804, + "balance_loss_clip": 1.03210342, + "balance_loss_mlp": 1.03050554, + "epoch": 0.11194949646775891, + "flos": 24094818063360.0, + "grad_norm": 1.9076652727283643, + "language_loss": 0.91204727, + "learning_rate": 3.929965805687474e-06, + "loss": 0.9338237, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.72265625, + "step": 1862, + "time_per_iteration": 2.4636106491088867 + }, + { + "auxiliary_loss_clip": 0.01104875, + "auxiliary_loss_mlp": 0.0107759, + "balance_loss_clip": 1.0315752, + "balance_loss_mlp": 1.02988827, + "epoch": 0.11200961972042688, + "flos": 25152756215040.0, + "grad_norm": 2.523365775097726, + "language_loss": 0.89980108, + "learning_rate": 3.92986360831752e-06, + "loss": 0.92162573, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.75, + "step": 1863, + "time_per_iteration": 2.4306387901306152 + }, + { + "auxiliary_loss_clip": 0.01102974, + "auxiliary_loss_mlp": 0.01069934, + "balance_loss_clip": 1.01957941, + "balance_loss_mlp": 1.027457, + "epoch": 0.11206974297309484, + "flos": 21287242437120.0, + "grad_norm": 2.0566667389338744, + "language_loss": 0.65794426, + "learning_rate": 3.929761337766945e-06, + "loss": 0.67967331, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.75390625, + "step": 1864, + "time_per_iteration": 2.447702646255493 + }, + { + "auxiliary_loss_clip": 0.01102412, + "auxiliary_loss_mlp": 0.0107741, + "balance_loss_clip": 1.03058505, + "balance_loss_mlp": 1.0282352, + "epoch": 0.11212986622576282, + "flos": 18914998953600.0, + "grad_norm": 2.0821785378618505, + "language_loss": 0.76102626, + "learning_rate": 3.929658994039627e-06, + "loss": 0.78282446, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.7421875, + "step": 1865, + "time_per_iteration": 2.405714511871338 + }, + { + "auxiliary_loss_clip": 0.01098779, + "auxiliary_loss_mlp": 0.01078346, + "balance_loss_clip": 1.02529812, + "balance_loss_mlp": 1.0248909, + "epoch": 0.11218998947843078, + "flos": 22053655802880.0, + "grad_norm": 13.303501424570783, + "language_loss": 0.8824231, + "learning_rate": 3.929556577139446e-06, + "loss": 0.90419436, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.73828125, + "step": 1866, + "time_per_iteration": 2.4739158153533936 + }, + { + "auxiliary_loss_clip": 0.01099612, + "auxiliary_loss_mlp": 0.01077632, + "balance_loss_clip": 1.02622902, + "balance_loss_mlp": 1.0240593, + "epoch": 0.11225011273109875, + "flos": 24570544515840.0, + "grad_norm": 2.8087754590914193, + "language_loss": 0.83187294, + "learning_rate": 3.929454087070286e-06, + "loss": 0.85364538, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.75390625, + "step": 1867, + "time_per_iteration": 2.453341484069824 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01073689, + "balance_loss_clip": 1.025648, + "balance_loss_mlp": 1.02437031, + "epoch": 0.11231023598376672, + "flos": 28437419836800.0, + "grad_norm": 2.9863625959634277, + "language_loss": 0.89498687, + "learning_rate": 3.929351523836035e-06, + "loss": 0.91670704, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.7421875, + "step": 1868, + "time_per_iteration": 2.494337558746338 + }, + { + "auxiliary_loss_clip": 0.01096515, + "auxiliary_loss_mlp": 0.01071824, + "balance_loss_clip": 1.0233295, + "balance_loss_mlp": 1.02432346, + "epoch": 0.1123703592364347, + "flos": 14425657269120.0, + "grad_norm": 2.2671150608784596, + "language_loss": 0.70128286, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.72296625, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.72265625, + "step": 1869, + "time_per_iteration": 2.403276205062866 + }, + { + "auxiliary_loss_clip": 0.0110261, + "auxiliary_loss_mlp": 0.01073153, + "balance_loss_clip": 1.02091527, + "balance_loss_mlp": 1.02499545, + "epoch": 0.11243048248910266, + "flos": 22235204206080.0, + "grad_norm": 1.7999289715022688, + "language_loss": 0.79256278, + "learning_rate": 3.929146177887814e-06, + "loss": 0.81432039, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.77734375, + "step": 1870, + "time_per_iteration": 2.4106569290161133 + }, + { + "auxiliary_loss_clip": 0.01100605, + "auxiliary_loss_mlp": 0.01078561, + "balance_loss_clip": 1.02658582, + "balance_loss_mlp": 1.02378428, + "epoch": 0.11249060574177062, + "flos": 18583289326080.0, + "grad_norm": 1.8754945887567247, + "language_loss": 0.78173065, + "learning_rate": 3.929043395181631e-06, + "loss": 0.80352229, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.76953125, + "step": 1871, + "time_per_iteration": 2.3714852333068848 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01075277, + "balance_loss_clip": 1.02675927, + "balance_loss_mlp": 1.02688217, + "epoch": 0.1125507289944386, + "flos": 22855471153920.0, + "grad_norm": 2.109045831003895, + "language_loss": 0.84588909, + "learning_rate": 3.928940539325929e-06, + "loss": 0.86765969, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.75, + "step": 1872, + "time_per_iteration": 2.426419258117676 + }, + { + "auxiliary_loss_clip": 0.01100046, + "auxiliary_loss_mlp": 0.01070506, + "balance_loss_clip": 1.01977015, + "balance_loss_mlp": 1.02552271, + "epoch": 0.11261085224710657, + "flos": 19675547210880.0, + "grad_norm": 2.302261445487002, + "language_loss": 0.84636939, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.86807501, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7421875, + "step": 1873, + "time_per_iteration": 3.8020644187927246 + }, + { + "auxiliary_loss_clip": 0.01104129, + "auxiliary_loss_mlp": 0.01080662, + "balance_loss_clip": 1.02148616, + "balance_loss_mlp": 1.02533126, + "epoch": 0.11267097549977453, + "flos": 26062173976320.0, + "grad_norm": 1.8693041633854317, + "language_loss": 0.93537807, + "learning_rate": 3.928734608181575e-06, + "loss": 0.95722592, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 0.7890625, + "step": 1874, + "time_per_iteration": 2.450397491455078 + }, + { + "auxiliary_loss_clip": 0.01098637, + "auxiliary_loss_mlp": 0.01069755, + "balance_loss_clip": 1.01789856, + "balance_loss_mlp": 1.02503562, + "epoch": 0.11273109875244251, + "flos": 21067010380800.0, + "grad_norm": 1.5444510406006333, + "language_loss": 0.76353008, + "learning_rate": 3.928631532900729e-06, + "loss": 0.78521401, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.734375, + "step": 1875, + "time_per_iteration": 2.4390599727630615 + }, + { + "auxiliary_loss_clip": 0.0109686, + "auxiliary_loss_mlp": 0.01072392, + "balance_loss_clip": 1.02878559, + "balance_loss_mlp": 1.02631259, + "epoch": 0.11279122200511048, + "flos": 27087782342400.0, + "grad_norm": 1.86897390320642, + "language_loss": 0.74271333, + "learning_rate": 3.928528384485984e-06, + "loss": 0.76440585, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.70703125, + "step": 1876, + "time_per_iteration": 3.879782199859619 + }, + { + "auxiliary_loss_clip": 0.01099706, + "auxiliary_loss_mlp": 0.01066436, + "balance_loss_clip": 1.01941967, + "balance_loss_mlp": 1.02761054, + "epoch": 0.11285134525777844, + "flos": 20187024762240.0, + "grad_norm": 1.8917786796660516, + "language_loss": 0.78641903, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.80808043, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.71875, + "step": 1877, + "time_per_iteration": 5.265125036239624 + }, + { + "auxiliary_loss_clip": 0.01101101, + "auxiliary_loss_mlp": 0.01069747, + "balance_loss_clip": 1.01853502, + "balance_loss_mlp": 1.02697456, + "epoch": 0.11291146851044641, + "flos": 12457638040320.0, + "grad_norm": 2.1759545031813112, + "language_loss": 0.90824407, + "learning_rate": 3.928321868270436e-06, + "loss": 0.92995256, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.7421875, + "step": 1878, + "time_per_iteration": 2.5670459270477295 + }, + { + "auxiliary_loss_clip": 0.01097836, + "auxiliary_loss_mlp": 0.01067914, + "balance_loss_clip": 1.0203259, + "balance_loss_mlp": 1.02607751, + "epoch": 0.11297159176311439, + "flos": 23841173969280.0, + "grad_norm": 1.9428841447663734, + "language_loss": 0.84095526, + "learning_rate": 3.928218500477466e-06, + "loss": 0.86261284, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.71875, + "step": 1879, + "time_per_iteration": 2.4591658115386963 + }, + { + "auxiliary_loss_clip": 0.01100415, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_clip": 1.02478814, + "balance_loss_mlp": 1.0252378, + "epoch": 0.11303171501578235, + "flos": 29929363499520.0, + "grad_norm": 1.8849349429999354, + "language_loss": 0.72397721, + "learning_rate": 3.928115059566259e-06, + "loss": 0.74576426, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.75, + "step": 1880, + "time_per_iteration": 2.5262112617492676 + }, + { + "auxiliary_loss_clip": 0.01096647, + "auxiliary_loss_mlp": 0.01069077, + "balance_loss_clip": 1.02127421, + "balance_loss_mlp": 1.02491975, + "epoch": 0.11309183826845032, + "flos": 16179623752320.0, + "grad_norm": 1.590189952746905, + "language_loss": 0.73756194, + "learning_rate": 3.928011545540734e-06, + "loss": 0.75921917, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.71875, + "step": 1881, + "time_per_iteration": 2.40313720703125 + }, + { + "auxiliary_loss_clip": 0.01099009, + "auxiliary_loss_mlp": 0.01070966, + "balance_loss_clip": 1.02261496, + "balance_loss_mlp": 1.02470291, + "epoch": 0.1131519615211183, + "flos": 12019897013760.0, + "grad_norm": 2.13825440647513, + "language_loss": 0.76745653, + "learning_rate": 3.927907958404819e-06, + "loss": 0.78915632, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7421875, + "step": 1882, + "time_per_iteration": 2.4102942943573 + }, + { + "auxiliary_loss_clip": 0.01097094, + "auxiliary_loss_mlp": 0.01064997, + "balance_loss_clip": 1.01936352, + "balance_loss_mlp": 1.02425206, + "epoch": 0.11321208477378626, + "flos": 26248924172160.0, + "grad_norm": 2.028690310013846, + "language_loss": 0.82195008, + "learning_rate": 3.92780429816244e-06, + "loss": 0.84357095, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.7265625, + "step": 1883, + "time_per_iteration": 2.461684226989746 + }, + { + "auxiliary_loss_clip": 0.01098811, + "auxiliary_loss_mlp": 0.01076146, + "balance_loss_clip": 1.02283561, + "balance_loss_mlp": 1.02347541, + "epoch": 0.11327220802645423, + "flos": 13625517663360.0, + "grad_norm": 1.8184585007930745, + "language_loss": 0.79082727, + "learning_rate": 3.927700564817529e-06, + "loss": 0.81257689, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.75, + "step": 1884, + "time_per_iteration": 2.4012253284454346 + }, + { + "auxiliary_loss_clip": 0.01034132, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.02719951, + "balance_loss_mlp": 1.01575351, + "epoch": 0.1133323312791222, + "flos": 57188672818560.0, + "grad_norm": 0.8346576611494941, + "language_loss": 0.55341917, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57409185, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.05932617, + "router_z_loss_mlp": 0.18359375, + "step": 1885, + "time_per_iteration": 2.9715123176574707 + }, + { + "auxiliary_loss_clip": 0.01092564, + "auxiliary_loss_mlp": 0.01066852, + "balance_loss_clip": 1.02133799, + "balance_loss_mlp": 1.02258849, + "epoch": 0.11339245453179017, + "flos": 24350591750400.0, + "grad_norm": 2.1762888962388653, + "language_loss": 0.924577, + "learning_rate": 3.927492878835848e-06, + "loss": 0.94617116, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.69921875, + "step": 1886, + "time_per_iteration": 2.5209672451019287 + }, + { + "auxiliary_loss_clip": 0.01096971, + "auxiliary_loss_mlp": 0.01073669, + "balance_loss_clip": 1.02577114, + "balance_loss_mlp": 1.0243454, + "epoch": 0.11345257778445814, + "flos": 22669698476160.0, + "grad_norm": 1.6569463502935136, + "language_loss": 0.86806983, + "learning_rate": 3.927388926206953e-06, + "loss": 0.88977629, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.7265625, + "step": 1887, + "time_per_iteration": 2.4785964488983154 + }, + { + "auxiliary_loss_clip": 0.01097146, + "auxiliary_loss_mlp": 0.01070763, + "balance_loss_clip": 1.02448606, + "balance_loss_mlp": 1.02490258, + "epoch": 0.11351270103712612, + "flos": 20987408747520.0, + "grad_norm": 2.460425572651278, + "language_loss": 0.80043852, + "learning_rate": 3.927284900491277e-06, + "loss": 0.82211769, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.72265625, + "step": 1888, + "time_per_iteration": 2.492938756942749 + }, + { + "auxiliary_loss_clip": 0.01099098, + "auxiliary_loss_mlp": 0.01077062, + "balance_loss_clip": 1.02666032, + "balance_loss_mlp": 1.02547514, + "epoch": 0.11357282428979408, + "flos": 37346241841920.0, + "grad_norm": 1.9772865667537864, + "language_loss": 0.69375616, + "learning_rate": 3.927180801692764e-06, + "loss": 0.71551776, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.734375, + "step": 1889, + "time_per_iteration": 2.601879596710205 + }, + { + "auxiliary_loss_clip": 0.01096481, + "auxiliary_loss_mlp": 0.01070475, + "balance_loss_clip": 1.01985884, + "balance_loss_mlp": 1.0249536, + "epoch": 0.11363294754246205, + "flos": 21756091351680.0, + "grad_norm": 1.6606015685371962, + "language_loss": 0.85298574, + "learning_rate": 3.927076629815362e-06, + "loss": 0.87465525, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.71484375, + "step": 1890, + "time_per_iteration": 2.4335434436798096 + }, + { + "auxiliary_loss_clip": 0.01095659, + "auxiliary_loss_mlp": 0.01071225, + "balance_loss_clip": 1.02623594, + "balance_loss_mlp": 1.02563465, + "epoch": 0.11369307079513001, + "flos": 22600535339520.0, + "grad_norm": 2.6521065676607343, + "language_loss": 0.68373477, + "learning_rate": 3.926972384863022e-06, + "loss": 0.70540369, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.69921875, + "step": 1891, + "time_per_iteration": 2.396352529525757 + }, + { + "auxiliary_loss_clip": 0.01101738, + "auxiliary_loss_mlp": 0.01070181, + "balance_loss_clip": 1.02244902, + "balance_loss_mlp": 1.02642405, + "epoch": 0.11375319404779799, + "flos": 21943190661120.0, + "grad_norm": 2.0437717535521975, + "language_loss": 0.90590245, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.9276216, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.75390625, + "step": 1892, + "time_per_iteration": 2.4096405506134033 + }, + { + "auxiliary_loss_clip": 0.01103202, + "auxiliary_loss_mlp": 0.0108169, + "balance_loss_clip": 1.03004885, + "balance_loss_mlp": 1.02683306, + "epoch": 0.11381331730046595, + "flos": 26394267628800.0, + "grad_norm": 2.7223509585698085, + "language_loss": 0.75923604, + "learning_rate": 3.926763675749339e-06, + "loss": 0.78108495, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.765625, + "step": 1893, + "time_per_iteration": 2.4340016841888428 + }, + { + "auxiliary_loss_clip": 0.01094557, + "auxiliary_loss_mlp": 0.01073107, + "balance_loss_clip": 1.02554297, + "balance_loss_mlp": 1.02400064, + "epoch": 0.11387344055313392, + "flos": 23803607479680.0, + "grad_norm": 1.8893833949054872, + "language_loss": 0.81409949, + "learning_rate": 3.92665921159591e-06, + "loss": 0.83577621, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.703125, + "step": 1894, + "time_per_iteration": 2.4373579025268555 + }, + { + "auxiliary_loss_clip": 0.01103576, + "auxiliary_loss_mlp": 0.01071936, + "balance_loss_clip": 1.02193952, + "balance_loss_mlp": 1.02791119, + "epoch": 0.1139335638058019, + "flos": 34521699294720.0, + "grad_norm": 4.128387348864333, + "language_loss": 0.84122884, + "learning_rate": 3.926554674383371e-06, + "loss": 0.86298394, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7578125, + "step": 1895, + "time_per_iteration": 2.4865076541900635 + }, + { + "auxiliary_loss_clip": 0.01026502, + "auxiliary_loss_mlp": 0.01017288, + "balance_loss_clip": 1.00980127, + "balance_loss_mlp": 1.0084796, + "epoch": 0.11399368705846986, + "flos": 70584148333440.0, + "grad_norm": 0.8147085719504553, + "language_loss": 0.63502467, + "learning_rate": 3.926450064115686e-06, + "loss": 0.6554625, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.07470703, + "router_z_loss_mlp": 0.1796875, + "step": 1896, + "time_per_iteration": 3.1232588291168213 + }, + { + "auxiliary_loss_clip": 0.01101255, + "auxiliary_loss_mlp": 0.01067352, + "balance_loss_clip": 1.016927, + "balance_loss_mlp": 1.02822733, + "epoch": 0.11405381031113783, + "flos": 21323203004160.0, + "grad_norm": 1.9318551735230916, + "language_loss": 0.86146086, + "learning_rate": 3.926345380796821e-06, + "loss": 0.88314694, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.73046875, + "step": 1897, + "time_per_iteration": 2.425869941711426 + }, + { + "auxiliary_loss_clip": 0.01104157, + "auxiliary_loss_mlp": 0.01071224, + "balance_loss_clip": 1.02303958, + "balance_loss_mlp": 1.02957845, + "epoch": 0.11411393356380581, + "flos": 19718594784000.0, + "grad_norm": 2.2199132149606893, + "language_loss": 0.81340933, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.83516318, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.74609375, + "step": 1898, + "time_per_iteration": 2.443129062652588 + }, + { + "auxiliary_loss_clip": 0.01104505, + "auxiliary_loss_mlp": 0.01077304, + "balance_loss_clip": 1.02661633, + "balance_loss_mlp": 1.02875268, + "epoch": 0.11417405681647377, + "flos": 17529470714880.0, + "grad_norm": 2.578378824415495, + "language_loss": 0.76418275, + "learning_rate": 3.926135795021435e-06, + "loss": 0.78600085, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7578125, + "step": 1899, + "time_per_iteration": 2.3868911266326904 + }, + { + "auxiliary_loss_clip": 0.01035545, + "auxiliary_loss_mlp": 0.01018162, + "balance_loss_clip": 1.01322711, + "balance_loss_mlp": 1.01728582, + "epoch": 0.11423418006914174, + "flos": 59671416355200.0, + "grad_norm": 0.9306140439127988, + "language_loss": 0.63517708, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65571415, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.04931641, + "router_z_loss_mlp": 0.18261719, + "step": 1900, + "time_per_iteration": 3.0043721199035645 + }, + { + "auxiliary_loss_clip": 0.01105093, + "auxiliary_loss_mlp": 0.01079259, + "balance_loss_clip": 1.03095543, + "balance_loss_mlp": 1.02963984, + "epoch": 0.1142943033218097, + "flos": 22962096046080.0, + "grad_norm": 1.9113502691584607, + "language_loss": 0.79754734, + "learning_rate": 3.925925917089001e-06, + "loss": 0.81939083, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.75390625, + "step": 1901, + "time_per_iteration": 2.4261326789855957 + }, + { + "auxiliary_loss_clip": 0.01103142, + "auxiliary_loss_mlp": 0.01073913, + "balance_loss_clip": 1.02596688, + "balance_loss_mlp": 1.02781212, + "epoch": 0.11435442657447768, + "flos": 18255385036800.0, + "grad_norm": 2.0978125028630017, + "language_loss": 0.86818624, + "learning_rate": 3.925820868573839e-06, + "loss": 0.88995683, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.75390625, + "step": 1902, + "time_per_iteration": 2.423414707183838 + }, + { + "auxiliary_loss_clip": 0.01098273, + "auxiliary_loss_mlp": 0.01075791, + "balance_loss_clip": 1.02646255, + "balance_loss_mlp": 1.02488196, + "epoch": 0.11441454982714565, + "flos": 24060044482560.0, + "grad_norm": 3.69791193643829, + "language_loss": 0.79799533, + "learning_rate": 3.925715747031356e-06, + "loss": 0.819736, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.734375, + "step": 1903, + "time_per_iteration": 2.439527988433838 + }, + { + "auxiliary_loss_clip": 0.01100175, + "auxiliary_loss_mlp": 0.01073945, + "balance_loss_clip": 1.02514064, + "balance_loss_mlp": 1.02542341, + "epoch": 0.11447467307981361, + "flos": 25336538945280.0, + "grad_norm": 5.006334007746795, + "language_loss": 0.76973581, + "learning_rate": 3.925610552465539e-06, + "loss": 0.79147708, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.75, + "step": 1904, + "time_per_iteration": 2.4378955364227295 + }, + { + "auxiliary_loss_clip": 0.01098442, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_clip": 1.02762282, + "balance_loss_mlp": 1.0257175, + "epoch": 0.11453479633248159, + "flos": 21724983463680.0, + "grad_norm": 2.322709133589434, + "language_loss": 0.94894862, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.9707545, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.7265625, + "step": 1905, + "time_per_iteration": 2.4019157886505127 + }, + { + "auxiliary_loss_clip": 0.01106759, + "auxiliary_loss_mlp": 0.01076048, + "balance_loss_clip": 1.02273726, + "balance_loss_mlp": 1.02536607, + "epoch": 0.11459491958514956, + "flos": 12968871212160.0, + "grad_norm": 2.3173862953568123, + "language_loss": 0.80984318, + "learning_rate": 3.925399944279861e-06, + "loss": 0.83167124, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.8125, + "step": 1906, + "time_per_iteration": 2.4461934566497803 + }, + { + "auxiliary_loss_clip": 0.01099596, + "auxiliary_loss_mlp": 0.01078397, + "balance_loss_clip": 1.02499104, + "balance_loss_mlp": 1.02467704, + "epoch": 0.11465504283781752, + "flos": 22710162608640.0, + "grad_norm": 2.0420346689084563, + "language_loss": 0.85179579, + "learning_rate": 3.925294530667986e-06, + "loss": 0.87357569, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.75, + "step": 1907, + "time_per_iteration": 2.4013493061065674 + }, + { + "auxiliary_loss_clip": 0.01104586, + "auxiliary_loss_mlp": 0.01094775, + "balance_loss_clip": 1.04182243, + "balance_loss_mlp": 1.02952981, + "epoch": 0.1147151660904855, + "flos": 23397428188800.0, + "grad_norm": 2.3984462271129345, + "language_loss": 0.86514479, + "learning_rate": 3.92518904404875e-06, + "loss": 0.88713837, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.75, + "step": 1908, + "time_per_iteration": 2.466416120529175 + }, + { + "auxiliary_loss_clip": 0.01037684, + "auxiliary_loss_mlp": 0.01008842, + "balance_loss_clip": 1.00393057, + "balance_loss_mlp": 1.01954818, + "epoch": 0.11477528934315347, + "flos": 63009044242560.0, + "grad_norm": 0.9278183448357621, + "language_loss": 0.61119366, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63165897, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.04907227, + "router_z_loss_mlp": 0.18164062, + "step": 1909, + "time_per_iteration": 2.7610809803009033 + }, + { + "auxiliary_loss_clip": 0.01104264, + "auxiliary_loss_mlp": 0.01075453, + "balance_loss_clip": 1.02409756, + "balance_loss_mlp": 1.02895105, + "epoch": 0.11483541259582143, + "flos": 16324687918080.0, + "grad_norm": 1.842888055947887, + "language_loss": 0.81009704, + "learning_rate": 3.924977851804197e-06, + "loss": 0.83189416, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.75390625, + "step": 1910, + "time_per_iteration": 2.38220477104187 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01065713, + "balance_loss_clip": 1.01478636, + "balance_loss_mlp": 1.02840471, + "epoch": 0.1148955358484894, + "flos": 21579325804800.0, + "grad_norm": 2.040757855259629, + "language_loss": 0.78414232, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.80583549, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 1911, + "time_per_iteration": 2.3916642665863037 + }, + { + "auxiliary_loss_clip": 0.01098167, + "auxiliary_loss_mlp": 0.01070481, + "balance_loss_clip": 1.02191544, + "balance_loss_mlp": 1.02687752, + "epoch": 0.11495565910115738, + "flos": 27672437836800.0, + "grad_norm": 1.6259707657283342, + "language_loss": 0.80389506, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.82558155, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.71484375, + "step": 1912, + "time_per_iteration": 2.457498550415039 + }, + { + "auxiliary_loss_clip": 0.01103763, + "auxiliary_loss_mlp": 0.01081574, + "balance_loss_clip": 1.02969384, + "balance_loss_mlp": 1.02839506, + "epoch": 0.11501578235382534, + "flos": 20631294213120.0, + "grad_norm": 1.9691481128065385, + "language_loss": 0.803545, + "learning_rate": 3.924660515982246e-06, + "loss": 0.82539833, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.75390625, + "step": 1913, + "time_per_iteration": 3.8680005073547363 + }, + { + "auxiliary_loss_clip": 0.01104216, + "auxiliary_loss_mlp": 0.0106984, + "balance_loss_clip": 1.0177691, + "balance_loss_mlp": 1.02767372, + "epoch": 0.1150759056064933, + "flos": 19828012584960.0, + "grad_norm": 4.113344882486828, + "language_loss": 0.72709507, + "learning_rate": 3.924554591402939e-06, + "loss": 0.74883562, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.765625, + "step": 1914, + "time_per_iteration": 2.395423173904419 + }, + { + "auxiliary_loss_clip": 0.01039858, + "auxiliary_loss_mlp": 0.01006677, + "balance_loss_clip": 1.00071704, + "balance_loss_mlp": 1.02014196, + "epoch": 0.11513602885916129, + "flos": 70041981830400.0, + "grad_norm": 0.8619132263675272, + "language_loss": 0.61130953, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63177478, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.19726562, + "step": 1915, + "time_per_iteration": 3.123317241668701 + }, + { + "auxiliary_loss_clip": 0.011031, + "auxiliary_loss_mlp": 0.01085096, + "balance_loss_clip": 1.030617, + "balance_loss_mlp": 1.02827954, + "epoch": 0.11519615211182925, + "flos": 15740835384960.0, + "grad_norm": 2.3587560287586697, + "language_loss": 0.96246397, + "learning_rate": 3.924342523310436e-06, + "loss": 0.98434597, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.75, + "step": 1916, + "time_per_iteration": 5.260532379150391 + }, + { + "auxiliary_loss_clip": 0.01102699, + "auxiliary_loss_mlp": 0.01082716, + "balance_loss_clip": 1.02787995, + "balance_loss_mlp": 1.02652657, + "epoch": 0.11525627536449722, + "flos": 20666591464320.0, + "grad_norm": 1.8663706166075011, + "language_loss": 0.74269754, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7645517, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.76171875, + "step": 1917, + "time_per_iteration": 2.389037609100342 + }, + { + "auxiliary_loss_clip": 0.01096504, + "auxiliary_loss_mlp": 0.01071462, + "balance_loss_clip": 1.01815176, + "balance_loss_mlp": 1.02505946, + "epoch": 0.1153163986171652, + "flos": 20302237848960.0, + "grad_norm": 1.878223501492166, + "language_loss": 0.77126813, + "learning_rate": 3.92413016333289e-06, + "loss": 0.79294789, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.71484375, + "step": 1918, + "time_per_iteration": 2.420306921005249 + }, + { + "auxiliary_loss_clip": 0.01101986, + "auxiliary_loss_mlp": 0.01073936, + "balance_loss_clip": 1.02081609, + "balance_loss_mlp": 1.02552521, + "epoch": 0.11537652186983316, + "flos": 17638364845440.0, + "grad_norm": 1.9435062321325158, + "language_loss": 0.89015043, + "learning_rate": 3.92402387389729e-06, + "loss": 0.91190958, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.765625, + "step": 1919, + "time_per_iteration": 2.3609936237335205 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01078531, + "balance_loss_clip": 1.02443421, + "balance_loss_mlp": 1.02426577, + "epoch": 0.11543664512250112, + "flos": 21068337012480.0, + "grad_norm": 1.8801302635587271, + "language_loss": 0.88996416, + "learning_rate": 3.923917511502512e-06, + "loss": 0.91174942, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.7578125, + "step": 1920, + "time_per_iteration": 2.412687301635742 + }, + { + "auxiliary_loss_clip": 0.01097556, + "auxiliary_loss_mlp": 0.01073687, + "balance_loss_clip": 1.0231421, + "balance_loss_mlp": 1.02450156, + "epoch": 0.11549676837516909, + "flos": 22746437377920.0, + "grad_norm": 1.9015909137237426, + "language_loss": 0.8145293, + "learning_rate": 3.923811076152589e-06, + "loss": 0.83624172, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.734375, + "step": 1921, + "time_per_iteration": 2.4050183296203613 + }, + { + "auxiliary_loss_clip": 0.01105586, + "auxiliary_loss_mlp": 0.01079158, + "balance_loss_clip": 1.02150798, + "balance_loss_mlp": 1.02614427, + "epoch": 0.11555689162783707, + "flos": 19168049554560.0, + "grad_norm": 1.9489528284703288, + "language_loss": 0.79503161, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81687903, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 0.79296875, + "step": 1922, + "time_per_iteration": 2.402738094329834 + }, + { + "auxiliary_loss_clip": 0.01103091, + "auxiliary_loss_mlp": 0.01084399, + "balance_loss_clip": 1.03073144, + "balance_loss_mlp": 1.02614999, + "epoch": 0.11561701488050503, + "flos": 24570893629440.0, + "grad_norm": 1.8604942293116362, + "language_loss": 0.85317189, + "learning_rate": 3.923597986603456e-06, + "loss": 0.87504685, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.76953125, + "step": 1923, + "time_per_iteration": 2.4516875743865967 + }, + { + "auxiliary_loss_clip": 0.01104156, + "auxiliary_loss_mlp": 0.0107577, + "balance_loss_clip": 1.01885962, + "balance_loss_mlp": 1.02692676, + "epoch": 0.115677138133173, + "flos": 17091590042880.0, + "grad_norm": 2.0027427251759735, + "language_loss": 0.83251321, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.85431242, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.7734375, + "step": 1924, + "time_per_iteration": 2.420670509338379 + }, + { + "auxiliary_loss_clip": 0.01024496, + "auxiliary_loss_mlp": 0.01009615, + "balance_loss_clip": 1.0039643, + "balance_loss_mlp": 1.00571227, + "epoch": 0.11573726138584098, + "flos": 62700515758080.0, + "grad_norm": 0.816670549206094, + "language_loss": 0.61332083, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63366193, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.1875, + "step": 1925, + "time_per_iteration": 3.0350961685180664 + }, + { + "auxiliary_loss_clip": 0.01100762, + "auxiliary_loss_mlp": 0.01078035, + "balance_loss_clip": 1.02160156, + "balance_loss_mlp": 1.02532911, + "epoch": 0.11579738463850894, + "flos": 22600046580480.0, + "grad_norm": 1.6592144619966165, + "language_loss": 0.76847106, + "learning_rate": 3.923277805217161e-06, + "loss": 0.79025906, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.75390625, + "step": 1926, + "time_per_iteration": 2.4421029090881348 + }, + { + "auxiliary_loss_clip": 0.01105084, + "auxiliary_loss_mlp": 0.01081601, + "balance_loss_clip": 1.02404654, + "balance_loss_mlp": 1.02691829, + "epoch": 0.11585750789117691, + "flos": 21725053286400.0, + "grad_norm": 2.81843715438092, + "language_loss": 0.7573036, + "learning_rate": 3.923170932221222e-06, + "loss": 0.77917039, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.78125, + "step": 1927, + "time_per_iteration": 2.4649441242218018 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.01075424, + "balance_loss_clip": 1.01879978, + "balance_loss_mlp": 1.02573895, + "epoch": 0.11591763114384489, + "flos": 26286316104960.0, + "grad_norm": 1.6026120684954417, + "language_loss": 0.88697374, + "learning_rate": 3.92306398629845e-06, + "loss": 0.90874124, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.75390625, + "step": 1928, + "time_per_iteration": 2.4759366512298584 + }, + { + "auxiliary_loss_clip": 0.01105063, + "auxiliary_loss_mlp": 0.0109223, + "balance_loss_clip": 1.03725076, + "balance_loss_mlp": 1.02782726, + "epoch": 0.11597775439651285, + "flos": 22999418156160.0, + "grad_norm": 1.969625707307952, + "language_loss": 0.80208433, + "learning_rate": 3.922956967452898e-06, + "loss": 0.82405722, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.7734375, + "step": 1929, + "time_per_iteration": 2.4990079402923584 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01080886, + "balance_loss_clip": 1.03248727, + "balance_loss_mlp": 1.02691579, + "epoch": 0.11603787764918082, + "flos": 31940360478720.0, + "grad_norm": 1.7275403423455795, + "language_loss": 0.79079658, + "learning_rate": 3.922849875688626e-06, + "loss": 0.81260639, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.73046875, + "step": 1930, + "time_per_iteration": 2.504427433013916 + }, + { + "auxiliary_loss_clip": 0.01101347, + "auxiliary_loss_mlp": 0.0107767, + "balance_loss_clip": 1.02576673, + "balance_loss_mlp": 1.0264256, + "epoch": 0.1160980009018488, + "flos": 22270606191360.0, + "grad_norm": 1.9524965515627912, + "language_loss": 0.74097633, + "learning_rate": 3.922742711009693e-06, + "loss": 0.76276648, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.75, + "step": 1931, + "time_per_iteration": 2.4005420207977295 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01079492, + "balance_loss_clip": 1.02765942, + "balance_loss_mlp": 1.02877855, + "epoch": 0.11615812415451676, + "flos": 22782537590400.0, + "grad_norm": 1.7454332805812756, + "language_loss": 0.84138036, + "learning_rate": 3.922635473420164e-06, + "loss": 0.8632288, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.765625, + "step": 1932, + "time_per_iteration": 2.4524455070495605 + }, + { + "auxiliary_loss_clip": 0.01024423, + "auxiliary_loss_mlp": 0.01016575, + "balance_loss_clip": 1.00985181, + "balance_loss_mlp": 1.00607646, + "epoch": 0.11621824740718473, + "flos": 67142864885760.0, + "grad_norm": 0.7795563159617596, + "language_loss": 0.61040831, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63081825, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.06738281, + "router_z_loss_mlp": 0.18359375, + "step": 1933, + "time_per_iteration": 2.9197466373443604 + }, + { + "auxiliary_loss_clip": 0.0110241, + "auxiliary_loss_mlp": 0.01074543, + "balance_loss_clip": 1.02287745, + "balance_loss_mlp": 1.02558279, + "epoch": 0.11627837065985269, + "flos": 20374892121600.0, + "grad_norm": 1.9751206780424375, + "language_loss": 0.88423049, + "learning_rate": 3.922420779525586e-06, + "loss": 0.90600002, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.76953125, + "step": 1934, + "time_per_iteration": 2.421251058578491 + }, + { + "auxiliary_loss_clip": 0.0110623, + "auxiliary_loss_mlp": 0.01080654, + "balance_loss_clip": 1.02801085, + "balance_loss_mlp": 1.02898979, + "epoch": 0.11633849391252067, + "flos": 21724739084160.0, + "grad_norm": 2.26167043084455, + "language_loss": 0.69919413, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.72106302, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.7734375, + "step": 1935, + "time_per_iteration": 2.385678291320801 + }, + { + "auxiliary_loss_clip": 0.01099681, + "auxiliary_loss_mlp": 0.01077429, + "balance_loss_clip": 1.02755141, + "balance_loss_mlp": 1.02478611, + "epoch": 0.11639861716518864, + "flos": 18804394166400.0, + "grad_norm": 2.087801591831859, + "language_loss": 0.77685356, + "learning_rate": 3.922205794037456e-06, + "loss": 0.79862463, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.75, + "step": 1936, + "time_per_iteration": 2.394496202468872 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.01076216, + "balance_loss_clip": 1.02376366, + "balance_loss_mlp": 1.02456009, + "epoch": 0.1164587404178566, + "flos": 21213924848640.0, + "grad_norm": 1.8307292919040705, + "language_loss": 0.86297512, + "learning_rate": 3.922098191955998e-06, + "loss": 0.88473749, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.75390625, + "step": 1937, + "time_per_iteration": 2.388935089111328 + }, + { + "auxiliary_loss_clip": 0.01094732, + "auxiliary_loss_mlp": 0.01067897, + "balance_loss_clip": 1.01997471, + "balance_loss_mlp": 1.02352262, + "epoch": 0.11651886367052458, + "flos": 27817397268480.0, + "grad_norm": 1.9335887271623498, + "language_loss": 0.7779693, + "learning_rate": 3.921990516988384e-06, + "loss": 0.79959559, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.7109375, + "step": 1938, + "time_per_iteration": 2.473313808441162 + }, + { + "auxiliary_loss_clip": 0.01104287, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_clip": 1.02837622, + "balance_loss_mlp": 1.0257802, + "epoch": 0.11657898692319255, + "flos": 22888568989440.0, + "grad_norm": 1.7367867235260621, + "language_loss": 0.80767649, + "learning_rate": 3.921882769138696e-06, + "loss": 0.82956362, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.78515625, + "step": 1939, + "time_per_iteration": 2.3945810794830322 + }, + { + "auxiliary_loss_clip": 0.01099419, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_clip": 1.0319519, + "balance_loss_mlp": 1.0249927, + "epoch": 0.11663911017586051, + "flos": 24314770828800.0, + "grad_norm": 2.7873221709203815, + "language_loss": 0.8819685, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.90377975, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.7421875, + "step": 1940, + "time_per_iteration": 2.4464802742004395 + }, + { + "auxiliary_loss_clip": 0.01094395, + "auxiliary_loss_mlp": 0.01078484, + "balance_loss_clip": 1.03428137, + "balance_loss_mlp": 1.02451062, + "epoch": 0.11669923342852849, + "flos": 42338507794560.0, + "grad_norm": 1.5274539315744171, + "language_loss": 0.77823973, + "learning_rate": 3.921667054809449e-06, + "loss": 0.79996848, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.69921875, + "step": 1941, + "time_per_iteration": 2.5917911529541016 + }, + { + "auxiliary_loss_clip": 0.01099636, + "auxiliary_loss_mlp": 0.01085479, + "balance_loss_clip": 1.0362215, + "balance_loss_mlp": 1.02487659, + "epoch": 0.11675935668119646, + "flos": 14641560316800.0, + "grad_norm": 1.9065487899793008, + "language_loss": 0.89950025, + "learning_rate": 3.921559088338068e-06, + "loss": 0.92135143, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.75, + "step": 1942, + "time_per_iteration": 2.3784279823303223 + }, + { + "auxiliary_loss_clip": 0.01097409, + "auxiliary_loss_mlp": 0.01071666, + "balance_loss_clip": 1.02534151, + "balance_loss_mlp": 1.02689183, + "epoch": 0.11681947993386442, + "flos": 35115012806400.0, + "grad_norm": 2.053962402357319, + "language_loss": 0.69737881, + "learning_rate": 3.921451049000975e-06, + "loss": 0.7190696, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.703125, + "step": 1943, + "time_per_iteration": 2.524909496307373 + }, + { + "auxiliary_loss_clip": 0.01097538, + "auxiliary_loss_mlp": 0.01067658, + "balance_loss_clip": 1.01818633, + "balance_loss_mlp": 1.02541852, + "epoch": 0.11687960318653239, + "flos": 38981713570560.0, + "grad_norm": 1.871225335178146, + "language_loss": 0.7164948, + "learning_rate": 3.921342936802265e-06, + "loss": 0.73814678, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.72265625, + "step": 1944, + "time_per_iteration": 2.56483793258667 + }, + { + "auxiliary_loss_clip": 0.01102197, + "auxiliary_loss_mlp": 0.01076505, + "balance_loss_clip": 1.02689004, + "balance_loss_mlp": 1.02652848, + "epoch": 0.11693972643920036, + "flos": 25993778889600.0, + "grad_norm": 1.5659902023576129, + "language_loss": 0.83571392, + "learning_rate": 3.921234751746038e-06, + "loss": 0.85750091, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.7578125, + "step": 1945, + "time_per_iteration": 2.440305471420288 + }, + { + "auxiliary_loss_clip": 0.01095979, + "auxiliary_loss_mlp": 0.01075449, + "balance_loss_clip": 1.02600121, + "balance_loss_mlp": 1.02307653, + "epoch": 0.11699984969186833, + "flos": 27270866845440.0, + "grad_norm": 2.0354172485559996, + "language_loss": 0.78286022, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.80457449, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.73046875, + "step": 1946, + "time_per_iteration": 2.443709373474121 + }, + { + "auxiliary_loss_clip": 0.01094886, + "auxiliary_loss_mlp": 0.0106774, + "balance_loss_clip": 1.02208304, + "balance_loss_mlp": 1.02453065, + "epoch": 0.1170599729445363, + "flos": 15266959234560.0, + "grad_norm": 1.9162103734297344, + "language_loss": 0.70844942, + "learning_rate": 3.921018163077448e-06, + "loss": 0.73007572, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.703125, + "step": 1947, + "time_per_iteration": 2.3867526054382324 + }, + { + "auxiliary_loss_clip": 0.01098535, + "auxiliary_loss_mlp": 0.01070628, + "balance_loss_clip": 1.02010679, + "balance_loss_mlp": 1.0261786, + "epoch": 0.11712009619720427, + "flos": 17163511176960.0, + "grad_norm": 1.86112873725718, + "language_loss": 0.87185425, + "learning_rate": 3.920909759473295e-06, + "loss": 0.89354587, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.72265625, + "step": 1948, + "time_per_iteration": 2.383723020553589 + }, + { + "auxiliary_loss_clip": 0.01025867, + "auxiliary_loss_mlp": 0.01020997, + "balance_loss_clip": 1.01515615, + "balance_loss_mlp": 1.00658453, + "epoch": 0.11718021944987224, + "flos": 70937644515840.0, + "grad_norm": 0.8347948317840764, + "language_loss": 0.65263897, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67310762, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.05834961, + "router_z_loss_mlp": 0.19335938, + "step": 1949, + "time_per_iteration": 3.0580947399139404 + }, + { + "auxiliary_loss_clip": 0.01095745, + "auxiliary_loss_mlp": 0.01069687, + "balance_loss_clip": 1.02431607, + "balance_loss_mlp": 1.02404857, + "epoch": 0.1172403427025402, + "flos": 27452240691840.0, + "grad_norm": 1.6309491835037033, + "language_loss": 0.73445642, + "learning_rate": 3.920692733745835e-06, + "loss": 0.75611079, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.71875, + "step": 1950, + "time_per_iteration": 2.453089714050293 + }, + { + "auxiliary_loss_clip": 0.01099548, + "auxiliary_loss_mlp": 0.01074748, + "balance_loss_clip": 1.026564, + "balance_loss_mlp": 1.02574944, + "epoch": 0.11730046595520818, + "flos": 15667831998720.0, + "grad_norm": 2.140938309932894, + "language_loss": 0.79714155, + "learning_rate": 3.920584111630755e-06, + "loss": 0.81888449, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.73828125, + "step": 1951, + "time_per_iteration": 2.3543148040771484 + }, + { + "auxiliary_loss_clip": 0.0109912, + "auxiliary_loss_mlp": 0.01079958, + "balance_loss_clip": 1.03141642, + "balance_loss_mlp": 1.02570367, + "epoch": 0.11736058920787615, + "flos": 25628971426560.0, + "grad_norm": 1.7707328433413079, + "language_loss": 0.78388196, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.80567276, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.734375, + "step": 1952, + "time_per_iteration": 2.4808359146118164 + }, + { + "auxiliary_loss_clip": 0.01099645, + "auxiliary_loss_mlp": 0.01075363, + "balance_loss_clip": 1.02608204, + "balance_loss_mlp": 1.02539825, + "epoch": 0.11742071246054411, + "flos": 21433214298240.0, + "grad_norm": 1.8347422267290776, + "language_loss": 0.74187744, + "learning_rate": 3.920366648918491e-06, + "loss": 0.76362753, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.7421875, + "step": 1953, + "time_per_iteration": 4.628724098205566 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01087097, + "balance_loss_clip": 1.03519344, + "balance_loss_mlp": 1.02541363, + "epoch": 0.11748083571321208, + "flos": 15996923274240.0, + "grad_norm": 2.576622216994708, + "language_loss": 0.83628595, + "learning_rate": 3.920257808329552e-06, + "loss": 0.85819519, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.78515625, + "step": 1954, + "time_per_iteration": 2.4205286502838135 + }, + { + "auxiliary_loss_clip": 0.0109953, + "auxiliary_loss_mlp": 0.01078657, + "balance_loss_clip": 1.02725363, + "balance_loss_mlp": 1.02441096, + "epoch": 0.11754095896588006, + "flos": 16179134993280.0, + "grad_norm": 2.1752875628262167, + "language_loss": 0.87702525, + "learning_rate": 3.920148894924246e-06, + "loss": 0.89880705, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.75, + "step": 1955, + "time_per_iteration": 5.291463613510132 + }, + { + "auxiliary_loss_clip": 0.01096756, + "auxiliary_loss_mlp": 0.0107593, + "balance_loss_clip": 1.0284605, + "balance_loss_mlp": 1.02308774, + "epoch": 0.11760108221854802, + "flos": 13260745111680.0, + "grad_norm": 2.1927197636241345, + "language_loss": 0.79162407, + "learning_rate": 3.920039908706701e-06, + "loss": 0.81335092, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.734375, + "step": 1956, + "time_per_iteration": 2.428910732269287 + }, + { + "auxiliary_loss_clip": 0.01094802, + "auxiliary_loss_mlp": 0.01083694, + "balance_loss_clip": 1.03326809, + "balance_loss_mlp": 1.02414382, + "epoch": 0.11766120547121599, + "flos": 24497296750080.0, + "grad_norm": 1.9578382747614873, + "language_loss": 0.82200289, + "learning_rate": 3.91993084968105e-06, + "loss": 0.84378785, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.70703125, + "step": 1957, + "time_per_iteration": 3.9153811931610107 + }, + { + "auxiliary_loss_clip": 0.01099358, + "auxiliary_loss_mlp": 0.01068539, + "balance_loss_clip": 1.02011645, + "balance_loss_mlp": 1.02493358, + "epoch": 0.11772132872388397, + "flos": 17783079897600.0, + "grad_norm": 2.008533326891821, + "language_loss": 0.81396335, + "learning_rate": 3.919821717851428e-06, + "loss": 0.83564234, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.74609375, + "step": 1958, + "time_per_iteration": 2.547597646713257 + }, + { + "auxiliary_loss_clip": 0.01097261, + "auxiliary_loss_mlp": 0.01065679, + "balance_loss_clip": 1.01840091, + "balance_loss_mlp": 1.02544034, + "epoch": 0.11778145197655193, + "flos": 13216405818240.0, + "grad_norm": 1.8823968293413278, + "language_loss": 0.79106855, + "learning_rate": 3.919712513221976e-06, + "loss": 0.81269795, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.71875, + "step": 1959, + "time_per_iteration": 2.4653778076171875 + }, + { + "auxiliary_loss_clip": 0.01095478, + "auxiliary_loss_mlp": 0.01069631, + "balance_loss_clip": 1.02173281, + "balance_loss_mlp": 1.02389824, + "epoch": 0.1178415752292199, + "flos": 20229164640000.0, + "grad_norm": 1.9011261186504385, + "language_loss": 0.72540903, + "learning_rate": 3.919603235796832e-06, + "loss": 0.74706012, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.71484375, + "step": 1960, + "time_per_iteration": 2.467146873474121 + }, + { + "auxiliary_loss_clip": 0.01101117, + "auxiliary_loss_mlp": 0.01078166, + "balance_loss_clip": 1.02945721, + "balance_loss_mlp": 1.02518308, + "epoch": 0.11790169848188788, + "flos": 13039360980480.0, + "grad_norm": 3.2353078315259984, + "language_loss": 0.83770704, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.85949981, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7578125, + "step": 1961, + "time_per_iteration": 2.3909335136413574 + }, + { + "auxiliary_loss_clip": 0.01094421, + "auxiliary_loss_mlp": 0.01075894, + "balance_loss_clip": 1.02811527, + "balance_loss_mlp": 1.02373433, + "epoch": 0.11796182173455584, + "flos": 22264845816960.0, + "grad_norm": 1.764938619687065, + "language_loss": 0.94603556, + "learning_rate": 3.919384462576049e-06, + "loss": 0.96773869, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.70703125, + "step": 1962, + "time_per_iteration": 2.4402852058410645 + }, + { + "auxiliary_loss_clip": 0.01098698, + "auxiliary_loss_mlp": 0.01078368, + "balance_loss_clip": 1.03151906, + "balance_loss_mlp": 1.02517033, + "epoch": 0.1180219449872238, + "flos": 10634229129600.0, + "grad_norm": 2.2805685847614856, + "language_loss": 0.90378129, + "learning_rate": 3.919274966788707e-06, + "loss": 0.92555201, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.734375, + "step": 1963, + "time_per_iteration": 2.397088050842285 + }, + { + "auxiliary_loss_clip": 0.01103203, + "auxiliary_loss_mlp": 0.01077506, + "balance_loss_clip": 1.03015637, + "balance_loss_mlp": 1.02593577, + "epoch": 0.11808206823989177, + "flos": 20922469885440.0, + "grad_norm": 1.8935345525425222, + "language_loss": 0.86051631, + "learning_rate": 3.919165398222265e-06, + "loss": 0.88232338, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.7734375, + "step": 1964, + "time_per_iteration": 2.473780632019043 + }, + { + "auxiliary_loss_clip": 0.01098289, + "auxiliary_loss_mlp": 0.01071061, + "balance_loss_clip": 1.02538013, + "balance_loss_mlp": 1.02582359, + "epoch": 0.11814219149255975, + "flos": 20776707492480.0, + "grad_norm": 2.0143657817907665, + "language_loss": 0.85223615, + "learning_rate": 3.919055756880879e-06, + "loss": 0.87392962, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.7265625, + "step": 1965, + "time_per_iteration": 2.431307792663574 + }, + { + "auxiliary_loss_clip": 0.01098774, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_clip": 1.03172922, + "balance_loss_mlp": 1.02437353, + "epoch": 0.11820231474522772, + "flos": 48758162572800.0, + "grad_norm": 1.6564758975121856, + "language_loss": 0.76597512, + "learning_rate": 3.918946042768707e-06, + "loss": 0.78776532, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.74609375, + "step": 1966, + "time_per_iteration": 2.6666133403778076 + }, + { + "auxiliary_loss_clip": 0.01102013, + "auxiliary_loss_mlp": 0.01089877, + "balance_loss_clip": 1.04131079, + "balance_loss_mlp": 1.02631295, + "epoch": 0.11826243799789568, + "flos": 16689669937920.0, + "grad_norm": 2.23096624147491, + "language_loss": 0.74836242, + "learning_rate": 3.918836255889908e-06, + "loss": 0.77028131, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7578125, + "step": 1967, + "time_per_iteration": 2.401096820831299 + }, + { + "auxiliary_loss_clip": 0.01095578, + "auxiliary_loss_mlp": 0.01074592, + "balance_loss_clip": 1.02581167, + "balance_loss_mlp": 1.02498949, + "epoch": 0.11832256125056366, + "flos": 16908924476160.0, + "grad_norm": 2.2520420532733962, + "language_loss": 0.9109596, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.93266129, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.703125, + "step": 1968, + "time_per_iteration": 2.4904839992523193 + }, + { + "auxiliary_loss_clip": 0.01100343, + "auxiliary_loss_mlp": 0.01076785, + "balance_loss_clip": 1.03046072, + "balance_loss_mlp": 1.02577674, + "epoch": 0.11838268450323162, + "flos": 22819301118720.0, + "grad_norm": 1.75858492264819, + "language_loss": 0.69124985, + "learning_rate": 3.918616463849087e-06, + "loss": 0.71302116, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.74609375, + "step": 1969, + "time_per_iteration": 2.440962076187134 + }, + { + "auxiliary_loss_clip": 0.01098371, + "auxiliary_loss_mlp": 0.0107058, + "balance_loss_clip": 1.02156162, + "balance_loss_mlp": 1.02679706, + "epoch": 0.11844280775589959, + "flos": 33544479939840.0, + "grad_norm": 2.1169594305573227, + "language_loss": 0.83471286, + "learning_rate": 3.918506458695399e-06, + "loss": 0.85640246, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.71484375, + "step": 1970, + "time_per_iteration": 2.54463529586792 + }, + { + "auxiliary_loss_clip": 0.01027703, + "auxiliary_loss_mlp": 0.0100801, + "balance_loss_clip": 1.00195396, + "balance_loss_mlp": 1.00787377, + "epoch": 0.11850293100856757, + "flos": 66347577959040.0, + "grad_norm": 0.8082849103206722, + "language_loss": 0.66209161, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68244874, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.06054688, + "router_z_loss_mlp": 0.19921875, + "step": 1971, + "time_per_iteration": 3.0194623470306396 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.01068902, + "balance_loss_clip": 1.02140939, + "balance_loss_mlp": 1.02438641, + "epoch": 0.11856305426123553, + "flos": 24679892494080.0, + "grad_norm": 2.826370033863772, + "language_loss": 0.81743836, + "learning_rate": 3.918286230142327e-06, + "loss": 0.83912718, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.7578125, + "step": 1972, + "time_per_iteration": 2.431324005126953 + }, + { + "auxiliary_loss_clip": 0.01095262, + "auxiliary_loss_mlp": 0.01076755, + "balance_loss_clip": 1.02837968, + "balance_loss_mlp": 1.02409101, + "epoch": 0.1186231775139035, + "flos": 24278949907200.0, + "grad_norm": 2.212271199992865, + "language_loss": 0.74296468, + "learning_rate": 3.918176006751292e-06, + "loss": 0.76468486, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7109375, + "step": 1973, + "time_per_iteration": 2.4557363986968994 + }, + { + "auxiliary_loss_clip": 0.01093986, + "auxiliary_loss_mlp": 0.01074244, + "balance_loss_clip": 1.02453411, + "balance_loss_mlp": 1.02348924, + "epoch": 0.11868330076657148, + "flos": 21756475376640.0, + "grad_norm": 1.590519927574492, + "language_loss": 0.74300444, + "learning_rate": 3.918065710622832e-06, + "loss": 0.76468676, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.703125, + "step": 1974, + "time_per_iteration": 2.4105241298675537 + }, + { + "auxiliary_loss_clip": 0.01098038, + "auxiliary_loss_mlp": 0.01075033, + "balance_loss_clip": 1.02193689, + "balance_loss_mlp": 1.02419257, + "epoch": 0.11874342401923944, + "flos": 17192559294720.0, + "grad_norm": 2.522444835456934, + "language_loss": 0.80826199, + "learning_rate": 3.917955341761128e-06, + "loss": 0.82999265, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.73828125, + "step": 1975, + "time_per_iteration": 2.3541975021362305 + }, + { + "auxiliary_loss_clip": 0.01098601, + "auxiliary_loss_mlp": 0.01072573, + "balance_loss_clip": 1.02744079, + "balance_loss_mlp": 1.02585196, + "epoch": 0.11880354727190741, + "flos": 15228729429120.0, + "grad_norm": 2.9093295250655937, + "language_loss": 0.78025293, + "learning_rate": 3.917844900170364e-06, + "loss": 0.80196464, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.7265625, + "step": 1976, + "time_per_iteration": 2.3779454231262207 + }, + { + "auxiliary_loss_clip": 0.01099631, + "auxiliary_loss_mlp": 0.01075919, + "balance_loss_clip": 1.02363396, + "balance_loss_mlp": 1.02605879, + "epoch": 0.11886367052457537, + "flos": 27308433335040.0, + "grad_norm": 1.5022591072757743, + "language_loss": 0.76572037, + "learning_rate": 3.91773438585473e-06, + "loss": 0.78747582, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.734375, + "step": 1977, + "time_per_iteration": 2.451704740524292 + }, + { + "auxiliary_loss_clip": 0.01102803, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_clip": 1.02749419, + "balance_loss_mlp": 1.02580285, + "epoch": 0.11892379377724335, + "flos": 21797218800000.0, + "grad_norm": 2.2810410563787453, + "language_loss": 0.76764417, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.7894845, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.7734375, + "step": 1978, + "time_per_iteration": 2.420731544494629 + }, + { + "auxiliary_loss_clip": 0.01098206, + "auxiliary_loss_mlp": 0.01068732, + "balance_loss_clip": 1.0235281, + "balance_loss_mlp": 1.02554226, + "epoch": 0.11898391702991132, + "flos": 13990150569600.0, + "grad_norm": 1.6703888744720572, + "language_loss": 0.74642849, + "learning_rate": 3.917513139065616e-06, + "loss": 0.76809794, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.7265625, + "step": 1979, + "time_per_iteration": 2.424344062805176 + }, + { + "auxiliary_loss_clip": 0.01099319, + "auxiliary_loss_mlp": 0.01070426, + "balance_loss_clip": 1.02178907, + "balance_loss_mlp": 1.02521837, + "epoch": 0.11904404028257928, + "flos": 32233176984960.0, + "grad_norm": 1.6881998049047522, + "language_loss": 1.00195682, + "learning_rate": 3.917402406600525e-06, + "loss": 1.02365422, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.7421875, + "step": 1980, + "time_per_iteration": 2.5174827575683594 + }, + { + "auxiliary_loss_clip": 0.01104547, + "auxiliary_loss_mlp": 0.01081701, + "balance_loss_clip": 1.02946401, + "balance_loss_mlp": 1.02820432, + "epoch": 0.11910416353524726, + "flos": 23585155902720.0, + "grad_norm": 1.6600414445236873, + "language_loss": 0.87702686, + "learning_rate": 3.917291601427342e-06, + "loss": 0.89888936, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.765625, + "step": 1981, + "time_per_iteration": 2.4694221019744873 + }, + { + "auxiliary_loss_clip": 0.0110244, + "auxiliary_loss_mlp": 0.01071673, + "balance_loss_clip": 1.01905417, + "balance_loss_mlp": 1.02628779, + "epoch": 0.11916428678791523, + "flos": 25332000468480.0, + "grad_norm": 1.8443650429948728, + "language_loss": 0.87343848, + "learning_rate": 3.91718072355027e-06, + "loss": 0.89517963, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.76171875, + "step": 1982, + "time_per_iteration": 2.4658966064453125 + }, + { + "auxiliary_loss_clip": 0.01097523, + "auxiliary_loss_mlp": 0.01065145, + "balance_loss_clip": 1.01674581, + "balance_loss_mlp": 1.02529323, + "epoch": 0.11922441004058319, + "flos": 19787513541120.0, + "grad_norm": 1.7884512626553415, + "language_loss": 0.86241335, + "learning_rate": 3.917069772973513e-06, + "loss": 0.88404, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.71875, + "step": 1983, + "time_per_iteration": 2.503352403640747 + }, + { + "auxiliary_loss_clip": 0.01104388, + "auxiliary_loss_mlp": 0.0107926, + "balance_loss_clip": 1.02263546, + "balance_loss_mlp": 1.02738404, + "epoch": 0.11928453329325117, + "flos": 21535475270400.0, + "grad_norm": 2.933758828510656, + "language_loss": 0.80892384, + "learning_rate": 3.916958749701277e-06, + "loss": 0.83076036, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.76953125, + "step": 1984, + "time_per_iteration": 2.4047012329101562 + }, + { + "auxiliary_loss_clip": 0.01101427, + "auxiliary_loss_mlp": 0.01069592, + "balance_loss_clip": 1.02250385, + "balance_loss_mlp": 1.02641273, + "epoch": 0.11934465654591914, + "flos": 20813924868480.0, + "grad_norm": 1.8658768617307708, + "language_loss": 0.85052133, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.8722316, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.75, + "step": 1985, + "time_per_iteration": 2.4365234375 + }, + { + "auxiliary_loss_clip": 0.01099644, + "auxiliary_loss_mlp": 0.01078192, + "balance_loss_clip": 1.02821922, + "balance_loss_mlp": 1.02602315, + "epoch": 0.1194047797985871, + "flos": 19059539448960.0, + "grad_norm": 1.9833492736668135, + "language_loss": 0.7667771, + "learning_rate": 3.916736485087216e-06, + "loss": 0.7885555, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.734375, + "step": 1986, + "time_per_iteration": 2.3927929401397705 + }, + { + "auxiliary_loss_clip": 0.01102733, + "auxiliary_loss_mlp": 0.01079752, + "balance_loss_clip": 1.02739501, + "balance_loss_mlp": 1.02693653, + "epoch": 0.11946490305125507, + "flos": 27189798935040.0, + "grad_norm": 2.0307746150811474, + "language_loss": 0.75117689, + "learning_rate": 3.916625243753819e-06, + "loss": 0.77300179, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7578125, + "step": 1987, + "time_per_iteration": 2.4350292682647705 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01079398, + "balance_loss_clip": 1.02839994, + "balance_loss_mlp": 1.02607214, + "epoch": 0.11952502630392305, + "flos": 21139769387520.0, + "grad_norm": 1.8618209477601615, + "language_loss": 0.75083005, + "learning_rate": 3.916513929741799e-06, + "loss": 0.77262479, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.73828125, + "step": 1988, + "time_per_iteration": 2.4165165424346924 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01086501, + "balance_loss_clip": 1.02901793, + "balance_loss_mlp": 1.02569008, + "epoch": 0.11958514955659101, + "flos": 22123237875840.0, + "grad_norm": 2.036405786914528, + "language_loss": 0.83034456, + "learning_rate": 3.91640254305538e-06, + "loss": 0.85221964, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.75390625, + "step": 1989, + "time_per_iteration": 2.403749465942383 + }, + { + "auxiliary_loss_clip": 0.01099889, + "auxiliary_loss_mlp": 0.01086348, + "balance_loss_clip": 1.0322032, + "balance_loss_mlp": 1.02468252, + "epoch": 0.11964527280925898, + "flos": 17420471850240.0, + "grad_norm": 2.370735707533791, + "language_loss": 0.78127801, + "learning_rate": 3.916291083698784e-06, + "loss": 0.80314028, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.75, + "step": 1990, + "time_per_iteration": 2.4581000804901123 + }, + { + "auxiliary_loss_clip": 0.01026246, + "auxiliary_loss_mlp": 0.01017792, + "balance_loss_clip": 1.01149738, + "balance_loss_mlp": 1.00718284, + "epoch": 0.11970539606192696, + "flos": 70676564302080.0, + "grad_norm": 0.8745206723342456, + "language_loss": 0.55380261, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57424301, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.06298828, + "router_z_loss_mlp": 0.19140625, + "step": 1991, + "time_per_iteration": 3.0395352840423584 + }, + { + "auxiliary_loss_clip": 0.01096793, + "auxiliary_loss_mlp": 0.01064705, + "balance_loss_clip": 1.0157572, + "balance_loss_mlp": 1.02505863, + "epoch": 0.11976551931459492, + "flos": 21213959760000.0, + "grad_norm": 2.65704373762825, + "language_loss": 0.80542469, + "learning_rate": 3.916067946991971e-06, + "loss": 0.8270396, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.71875, + "step": 1992, + "time_per_iteration": 3.9649112224578857 + }, + { + "auxiliary_loss_clip": 0.01100403, + "auxiliary_loss_mlp": 0.01076277, + "balance_loss_clip": 1.02408671, + "balance_loss_mlp": 1.02408791, + "epoch": 0.11982564256726289, + "flos": 25988262894720.0, + "grad_norm": 1.9484625021473028, + "language_loss": 0.80804837, + "learning_rate": 3.915956269650216e-06, + "loss": 0.82981521, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.765625, + "step": 1993, + "time_per_iteration": 2.4504408836364746 + }, + { + "auxiliary_loss_clip": 0.01098994, + "auxiliary_loss_mlp": 0.01082082, + "balance_loss_clip": 1.0281992, + "balance_loss_mlp": 1.02334023, + "epoch": 0.11988576581993086, + "flos": 21649850484480.0, + "grad_norm": 1.819952999341646, + "language_loss": 0.84539223, + "learning_rate": 3.915844519655208e-06, + "loss": 0.86720294, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.7578125, + "step": 1994, + "time_per_iteration": 3.8368923664093018 + }, + { + "auxiliary_loss_clip": 0.01097663, + "auxiliary_loss_mlp": 0.01073072, + "balance_loss_clip": 1.02631783, + "balance_loss_mlp": 1.02424538, + "epoch": 0.11994588907259883, + "flos": 17856467308800.0, + "grad_norm": 2.053111139127266, + "language_loss": 0.91079462, + "learning_rate": 3.915732697011183e-06, + "loss": 0.93250197, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.734375, + "step": 1995, + "time_per_iteration": 3.7944447994232178 + }, + { + "auxiliary_loss_clip": 0.01101169, + "auxiliary_loss_mlp": 0.01085154, + "balance_loss_clip": 1.03265452, + "balance_loss_mlp": 1.02711642, + "epoch": 0.1200060123252668, + "flos": 24461580562560.0, + "grad_norm": 2.1947525367916465, + "language_loss": 0.76161927, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.78348249, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7421875, + "step": 1996, + "time_per_iteration": 3.8567821979522705 + }, + { + "auxiliary_loss_clip": 0.01102286, + "auxiliary_loss_mlp": 0.01082867, + "balance_loss_clip": 1.02822185, + "balance_loss_mlp": 1.02760184, + "epoch": 0.12006613557793476, + "flos": 18731251134720.0, + "grad_norm": 1.8991834544527353, + "language_loss": 0.89453328, + "learning_rate": 3.915508833793048e-06, + "loss": 0.91638482, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.74609375, + "step": 1997, + "time_per_iteration": 2.374429941177368 + }, + { + "auxiliary_loss_clip": 0.01101032, + "auxiliary_loss_mlp": 0.01083592, + "balance_loss_clip": 1.02925682, + "balance_loss_mlp": 1.02612877, + "epoch": 0.12012625883060274, + "flos": 22266800853120.0, + "grad_norm": 1.7461969063410048, + "language_loss": 0.80896258, + "learning_rate": 3.915396793227428e-06, + "loss": 0.83080888, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.75, + "step": 1998, + "time_per_iteration": 2.4020864963531494 + }, + { + "auxiliary_loss_clip": 0.01101154, + "auxiliary_loss_mlp": 0.01080212, + "balance_loss_clip": 1.02866566, + "balance_loss_mlp": 1.02690041, + "epoch": 0.1201863820832707, + "flos": 21757906742400.0, + "grad_norm": 1.5485039456893595, + "language_loss": 0.73996842, + "learning_rate": 3.915284680029769e-06, + "loss": 0.76178205, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.7421875, + "step": 1999, + "time_per_iteration": 2.4123663902282715 + }, + { + "auxiliary_loss_clip": 0.01104924, + "auxiliary_loss_mlp": 0.01077225, + "balance_loss_clip": 1.02603638, + "balance_loss_mlp": 1.02838016, + "epoch": 0.12024650533593867, + "flos": 21906915891840.0, + "grad_norm": 2.0148322726718813, + "language_loss": 0.77953172, + "learning_rate": 3.915172494204323e-06, + "loss": 0.80135322, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.765625, + "step": 2000, + "time_per_iteration": 2.4312736988067627 + }, + { + "auxiliary_loss_clip": 0.01100827, + "auxiliary_loss_mlp": 0.01077749, + "balance_loss_clip": 1.02386618, + "balance_loss_mlp": 1.02605891, + "epoch": 0.12030662858860665, + "flos": 21688150112640.0, + "grad_norm": 1.5357069174431184, + "language_loss": 0.87022698, + "learning_rate": 3.915060235755344e-06, + "loss": 0.89201272, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.75, + "step": 2001, + "time_per_iteration": 2.4093170166015625 + }, + { + "auxiliary_loss_clip": 0.01102482, + "auxiliary_loss_mlp": 0.01074914, + "balance_loss_clip": 1.0234158, + "balance_loss_mlp": 1.02663589, + "epoch": 0.12036675184127461, + "flos": 12932386974720.0, + "grad_norm": 2.1521722000856673, + "language_loss": 0.77409619, + "learning_rate": 3.91494790468709e-06, + "loss": 0.79587018, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.7578125, + "step": 2002, + "time_per_iteration": 2.3760640621185303 + }, + { + "auxiliary_loss_clip": 0.01104877, + "auxiliary_loss_mlp": 0.01078146, + "balance_loss_clip": 1.0212357, + "balance_loss_mlp": 1.02616906, + "epoch": 0.12042687509394258, + "flos": 20849955258240.0, + "grad_norm": 1.9742932210525594, + "language_loss": 0.80461574, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.82644594, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.7890625, + "step": 2003, + "time_per_iteration": 2.3926310539245605 + }, + { + "auxiliary_loss_clip": 0.01100392, + "auxiliary_loss_mlp": 0.01072534, + "balance_loss_clip": 1.02048755, + "balance_loss_mlp": 1.02594829, + "epoch": 0.12048699834661056, + "flos": 23877378915840.0, + "grad_norm": 1.7516043441517906, + "language_loss": 0.73556221, + "learning_rate": 3.914723024709793e-06, + "loss": 0.75729144, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.74609375, + "step": 2004, + "time_per_iteration": 2.4246857166290283 + }, + { + "auxiliary_loss_clip": 0.01100664, + "auxiliary_loss_mlp": 0.01077836, + "balance_loss_clip": 1.022475, + "balance_loss_mlp": 1.02415192, + "epoch": 0.12054712159927852, + "flos": 19755323400960.0, + "grad_norm": 1.5278522372437715, + "language_loss": 0.8042835, + "learning_rate": 3.914610475809279e-06, + "loss": 0.82606846, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.765625, + "step": 2005, + "time_per_iteration": 2.388758897781372 + }, + { + "auxiliary_loss_clip": 0.0102637, + "auxiliary_loss_mlp": 0.01011751, + "balance_loss_clip": 1.00521803, + "balance_loss_mlp": 1.00709653, + "epoch": 0.12060724485194649, + "flos": 51670057075200.0, + "grad_norm": 0.9355256713301027, + "language_loss": 0.58146429, + "learning_rate": 3.914497854306543e-06, + "loss": 0.6018455, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.06542969, + "router_z_loss_mlp": 0.19335938, + "step": 2006, + "time_per_iteration": 2.8250136375427246 + }, + { + "auxiliary_loss_clip": 0.01097827, + "auxiliary_loss_mlp": 0.01072638, + "balance_loss_clip": 1.02268934, + "balance_loss_mlp": 1.02444279, + "epoch": 0.12066736810461445, + "flos": 18989398794240.0, + "grad_norm": 1.9284007458093175, + "language_loss": 0.78901267, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.81071734, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.734375, + "step": 2007, + "time_per_iteration": 2.413078546524048 + }, + { + "auxiliary_loss_clip": 0.01100918, + "auxiliary_loss_mlp": 0.01079794, + "balance_loss_clip": 1.02507722, + "balance_loss_mlp": 1.02491093, + "epoch": 0.12072749135728243, + "flos": 16471043804160.0, + "grad_norm": 2.750731650383216, + "language_loss": 0.8665058, + "learning_rate": 3.914272393511494e-06, + "loss": 0.88831294, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.7578125, + "step": 2008, + "time_per_iteration": 2.42895770072937 + }, + { + "auxiliary_loss_clip": 0.01097345, + "auxiliary_loss_mlp": 0.01078803, + "balance_loss_clip": 1.02804422, + "balance_loss_mlp": 1.02364898, + "epoch": 0.1207876146099504, + "flos": 18076140783360.0, + "grad_norm": 1.8207046029444316, + "language_loss": 0.86446536, + "learning_rate": 3.91415955422773e-06, + "loss": 0.88622683, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.73828125, + "step": 2009, + "time_per_iteration": 2.393139362335205 + }, + { + "auxiliary_loss_clip": 0.01099216, + "auxiliary_loss_mlp": 0.01082942, + "balance_loss_clip": 1.03122866, + "balance_loss_mlp": 1.02429438, + "epoch": 0.12084773786261836, + "flos": 21870501477120.0, + "grad_norm": 1.947446039372563, + "language_loss": 0.8604666, + "learning_rate": 3.914046642358844e-06, + "loss": 0.88228816, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.75, + "step": 2010, + "time_per_iteration": 2.393658399581909 + }, + { + "auxiliary_loss_clip": 0.01101846, + "auxiliary_loss_mlp": 0.01081283, + "balance_loss_clip": 1.02799702, + "balance_loss_mlp": 1.0261147, + "epoch": 0.12090786111528634, + "flos": 18332054115840.0, + "grad_norm": 1.7712058122923342, + "language_loss": 0.85864633, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.88047767, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7578125, + "step": 2011, + "time_per_iteration": 2.373638153076172 + }, + { + "auxiliary_loss_clip": 0.01100197, + "auxiliary_loss_mlp": 0.01084869, + "balance_loss_clip": 1.03391862, + "balance_loss_mlp": 1.02493131, + "epoch": 0.1209679843679543, + "flos": 21104786338560.0, + "grad_norm": 1.9122767260484372, + "language_loss": 0.98636723, + "learning_rate": 3.913820600882834e-06, + "loss": 1.00821793, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2012, + "time_per_iteration": 2.404940605163574 + }, + { + "auxiliary_loss_clip": 0.01098806, + "auxiliary_loss_mlp": 0.0107676, + "balance_loss_clip": 1.02457047, + "balance_loss_mlp": 1.02681601, + "epoch": 0.12102810762062227, + "flos": 29239793769600.0, + "grad_norm": 2.132585782696584, + "language_loss": 0.81927824, + "learning_rate": 3.913707471284283e-06, + "loss": 0.84103394, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.71875, + "step": 2013, + "time_per_iteration": 2.4583590030670166 + }, + { + "auxiliary_loss_clip": 0.01100676, + "auxiliary_loss_mlp": 0.01073223, + "balance_loss_clip": 1.02065194, + "balance_loss_mlp": 1.02498317, + "epoch": 0.12108823087329025, + "flos": 17929749985920.0, + "grad_norm": 2.6129780929695943, + "language_loss": 0.79474306, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.81648207, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7578125, + "step": 2014, + "time_per_iteration": 2.3796372413635254 + }, + { + "auxiliary_loss_clip": 0.01101031, + "auxiliary_loss_mlp": 0.01077186, + "balance_loss_clip": 1.02411413, + "balance_loss_mlp": 1.02689409, + "epoch": 0.12114835412595822, + "flos": 22090733533440.0, + "grad_norm": 6.252650995932334, + "language_loss": 0.8912667, + "learning_rate": 3.913480994387535e-06, + "loss": 0.91304892, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7421875, + "step": 2015, + "time_per_iteration": 2.396007537841797 + }, + { + "auxiliary_loss_clip": 0.01094611, + "auxiliary_loss_mlp": 0.01072568, + "balance_loss_clip": 1.02278614, + "balance_loss_mlp": 1.02302539, + "epoch": 0.12120847737862618, + "flos": 20411306536320.0, + "grad_norm": 2.158719660804708, + "language_loss": 0.72802806, + "learning_rate": 3.913367647097926e-06, + "loss": 0.74969989, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.71484375, + "step": 2016, + "time_per_iteration": 2.389314651489258 + }, + { + "auxiliary_loss_clip": 0.01103517, + "auxiliary_loss_mlp": 0.0107349, + "balance_loss_clip": 1.01681769, + "balance_loss_mlp": 1.02737439, + "epoch": 0.12126860063129415, + "flos": 22307963212800.0, + "grad_norm": 2.4298115875707773, + "language_loss": 0.82641542, + "learning_rate": 3.913254227253225e-06, + "loss": 0.84818554, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.76171875, + "step": 2017, + "time_per_iteration": 2.407562017440796 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01069905, + "balance_loss_clip": 1.01835895, + "balance_loss_mlp": 1.02759886, + "epoch": 0.12132872388396213, + "flos": 13698416315520.0, + "grad_norm": 2.5426549681984727, + "language_loss": 0.72382724, + "learning_rate": 3.913140734857731e-06, + "loss": 0.74555147, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.75, + "step": 2018, + "time_per_iteration": 2.3722572326660156 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.01073024, + "balance_loss_clip": 1.02419543, + "balance_loss_mlp": 1.02634835, + "epoch": 0.12138884713663009, + "flos": 26465804737920.0, + "grad_norm": 1.6765216002555856, + "language_loss": 0.73807836, + "learning_rate": 3.91302716991575e-06, + "loss": 0.75979853, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7265625, + "step": 2019, + "time_per_iteration": 2.4477133750915527 + }, + { + "auxiliary_loss_clip": 0.01098821, + "auxiliary_loss_mlp": 0.01073841, + "balance_loss_clip": 1.02348661, + "balance_loss_mlp": 1.02545166, + "epoch": 0.12144897038929806, + "flos": 26140379155200.0, + "grad_norm": 1.6391196921058766, + "language_loss": 0.94323379, + "learning_rate": 3.912913532431586e-06, + "loss": 0.96496046, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.734375, + "step": 2020, + "time_per_iteration": 2.4295859336853027 + }, + { + "auxiliary_loss_clip": 0.01101166, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_clip": 1.02155137, + "balance_loss_mlp": 1.02704287, + "epoch": 0.12150909364196603, + "flos": 24716376731520.0, + "grad_norm": 3.016657971970353, + "language_loss": 0.80003452, + "learning_rate": 3.912799822409549e-06, + "loss": 0.82173038, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.7421875, + "step": 2021, + "time_per_iteration": 2.4687986373901367 + }, + { + "auxiliary_loss_clip": 0.01097673, + "auxiliary_loss_mlp": 0.01071001, + "balance_loss_clip": 1.02634466, + "balance_loss_mlp": 1.02644575, + "epoch": 0.121569216894634, + "flos": 25185958784640.0, + "grad_norm": 1.992559760619375, + "language_loss": 0.82123077, + "learning_rate": 3.912686039853952e-06, + "loss": 0.84291744, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.7109375, + "step": 2022, + "time_per_iteration": 2.4219000339508057 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01071895, + "balance_loss_clip": 1.0240922, + "balance_loss_mlp": 1.02610743, + "epoch": 0.12162934014730196, + "flos": 13443236121600.0, + "grad_norm": 1.7304771420679124, + "language_loss": 0.87428689, + "learning_rate": 3.912572184769108e-06, + "loss": 0.8960095, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.7421875, + "step": 2023, + "time_per_iteration": 2.380903482437134 + }, + { + "auxiliary_loss_clip": 0.0109664, + "auxiliary_loss_mlp": 0.01072186, + "balance_loss_clip": 1.02314281, + "balance_loss_mlp": 1.02475595, + "epoch": 0.12168946339996994, + "flos": 16945199245440.0, + "grad_norm": 2.24343236522217, + "language_loss": 0.88681155, + "learning_rate": 3.912458257159335e-06, + "loss": 0.90849984, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.71875, + "step": 2024, + "time_per_iteration": 2.3560869693756104 + }, + { + "auxiliary_loss_clip": 0.01096249, + "auxiliary_loss_mlp": 0.01074245, + "balance_loss_clip": 1.02524996, + "balance_loss_mlp": 1.02344918, + "epoch": 0.12174958665263791, + "flos": 29820399546240.0, + "grad_norm": 1.9362419442833214, + "language_loss": 0.73329562, + "learning_rate": 3.912344257028954e-06, + "loss": 0.75500059, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7265625, + "step": 2025, + "time_per_iteration": 2.4567461013793945 + }, + { + "auxiliary_loss_clip": 0.01098404, + "auxiliary_loss_mlp": 0.0106188, + "balance_loss_clip": 1.01662803, + "balance_loss_mlp": 1.0265224, + "epoch": 0.12180970990530587, + "flos": 24640824816000.0, + "grad_norm": 1.7373914318618429, + "language_loss": 0.77455968, + "learning_rate": 3.912230184382286e-06, + "loss": 0.79616255, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.71875, + "step": 2026, + "time_per_iteration": 2.4456303119659424 + }, + { + "auxiliary_loss_clip": 0.0109705, + "auxiliary_loss_mlp": 0.01073019, + "balance_loss_clip": 1.02335608, + "balance_loss_mlp": 1.02471697, + "epoch": 0.12186983315797385, + "flos": 20520654514560.0, + "grad_norm": 2.0902975911179387, + "language_loss": 0.91106635, + "learning_rate": 3.912116039223659e-06, + "loss": 0.93276703, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.72265625, + "step": 2027, + "time_per_iteration": 2.3931589126586914 + }, + { + "auxiliary_loss_clip": 0.01094783, + "auxiliary_loss_mlp": 0.01059222, + "balance_loss_clip": 1.01554406, + "balance_loss_mlp": 1.02400589, + "epoch": 0.12192995641064182, + "flos": 27817117977600.0, + "grad_norm": 1.727760463443916, + "language_loss": 0.77637988, + "learning_rate": 3.912001821557399e-06, + "loss": 0.79791999, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.70703125, + "step": 2028, + "time_per_iteration": 2.442688465118408 + }, + { + "auxiliary_loss_clip": 0.01097496, + "auxiliary_loss_mlp": 0.01071325, + "balance_loss_clip": 1.02337885, + "balance_loss_mlp": 1.02546561, + "epoch": 0.12199007966330978, + "flos": 22016054401920.0, + "grad_norm": 2.158207453716294, + "language_loss": 0.78875971, + "learning_rate": 3.911887531387839e-06, + "loss": 0.81044793, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.71875, + "step": 2029, + "time_per_iteration": 2.424067735671997 + }, + { + "auxiliary_loss_clip": 0.0109384, + "auxiliary_loss_mlp": 0.01062802, + "balance_loss_clip": 1.01731169, + "balance_loss_mlp": 1.02459514, + "epoch": 0.12205020291597775, + "flos": 23294084964480.0, + "grad_norm": 1.821161545617599, + "language_loss": 0.81320238, + "learning_rate": 3.911773168719313e-06, + "loss": 0.83476877, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.69140625, + "step": 2030, + "time_per_iteration": 2.52874493598938 + }, + { + "auxiliary_loss_clip": 0.01094847, + "auxiliary_loss_mlp": 0.01065589, + "balance_loss_clip": 1.02040911, + "balance_loss_mlp": 1.02542949, + "epoch": 0.12211032616864573, + "flos": 26030402772480.0, + "grad_norm": 2.1154043574589516, + "language_loss": 0.77602094, + "learning_rate": 3.911658733556155e-06, + "loss": 0.7976253, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.6953125, + "step": 2031, + "time_per_iteration": 2.462101936340332 + }, + { + "auxiliary_loss_clip": 0.01094894, + "auxiliary_loss_mlp": 0.0106596, + "balance_loss_clip": 1.0229013, + "balance_loss_mlp": 1.02499247, + "epoch": 0.12217044942131369, + "flos": 20409944993280.0, + "grad_norm": 1.777755454066369, + "language_loss": 0.76476181, + "learning_rate": 3.911544225902707e-06, + "loss": 0.78637034, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6953125, + "step": 2032, + "time_per_iteration": 3.8741612434387207 + }, + { + "auxiliary_loss_clip": 0.01090749, + "auxiliary_loss_mlp": 0.01055686, + "balance_loss_clip": 1.01880288, + "balance_loss_mlp": 1.02480173, + "epoch": 0.12223057267398166, + "flos": 22856029735680.0, + "grad_norm": 1.5465644651536612, + "language_loss": 0.90926754, + "learning_rate": 3.911429645763311e-06, + "loss": 0.93073189, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.66015625, + "step": 2033, + "time_per_iteration": 2.4857101440429688 + }, + { + "auxiliary_loss_clip": 0.0109937, + "auxiliary_loss_mlp": 0.01065909, + "balance_loss_clip": 1.02001321, + "balance_loss_mlp": 1.02723336, + "epoch": 0.12229069592664964, + "flos": 20046533984640.0, + "grad_norm": 2.2788883366632278, + "language_loss": 0.67856073, + "learning_rate": 3.911314993142311e-06, + "loss": 0.70021349, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.72265625, + "step": 2034, + "time_per_iteration": 3.8004846572875977 + }, + { + "auxiliary_loss_clip": 0.01095609, + "auxiliary_loss_mlp": 0.01061323, + "balance_loss_clip": 1.01933777, + "balance_loss_mlp": 1.02516866, + "epoch": 0.1223508191793176, + "flos": 22273119809280.0, + "grad_norm": 2.112112280464022, + "language_loss": 0.78143543, + "learning_rate": 3.911200268044055e-06, + "loss": 0.80300474, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.703125, + "step": 2035, + "time_per_iteration": 5.227928876876831 + }, + { + "auxiliary_loss_clip": 0.01097873, + "auxiliary_loss_mlp": 0.01070365, + "balance_loss_clip": 1.02473176, + "balance_loss_mlp": 1.026021, + "epoch": 0.12241094243198557, + "flos": 21284973198720.0, + "grad_norm": 2.1026026932902924, + "language_loss": 0.73358333, + "learning_rate": 3.911085470472892e-06, + "loss": 0.75526571, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.71875, + "step": 2036, + "time_per_iteration": 2.43646240234375 + }, + { + "auxiliary_loss_clip": 0.0109354, + "auxiliary_loss_mlp": 0.01063849, + "balance_loss_clip": 1.01919365, + "balance_loss_mlp": 1.0246433, + "epoch": 0.12247106568465355, + "flos": 17381473994880.0, + "grad_norm": 1.6312314502071057, + "language_loss": 0.84441555, + "learning_rate": 3.910970600433178e-06, + "loss": 0.86598945, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.6875, + "step": 2037, + "time_per_iteration": 2.403738260269165 + }, + { + "auxiliary_loss_clip": 0.01097735, + "auxiliary_loss_mlp": 0.01068362, + "balance_loss_clip": 1.02234674, + "balance_loss_mlp": 1.0257194, + "epoch": 0.12253118893732151, + "flos": 27044420567040.0, + "grad_norm": 2.7378280757705764, + "language_loss": 0.83843458, + "learning_rate": 3.910855657929267e-06, + "loss": 0.8600955, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.72265625, + "step": 2038, + "time_per_iteration": 2.4489896297454834 + }, + { + "auxiliary_loss_clip": 0.0102724, + "auxiliary_loss_mlp": 0.01014643, + "balance_loss_clip": 1.00932586, + "balance_loss_mlp": 1.01020432, + "epoch": 0.12259131218998948, + "flos": 53858762208000.0, + "grad_norm": 0.8288294661885447, + "language_loss": 0.58852047, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60893929, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.05322266, + "router_z_loss_mlp": 0.16992188, + "step": 2039, + "time_per_iteration": 2.899082660675049 + }, + { + "auxiliary_loss_clip": 0.01095161, + "auxiliary_loss_mlp": 0.01065488, + "balance_loss_clip": 1.01856744, + "balance_loss_mlp": 1.0251404, + "epoch": 0.12265143544265744, + "flos": 17891031421440.0, + "grad_norm": 3.475170532777588, + "language_loss": 0.83230424, + "learning_rate": 3.910625555546292e-06, + "loss": 0.85391068, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.69921875, + "step": 2040, + "time_per_iteration": 2.396820306777954 + }, + { + "auxiliary_loss_clip": 0.01094307, + "auxiliary_loss_mlp": 0.01059766, + "balance_loss_clip": 1.01580119, + "balance_loss_mlp": 1.02418733, + "epoch": 0.12271155869532542, + "flos": 21798824722560.0, + "grad_norm": 1.8250902053084934, + "language_loss": 0.84842402, + "learning_rate": 3.910510395675953e-06, + "loss": 0.86996472, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.69921875, + "step": 2041, + "time_per_iteration": 2.4108901023864746 + }, + { + "auxiliary_loss_clip": 0.01100113, + "auxiliary_loss_mlp": 0.01074342, + "balance_loss_clip": 1.02332067, + "balance_loss_mlp": 1.02571213, + "epoch": 0.12277168194799339, + "flos": 19827733294080.0, + "grad_norm": 2.04895345969584, + "language_loss": 0.70323157, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.72497612, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.7421875, + "step": 2042, + "time_per_iteration": 2.396517515182495 + }, + { + "auxiliary_loss_clip": 0.01102181, + "auxiliary_loss_mlp": 0.01074845, + "balance_loss_clip": 1.02532506, + "balance_loss_mlp": 1.02647924, + "epoch": 0.12283180520066135, + "flos": 23219929503360.0, + "grad_norm": 1.84579155283626, + "language_loss": 0.8289305, + "learning_rate": 3.910279858599409e-06, + "loss": 0.85070086, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.7578125, + "step": 2043, + "time_per_iteration": 2.4528863430023193 + }, + { + "auxiliary_loss_clip": 0.01104281, + "auxiliary_loss_mlp": 0.01072384, + "balance_loss_clip": 1.01983666, + "balance_loss_mlp": 1.02717531, + "epoch": 0.12289192845332933, + "flos": 18587478689280.0, + "grad_norm": 1.5994070044506536, + "language_loss": 0.81927812, + "learning_rate": 3.910164481401946e-06, + "loss": 0.84104484, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.7734375, + "step": 2044, + "time_per_iteration": 2.430948257446289 + }, + { + "auxiliary_loss_clip": 0.01100648, + "auxiliary_loss_mlp": 0.01070459, + "balance_loss_clip": 1.02391958, + "balance_loss_mlp": 1.03021646, + "epoch": 0.1229520517059973, + "flos": 25768519597440.0, + "grad_norm": 2.0812957377747, + "language_loss": 0.79213905, + "learning_rate": 3.910049031770853e-06, + "loss": 0.8138501, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.703125, + "step": 2045, + "time_per_iteration": 2.4407637119293213 + }, + { + "auxiliary_loss_clip": 0.01104067, + "auxiliary_loss_mlp": 0.01084356, + "balance_loss_clip": 1.02770782, + "balance_loss_mlp": 1.02807474, + "epoch": 0.12301217495866526, + "flos": 20886090382080.0, + "grad_norm": 2.6035637219847394, + "language_loss": 0.69962198, + "learning_rate": 3.90993350971051e-06, + "loss": 0.72150624, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.76171875, + "step": 2046, + "time_per_iteration": 2.403661012649536 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01072368, + "balance_loss_clip": 1.02525663, + "balance_loss_mlp": 1.02933848, + "epoch": 0.12307229821133324, + "flos": 22377824576640.0, + "grad_norm": 3.3185583231086673, + "language_loss": 0.74473155, + "learning_rate": 3.909817915225297e-06, + "loss": 0.7664808, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.734375, + "step": 2047, + "time_per_iteration": 2.439704179763794 + }, + { + "auxiliary_loss_clip": 0.01106395, + "auxiliary_loss_mlp": 0.01066882, + "balance_loss_clip": 1.01702857, + "balance_loss_mlp": 1.03078938, + "epoch": 0.1231324214640012, + "flos": 23366285389440.0, + "grad_norm": 1.6507888626790412, + "language_loss": 0.79207921, + "learning_rate": 3.909702248319597e-06, + "loss": 0.81381202, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.7578125, + "step": 2048, + "time_per_iteration": 2.4446001052856445 + }, + { + "auxiliary_loss_clip": 0.01101361, + "auxiliary_loss_mlp": 0.01075816, + "balance_loss_clip": 1.02856112, + "balance_loss_mlp": 1.02796519, + "epoch": 0.12319254471666917, + "flos": 23766075901440.0, + "grad_norm": 2.1154955643251507, + "language_loss": 0.8764677, + "learning_rate": 3.909586508997797e-06, + "loss": 0.89823949, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.734375, + "step": 2049, + "time_per_iteration": 2.439011573791504 + }, + { + "auxiliary_loss_clip": 0.01104118, + "auxiliary_loss_mlp": 0.01073127, + "balance_loss_clip": 1.02220082, + "balance_loss_mlp": 1.02865362, + "epoch": 0.12325266796933713, + "flos": 23549020778880.0, + "grad_norm": 1.4742929209807694, + "language_loss": 0.78183985, + "learning_rate": 3.909470697264285e-06, + "loss": 0.80361229, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75390625, + "step": 2050, + "time_per_iteration": 2.433151960372925 + }, + { + "auxiliary_loss_clip": 0.01101641, + "auxiliary_loss_mlp": 0.01072036, + "balance_loss_clip": 1.02325547, + "balance_loss_mlp": 1.02693617, + "epoch": 0.12331279122200511, + "flos": 24422896909440.0, + "grad_norm": 2.0343430057138967, + "language_loss": 0.83939666, + "learning_rate": 3.909354813123452e-06, + "loss": 0.8611334, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.74609375, + "step": 2051, + "time_per_iteration": 2.451732635498047 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01070648, + "balance_loss_clip": 1.02451372, + "balance_loss_mlp": 1.02879846, + "epoch": 0.12337291447467308, + "flos": 25483104299520.0, + "grad_norm": 1.558967632639031, + "language_loss": 0.81091416, + "learning_rate": 3.909238856579693e-06, + "loss": 0.8326394, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.73046875, + "step": 2052, + "time_per_iteration": 2.447704315185547 + }, + { + "auxiliary_loss_clip": 0.01101289, + "auxiliary_loss_mlp": 0.0107629, + "balance_loss_clip": 1.02541161, + "balance_loss_mlp": 1.02740407, + "epoch": 0.12343303772734104, + "flos": 23548881133440.0, + "grad_norm": 2.200160358247917, + "language_loss": 0.76149035, + "learning_rate": 3.909122827637406e-06, + "loss": 0.78326607, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.73828125, + "step": 2053, + "time_per_iteration": 2.4194579124450684 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01068813, + "balance_loss_clip": 1.01872158, + "balance_loss_mlp": 1.02509356, + "epoch": 0.12349316098000902, + "flos": 47555299900800.0, + "grad_norm": 1.4602745343085053, + "language_loss": 0.7615273, + "learning_rate": 3.909006726300991e-06, + "loss": 0.78322452, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7578125, + "step": 2054, + "time_per_iteration": 2.62043833732605 + }, + { + "auxiliary_loss_clip": 0.01094362, + "auxiliary_loss_mlp": 0.010621, + "balance_loss_clip": 1.01725316, + "balance_loss_mlp": 1.02342665, + "epoch": 0.12355328423267699, + "flos": 25044804691200.0, + "grad_norm": 1.654600054283012, + "language_loss": 0.86399066, + "learning_rate": 3.908890552574849e-06, + "loss": 0.88555527, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.7109375, + "step": 2055, + "time_per_iteration": 2.4431726932525635 + }, + { + "auxiliary_loss_clip": 0.0110285, + "auxiliary_loss_mlp": 0.01076264, + "balance_loss_clip": 1.02548099, + "balance_loss_mlp": 1.02694726, + "epoch": 0.12361340748534495, + "flos": 27707909644800.0, + "grad_norm": 1.8772361462237437, + "language_loss": 0.80354464, + "learning_rate": 3.908774306463384e-06, + "loss": 0.82533586, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7578125, + "step": 2056, + "time_per_iteration": 2.453562021255493 + }, + { + "auxiliary_loss_clip": 0.01099322, + "auxiliary_loss_mlp": 0.01071278, + "balance_loss_clip": 1.01610827, + "balance_loss_mlp": 1.02515936, + "epoch": 0.12367353073801293, + "flos": 26139401637120.0, + "grad_norm": 2.133033260603137, + "language_loss": 0.84793949, + "learning_rate": 3.908657987971009e-06, + "loss": 0.86964554, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.7421875, + "step": 2057, + "time_per_iteration": 2.4447827339172363 + }, + { + "auxiliary_loss_clip": 0.01102722, + "auxiliary_loss_mlp": 0.01075825, + "balance_loss_clip": 1.02399325, + "balance_loss_mlp": 1.02526152, + "epoch": 0.1237336539906809, + "flos": 25154850896640.0, + "grad_norm": 1.5515261900462065, + "language_loss": 0.80114263, + "learning_rate": 3.90854159710213e-06, + "loss": 0.82292807, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.7734375, + "step": 2058, + "time_per_iteration": 2.4445579051971436 + }, + { + "auxiliary_loss_clip": 0.01102928, + "auxiliary_loss_mlp": 0.01077654, + "balance_loss_clip": 1.02369976, + "balance_loss_mlp": 1.02615619, + "epoch": 0.12379377724334886, + "flos": 15303687851520.0, + "grad_norm": 1.8414191723691204, + "language_loss": 0.85353434, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.8753401, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.765625, + "step": 2059, + "time_per_iteration": 2.396862745285034 + }, + { + "auxiliary_loss_clip": 0.01104923, + "auxiliary_loss_mlp": 0.01083039, + "balance_loss_clip": 1.02696323, + "balance_loss_mlp": 1.02706861, + "epoch": 0.12385390049601683, + "flos": 21315871618560.0, + "grad_norm": 2.5997271222125455, + "language_loss": 0.8355009, + "learning_rate": 3.908308598252523e-06, + "loss": 0.85738051, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.77734375, + "step": 2060, + "time_per_iteration": 2.385915756225586 + }, + { + "auxiliary_loss_clip": 0.01100441, + "auxiliary_loss_mlp": 0.01074287, + "balance_loss_clip": 1.02340877, + "balance_loss_mlp": 1.02541351, + "epoch": 0.1239140237486848, + "flos": 15115576112640.0, + "grad_norm": 1.8742015410889903, + "language_loss": 0.88349009, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.90523732, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2061, + "time_per_iteration": 2.385793685913086 + }, + { + "auxiliary_loss_clip": 0.01097072, + "auxiliary_loss_mlp": 0.01072707, + "balance_loss_clip": 1.01970649, + "balance_loss_mlp": 1.02607954, + "epoch": 0.12397414700135277, + "flos": 21975834648960.0, + "grad_norm": 1.76867728680214, + "language_loss": 0.86641526, + "learning_rate": 3.908075309949906e-06, + "loss": 0.88811308, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7109375, + "step": 2062, + "time_per_iteration": 2.377290725708008 + }, + { + "auxiliary_loss_clip": 0.01097417, + "auxiliary_loss_mlp": 0.0107754, + "balance_loss_clip": 1.02618504, + "balance_loss_mlp": 1.02531052, + "epoch": 0.12403427025402074, + "flos": 13400223459840.0, + "grad_norm": 1.711058523759988, + "language_loss": 0.80737555, + "learning_rate": 3.907958557264774e-06, + "loss": 0.82912517, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.71875, + "step": 2063, + "time_per_iteration": 2.3697285652160645 + }, + { + "auxiliary_loss_clip": 0.01101941, + "auxiliary_loss_mlp": 0.01068925, + "balance_loss_clip": 1.01811814, + "balance_loss_mlp": 1.0266819, + "epoch": 0.12409439350668872, + "flos": 15303478383360.0, + "grad_norm": 1.9201969062870314, + "language_loss": 0.81433487, + "learning_rate": 3.907841732229663e-06, + "loss": 0.83604348, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2064, + "time_per_iteration": 2.356920003890991 + }, + { + "auxiliary_loss_clip": 0.0109826, + "auxiliary_loss_mlp": 0.01076059, + "balance_loss_clip": 1.02627718, + "balance_loss_mlp": 1.02642202, + "epoch": 0.12415451675935668, + "flos": 25008215719680.0, + "grad_norm": 2.3321194103369125, + "language_loss": 0.93810534, + "learning_rate": 3.907724834849002e-06, + "loss": 0.95984852, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.71875, + "step": 2065, + "time_per_iteration": 2.4324593544006348 + }, + { + "auxiliary_loss_clip": 0.0110207, + "auxiliary_loss_mlp": 0.01070047, + "balance_loss_clip": 1.01757085, + "balance_loss_mlp": 1.02560151, + "epoch": 0.12421464001202465, + "flos": 23658543313920.0, + "grad_norm": 1.5702033567887026, + "language_loss": 0.83040226, + "learning_rate": 3.907607865127225e-06, + "loss": 0.8521235, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.765625, + "step": 2066, + "time_per_iteration": 2.417567729949951 + }, + { + "auxiliary_loss_clip": 0.01031145, + "auxiliary_loss_mlp": 0.01019319, + "balance_loss_clip": 1.01245248, + "balance_loss_mlp": 1.01137173, + "epoch": 0.12427476326469263, + "flos": 65729440604160.0, + "grad_norm": 0.9655542385750268, + "language_loss": 0.63411349, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65461808, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.19726562, + "step": 2067, + "time_per_iteration": 2.9953720569610596 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.01071899, + "balance_loss_clip": 1.02459669, + "balance_loss_mlp": 1.02608716, + "epoch": 0.12433488651736059, + "flos": 24534269746560.0, + "grad_norm": 1.798849508054672, + "language_loss": 0.95304084, + "learning_rate": 3.907373708678063e-06, + "loss": 0.97475964, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.73828125, + "step": 2068, + "time_per_iteration": 2.410287857055664 + }, + { + "auxiliary_loss_clip": 0.01100036, + "auxiliary_loss_mlp": 0.01064338, + "balance_loss_clip": 1.02228117, + "balance_loss_mlp": 1.0278132, + "epoch": 0.12439500977002856, + "flos": 21030630877440.0, + "grad_norm": 1.8968719470765192, + "language_loss": 0.82948923, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.85113299, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.72265625, + "step": 2069, + "time_per_iteration": 2.4397928714752197 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.01075235, + "balance_loss_clip": 1.02578688, + "balance_loss_mlp": 1.02767277, + "epoch": 0.12445513302269653, + "flos": 26829495037440.0, + "grad_norm": 1.712623774292522, + "language_loss": 0.78713715, + "learning_rate": 3.907139262917696e-06, + "loss": 0.80890441, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.73828125, + "step": 2070, + "time_per_iteration": 2.4469263553619385 + }, + { + "auxiliary_loss_clip": 0.01100706, + "auxiliary_loss_mlp": 0.01076139, + "balance_loss_clip": 1.02425909, + "balance_loss_mlp": 1.0269618, + "epoch": 0.1245152562753645, + "flos": 18367944860160.0, + "grad_norm": 2.495067701395955, + "language_loss": 0.83349824, + "learning_rate": 3.907021931556922e-06, + "loss": 0.85526669, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.73828125, + "step": 2071, + "time_per_iteration": 3.8380000591278076 + }, + { + "auxiliary_loss_clip": 0.01096935, + "auxiliary_loss_mlp": 0.01081845, + "balance_loss_clip": 1.03323174, + "balance_loss_mlp": 1.0257802, + "epoch": 0.12457537952803246, + "flos": 33106634179200.0, + "grad_norm": 1.74452817456631, + "language_loss": 0.80133927, + "learning_rate": 3.906904527881684e-06, + "loss": 0.82312709, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7109375, + "step": 2072, + "time_per_iteration": 2.5089497566223145 + }, + { + "auxiliary_loss_clip": 0.01098201, + "auxiliary_loss_mlp": 0.01071654, + "balance_loss_clip": 1.02461421, + "balance_loss_mlp": 1.0265168, + "epoch": 0.12463550278070043, + "flos": 22269209736960.0, + "grad_norm": 1.8763199360852083, + "language_loss": 0.77721858, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.79891711, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.71875, + "step": 2073, + "time_per_iteration": 3.8386478424072266 + }, + { + "auxiliary_loss_clip": 0.01095482, + "auxiliary_loss_mlp": 0.01070278, + "balance_loss_clip": 1.02516913, + "balance_loss_mlp": 1.02402532, + "epoch": 0.12469562603336841, + "flos": 14678288933760.0, + "grad_norm": 2.0039301053183087, + "language_loss": 0.91479802, + "learning_rate": 3.906669503605631e-06, + "loss": 0.93645567, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.71484375, + "step": 2074, + "time_per_iteration": 3.768892526626587 + }, + { + "auxiliary_loss_clip": 0.01100886, + "auxiliary_loss_mlp": 0.01072452, + "balance_loss_clip": 1.02126396, + "balance_loss_mlp": 1.02616143, + "epoch": 0.12475574928603637, + "flos": 24643617724800.0, + "grad_norm": 3.2533629162476467, + "language_loss": 0.86380816, + "learning_rate": 3.906551883013728e-06, + "loss": 0.88554156, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.75, + "step": 2075, + "time_per_iteration": 3.8505442142486572 + }, + { + "auxiliary_loss_clip": 0.01099339, + "auxiliary_loss_mlp": 0.01079183, + "balance_loss_clip": 1.03049755, + "balance_loss_mlp": 1.02490568, + "epoch": 0.12481587253870434, + "flos": 21761886637440.0, + "grad_norm": 1.9032532322010431, + "language_loss": 0.74654835, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76833355, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.74609375, + "step": 2076, + "time_per_iteration": 2.401890516281128 + }, + { + "auxiliary_loss_clip": 0.0109499, + "auxiliary_loss_mlp": 0.01069416, + "balance_loss_clip": 1.02476048, + "balance_loss_mlp": 1.02540016, + "epoch": 0.12487599579137232, + "flos": 21431503641600.0, + "grad_norm": 1.9187105930682862, + "language_loss": 0.779791, + "learning_rate": 3.906316424944469e-06, + "loss": 0.80143499, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.6953125, + "step": 2077, + "time_per_iteration": 2.3947150707244873 + }, + { + "auxiliary_loss_clip": 0.01097937, + "auxiliary_loss_mlp": 0.01077504, + "balance_loss_clip": 1.02784157, + "balance_loss_mlp": 1.02377677, + "epoch": 0.12493611904404028, + "flos": 16106690188800.0, + "grad_norm": 2.002542384901918, + "language_loss": 0.84979808, + "learning_rate": 3.906198587476043e-06, + "loss": 0.87155259, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.7421875, + "step": 2078, + "time_per_iteration": 2.3877267837524414 + }, + { + "auxiliary_loss_clip": 0.01102501, + "auxiliary_loss_mlp": 0.0106665, + "balance_loss_clip": 1.02044487, + "balance_loss_mlp": 1.02737248, + "epoch": 0.12499624229670825, + "flos": 21579186159360.0, + "grad_norm": 1.6538365073071768, + "language_loss": 0.771119, + "learning_rate": 3.906080677724374e-06, + "loss": 0.79281044, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.75, + "step": 2079, + "time_per_iteration": 2.485616683959961 + }, + { + "auxiliary_loss_clip": 0.01101932, + "auxiliary_loss_mlp": 0.0107151, + "balance_loss_clip": 1.02444601, + "balance_loss_mlp": 1.02710176, + "epoch": 0.1250563655493762, + "flos": 25697960006400.0, + "grad_norm": 2.917230522348832, + "language_loss": 0.86151254, + "learning_rate": 3.905962695693935e-06, + "loss": 0.8832469, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.75, + "step": 2080, + "time_per_iteration": 2.4205875396728516 + }, + { + "auxiliary_loss_clip": 0.01098276, + "auxiliary_loss_mlp": 0.01067822, + "balance_loss_clip": 1.02087712, + "balance_loss_mlp": 1.02595782, + "epoch": 0.12511648880204418, + "flos": 16908575362560.0, + "grad_norm": 2.169931869684115, + "language_loss": 0.87034643, + "learning_rate": 3.9058446413892e-06, + "loss": 0.89200735, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.72265625, + "step": 2081, + "time_per_iteration": 2.3674209117889404 + }, + { + "auxiliary_loss_clip": 0.01097382, + "auxiliary_loss_mlp": 0.0106888, + "balance_loss_clip": 1.02257872, + "balance_loss_mlp": 1.02497804, + "epoch": 0.12517661205471217, + "flos": 17566513534080.0, + "grad_norm": 2.038809405738405, + "language_loss": 0.78053987, + "learning_rate": 3.905726514814646e-06, + "loss": 0.80220252, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.72265625, + "step": 2082, + "time_per_iteration": 2.364903450012207 + }, + { + "auxiliary_loss_clip": 0.01109731, + "auxiliary_loss_mlp": 0.01073246, + "balance_loss_clip": 1.01891017, + "balance_loss_mlp": 1.02954555, + "epoch": 0.12523673530738014, + "flos": 16032883841280.0, + "grad_norm": 2.5628209157990898, + "language_loss": 0.82240182, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.84423161, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.80078125, + "step": 2083, + "time_per_iteration": 2.401794910430908 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01065215, + "balance_loss_clip": 1.01457548, + "balance_loss_mlp": 1.02602339, + "epoch": 0.1252968585600481, + "flos": 18806733227520.0, + "grad_norm": 2.259297114636559, + "language_loss": 0.91939288, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.94105411, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2084, + "time_per_iteration": 2.3839058876037598 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01070343, + "balance_loss_clip": 1.02330339, + "balance_loss_mlp": 1.02845871, + "epoch": 0.12535698181271607, + "flos": 27270343175040.0, + "grad_norm": 1.9295354144979746, + "language_loss": 0.8259905, + "learning_rate": 3.905371701516869e-06, + "loss": 0.84772325, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.74609375, + "step": 2085, + "time_per_iteration": 2.4521946907043457 + }, + { + "auxiliary_loss_clip": 0.01098976, + "auxiliary_loss_mlp": 0.0107354, + "balance_loss_clip": 1.02452087, + "balance_loss_mlp": 1.02685952, + "epoch": 0.12541710506538403, + "flos": 22053027398400.0, + "grad_norm": 1.7945688715036465, + "language_loss": 0.90490174, + "learning_rate": 3.905253285907856e-06, + "loss": 0.92662692, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.72265625, + "step": 2086, + "time_per_iteration": 2.4161536693573 + }, + { + "auxiliary_loss_clip": 0.01095198, + "auxiliary_loss_mlp": 0.01067396, + "balance_loss_clip": 1.02352715, + "balance_loss_mlp": 1.02654982, + "epoch": 0.125477228318052, + "flos": 12602388003840.0, + "grad_norm": 3.906863675830979, + "language_loss": 0.88276929, + "learning_rate": 3.905134798051447e-06, + "loss": 0.90439522, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6875, + "step": 2087, + "time_per_iteration": 2.374539375305176 + }, + { + "auxiliary_loss_clip": 0.01100024, + "auxiliary_loss_mlp": 0.01069471, + "balance_loss_clip": 1.0197612, + "balance_loss_mlp": 1.02631855, + "epoch": 0.12553735157071996, + "flos": 23877413827200.0, + "grad_norm": 1.8864615784360474, + "language_loss": 0.75776803, + "learning_rate": 3.905016237952136e-06, + "loss": 0.77946299, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.73828125, + "step": 2088, + "time_per_iteration": 2.4770774841308594 + }, + { + "auxiliary_loss_clip": 0.01028493, + "auxiliary_loss_mlp": 0.01009165, + "balance_loss_clip": 1.00215578, + "balance_loss_mlp": 1.00877571, + "epoch": 0.12559747482338796, + "flos": 69917482321920.0, + "grad_norm": 0.7543294700646492, + "language_loss": 0.61870003, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63907659, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.0703125, + "router_z_loss_mlp": 0.19726562, + "step": 2089, + "time_per_iteration": 3.0809853076934814 + }, + { + "auxiliary_loss_clip": 0.01096597, + "auxiliary_loss_mlp": 0.01066002, + "balance_loss_clip": 1.01793671, + "balance_loss_mlp": 1.02545571, + "epoch": 0.12565759807605592, + "flos": 24278426236800.0, + "grad_norm": 2.480972761368767, + "language_loss": 0.80346167, + "learning_rate": 3.904778901042793e-06, + "loss": 0.82508767, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.7109375, + "step": 2090, + "time_per_iteration": 2.4240455627441406 + }, + { + "auxiliary_loss_clip": 0.01026659, + "auxiliary_loss_mlp": 0.01007729, + "balance_loss_clip": 1.00033796, + "balance_loss_mlp": 1.0068574, + "epoch": 0.12571772132872389, + "flos": 56448375016320.0, + "grad_norm": 0.7659677819536875, + "language_loss": 0.59696865, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61731255, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.07373047, + "router_z_loss_mlp": 0.19824219, + "step": 2091, + "time_per_iteration": 2.939903974533081 + }, + { + "auxiliary_loss_clip": 0.01101397, + "auxiliary_loss_mlp": 0.01070255, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.02838159, + "epoch": 0.12577784458139185, + "flos": 41244225050880.0, + "grad_norm": 2.294099353437529, + "language_loss": 0.67198324, + "learning_rate": 3.904541275215825e-06, + "loss": 0.69369972, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.73046875, + "step": 2092, + "time_per_iteration": 2.5664381980895996 + }, + { + "auxiliary_loss_clip": 0.0110318, + "auxiliary_loss_mlp": 0.01075065, + "balance_loss_clip": 1.02556968, + "balance_loss_mlp": 1.02850103, + "epoch": 0.12583796783405982, + "flos": 19754485528320.0, + "grad_norm": 2.0937845431267648, + "language_loss": 0.83230537, + "learning_rate": 3.904422353969493e-06, + "loss": 0.85408783, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.74609375, + "step": 2093, + "time_per_iteration": 2.409550189971924 + }, + { + "auxiliary_loss_clip": 0.01100343, + "auxiliary_loss_mlp": 0.0107023, + "balance_loss_clip": 1.02252293, + "balance_loss_mlp": 1.02747798, + "epoch": 0.12589809108672778, + "flos": 22600989187200.0, + "grad_norm": 4.010100292576967, + "language_loss": 0.77535617, + "learning_rate": 3.904303360507276e-06, + "loss": 0.79706192, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.7265625, + "step": 2094, + "time_per_iteration": 2.412766695022583 + }, + { + "auxiliary_loss_clip": 0.01095888, + "auxiliary_loss_mlp": 0.01065553, + "balance_loss_clip": 1.02061129, + "balance_loss_mlp": 1.02553749, + "epoch": 0.12595821433939577, + "flos": 45221111665920.0, + "grad_norm": 1.6178884577198958, + "language_loss": 0.78845894, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.81007338, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.703125, + "step": 2095, + "time_per_iteration": 2.613774061203003 + }, + { + "auxiliary_loss_clip": 0.0110077, + "auxiliary_loss_mlp": 0.0106413, + "balance_loss_clip": 1.01709008, + "balance_loss_mlp": 1.02578056, + "epoch": 0.12601833759206374, + "flos": 14318927642880.0, + "grad_norm": 2.298427310107454, + "language_loss": 0.86728024, + "learning_rate": 3.904065156953232e-06, + "loss": 0.88892925, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.75, + "step": 2096, + "time_per_iteration": 2.3652350902557373 + }, + { + "auxiliary_loss_clip": 0.01099921, + "auxiliary_loss_mlp": 0.010802, + "balance_loss_clip": 1.02924979, + "balance_loss_mlp": 1.02683473, + "epoch": 0.1260784608447317, + "flos": 21287172614400.0, + "grad_norm": 2.041857833389885, + "language_loss": 0.77012503, + "learning_rate": 3.903945946870439e-06, + "loss": 0.79192626, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.73046875, + "step": 2097, + "time_per_iteration": 2.422727346420288 + }, + { + "auxiliary_loss_clip": 0.01099006, + "auxiliary_loss_mlp": 0.01074696, + "balance_loss_clip": 1.0268451, + "balance_loss_mlp": 1.02711439, + "epoch": 0.12613858409739967, + "flos": 26250076247040.0, + "grad_norm": 1.8806725712123253, + "language_loss": 0.88966548, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.91140246, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.71875, + "step": 2098, + "time_per_iteration": 2.4385225772857666 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.01077606, + "balance_loss_clip": 1.02293706, + "balance_loss_mlp": 1.0277276, + "epoch": 0.12619870735006763, + "flos": 21578906868480.0, + "grad_norm": 1.80885434019419, + "language_loss": 0.71467018, + "learning_rate": 3.903707310115912e-06, + "loss": 0.73648471, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.76171875, + "step": 2099, + "time_per_iteration": 2.4187145233154297 + }, + { + "auxiliary_loss_clip": 0.01100844, + "auxiliary_loss_mlp": 0.01076761, + "balance_loss_clip": 1.02621675, + "balance_loss_mlp": 1.0267241, + "epoch": 0.1262588306027356, + "flos": 23365936275840.0, + "grad_norm": 2.0459697087326916, + "language_loss": 0.83882058, + "learning_rate": 3.903587883453228e-06, + "loss": 0.86059666, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.7421875, + "step": 2100, + "time_per_iteration": 2.4331295490264893 + }, + { + "auxiliary_loss_clip": 0.01103684, + "auxiliary_loss_mlp": 0.01076921, + "balance_loss_clip": 1.02549434, + "balance_loss_mlp": 1.02695155, + "epoch": 0.12631895385540357, + "flos": 23948113063680.0, + "grad_norm": 1.937344077331392, + "language_loss": 0.83285069, + "learning_rate": 3.903468384606302e-06, + "loss": 0.8546567, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.765625, + "step": 2101, + "time_per_iteration": 2.4277701377868652 + }, + { + "auxiliary_loss_clip": 0.01026852, + "auxiliary_loss_mlp": 0.01007164, + "balance_loss_clip": 1.00058401, + "balance_loss_mlp": 1.00643969, + "epoch": 0.12637907710807156, + "flos": 70278868471680.0, + "grad_norm": 0.709846291096711, + "language_loss": 0.57129288, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59163308, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.20410156, + "step": 2102, + "time_per_iteration": 3.0667166709899902 + }, + { + "auxiliary_loss_clip": 0.01103495, + "auxiliary_loss_mlp": 0.0107421, + "balance_loss_clip": 1.02550149, + "balance_loss_mlp": 1.02853513, + "epoch": 0.12643920036073952, + "flos": 18914126169600.0, + "grad_norm": 2.213121025184689, + "language_loss": 0.95580673, + "learning_rate": 3.903229170377845e-06, + "loss": 0.97758371, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.75, + "step": 2103, + "time_per_iteration": 2.3907690048217773 + }, + { + "auxiliary_loss_clip": 0.0109308, + "auxiliary_loss_mlp": 0.01059057, + "balance_loss_clip": 1.01652336, + "balance_loss_mlp": 1.0243901, + "epoch": 0.1264993236134075, + "flos": 27781227233280.0, + "grad_norm": 1.6982247439133973, + "language_loss": 0.79580879, + "learning_rate": 3.903109455005387e-06, + "loss": 0.81733012, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6875, + "step": 2104, + "time_per_iteration": 2.449740409851074 + }, + { + "auxiliary_loss_clip": 0.01103013, + "auxiliary_loss_mlp": 0.01075021, + "balance_loss_clip": 1.0258112, + "balance_loss_mlp": 1.02943814, + "epoch": 0.12655944686607545, + "flos": 24753524284800.0, + "grad_norm": 1.7608372495154618, + "language_loss": 0.84060472, + "learning_rate": 3.902989667466828e-06, + "loss": 0.86238503, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.734375, + "step": 2105, + "time_per_iteration": 2.453557014465332 + }, + { + "auxiliary_loss_clip": 0.01102965, + "auxiliary_loss_mlp": 0.01076518, + "balance_loss_clip": 1.02640271, + "balance_loss_mlp": 1.02763617, + "epoch": 0.12661957011874342, + "flos": 24131930705280.0, + "grad_norm": 1.7849529316768553, + "language_loss": 0.84455574, + "learning_rate": 3.90286980776671e-06, + "loss": 0.86635053, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.75390625, + "step": 2106, + "time_per_iteration": 2.4228100776672363 + }, + { + "auxiliary_loss_clip": 0.01098242, + "auxiliary_loss_mlp": 0.01076602, + "balance_loss_clip": 1.02813125, + "balance_loss_mlp": 1.02697301, + "epoch": 0.12667969337141138, + "flos": 24568519656960.0, + "grad_norm": 1.7737080614408403, + "language_loss": 0.75314415, + "learning_rate": 3.902749875909578e-06, + "loss": 0.77489257, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7109375, + "step": 2107, + "time_per_iteration": 2.4417243003845215 + }, + { + "auxiliary_loss_clip": 0.01098884, + "auxiliary_loss_mlp": 0.01066212, + "balance_loss_clip": 1.02010202, + "balance_loss_mlp": 1.02894425, + "epoch": 0.12673981662407935, + "flos": 22960699591680.0, + "grad_norm": 1.978240378070436, + "language_loss": 0.8083272, + "learning_rate": 3.90262987189998e-06, + "loss": 0.82997811, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.69921875, + "step": 2108, + "time_per_iteration": 2.41288423538208 + }, + { + "auxiliary_loss_clip": 0.01097878, + "auxiliary_loss_mlp": 0.01057533, + "balance_loss_clip": 1.01263857, + "balance_loss_mlp": 1.02517676, + "epoch": 0.12679993987674734, + "flos": 17273906496000.0, + "grad_norm": 1.7686333410104331, + "language_loss": 0.78784013, + "learning_rate": 3.902509795742467e-06, + "loss": 0.80939424, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.7265625, + "step": 2109, + "time_per_iteration": 2.397465944290161 + }, + { + "auxiliary_loss_clip": 0.0109429, + "auxiliary_loss_mlp": 0.01066094, + "balance_loss_clip": 1.02205801, + "balance_loss_mlp": 1.02429247, + "epoch": 0.1268600631294153, + "flos": 17274115964160.0, + "grad_norm": 1.6423418252906739, + "language_loss": 0.85298687, + "learning_rate": 3.902389647441592e-06, + "loss": 0.87459069, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.69921875, + "step": 2110, + "time_per_iteration": 2.4018049240112305 + }, + { + "auxiliary_loss_clip": 0.01097519, + "auxiliary_loss_mlp": 0.01068986, + "balance_loss_clip": 1.02223182, + "balance_loss_mlp": 1.02688062, + "epoch": 0.12692018638208327, + "flos": 24059904837120.0, + "grad_norm": 2.0053995889375873, + "language_loss": 0.8123129, + "learning_rate": 3.90226942700191e-06, + "loss": 0.83397794, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.7109375, + "step": 2111, + "time_per_iteration": 3.853485584259033 + }, + { + "auxiliary_loss_clip": 0.01105653, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_clip": 1.02750802, + "balance_loss_mlp": 1.02796817, + "epoch": 0.12698030963475124, + "flos": 31830558652800.0, + "grad_norm": 1.9923017631396902, + "language_loss": 0.79077971, + "learning_rate": 3.902149134427982e-06, + "loss": 0.81263793, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.77734375, + "step": 2112, + "time_per_iteration": 2.465670347213745 + }, + { + "auxiliary_loss_clip": 0.01097859, + "auxiliary_loss_mlp": 0.0106057, + "balance_loss_clip": 1.01934743, + "balance_loss_mlp": 1.02654886, + "epoch": 0.1270404328874192, + "flos": 25186691923200.0, + "grad_norm": 1.7615828064080397, + "language_loss": 0.86774397, + "learning_rate": 3.902028769724367e-06, + "loss": 0.8893283, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.7109375, + "step": 2113, + "time_per_iteration": 3.850452423095703 + }, + { + "auxiliary_loss_clip": 0.01095139, + "auxiliary_loss_mlp": 0.01067165, + "balance_loss_clip": 1.02083993, + "balance_loss_mlp": 1.02468586, + "epoch": 0.12710055614008717, + "flos": 15996434515200.0, + "grad_norm": 1.8993008356207108, + "language_loss": 0.75400722, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.77563024, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.70703125, + "step": 2114, + "time_per_iteration": 5.231007814407349 + }, + { + "auxiliary_loss_clip": 0.01099663, + "auxiliary_loss_mlp": 0.01067381, + "balance_loss_clip": 1.02241492, + "balance_loss_mlp": 1.02786088, + "epoch": 0.12716067939275516, + "flos": 15084747515520.0, + "grad_norm": 3.732960096500092, + "language_loss": 0.85291243, + "learning_rate": 3.901787823946341e-06, + "loss": 0.87458289, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.71875, + "step": 2115, + "time_per_iteration": 2.377838134765625 + }, + { + "auxiliary_loss_clip": 0.01101968, + "auxiliary_loss_mlp": 0.01074416, + "balance_loss_clip": 1.02563524, + "balance_loss_mlp": 1.02876568, + "epoch": 0.12722080264542313, + "flos": 28365463791360.0, + "grad_norm": 1.8239713692432853, + "language_loss": 0.8873347, + "learning_rate": 3.901667242881065e-06, + "loss": 0.90909851, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.734375, + "step": 2116, + "time_per_iteration": 2.472651958465576 + }, + { + "auxiliary_loss_clip": 0.01094542, + "auxiliary_loss_mlp": 0.01063708, + "balance_loss_clip": 1.01998186, + "balance_loss_mlp": 1.02571571, + "epoch": 0.1272809258980911, + "flos": 32378520441600.0, + "grad_norm": 1.8977255185679933, + "language_loss": 0.72030169, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.74188417, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6875, + "step": 2117, + "time_per_iteration": 2.506897449493408 + }, + { + "auxiliary_loss_clip": 0.01097908, + "auxiliary_loss_mlp": 0.01064841, + "balance_loss_clip": 1.02087629, + "balance_loss_mlp": 1.02690625, + "epoch": 0.12734104915075906, + "flos": 16033477334400.0, + "grad_norm": 2.1123390325401674, + "language_loss": 0.88026881, + "learning_rate": 3.901425864420852e-06, + "loss": 0.9018963, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.7109375, + "step": 2118, + "time_per_iteration": 2.3871867656707764 + }, + { + "auxiliary_loss_clip": 0.01094619, + "auxiliary_loss_mlp": 0.01072641, + "balance_loss_clip": 1.02707911, + "balance_loss_mlp": 1.02455306, + "epoch": 0.12740117240342702, + "flos": 18259330020480.0, + "grad_norm": 1.777095038175777, + "language_loss": 0.89981472, + "learning_rate": 3.901305067035068e-06, + "loss": 0.92148739, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.703125, + "step": 2119, + "time_per_iteration": 2.392143964767456 + }, + { + "auxiliary_loss_clip": 0.01098346, + "auxiliary_loss_mlp": 0.01069956, + "balance_loss_clip": 1.02572966, + "balance_loss_mlp": 1.02686262, + "epoch": 0.127461295656095, + "flos": 12121215379200.0, + "grad_norm": 2.338813219628736, + "language_loss": 0.89659369, + "learning_rate": 3.901184197551605e-06, + "loss": 0.91827679, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.71484375, + "step": 2120, + "time_per_iteration": 2.338932514190674 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.01064823, + "balance_loss_clip": 1.01883233, + "balance_loss_mlp": 1.02729821, + "epoch": 0.12752141890876295, + "flos": 23147973457920.0, + "grad_norm": 1.7869999541678403, + "language_loss": 0.7796129, + "learning_rate": 3.901063255975046e-06, + "loss": 0.80125207, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.71875, + "step": 2121, + "time_per_iteration": 2.4427952766418457 + }, + { + "auxiliary_loss_clip": 0.01096737, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_clip": 1.02339578, + "balance_loss_mlp": 1.02585292, + "epoch": 0.12758154216143094, + "flos": 21614937258240.0, + "grad_norm": 2.2278393657309, + "language_loss": 0.84214371, + "learning_rate": 3.900942242309978e-06, + "loss": 0.86383355, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7109375, + "step": 2122, + "time_per_iteration": 2.390829086303711 + }, + { + "auxiliary_loss_clip": 0.01096579, + "auxiliary_loss_mlp": 0.01065645, + "balance_loss_clip": 1.02137053, + "balance_loss_mlp": 1.02663898, + "epoch": 0.1276416654140989, + "flos": 15923954799360.0, + "grad_norm": 1.8299033224575254, + "language_loss": 0.8157838, + "learning_rate": 3.90082115656099e-06, + "loss": 0.83740604, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.69921875, + "step": 2123, + "time_per_iteration": 2.38773775100708 + }, + { + "auxiliary_loss_clip": 0.011, + "auxiliary_loss_mlp": 0.01070653, + "balance_loss_clip": 1.02695119, + "balance_loss_mlp": 1.02825522, + "epoch": 0.12770178866676687, + "flos": 22381595003520.0, + "grad_norm": 1.626708526647291, + "language_loss": 0.81140411, + "learning_rate": 3.900699998732673e-06, + "loss": 0.83311069, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.71875, + "step": 2124, + "time_per_iteration": 2.4497182369232178 + }, + { + "auxiliary_loss_clip": 0.01098105, + "auxiliary_loss_mlp": 0.01075566, + "balance_loss_clip": 1.02921736, + "balance_loss_mlp": 1.02514982, + "epoch": 0.12776191191943484, + "flos": 21651421495680.0, + "grad_norm": 1.920609281260963, + "language_loss": 0.76993394, + "learning_rate": 3.900578768829623e-06, + "loss": 0.79167068, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.7265625, + "step": 2125, + "time_per_iteration": 2.389789581298828 + }, + { + "auxiliary_loss_clip": 0.01098346, + "auxiliary_loss_mlp": 0.01065025, + "balance_loss_clip": 1.0198921, + "balance_loss_mlp": 1.0265789, + "epoch": 0.1278220351721028, + "flos": 25734479155200.0, + "grad_norm": 2.1006449814837818, + "language_loss": 0.7968539, + "learning_rate": 3.900457466856434e-06, + "loss": 0.81848764, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.71875, + "step": 2126, + "time_per_iteration": 2.4288575649261475 + }, + { + "auxiliary_loss_clip": 0.01097925, + "auxiliary_loss_mlp": 0.0107224, + "balance_loss_clip": 1.02713084, + "balance_loss_mlp": 1.02741504, + "epoch": 0.12788215842477077, + "flos": 41241676521600.0, + "grad_norm": 1.471003300986322, + "language_loss": 0.71418953, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7358911, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.70703125, + "step": 2127, + "time_per_iteration": 2.591374158859253 + }, + { + "auxiliary_loss_clip": 0.01035868, + "auxiliary_loss_mlp": 0.01010503, + "balance_loss_clip": 1.00416136, + "balance_loss_mlp": 1.01463032, + "epoch": 0.12794228167743876, + "flos": 70873822817280.0, + "grad_norm": 0.852645922483622, + "language_loss": 0.62999916, + "learning_rate": 3.900214646718047e-06, + "loss": 0.65046287, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.06347656, + "router_z_loss_mlp": 0.21289062, + "step": 2128, + "time_per_iteration": 3.0717384815216064 + }, + { + "auxiliary_loss_clip": 0.01098301, + "auxiliary_loss_mlp": 0.01067938, + "balance_loss_clip": 1.02177978, + "balance_loss_mlp": 1.02620006, + "epoch": 0.12800240493010673, + "flos": 16288797173760.0, + "grad_norm": 2.755545132945294, + "language_loss": 0.8059653, + "learning_rate": 3.900093128562056e-06, + "loss": 0.82762766, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.71875, + "step": 2129, + "time_per_iteration": 2.473583459854126 + }, + { + "auxiliary_loss_clip": 0.01103813, + "auxiliary_loss_mlp": 0.01071043, + "balance_loss_clip": 1.01854289, + "balance_loss_mlp": 1.02782166, + "epoch": 0.1280625281827747, + "flos": 20630491251840.0, + "grad_norm": 2.4974660448896677, + "language_loss": 0.8254205, + "learning_rate": 3.899971538354343e-06, + "loss": 0.84716904, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7578125, + "step": 2130, + "time_per_iteration": 2.421613931655884 + }, + { + "auxiliary_loss_clip": 0.01097476, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_clip": 1.02278936, + "balance_loss_mlp": 1.02525949, + "epoch": 0.12812265143544266, + "flos": 22637124311040.0, + "grad_norm": 1.7984073623287196, + "language_loss": 0.72785008, + "learning_rate": 3.899849876099518e-06, + "loss": 0.749542, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.72265625, + "step": 2131, + "time_per_iteration": 2.433311700820923 + }, + { + "auxiliary_loss_clip": 0.01099444, + "auxiliary_loss_mlp": 0.01066751, + "balance_loss_clip": 1.0191623, + "balance_loss_mlp": 1.02744675, + "epoch": 0.12818277468811062, + "flos": 34713267258240.0, + "grad_norm": 2.6532069950252097, + "language_loss": 0.75361168, + "learning_rate": 3.899728141802197e-06, + "loss": 0.77527362, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.71875, + "step": 2132, + "time_per_iteration": 2.512260675430298 + }, + { + "auxiliary_loss_clip": 0.01097022, + "auxiliary_loss_mlp": 0.01066395, + "balance_loss_clip": 1.01923525, + "balance_loss_mlp": 1.02742743, + "epoch": 0.1282428979407786, + "flos": 23111000461440.0, + "grad_norm": 2.130297013143634, + "language_loss": 0.8366586, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.85829276, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.6953125, + "step": 2133, + "time_per_iteration": 2.459123134613037 + }, + { + "auxiliary_loss_clip": 0.0110779, + "auxiliary_loss_mlp": 0.01078903, + "balance_loss_clip": 1.02707124, + "balance_loss_mlp": 1.02944136, + "epoch": 0.12830302119344655, + "flos": 20885461977600.0, + "grad_norm": 2.5477247497376783, + "language_loss": 0.82169557, + "learning_rate": 3.899484457098528e-06, + "loss": 0.84356248, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.78515625, + "step": 2134, + "time_per_iteration": 2.4053640365600586 + }, + { + "auxiliary_loss_clip": 0.0110127, + "auxiliary_loss_mlp": 0.01077543, + "balance_loss_clip": 1.02759445, + "balance_loss_mlp": 1.02866387, + "epoch": 0.12836314444611455, + "flos": 21396695149440.0, + "grad_norm": 1.7665007290235353, + "language_loss": 0.85576051, + "learning_rate": 3.899362506701421e-06, + "loss": 0.87754869, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7265625, + "step": 2135, + "time_per_iteration": 2.4040963649749756 + }, + { + "auxiliary_loss_clip": 0.0109949, + "auxiliary_loss_mlp": 0.01073129, + "balance_loss_clip": 1.02484977, + "balance_loss_mlp": 1.02775538, + "epoch": 0.1284232676987825, + "flos": 13661617875840.0, + "grad_norm": 2.178735363994609, + "language_loss": 0.80063188, + "learning_rate": 3.899240484280298e-06, + "loss": 0.82235807, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.71875, + "step": 2136, + "time_per_iteration": 2.4106063842773438 + }, + { + "auxiliary_loss_clip": 0.01029612, + "auxiliary_loss_mlp": 0.01024085, + "balance_loss_clip": 1.01564538, + "balance_loss_mlp": 1.00863743, + "epoch": 0.12848339095145048, + "flos": 59991709968000.0, + "grad_norm": 0.902831602181296, + "language_loss": 0.59206522, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61260223, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.08447266, + "router_z_loss_mlp": 0.20996094, + "step": 2137, + "time_per_iteration": 3.139983892440796 + }, + { + "auxiliary_loss_clip": 0.01102136, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_clip": 1.02513027, + "balance_loss_mlp": 1.02843142, + "epoch": 0.12854351420411844, + "flos": 13880523300480.0, + "grad_norm": 2.503153097920799, + "language_loss": 0.85267538, + "learning_rate": 3.898996223384512e-06, + "loss": 0.87443519, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.73828125, + "step": 2138, + "time_per_iteration": 2.374898910522461 + }, + { + "auxiliary_loss_clip": 0.01106013, + "auxiliary_loss_mlp": 0.0108006, + "balance_loss_clip": 1.03034997, + "balance_loss_mlp": 1.02957308, + "epoch": 0.1286036374567864, + "flos": 22636845020160.0, + "grad_norm": 3.723604201752595, + "language_loss": 0.81704056, + "learning_rate": 3.898873984919113e-06, + "loss": 0.83890128, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.765625, + "step": 2139, + "time_per_iteration": 2.4395737648010254 + }, + { + "auxiliary_loss_clip": 0.01104622, + "auxiliary_loss_mlp": 0.01087183, + "balance_loss_clip": 1.03661466, + "balance_loss_mlp": 1.02929151, + "epoch": 0.12866376070945437, + "flos": 16323884956800.0, + "grad_norm": 1.7946937930524334, + "language_loss": 0.86959648, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.89151454, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.75390625, + "step": 2140, + "time_per_iteration": 2.397221326828003 + }, + { + "auxiliary_loss_clip": 0.01098999, + "auxiliary_loss_mlp": 0.0107144, + "balance_loss_clip": 1.02797592, + "balance_loss_mlp": 1.02807331, + "epoch": 0.12872388396212234, + "flos": 11873750595840.0, + "grad_norm": 2.005436340285127, + "language_loss": 0.8735441, + "learning_rate": 3.898629291976476e-06, + "loss": 0.89524841, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.7109375, + "step": 2141, + "time_per_iteration": 2.399759292602539 + }, + { + "auxiliary_loss_clip": 0.01105523, + "auxiliary_loss_mlp": 0.01082706, + "balance_loss_clip": 1.03282905, + "balance_loss_mlp": 1.02897584, + "epoch": 0.12878400721479033, + "flos": 28365428880000.0, + "grad_norm": 1.9538901436228513, + "language_loss": 0.70832264, + "learning_rate": 3.898506837508518e-06, + "loss": 0.73020494, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.765625, + "step": 2142, + "time_per_iteration": 2.4588537216186523 + }, + { + "auxiliary_loss_clip": 0.01106471, + "auxiliary_loss_mlp": 0.01081504, + "balance_loss_clip": 1.03145981, + "balance_loss_mlp": 1.02978516, + "epoch": 0.1288441304674583, + "flos": 25884430911360.0, + "grad_norm": 2.273505944389879, + "language_loss": 0.86100835, + "learning_rate": 3.89838431104899e-06, + "loss": 0.88288808, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.765625, + "step": 2143, + "time_per_iteration": 2.4309492111206055 + }, + { + "auxiliary_loss_clip": 0.01106111, + "auxiliary_loss_mlp": 0.01085694, + "balance_loss_clip": 1.03476739, + "balance_loss_mlp": 1.03029275, + "epoch": 0.12890425372012626, + "flos": 20812737882240.0, + "grad_norm": 1.7238903002284929, + "language_loss": 0.83865738, + "learning_rate": 3.898261712602539e-06, + "loss": 0.86057538, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7578125, + "step": 2144, + "time_per_iteration": 2.4402389526367188 + }, + { + "auxiliary_loss_clip": 0.01102883, + "auxiliary_loss_mlp": 0.01090767, + "balance_loss_clip": 1.03879213, + "balance_loss_mlp": 1.02760077, + "epoch": 0.12896437697279423, + "flos": 22564749329280.0, + "grad_norm": 2.0313800768567836, + "language_loss": 0.81231457, + "learning_rate": 3.898139042173813e-06, + "loss": 0.83425105, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.75390625, + "step": 2145, + "time_per_iteration": 2.3943943977355957 + }, + { + "auxiliary_loss_clip": 0.01104524, + "auxiliary_loss_mlp": 0.010879, + "balance_loss_clip": 1.0311799, + "balance_loss_mlp": 1.02722168, + "epoch": 0.1290245002254622, + "flos": 17492811920640.0, + "grad_norm": 1.9097048490765307, + "language_loss": 0.84887475, + "learning_rate": 3.898016299767465e-06, + "loss": 0.87079901, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.56640625, + "router_z_loss_mlp": 0.7734375, + "step": 2146, + "time_per_iteration": 2.4047343730926514 + }, + { + "auxiliary_loss_clip": 0.01100938, + "auxiliary_loss_mlp": 0.01089772, + "balance_loss_clip": 1.03672397, + "balance_loss_mlp": 1.02816844, + "epoch": 0.12908462347813016, + "flos": 36314593810560.0, + "grad_norm": 3.559626873553877, + "language_loss": 0.73351383, + "learning_rate": 3.897893485388149e-06, + "loss": 0.75542092, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7265625, + "step": 2147, + "time_per_iteration": 2.5256307125091553 + }, + { + "auxiliary_loss_clip": 0.01102351, + "auxiliary_loss_mlp": 0.01072922, + "balance_loss_clip": 1.02333093, + "balance_loss_mlp": 1.02784872, + "epoch": 0.12914474673079815, + "flos": 22527601776000.0, + "grad_norm": 2.133264318744652, + "language_loss": 0.73575222, + "learning_rate": 3.897770599040521e-06, + "loss": 0.75750494, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.74609375, + "step": 2148, + "time_per_iteration": 2.42390775680542 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.0106908, + "balance_loss_clip": 1.02153933, + "balance_loss_mlp": 1.02845812, + "epoch": 0.12920486998346611, + "flos": 21470780787840.0, + "grad_norm": 1.6048687876951306, + "language_loss": 0.8061527, + "learning_rate": 3.897647640729242e-06, + "loss": 0.82784426, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.71484375, + "step": 2149, + "time_per_iteration": 2.4270668029785156 + }, + { + "auxiliary_loss_clip": 0.01106387, + "auxiliary_loss_mlp": 0.01078877, + "balance_loss_clip": 1.02482748, + "balance_loss_mlp": 1.03030229, + "epoch": 0.12926499323613408, + "flos": 27307316171520.0, + "grad_norm": 2.037643000542415, + "language_loss": 0.78389686, + "learning_rate": 3.897524610458975e-06, + "loss": 0.80574954, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.76171875, + "step": 2150, + "time_per_iteration": 3.9015908241271973 + }, + { + "auxiliary_loss_clip": 0.01100793, + "auxiliary_loss_mlp": 0.0108682, + "balance_loss_clip": 1.03467822, + "balance_loss_mlp": 1.02674079, + "epoch": 0.12932511648880204, + "flos": 22090035306240.0, + "grad_norm": 2.1873923320628923, + "language_loss": 0.73836011, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.76023626, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7421875, + "step": 2151, + "time_per_iteration": 2.441594123840332 + }, + { + "auxiliary_loss_clip": 0.01101921, + "auxiliary_loss_mlp": 0.01077632, + "balance_loss_clip": 1.0254184, + "balance_loss_mlp": 1.02878642, + "epoch": 0.12938523974147, + "flos": 20301749089920.0, + "grad_norm": 1.983095765708907, + "language_loss": 0.86479634, + "learning_rate": 3.897278334060137e-06, + "loss": 0.88659191, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.734375, + "step": 2152, + "time_per_iteration": 2.384993553161621 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01084994, + "balance_loss_clip": 1.03247035, + "balance_loss_mlp": 1.02654552, + "epoch": 0.12944536299413797, + "flos": 19498956220800.0, + "grad_norm": 1.668044403525724, + "language_loss": 0.80230546, + "learning_rate": 3.897155087940906e-06, + "loss": 0.82417846, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7578125, + "step": 2153, + "time_per_iteration": 5.246873617172241 + }, + { + "auxiliary_loss_clip": 0.01102653, + "auxiliary_loss_mlp": 0.01079191, + "balance_loss_clip": 1.02421165, + "balance_loss_mlp": 1.0266118, + "epoch": 0.12950548624680594, + "flos": 27706722658560.0, + "grad_norm": 1.712649903689431, + "language_loss": 0.823228, + "learning_rate": 3.897031769881364e-06, + "loss": 0.84504646, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.76171875, + "step": 2154, + "time_per_iteration": 3.856257677078247 + }, + { + "auxiliary_loss_clip": 0.01104203, + "auxiliary_loss_mlp": 0.01077627, + "balance_loss_clip": 1.02534175, + "balance_loss_mlp": 1.02897525, + "epoch": 0.12956560949947393, + "flos": 17564802877440.0, + "grad_norm": 1.92020694841077, + "language_loss": 0.85300857, + "learning_rate": 3.896908379886188e-06, + "loss": 0.87482691, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.75390625, + "step": 2155, + "time_per_iteration": 2.434171676635742 + }, + { + "auxiliary_loss_clip": 0.01107911, + "auxiliary_loss_mlp": 0.01080043, + "balance_loss_clip": 1.02442026, + "balance_loss_mlp": 1.03027892, + "epoch": 0.1296257327521419, + "flos": 20739664673280.0, + "grad_norm": 2.4936167385423955, + "language_loss": 0.7763865, + "learning_rate": 3.896784917960055e-06, + "loss": 0.79826605, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.77734375, + "step": 2156, + "time_per_iteration": 2.4074456691741943 + }, + { + "auxiliary_loss_clip": 0.01098624, + "auxiliary_loss_mlp": 0.01072313, + "balance_loss_clip": 1.02288842, + "balance_loss_mlp": 1.02656102, + "epoch": 0.12968585600480986, + "flos": 16394898395520.0, + "grad_norm": 1.9300415881305968, + "language_loss": 0.88281727, + "learning_rate": 3.896661384107648e-06, + "loss": 0.90452671, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.71875, + "step": 2157, + "time_per_iteration": 2.372838258743286 + }, + { + "auxiliary_loss_clip": 0.01105243, + "auxiliary_loss_mlp": 0.0108774, + "balance_loss_clip": 1.02770567, + "balance_loss_mlp": 1.02739358, + "epoch": 0.12974597925747783, + "flos": 28328281326720.0, + "grad_norm": 2.482528184085084, + "language_loss": 0.83481377, + "learning_rate": 3.896537778333651e-06, + "loss": 0.85674363, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 0.78125, + "step": 2158, + "time_per_iteration": 2.482666492462158 + }, + { + "auxiliary_loss_clip": 0.01105827, + "auxiliary_loss_mlp": 0.01090516, + "balance_loss_clip": 1.03412986, + "balance_loss_mlp": 1.02813244, + "epoch": 0.1298061025101458, + "flos": 9682357288320.0, + "grad_norm": 2.577298082869026, + "language_loss": 0.77145863, + "learning_rate": 3.896414100642752e-06, + "loss": 0.79342204, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.77734375, + "step": 2159, + "time_per_iteration": 2.361314058303833 + }, + { + "auxiliary_loss_clip": 0.01100475, + "auxiliary_loss_mlp": 0.010783, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.02536941, + "epoch": 0.12986622576281376, + "flos": 27708293669760.0, + "grad_norm": 2.138159615476322, + "language_loss": 0.84459567, + "learning_rate": 3.89629035103964e-06, + "loss": 0.86638349, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.75, + "step": 2160, + "time_per_iteration": 2.45601749420166 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.01079133, + "balance_loss_clip": 1.02646589, + "balance_loss_mlp": 1.02615285, + "epoch": 0.12992634901548175, + "flos": 18801845637120.0, + "grad_norm": 2.0795491496572556, + "language_loss": 0.83479977, + "learning_rate": 3.896166529529008e-06, + "loss": 0.85657322, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.71875, + "step": 2161, + "time_per_iteration": 2.37758469581604 + }, + { + "auxiliary_loss_clip": 0.01105233, + "auxiliary_loss_mlp": 0.01076326, + "balance_loss_clip": 1.01886749, + "balance_loss_mlp": 1.02896333, + "epoch": 0.12998647226814972, + "flos": 29126430984960.0, + "grad_norm": 1.9394543476831771, + "language_loss": 0.84819478, + "learning_rate": 3.896042636115551e-06, + "loss": 0.87001038, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.765625, + "step": 2162, + "time_per_iteration": 2.462890863418579 + }, + { + "auxiliary_loss_clip": 0.01105679, + "auxiliary_loss_mlp": 0.01069682, + "balance_loss_clip": 1.01484585, + "balance_loss_mlp": 1.02604949, + "epoch": 0.13004659552081768, + "flos": 19572657834240.0, + "grad_norm": 4.3031566290862875, + "language_loss": 0.74970555, + "learning_rate": 3.895918670803968e-06, + "loss": 0.77145922, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.796875, + "step": 2163, + "time_per_iteration": 2.3761167526245117 + }, + { + "auxiliary_loss_clip": 0.01105477, + "auxiliary_loss_mlp": 0.01093623, + "balance_loss_clip": 1.03041792, + "balance_loss_mlp": 1.02618325, + "epoch": 0.13010671877348565, + "flos": 22489651261440.0, + "grad_norm": 1.9680069335514843, + "language_loss": 0.83958429, + "learning_rate": 3.895794633598958e-06, + "loss": 0.86157537, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.79296875, + "step": 2164, + "time_per_iteration": 2.3981714248657227 + }, + { + "auxiliary_loss_clip": 0.01103049, + "auxiliary_loss_mlp": 0.01077544, + "balance_loss_clip": 1.02561641, + "balance_loss_mlp": 1.02615595, + "epoch": 0.1301668420261536, + "flos": 23877099624960.0, + "grad_norm": 1.9581646387317124, + "language_loss": 0.74384749, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.76565349, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.76953125, + "step": 2165, + "time_per_iteration": 2.3984694480895996 + }, + { + "auxiliary_loss_clip": 0.01106716, + "auxiliary_loss_mlp": 0.01080568, + "balance_loss_clip": 1.02177358, + "balance_loss_mlp": 1.02712548, + "epoch": 0.13022696527882158, + "flos": 23148916064640.0, + "grad_norm": 1.757578930794073, + "language_loss": 0.76846951, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.79034233, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.796875, + "step": 2166, + "time_per_iteration": 2.4149773120880127 + }, + { + "auxiliary_loss_clip": 0.01103432, + "auxiliary_loss_mlp": 0.01079853, + "balance_loss_clip": 1.02539849, + "balance_loss_mlp": 1.02670789, + "epoch": 0.13028708853148954, + "flos": 26907281280000.0, + "grad_norm": 1.5480991107658018, + "language_loss": 0.84858656, + "learning_rate": 3.895422090670421e-06, + "loss": 0.87041938, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.765625, + "step": 2167, + "time_per_iteration": 2.4516680240631104 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01088183, + "balance_loss_clip": 1.02953172, + "balance_loss_mlp": 1.02715278, + "epoch": 0.13034721178415754, + "flos": 21250409086080.0, + "grad_norm": 1.593509813888936, + "language_loss": 0.8556115, + "learning_rate": 3.89529776593877e-06, + "loss": 0.87753057, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 0.765625, + "step": 2168, + "time_per_iteration": 2.411665678024292 + }, + { + "auxiliary_loss_clip": 0.01103219, + "auxiliary_loss_mlp": 0.01088589, + "balance_loss_clip": 1.03234661, + "balance_loss_mlp": 1.02483761, + "epoch": 0.1304073350368255, + "flos": 18766338917760.0, + "grad_norm": 2.5277730479883607, + "language_loss": 0.81934512, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.84126323, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.78125, + "step": 2169, + "time_per_iteration": 2.4268789291381836 + }, + { + "auxiliary_loss_clip": 0.0110484, + "auxiliary_loss_mlp": 0.01073994, + "balance_loss_clip": 1.01942003, + "balance_loss_mlp": 1.02718449, + "epoch": 0.13046745828949347, + "flos": 28363438932480.0, + "grad_norm": 1.9945692413261011, + "language_loss": 0.70927185, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.73106027, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.77734375, + "step": 2170, + "time_per_iteration": 2.444035291671753 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.0108378, + "balance_loss_clip": 1.03106558, + "balance_loss_mlp": 1.02672482, + "epoch": 0.13052758154216143, + "flos": 29603798271360.0, + "grad_norm": 1.6597732886289804, + "language_loss": 0.68830538, + "learning_rate": 3.8949243605434e-06, + "loss": 0.71016043, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.75, + "step": 2171, + "time_per_iteration": 2.462592124938965 + }, + { + "auxiliary_loss_clip": 0.01103603, + "auxiliary_loss_mlp": 0.01083711, + "balance_loss_clip": 1.02839756, + "balance_loss_mlp": 1.02755022, + "epoch": 0.1305877047948294, + "flos": 19389852622080.0, + "grad_norm": 1.797303949574266, + "language_loss": 0.74787533, + "learning_rate": 3.894799748360537e-06, + "loss": 0.76974845, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.76171875, + "step": 2172, + "time_per_iteration": 2.4035677909851074 + }, + { + "auxiliary_loss_clip": 0.01096918, + "auxiliary_loss_mlp": 0.01066742, + "balance_loss_clip": 1.01805747, + "balance_loss_mlp": 1.02596641, + "epoch": 0.13064782804749736, + "flos": 16872579884160.0, + "grad_norm": 1.6062312006489599, + "language_loss": 0.77107453, + "learning_rate": 3.894675064326678e-06, + "loss": 0.79271108, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7109375, + "step": 2173, + "time_per_iteration": 2.383920431137085 + }, + { + "auxiliary_loss_clip": 0.01104374, + "auxiliary_loss_mlp": 0.01084473, + "balance_loss_clip": 1.02367592, + "balance_loss_mlp": 1.02684903, + "epoch": 0.13070795130016533, + "flos": 24497925154560.0, + "grad_norm": 2.148086685359615, + "language_loss": 0.73761266, + "learning_rate": 3.894550308446551e-06, + "loss": 0.7595011, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 0.77734375, + "step": 2174, + "time_per_iteration": 2.4106554985046387 + }, + { + "auxiliary_loss_clip": 0.01026133, + "auxiliary_loss_mlp": 0.01023231, + "balance_loss_clip": 1.01588738, + "balance_loss_mlp": 1.00703239, + "epoch": 0.13076807455283332, + "flos": 71051042211840.0, + "grad_norm": 1.3271502743681594, + "language_loss": 0.59081382, + "learning_rate": 3.894425480724886e-06, + "loss": 0.6113075, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.07324219, + "router_z_loss_mlp": 0.19140625, + "step": 2175, + "time_per_iteration": 3.186306953430176 + }, + { + "auxiliary_loss_clip": 0.01101312, + "auxiliary_loss_mlp": 0.01073086, + "balance_loss_clip": 1.01982284, + "balance_loss_mlp": 1.02684224, + "epoch": 0.13082819780550128, + "flos": 20263519284480.0, + "grad_norm": 2.0233089495626286, + "language_loss": 0.82500839, + "learning_rate": 3.894300581166417e-06, + "loss": 0.84675241, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.74609375, + "step": 2176, + "time_per_iteration": 2.392143964767456 + }, + { + "auxiliary_loss_clip": 0.01103056, + "auxiliary_loss_mlp": 0.01079865, + "balance_loss_clip": 1.0249573, + "balance_loss_mlp": 1.02847028, + "epoch": 0.13088832105816925, + "flos": 34202034086400.0, + "grad_norm": 1.8140135000813216, + "language_loss": 0.76003402, + "learning_rate": 3.894175609775881e-06, + "loss": 0.78186327, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.74609375, + "step": 2177, + "time_per_iteration": 2.49130916595459 + }, + { + "auxiliary_loss_clip": 0.01098207, + "auxiliary_loss_mlp": 0.01067108, + "balance_loss_clip": 1.0163244, + "balance_loss_mlp": 1.02541482, + "epoch": 0.13094844431083721, + "flos": 17893998887040.0, + "grad_norm": 1.720955826433338, + "language_loss": 0.83740467, + "learning_rate": 3.894050566558015e-06, + "loss": 0.85905778, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7265625, + "step": 2178, + "time_per_iteration": 2.4031450748443604 + }, + { + "auxiliary_loss_clip": 0.01101447, + "auxiliary_loss_mlp": 0.01073703, + "balance_loss_clip": 1.02313411, + "balance_loss_mlp": 1.02576351, + "epoch": 0.13100856756350518, + "flos": 17310355822080.0, + "grad_norm": 1.9384048704673258, + "language_loss": 0.76314843, + "learning_rate": 3.893925451517562e-06, + "loss": 0.78489995, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.7578125, + "step": 2179, + "time_per_iteration": 2.3660812377929688 + }, + { + "auxiliary_loss_clip": 0.01098221, + "auxiliary_loss_mlp": 0.01072138, + "balance_loss_clip": 1.02371478, + "balance_loss_mlp": 1.02575374, + "epoch": 0.13106869081617314, + "flos": 22199453107200.0, + "grad_norm": 3.884586288962993, + "language_loss": 0.86351562, + "learning_rate": 3.893800264659266e-06, + "loss": 0.88521922, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7265625, + "step": 2180, + "time_per_iteration": 2.4578819274902344 + }, + { + "auxiliary_loss_clip": 0.0110077, + "auxiliary_loss_mlp": 0.01077793, + "balance_loss_clip": 1.02624714, + "balance_loss_mlp": 1.02796233, + "epoch": 0.13112881406884114, + "flos": 21762026282880.0, + "grad_norm": 4.332106160440889, + "language_loss": 0.91461575, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.93640137, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.7265625, + "step": 2181, + "time_per_iteration": 2.451793909072876 + }, + { + "auxiliary_loss_clip": 0.01101838, + "auxiliary_loss_mlp": 0.01077466, + "balance_loss_clip": 1.02448988, + "balance_loss_mlp": 1.02778435, + "epoch": 0.1311889373215091, + "flos": 23329975708800.0, + "grad_norm": 2.05400055651519, + "language_loss": 0.70575041, + "learning_rate": 3.893549675508137e-06, + "loss": 0.72754347, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7421875, + "step": 2182, + "time_per_iteration": 2.4025819301605225 + }, + { + "auxiliary_loss_clip": 0.01105745, + "auxiliary_loss_mlp": 0.01071517, + "balance_loss_clip": 1.01856422, + "balance_loss_mlp": 1.02907538, + "epoch": 0.13124906057417707, + "flos": 21466381956480.0, + "grad_norm": 1.7763946611080843, + "language_loss": 0.80281943, + "learning_rate": 3.893424273224806e-06, + "loss": 0.82459199, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.765625, + "step": 2183, + "time_per_iteration": 2.4590823650360107 + }, + { + "auxiliary_loss_clip": 0.01099295, + "auxiliary_loss_mlp": 0.01075092, + "balance_loss_clip": 1.02547693, + "balance_loss_mlp": 1.02656484, + "epoch": 0.13130918382684503, + "flos": 23254284147840.0, + "grad_norm": 1.551847241482306, + "language_loss": 0.86526358, + "learning_rate": 3.893298799142636e-06, + "loss": 0.88700747, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.7265625, + "step": 2184, + "time_per_iteration": 2.417428731918335 + }, + { + "auxiliary_loss_clip": 0.0110309, + "auxiliary_loss_mlp": 0.01075245, + "balance_loss_clip": 1.0226016, + "balance_loss_mlp": 1.02787352, + "epoch": 0.131369307079513, + "flos": 20849222119680.0, + "grad_norm": 1.9253980271838569, + "language_loss": 0.83943623, + "learning_rate": 3.893173253266387e-06, + "loss": 0.86121953, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.75390625, + "step": 2185, + "time_per_iteration": 2.4021122455596924 + }, + { + "auxiliary_loss_clip": 0.01102327, + "auxiliary_loss_mlp": 0.01083768, + "balance_loss_clip": 1.0331037, + "balance_loss_mlp": 1.026546, + "epoch": 0.13142943033218096, + "flos": 17857375004160.0, + "grad_norm": 2.0324346868977017, + "language_loss": 0.7581706, + "learning_rate": 3.893047635600818e-06, + "loss": 0.78003156, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7578125, + "step": 2186, + "time_per_iteration": 2.3760697841644287 + }, + { + "auxiliary_loss_clip": 0.01098425, + "auxiliary_loss_mlp": 0.0107473, + "balance_loss_clip": 1.02270675, + "balance_loss_mlp": 1.02595532, + "epoch": 0.13148955358484893, + "flos": 20994984512640.0, + "grad_norm": 2.1810867133613843, + "language_loss": 0.82840842, + "learning_rate": 3.892921946150693e-06, + "loss": 0.85013998, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.72265625, + "step": 2187, + "time_per_iteration": 2.390354633331299 + }, + { + "auxiliary_loss_clip": 0.01026024, + "auxiliary_loss_mlp": 0.01025985, + "balance_loss_clip": 1.01845086, + "balance_loss_mlp": 1.00727439, + "epoch": 0.13154967683751692, + "flos": 70169206291200.0, + "grad_norm": 0.8526040588713295, + "language_loss": 0.59173119, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61225128, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.07519531, + "router_z_loss_mlp": 0.1875, + "step": 2188, + "time_per_iteration": 3.058379650115967 + }, + { + "auxiliary_loss_clip": 0.01098561, + "auxiliary_loss_mlp": 0.01072022, + "balance_loss_clip": 1.02233529, + "balance_loss_mlp": 1.02687764, + "epoch": 0.1316098000901849, + "flos": 20375101589760.0, + "grad_norm": 2.1819189748443946, + "language_loss": 0.75803828, + "learning_rate": 3.892670351915842e-06, + "loss": 0.77974403, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.71875, + "step": 2189, + "time_per_iteration": 2.373556137084961 + }, + { + "auxiliary_loss_clip": 0.01096778, + "auxiliary_loss_mlp": 0.01076097, + "balance_loss_clip": 1.02605259, + "balance_loss_mlp": 1.02622294, + "epoch": 0.13166992334285285, + "flos": 23220034237440.0, + "grad_norm": 1.8170873819931301, + "language_loss": 0.73954928, + "learning_rate": 3.892544447140657e-06, + "loss": 0.76127803, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.703125, + "step": 2190, + "time_per_iteration": 3.866952419281006 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_clip": 1.02997327, + "balance_loss_mlp": 1.02823138, + "epoch": 0.13173004659552082, + "flos": 23329836063360.0, + "grad_norm": 1.7045394228997708, + "language_loss": 0.76233184, + "learning_rate": 3.892418470599996e-06, + "loss": 0.78412086, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.734375, + "step": 2191, + "time_per_iteration": 2.405099630355835 + }, + { + "auxiliary_loss_clip": 0.01100217, + "auxiliary_loss_mlp": 0.01079395, + "balance_loss_clip": 1.03080559, + "balance_loss_mlp": 1.0269568, + "epoch": 0.13179016984818878, + "flos": 21250443997440.0, + "grad_norm": 1.8611697006616352, + "language_loss": 0.80638373, + "learning_rate": 3.892292422298637e-06, + "loss": 0.82817996, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.734375, + "step": 2192, + "time_per_iteration": 3.939382553100586 + }, + { + "auxiliary_loss_clip": 0.01100907, + "auxiliary_loss_mlp": 0.01076242, + "balance_loss_clip": 1.0283196, + "balance_loss_mlp": 1.02775931, + "epoch": 0.13185029310085675, + "flos": 17777913016320.0, + "grad_norm": 1.862093829668189, + "language_loss": 0.86937112, + "learning_rate": 3.892166302241361e-06, + "loss": 0.89114261, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.73046875, + "step": 2193, + "time_per_iteration": 3.8463470935821533 + }, + { + "auxiliary_loss_clip": 0.01025657, + "auxiliary_loss_mlp": 0.01015601, + "balance_loss_clip": 1.00787592, + "balance_loss_mlp": 1.00635481, + "epoch": 0.1319104163535247, + "flos": 69848319185280.0, + "grad_norm": 0.7642247581550348, + "language_loss": 0.54157043, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56198299, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.07714844, + "router_z_loss_mlp": 0.19335938, + "step": 2194, + "time_per_iteration": 2.992501974105835 + }, + { + "auxiliary_loss_clip": 0.01100398, + "auxiliary_loss_mlp": 0.01077809, + "balance_loss_clip": 1.02771759, + "balance_loss_mlp": 1.02622354, + "epoch": 0.1319705396061927, + "flos": 25191893715840.0, + "grad_norm": 1.9800781632050022, + "language_loss": 0.73985565, + "learning_rate": 3.891913846878185e-06, + "loss": 0.76163775, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7421875, + "step": 2195, + "time_per_iteration": 2.475264549255371 + }, + { + "auxiliary_loss_clip": 0.01103576, + "auxiliary_loss_mlp": 0.01073103, + "balance_loss_clip": 1.02146173, + "balance_loss_mlp": 1.02726948, + "epoch": 0.13203066285886067, + "flos": 20739420293760.0, + "grad_norm": 1.5897912860088113, + "language_loss": 0.79767597, + "learning_rate": 3.891787511581859e-06, + "loss": 0.81944275, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.76171875, + "step": 2196, + "time_per_iteration": 2.398730516433716 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01068002, + "balance_loss_clip": 1.01838756, + "balance_loss_mlp": 1.02693677, + "epoch": 0.13209078611152864, + "flos": 22053306689280.0, + "grad_norm": 2.087994542189287, + "language_loss": 0.77902067, + "learning_rate": 3.89166110454876e-06, + "loss": 0.80071986, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.75, + "step": 2197, + "time_per_iteration": 2.390437602996826 + }, + { + "auxiliary_loss_clip": 0.01102531, + "auxiliary_loss_mlp": 0.0107752, + "balance_loss_clip": 1.02416193, + "balance_loss_mlp": 1.0267837, + "epoch": 0.1321509093641966, + "flos": 16284153962880.0, + "grad_norm": 2.241333516925777, + "language_loss": 0.8223244, + "learning_rate": 3.891534625783685e-06, + "loss": 0.84412491, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.7578125, + "step": 2198, + "time_per_iteration": 2.380690336227417 + }, + { + "auxiliary_loss_clip": 0.01102473, + "auxiliary_loss_mlp": 0.01072234, + "balance_loss_clip": 1.0224762, + "balance_loss_mlp": 1.02763891, + "epoch": 0.13221103261686457, + "flos": 16982067507840.0, + "grad_norm": 2.280050670343438, + "language_loss": 0.85262024, + "learning_rate": 3.891408075291425e-06, + "loss": 0.8743673, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.75, + "step": 2199, + "time_per_iteration": 2.3616855144500732 + }, + { + "auxiliary_loss_clip": 0.01103017, + "auxiliary_loss_mlp": 0.01075078, + "balance_loss_clip": 1.02469993, + "balance_loss_mlp": 1.02802777, + "epoch": 0.13227115586953253, + "flos": 34232373924480.0, + "grad_norm": 1.8971044008347986, + "language_loss": 0.70888042, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.73066139, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.75, + "step": 2200, + "time_per_iteration": 2.5392343997955322 + }, + { + "auxiliary_loss_clip": 0.01099209, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_clip": 1.02340031, + "balance_loss_mlp": 1.02655399, + "epoch": 0.13233127912220052, + "flos": 20703599372160.0, + "grad_norm": 1.6772671422531187, + "language_loss": 0.8695336, + "learning_rate": 3.891154759144557e-06, + "loss": 0.89125037, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.7265625, + "step": 2201, + "time_per_iteration": 2.399847984313965 + }, + { + "auxiliary_loss_clip": 0.01105383, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_clip": 1.02207947, + "balance_loss_mlp": 1.02817249, + "epoch": 0.1323914023748685, + "flos": 25804061228160.0, + "grad_norm": 1.7626929257480006, + "language_loss": 0.89090186, + "learning_rate": 3.891027993499554e-06, + "loss": 0.91269982, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7734375, + "step": 2202, + "time_per_iteration": 2.4358041286468506 + }, + { + "auxiliary_loss_clip": 0.01101246, + "auxiliary_loss_mlp": 0.01072023, + "balance_loss_clip": 1.02102518, + "balance_loss_mlp": 1.02812624, + "epoch": 0.13245152562753645, + "flos": 21250478908800.0, + "grad_norm": 2.162875163457208, + "language_loss": 0.74754173, + "learning_rate": 3.89090115614658e-06, + "loss": 0.76927441, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.73046875, + "step": 2203, + "time_per_iteration": 2.426265239715576 + }, + { + "auxiliary_loss_clip": 0.01103502, + "auxiliary_loss_mlp": 0.01072753, + "balance_loss_clip": 1.02335298, + "balance_loss_mlp": 1.0271883, + "epoch": 0.13251164888020442, + "flos": 26609856474240.0, + "grad_norm": 22.091661392112414, + "language_loss": 0.76117986, + "learning_rate": 3.890774247090444e-06, + "loss": 0.78294241, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.76171875, + "step": 2204, + "time_per_iteration": 2.428243398666382 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01074333, + "balance_loss_clip": 1.02471769, + "balance_loss_mlp": 1.02933431, + "epoch": 0.13257177213287238, + "flos": 29825217313920.0, + "grad_norm": 1.863317482381587, + "language_loss": 0.80477309, + "learning_rate": 3.89064726633596e-06, + "loss": 0.82654446, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.734375, + "step": 2205, + "time_per_iteration": 2.4821231365203857 + }, + { + "auxiliary_loss_clip": 0.01100521, + "auxiliary_loss_mlp": 0.01073936, + "balance_loss_clip": 1.02308106, + "balance_loss_mlp": 1.02849793, + "epoch": 0.13263189538554035, + "flos": 21287382082560.0, + "grad_norm": 2.059719799170582, + "language_loss": 0.81315291, + "learning_rate": 3.890520213887941e-06, + "loss": 0.83489746, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.71875, + "step": 2206, + "time_per_iteration": 2.3967065811157227 + }, + { + "auxiliary_loss_clip": 0.0110418, + "auxiliary_loss_mlp": 0.01078341, + "balance_loss_clip": 1.02965617, + "balance_loss_mlp": 1.0296495, + "epoch": 0.13269201863820831, + "flos": 16873138465920.0, + "grad_norm": 2.161020196177023, + "language_loss": 0.77088231, + "learning_rate": 3.890393089751208e-06, + "loss": 0.7927075, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.74609375, + "step": 2207, + "time_per_iteration": 2.3862597942352295 + }, + { + "auxiliary_loss_clip": 0.01099054, + "auxiliary_loss_mlp": 0.01069336, + "balance_loss_clip": 1.01740837, + "balance_loss_mlp": 1.02708483, + "epoch": 0.1327521418908763, + "flos": 23767786558080.0, + "grad_norm": 1.603926206442705, + "language_loss": 0.86015475, + "learning_rate": 3.890265893930578e-06, + "loss": 0.88183862, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.71875, + "step": 2208, + "time_per_iteration": 2.411311388015747 + }, + { + "auxiliary_loss_clip": 0.01096619, + "auxiliary_loss_mlp": 0.01076314, + "balance_loss_clip": 1.02791524, + "balance_loss_mlp": 1.02822804, + "epoch": 0.13281226514354427, + "flos": 26504383656960.0, + "grad_norm": 1.6122563237262262, + "language_loss": 0.86913729, + "learning_rate": 3.890138626430876e-06, + "loss": 0.89086658, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.68359375, + "step": 2209, + "time_per_iteration": 2.4462203979492188 + }, + { + "auxiliary_loss_clip": 0.01098641, + "auxiliary_loss_mlp": 0.01073012, + "balance_loss_clip": 1.02511382, + "balance_loss_mlp": 1.02627707, + "epoch": 0.13287238839621224, + "flos": 24497610952320.0, + "grad_norm": 1.8509664704096933, + "language_loss": 0.84244275, + "learning_rate": 3.890011287256929e-06, + "loss": 0.86415929, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.7265625, + "step": 2210, + "time_per_iteration": 2.405325412750244 + }, + { + "auxiliary_loss_clip": 0.01028678, + "auxiliary_loss_mlp": 0.01013493, + "balance_loss_clip": 1.0045284, + "balance_loss_mlp": 1.00929117, + "epoch": 0.1329325116488802, + "flos": 67691071054080.0, + "grad_norm": 0.7644311733127371, + "language_loss": 0.58120221, + "learning_rate": 3.889883876413563e-06, + "loss": 0.60162395, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.08984375, + "router_z_loss_mlp": 0.19433594, + "step": 2211, + "time_per_iteration": 3.157275915145874 + }, + { + "auxiliary_loss_clip": 0.01030107, + "auxiliary_loss_mlp": 0.01008513, + "balance_loss_clip": 1.00016809, + "balance_loss_mlp": 1.00954723, + "epoch": 0.13299263490154817, + "flos": 72258303715200.0, + "grad_norm": 0.8247129396851514, + "language_loss": 0.55416054, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57454669, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.08349609, + "router_z_loss_mlp": 0.20507812, + "step": 2212, + "time_per_iteration": 3.1067214012145996 + }, + { + "auxiliary_loss_clip": 0.01106708, + "auxiliary_loss_mlp": 0.0107704, + "balance_loss_clip": 1.02511203, + "balance_loss_mlp": 1.02860498, + "epoch": 0.13305275815421613, + "flos": 17930308567680.0, + "grad_norm": 2.147444456482937, + "language_loss": 0.77006203, + "learning_rate": 3.889628839737908e-06, + "loss": 0.7918995, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.78125, + "step": 2213, + "time_per_iteration": 2.363706588745117 + }, + { + "auxiliary_loss_clip": 0.01093354, + "auxiliary_loss_mlp": 0.0106555, + "balance_loss_clip": 1.01963019, + "balance_loss_mlp": 1.02437508, + "epoch": 0.13311288140688413, + "flos": 22339943884800.0, + "grad_norm": 1.6570479582484574, + "language_loss": 0.80528772, + "learning_rate": 3.889501213915291e-06, + "loss": 0.82687676, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.69140625, + "step": 2214, + "time_per_iteration": 2.430022716522217 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.0107587, + "balance_loss_clip": 1.02699411, + "balance_loss_mlp": 1.02724886, + "epoch": 0.1331730046595521, + "flos": 31867531649280.0, + "grad_norm": 2.6049125947989578, + "language_loss": 0.71706915, + "learning_rate": 3.889373516442597e-06, + "loss": 0.73882866, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7265625, + "step": 2215, + "time_per_iteration": 2.602797031402588 + }, + { + "auxiliary_loss_clip": 0.01100546, + "auxiliary_loss_mlp": 0.01079967, + "balance_loss_clip": 1.02856398, + "balance_loss_mlp": 1.02581954, + "epoch": 0.13323312791222006, + "flos": 22565447556480.0, + "grad_norm": 1.6504324730825204, + "language_loss": 0.8263526, + "learning_rate": 3.889245747324671e-06, + "loss": 0.8481577, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.74609375, + "step": 2216, + "time_per_iteration": 2.4838573932647705 + }, + { + "auxiliary_loss_clip": 0.01099081, + "auxiliary_loss_mlp": 0.01080751, + "balance_loss_clip": 1.03218544, + "balance_loss_mlp": 1.02684927, + "epoch": 0.13329325116488802, + "flos": 15084433313280.0, + "grad_norm": 2.154954400830231, + "language_loss": 0.89480364, + "learning_rate": 3.889117906566356e-06, + "loss": 0.91660196, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.71875, + "step": 2217, + "time_per_iteration": 2.4196934700012207 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.01069494, + "balance_loss_clip": 1.02114272, + "balance_loss_mlp": 1.02821517, + "epoch": 0.133353374417556, + "flos": 27452450160000.0, + "grad_norm": 2.3070235340726657, + "language_loss": 0.75504792, + "learning_rate": 3.888989994172501e-06, + "loss": 0.77673769, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7109375, + "step": 2218, + "time_per_iteration": 2.4511611461639404 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01072347, + "balance_loss_clip": 1.01979923, + "balance_loss_mlp": 1.02819741, + "epoch": 0.13341349767022395, + "flos": 24093631077120.0, + "grad_norm": 1.6865516504875226, + "language_loss": 0.88894379, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.9106704, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.72265625, + "step": 2219, + "time_per_iteration": 2.419753313064575 + }, + { + "auxiliary_loss_clip": 0.01104086, + "auxiliary_loss_mlp": 0.01074202, + "balance_loss_clip": 1.02556419, + "balance_loss_mlp": 1.03176022, + "epoch": 0.13347362092289192, + "flos": 24132209996160.0, + "grad_norm": 1.9515259778377105, + "language_loss": 0.78933638, + "learning_rate": 3.888733954497574e-06, + "loss": 0.81111932, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.72265625, + "step": 2220, + "time_per_iteration": 2.4327468872070312 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01074012, + "balance_loss_clip": 1.02616096, + "balance_loss_mlp": 1.02821457, + "epoch": 0.1335337441755599, + "flos": 18435711542400.0, + "grad_norm": 2.1120706156893796, + "language_loss": 0.81201619, + "learning_rate": 3.888605827226212e-06, + "loss": 0.83376741, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.73046875, + "step": 2221, + "time_per_iteration": 2.369741678237915 + }, + { + "auxiliary_loss_clip": 0.01030732, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.02020633, + "balance_loss_mlp": 1.0107193, + "epoch": 0.13359386742822787, + "flos": 50609395837440.0, + "grad_norm": 0.9849485220337436, + "language_loss": 0.69057494, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71116537, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.08105469, + "router_z_loss_mlp": 0.20019531, + "step": 2222, + "time_per_iteration": 2.845627784729004 + }, + { + "auxiliary_loss_clip": 0.01102037, + "auxiliary_loss_mlp": 0.01078514, + "balance_loss_clip": 1.03171277, + "balance_loss_mlp": 1.03042388, + "epoch": 0.13365399068089584, + "flos": 22777615088640.0, + "grad_norm": 1.75699260935051, + "language_loss": 0.68620056, + "learning_rate": 3.888349357839982e-06, + "loss": 0.70800608, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.71484375, + "step": 2223, + "time_per_iteration": 2.4104342460632324 + }, + { + "auxiliary_loss_clip": 0.01098854, + "auxiliary_loss_mlp": 0.01069809, + "balance_loss_clip": 1.02074289, + "balance_loss_mlp": 1.02735221, + "epoch": 0.1337141139335638, + "flos": 12530781072000.0, + "grad_norm": 1.9775686222207347, + "language_loss": 0.8480376, + "learning_rate": 3.88822101573484e-06, + "loss": 0.86972427, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.71484375, + "step": 2224, + "time_per_iteration": 2.38739275932312 + }, + { + "auxiliary_loss_clip": 0.01100708, + "auxiliary_loss_mlp": 0.01071646, + "balance_loss_clip": 1.02279413, + "balance_loss_mlp": 1.0275836, + "epoch": 0.13377423718623177, + "flos": 23037857429760.0, + "grad_norm": 1.9255327745820277, + "language_loss": 0.68654615, + "learning_rate": 3.888092602028167e-06, + "loss": 0.70826972, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.73046875, + "step": 2225, + "time_per_iteration": 2.4010989665985107 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.0106855, + "balance_loss_clip": 1.02313101, + "balance_loss_mlp": 1.02782238, + "epoch": 0.13383436043889974, + "flos": 16215479585280.0, + "grad_norm": 2.0355513272661234, + "language_loss": 0.92166185, + "learning_rate": 3.887964116724835e-06, + "loss": 0.94334352, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.71875, + "step": 2226, + "time_per_iteration": 2.3766438961029053 + }, + { + "auxiliary_loss_clip": 0.01098751, + "auxiliary_loss_mlp": 0.01085356, + "balance_loss_clip": 1.03769648, + "balance_loss_mlp": 1.02690625, + "epoch": 0.1338944836915677, + "flos": 24278530970880.0, + "grad_norm": 2.3197974944567985, + "language_loss": 0.76496994, + "learning_rate": 3.887835559829712e-06, + "loss": 0.78681099, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.71875, + "step": 2227, + "time_per_iteration": 2.5395162105560303 + }, + { + "auxiliary_loss_clip": 0.0109695, + "auxiliary_loss_mlp": 0.01069372, + "balance_loss_clip": 1.01923299, + "balance_loss_mlp": 1.02516258, + "epoch": 0.1339546069442357, + "flos": 17597900712960.0, + "grad_norm": 2.045375160076701, + "language_loss": 0.86295033, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.88461351, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.71875, + "step": 2228, + "time_per_iteration": 2.364319324493408 + }, + { + "auxiliary_loss_clip": 0.01094923, + "auxiliary_loss_mlp": 0.01067807, + "balance_loss_clip": 1.02460504, + "balance_loss_mlp": 1.02542627, + "epoch": 0.13401473019690366, + "flos": 18989049680640.0, + "grad_norm": 1.8602125548642618, + "language_loss": 0.83485156, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.85647893, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.6953125, + "step": 2229, + "time_per_iteration": 2.375342607498169 + }, + { + "auxiliary_loss_clip": 0.01097343, + "auxiliary_loss_mlp": 0.01076618, + "balance_loss_clip": 1.03084183, + "balance_loss_mlp": 1.0256443, + "epoch": 0.13407485344957162, + "flos": 26942578531200.0, + "grad_norm": 1.617592929277372, + "language_loss": 0.76911587, + "learning_rate": 3.887449459642378e-06, + "loss": 0.79085553, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.71875, + "step": 2230, + "time_per_iteration": 3.856142997741699 + }, + { + "auxiliary_loss_clip": 0.01097907, + "auxiliary_loss_mlp": 0.01075188, + "balance_loss_clip": 1.02967405, + "balance_loss_mlp": 1.02619624, + "epoch": 0.1341349767022396, + "flos": 20338338061440.0, + "grad_norm": 1.7600423149187243, + "language_loss": 0.82524061, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.84697163, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.71875, + "step": 2231, + "time_per_iteration": 2.4314239025115967 + }, + { + "auxiliary_loss_clip": 0.01099182, + "auxiliary_loss_mlp": 0.01074762, + "balance_loss_clip": 1.02421737, + "balance_loss_mlp": 1.02655923, + "epoch": 0.13419509995490755, + "flos": 29860724033280.0, + "grad_norm": 1.5567977257933459, + "language_loss": 0.74192995, + "learning_rate": 3.887191701647992e-06, + "loss": 0.76366937, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7265625, + "step": 2232, + "time_per_iteration": 3.860792398452759 + }, + { + "auxiliary_loss_clip": 0.01102289, + "auxiliary_loss_mlp": 0.010757, + "balance_loss_clip": 1.02386832, + "balance_loss_mlp": 1.02843082, + "epoch": 0.13425522320757552, + "flos": 26941775569920.0, + "grad_norm": 2.5913909199948186, + "language_loss": 0.68207562, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.70385551, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.73828125, + "step": 2233, + "time_per_iteration": 3.8814749717712402 + }, + { + "auxiliary_loss_clip": 0.01098815, + "auxiliary_loss_mlp": 0.01063877, + "balance_loss_clip": 1.01583529, + "balance_loss_mlp": 1.02544498, + "epoch": 0.1343153464602435, + "flos": 15776411927040.0, + "grad_norm": 2.5458297067228233, + "language_loss": 0.8360821, + "learning_rate": 3.886933657403615e-06, + "loss": 0.85770905, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.734375, + "step": 2234, + "time_per_iteration": 2.418581008911133 + }, + { + "auxiliary_loss_clip": 0.01099129, + "auxiliary_loss_mlp": 0.01070572, + "balance_loss_clip": 1.0197413, + "balance_loss_mlp": 1.02733564, + "epoch": 0.13437546971291148, + "flos": 24313653665280.0, + "grad_norm": 1.9024392479551717, + "language_loss": 0.82850933, + "learning_rate": 3.886804527949909e-06, + "loss": 0.85020638, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.71875, + "step": 2235, + "time_per_iteration": 2.4185612201690674 + }, + { + "auxiliary_loss_clip": 0.01098267, + "auxiliary_loss_mlp": 0.0107459, + "balance_loss_clip": 1.02435517, + "balance_loss_mlp": 1.02520013, + "epoch": 0.13443559296557944, + "flos": 26649482734080.0, + "grad_norm": 1.5479806939711218, + "language_loss": 0.87836885, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.90009737, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.734375, + "step": 2236, + "time_per_iteration": 2.4468624591827393 + }, + { + "auxiliary_loss_clip": 0.01100489, + "auxiliary_loss_mlp": 0.01079573, + "balance_loss_clip": 1.02638197, + "balance_loss_mlp": 1.0286299, + "epoch": 0.1344957162182474, + "flos": 21795193941120.0, + "grad_norm": 1.497363164959225, + "language_loss": 0.78802741, + "learning_rate": 3.886546054403946e-06, + "loss": 0.80982798, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.71875, + "step": 2237, + "time_per_iteration": 2.4397060871124268 + }, + { + "auxiliary_loss_clip": 0.01104362, + "auxiliary_loss_mlp": 0.01081905, + "balance_loss_clip": 1.0286901, + "balance_loss_mlp": 1.02831817, + "epoch": 0.13455583947091537, + "flos": 19864531733760.0, + "grad_norm": 1.9403822991825106, + "language_loss": 0.81926149, + "learning_rate": 3.886416710321491e-06, + "loss": 0.84112418, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.76171875, + "step": 2238, + "time_per_iteration": 2.518111228942871 + }, + { + "auxiliary_loss_clip": 0.01097606, + "auxiliary_loss_mlp": 0.01073574, + "balance_loss_clip": 1.02162218, + "balance_loss_mlp": 1.02651429, + "epoch": 0.13461596272358334, + "flos": 30845519153280.0, + "grad_norm": 1.9232825845611043, + "language_loss": 0.70116866, + "learning_rate": 3.886287294705924e-06, + "loss": 0.72288048, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.7109375, + "step": 2239, + "time_per_iteration": 2.4701101779937744 + }, + { + "auxiliary_loss_clip": 0.01100847, + "auxiliary_loss_mlp": 0.01071233, + "balance_loss_clip": 1.02083182, + "balance_loss_mlp": 1.02680802, + "epoch": 0.1346760859762513, + "flos": 12493633518720.0, + "grad_norm": 2.533619918983608, + "language_loss": 0.85951328, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.88123411, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.7421875, + "step": 2240, + "time_per_iteration": 2.3721256256103516 + }, + { + "auxiliary_loss_clip": 0.01104244, + "auxiliary_loss_mlp": 0.01079318, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.02625155, + "epoch": 0.1347362092289193, + "flos": 21834924935040.0, + "grad_norm": 1.759400868063721, + "language_loss": 0.80302268, + "learning_rate": 3.886028248895093e-06, + "loss": 0.82485831, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.78125, + "step": 2241, + "time_per_iteration": 2.4087882041931152 + }, + { + "auxiliary_loss_clip": 0.01097081, + "auxiliary_loss_mlp": 0.01065632, + "balance_loss_clip": 1.01835346, + "balance_loss_mlp": 1.02690816, + "epoch": 0.13479633248158726, + "flos": 23508451912320.0, + "grad_norm": 1.8099688119627833, + "language_loss": 0.85164362, + "learning_rate": 3.88589861870965e-06, + "loss": 0.87327075, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.703125, + "step": 2242, + "time_per_iteration": 2.4149866104125977 + }, + { + "auxiliary_loss_clip": 0.01103398, + "auxiliary_loss_mlp": 0.01083021, + "balance_loss_clip": 1.03257179, + "balance_loss_mlp": 1.02953422, + "epoch": 0.13485645573425523, + "flos": 29343241728000.0, + "grad_norm": 2.581427263831077, + "language_loss": 0.68314719, + "learning_rate": 3.885768917010744e-06, + "loss": 0.70501137, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.7421875, + "step": 2243, + "time_per_iteration": 2.4642670154571533 + }, + { + "auxiliary_loss_clip": 0.01096104, + "auxiliary_loss_mlp": 0.01070603, + "balance_loss_clip": 1.02492189, + "balance_loss_mlp": 1.02598977, + "epoch": 0.1349165789869232, + "flos": 28035883756800.0, + "grad_norm": 1.4770541881657482, + "language_loss": 0.74721408, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.7688812, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.69921875, + "step": 2244, + "time_per_iteration": 2.4423115253448486 + }, + { + "auxiliary_loss_clip": 0.01099045, + "auxiliary_loss_mlp": 0.01077254, + "balance_loss_clip": 1.02575552, + "balance_loss_mlp": 1.02527559, + "epoch": 0.13497670223959116, + "flos": 22852713156480.0, + "grad_norm": 1.5933513245835889, + "language_loss": 0.87576103, + "learning_rate": 3.88550929909221e-06, + "loss": 0.89752406, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.73828125, + "step": 2245, + "time_per_iteration": 2.406095266342163 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_clip": 1.02264965, + "balance_loss_mlp": 1.02639711, + "epoch": 0.13503682549225912, + "flos": 16503757614720.0, + "grad_norm": 1.6263544111086021, + "language_loss": 0.80819583, + "learning_rate": 3.88537938288243e-06, + "loss": 0.82986194, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.703125, + "step": 2246, + "time_per_iteration": 2.403489112854004 + }, + { + "auxiliary_loss_clip": 0.01026472, + "auxiliary_loss_mlp": 0.01015146, + "balance_loss_clip": 1.00718296, + "balance_loss_mlp": 1.00655138, + "epoch": 0.1350969487449271, + "flos": 70753023912960.0, + "grad_norm": 0.757451107081713, + "language_loss": 0.60601932, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62643546, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.07958984, + "router_z_loss_mlp": 0.19921875, + "step": 2247, + "time_per_iteration": 3.1073782444000244 + }, + { + "auxiliary_loss_clip": 0.01103046, + "auxiliary_loss_mlp": 0.01076854, + "balance_loss_clip": 1.02134991, + "balance_loss_mlp": 1.02588606, + "epoch": 0.13515707199759508, + "flos": 23074865337600.0, + "grad_norm": 1.7590742050169108, + "language_loss": 0.82550848, + "learning_rate": 3.885119335986473e-06, + "loss": 0.8473075, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 0.7734375, + "step": 2248, + "time_per_iteration": 2.4199070930480957 + }, + { + "auxiliary_loss_clip": 0.01097229, + "auxiliary_loss_mlp": 0.01066393, + "balance_loss_clip": 1.0205214, + "balance_loss_mlp": 1.02659726, + "epoch": 0.13521719525026304, + "flos": 23185225745280.0, + "grad_norm": 1.7847406000295978, + "language_loss": 0.78590763, + "learning_rate": 3.884989205310157e-06, + "loss": 0.80754375, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.70703125, + "step": 2249, + "time_per_iteration": 2.424166679382324 + }, + { + "auxiliary_loss_clip": 0.01097915, + "auxiliary_loss_mlp": 0.01072005, + "balance_loss_clip": 1.02208006, + "balance_loss_mlp": 1.02584088, + "epoch": 0.135277318502931, + "flos": 24789764142720.0, + "grad_norm": 1.4772714333685675, + "language_loss": 0.85804892, + "learning_rate": 3.884859003154862e-06, + "loss": 0.87974811, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.71875, + "step": 2250, + "time_per_iteration": 2.459350824356079 + }, + { + "auxiliary_loss_clip": 0.01100116, + "auxiliary_loss_mlp": 0.01076168, + "balance_loss_clip": 1.02388275, + "balance_loss_mlp": 1.02707946, + "epoch": 0.13533744175559898, + "flos": 21907439562240.0, + "grad_norm": 1.952702536190691, + "language_loss": 0.84368861, + "learning_rate": 3.884728729525524e-06, + "loss": 0.86545146, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.73046875, + "step": 2251, + "time_per_iteration": 2.4180047512054443 + }, + { + "auxiliary_loss_clip": 0.01098556, + "auxiliary_loss_mlp": 0.01082246, + "balance_loss_clip": 1.03410912, + "balance_loss_mlp": 1.0249269, + "epoch": 0.13539756500826694, + "flos": 21210678092160.0, + "grad_norm": 1.7143504194189876, + "language_loss": 0.86527205, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88708001, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.734375, + "step": 2252, + "time_per_iteration": 2.398433208465576 + }, + { + "auxiliary_loss_clip": 0.01028553, + "auxiliary_loss_mlp": 0.0101601, + "balance_loss_clip": 1.00728357, + "balance_loss_mlp": 1.00885916, + "epoch": 0.1354576882609349, + "flos": 63238981656960.0, + "grad_norm": 0.7733023201636623, + "language_loss": 0.62033314, + "learning_rate": 3.884467967864485e-06, + "loss": 0.64077878, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.08740234, + "router_z_loss_mlp": 0.19726562, + "step": 2253, + "time_per_iteration": 3.136885166168213 + }, + { + "auxiliary_loss_clip": 0.01097439, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.02739191, + "epoch": 0.1355178115136029, + "flos": 25481882401920.0, + "grad_norm": 1.6281869850150181, + "language_loss": 0.91105425, + "learning_rate": 3.884337479842671e-06, + "loss": 0.9327926, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.69921875, + "step": 2254, + "time_per_iteration": 2.457720994949341 + }, + { + "auxiliary_loss_clip": 0.01101147, + "auxiliary_loss_mlp": 0.0108143, + "balance_loss_clip": 1.0323391, + "balance_loss_mlp": 1.02672029, + "epoch": 0.13557793476627086, + "flos": 21615879864960.0, + "grad_norm": 1.8199792501807002, + "language_loss": 0.87098551, + "learning_rate": 3.884206920366591e-06, + "loss": 0.8928113, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.7421875, + "step": 2255, + "time_per_iteration": 2.409785270690918 + }, + { + "auxiliary_loss_clip": 0.01094607, + "auxiliary_loss_mlp": 0.01066914, + "balance_loss_clip": 1.02163863, + "balance_loss_mlp": 1.02453744, + "epoch": 0.13563805801893883, + "flos": 24927322366080.0, + "grad_norm": 2.41359802652145, + "language_loss": 0.77321422, + "learning_rate": 3.884076289441196e-06, + "loss": 0.79482943, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.69921875, + "step": 2256, + "time_per_iteration": 2.4274182319641113 + }, + { + "auxiliary_loss_clip": 0.010961, + "auxiliary_loss_mlp": 0.01068414, + "balance_loss_clip": 1.01841819, + "balance_loss_mlp": 1.02437472, + "epoch": 0.1356981812716068, + "flos": 14749581663360.0, + "grad_norm": 2.1700475601124296, + "language_loss": 0.84923184, + "learning_rate": 3.88394558707144e-06, + "loss": 0.87087703, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.71875, + "step": 2257, + "time_per_iteration": 2.373764753341675 + }, + { + "auxiliary_loss_clip": 0.01103694, + "auxiliary_loss_mlp": 0.01079532, + "balance_loss_clip": 1.02600682, + "balance_loss_mlp": 1.02561045, + "epoch": 0.13575830452427476, + "flos": 11107791077760.0, + "grad_norm": 2.1731821464517953, + "language_loss": 0.84647334, + "learning_rate": 3.883814813262277e-06, + "loss": 0.86830562, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.78125, + "step": 2258, + "time_per_iteration": 2.370626926422119 + }, + { + "auxiliary_loss_clip": 0.01099121, + "auxiliary_loss_mlp": 0.01071684, + "balance_loss_clip": 1.02104437, + "balance_loss_mlp": 1.02624071, + "epoch": 0.13581842777694272, + "flos": 17959531242240.0, + "grad_norm": 2.6373067891920203, + "language_loss": 0.85858792, + "learning_rate": 3.883683968018669e-06, + "loss": 0.88029593, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.73046875, + "step": 2259, + "time_per_iteration": 2.395463705062866 + }, + { + "auxiliary_loss_clip": 0.01097073, + "auxiliary_loss_mlp": 0.01075163, + "balance_loss_clip": 1.0283848, + "balance_loss_mlp": 1.02525616, + "epoch": 0.1358785510296107, + "flos": 22856029735680.0, + "grad_norm": 2.0032252664893138, + "language_loss": 0.75659031, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.77831268, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.71875, + "step": 2260, + "time_per_iteration": 2.3941452503204346 + }, + { + "auxiliary_loss_clip": 0.01097665, + "auxiliary_loss_mlp": 0.01071203, + "balance_loss_clip": 1.02318597, + "balance_loss_mlp": 1.02639341, + "epoch": 0.13593867428227868, + "flos": 25738214670720.0, + "grad_norm": 2.292928812605111, + "language_loss": 0.77429187, + "learning_rate": 3.883422063247961e-06, + "loss": 0.79598057, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.71484375, + "step": 2261, + "time_per_iteration": 2.4400699138641357 + }, + { + "auxiliary_loss_clip": 0.01098733, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.0187062, + "balance_loss_mlp": 1.025599, + "epoch": 0.13599879753494665, + "flos": 31247858194560.0, + "grad_norm": 2.604008987517394, + "language_loss": 0.65523297, + "learning_rate": 3.883291003730794e-06, + "loss": 0.67687988, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.734375, + "step": 2262, + "time_per_iteration": 2.57812762260437 + }, + { + "auxiliary_loss_clip": 0.01097153, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.01711595, + "balance_loss_mlp": 1.0269351, + "epoch": 0.1360589207876146, + "flos": 23913898064640.0, + "grad_norm": 2.5597430599023268, + "language_loss": 0.85676247, + "learning_rate": 3.883159872799043e-06, + "loss": 0.87835294, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.703125, + "step": 2263, + "time_per_iteration": 2.4191956520080566 + }, + { + "auxiliary_loss_clip": 0.01099959, + "auxiliary_loss_mlp": 0.01076651, + "balance_loss_clip": 1.02605867, + "balance_loss_mlp": 1.02628708, + "epoch": 0.13611904404028258, + "flos": 19973181484800.0, + "grad_norm": 1.813451160050589, + "language_loss": 0.90408027, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.92584634, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.73828125, + "step": 2264, + "time_per_iteration": 2.3880631923675537 + }, + { + "auxiliary_loss_clip": 0.01101171, + "auxiliary_loss_mlp": 0.01070633, + "balance_loss_clip": 1.01980257, + "balance_loss_mlp": 1.02616441, + "epoch": 0.13617916729295054, + "flos": 15339753152640.0, + "grad_norm": 6.156833009868393, + "language_loss": 0.74880558, + "learning_rate": 3.882897396711683e-06, + "loss": 0.77052367, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2265, + "time_per_iteration": 2.3641650676727295 + }, + { + "auxiliary_loss_clip": 0.01095415, + "auxiliary_loss_mlp": 0.01066369, + "balance_loss_clip": 1.02023554, + "balance_loss_mlp": 1.02640224, + "epoch": 0.1362392905456185, + "flos": 27450285655680.0, + "grad_norm": 1.9548746074036174, + "language_loss": 0.6882602, + "learning_rate": 3.882766051566027e-06, + "loss": 0.70987803, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.69140625, + "step": 2266, + "time_per_iteration": 2.4353957176208496 + }, + { + "auxiliary_loss_clip": 0.010956, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_clip": 1.02888465, + "balance_loss_mlp": 1.02594626, + "epoch": 0.1362994137982865, + "flos": 25007866606080.0, + "grad_norm": 1.539464861056712, + "language_loss": 0.78371, + "learning_rate": 3.882634635025694e-06, + "loss": 0.80544311, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.6953125, + "step": 2267, + "time_per_iteration": 2.422356367111206 + }, + { + "auxiliary_loss_clip": 0.01096752, + "auxiliary_loss_mlp": 0.01074279, + "balance_loss_clip": 1.02852607, + "balance_loss_mlp": 1.02502263, + "epoch": 0.13635953705095447, + "flos": 20301993469440.0, + "grad_norm": 1.819110101552072, + "language_loss": 0.83485019, + "learning_rate": 3.882503147095667e-06, + "loss": 0.85656047, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.71875, + "step": 2268, + "time_per_iteration": 2.3829476833343506 + }, + { + "auxiliary_loss_clip": 0.0109508, + "auxiliary_loss_mlp": 0.01060979, + "balance_loss_clip": 1.01548886, + "balance_loss_mlp": 1.02592158, + "epoch": 0.13641966030362243, + "flos": 31357066527360.0, + "grad_norm": 1.7807704464336256, + "language_loss": 0.7826739, + "learning_rate": 3.882371587780931e-06, + "loss": 0.8042345, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.69140625, + "step": 2269, + "time_per_iteration": 3.877122640609741 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01068464, + "balance_loss_clip": 1.02104235, + "balance_loss_mlp": 1.02821445, + "epoch": 0.1364797835562904, + "flos": 20477257827840.0, + "grad_norm": 3.152170079273674, + "language_loss": 0.82435924, + "learning_rate": 3.882239957086477e-06, + "loss": 0.84605777, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.734375, + "step": 2270, + "time_per_iteration": 2.3710389137268066 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01076521, + "balance_loss_clip": 1.02833652, + "balance_loss_mlp": 1.02480543, + "epoch": 0.13653990680895836, + "flos": 13077520963200.0, + "grad_norm": 2.448385458643169, + "language_loss": 0.77454352, + "learning_rate": 3.882108255017295e-06, + "loss": 0.79629594, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.73828125, + "step": 2271, + "time_per_iteration": 3.860752820968628 + }, + { + "auxiliary_loss_clip": 0.01099024, + "auxiliary_loss_mlp": 0.01076941, + "balance_loss_clip": 1.03009224, + "balance_loss_mlp": 1.02736855, + "epoch": 0.13660003006162633, + "flos": 16945757827200.0, + "grad_norm": 1.8984415281285782, + "language_loss": 0.82231283, + "learning_rate": 3.881976481578379e-06, + "loss": 0.84407252, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.71484375, + "step": 2272, + "time_per_iteration": 5.248999118804932 + }, + { + "auxiliary_loss_clip": 0.01023918, + "auxiliary_loss_mlp": 0.01008438, + "balance_loss_clip": 1.00138044, + "balance_loss_mlp": 1.00426769, + "epoch": 0.1366601533142943, + "flos": 68679357310080.0, + "grad_norm": 0.6972934234469925, + "language_loss": 0.60881793, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62914151, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.07080078, + "router_z_loss_mlp": 0.19726562, + "step": 2273, + "time_per_iteration": 3.117666006088257 + }, + { + "auxiliary_loss_clip": 0.01095842, + "auxiliary_loss_mlp": 0.01070711, + "balance_loss_clip": 1.0235517, + "balance_loss_mlp": 1.02516508, + "epoch": 0.13672027656696228, + "flos": 19243252356480.0, + "grad_norm": 1.6499385615182312, + "language_loss": 0.79421479, + "learning_rate": 3.881712720611336e-06, + "loss": 0.8158803, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.70703125, + "step": 2274, + "time_per_iteration": 2.3902363777160645 + }, + { + "auxiliary_loss_clip": 0.01097658, + "auxiliary_loss_mlp": 0.0107179, + "balance_loss_clip": 1.02637148, + "balance_loss_mlp": 1.02565575, + "epoch": 0.13678039981963025, + "flos": 24533780987520.0, + "grad_norm": 1.7358002679608784, + "language_loss": 0.79964995, + "learning_rate": 3.881580733093211e-06, + "loss": 0.82134438, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.71875, + "step": 2275, + "time_per_iteration": 2.432495594024658 + }, + { + "auxiliary_loss_clip": 0.01095109, + "auxiliary_loss_mlp": 0.01071957, + "balance_loss_clip": 1.02618098, + "balance_loss_mlp": 1.02461791, + "epoch": 0.13684052307229821, + "flos": 15668425491840.0, + "grad_norm": 2.394219708841684, + "language_loss": 0.83864653, + "learning_rate": 3.881448674225356e-06, + "loss": 0.86031723, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.703125, + "step": 2276, + "time_per_iteration": 2.379607915878296 + }, + { + "auxiliary_loss_clip": 0.01103667, + "auxiliary_loss_mlp": 0.01082731, + "balance_loss_clip": 1.02877665, + "balance_loss_mlp": 1.02644181, + "epoch": 0.13690064632496618, + "flos": 28363473843840.0, + "grad_norm": 2.577271704825388, + "language_loss": 0.73588455, + "learning_rate": 3.881316544012779e-06, + "loss": 0.75774848, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.7734375, + "step": 2277, + "time_per_iteration": 2.4821271896362305 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.01090852, + "balance_loss_clip": 1.04123688, + "balance_loss_mlp": 1.02697539, + "epoch": 0.13696076957763414, + "flos": 23403642410880.0, + "grad_norm": 2.823581413939763, + "language_loss": 0.82793057, + "learning_rate": 3.88118434246049e-06, + "loss": 0.84983957, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.73046875, + "step": 2278, + "time_per_iteration": 2.4047176837921143 + }, + { + "auxiliary_loss_clip": 0.01097557, + "auxiliary_loss_mlp": 0.01078271, + "balance_loss_clip": 1.03199434, + "balance_loss_mlp": 1.02652359, + "epoch": 0.1370208928303021, + "flos": 37195068188160.0, + "grad_norm": 2.0781161923109384, + "language_loss": 0.77574086, + "learning_rate": 3.881052069573502e-06, + "loss": 0.79749906, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.7109375, + "step": 2279, + "time_per_iteration": 2.556952953338623 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01073409, + "balance_loss_clip": 1.02613068, + "balance_loss_mlp": 1.02540874, + "epoch": 0.13708101608297008, + "flos": 26975187607680.0, + "grad_norm": 10.713343639648075, + "language_loss": 0.78060412, + "learning_rate": 3.880919725356831e-06, + "loss": 0.8023454, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.75390625, + "step": 2280, + "time_per_iteration": 2.43609881401062 + }, + { + "auxiliary_loss_clip": 0.0109492, + "auxiliary_loss_mlp": 0.01072273, + "balance_loss_clip": 1.0250659, + "balance_loss_mlp": 1.02497327, + "epoch": 0.13714113933563807, + "flos": 32555635102080.0, + "grad_norm": 1.727473074379076, + "language_loss": 0.81443465, + "learning_rate": 3.880787309815496e-06, + "loss": 0.8361066, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.69921875, + "step": 2281, + "time_per_iteration": 2.5257480144500732 + }, + { + "auxiliary_loss_clip": 0.01103161, + "auxiliary_loss_mlp": 0.01079185, + "balance_loss_clip": 1.02928364, + "balance_loss_mlp": 1.02799702, + "epoch": 0.13720126258830603, + "flos": 16100510878080.0, + "grad_norm": 1.670013957744066, + "language_loss": 0.85952735, + "learning_rate": 3.880654822954518e-06, + "loss": 0.88135087, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.75, + "step": 2282, + "time_per_iteration": 2.3663246631622314 + }, + { + "auxiliary_loss_clip": 0.01099191, + "auxiliary_loss_mlp": 0.01074217, + "balance_loss_clip": 1.02698636, + "balance_loss_mlp": 1.02656865, + "epoch": 0.137261385840974, + "flos": 18952530531840.0, + "grad_norm": 1.5193065495591382, + "language_loss": 0.75200641, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.77374053, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.7265625, + "step": 2283, + "time_per_iteration": 2.402902126312256 + }, + { + "auxiliary_loss_clip": 0.01102513, + "auxiliary_loss_mlp": 0.01073634, + "balance_loss_clip": 1.02499628, + "balance_loss_mlp": 1.02980363, + "epoch": 0.13732150909364196, + "flos": 23294224609920.0, + "grad_norm": 2.645519710891511, + "language_loss": 0.87478697, + "learning_rate": 3.880389635293729e-06, + "loss": 0.89654839, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.7265625, + "step": 2284, + "time_per_iteration": 2.4220385551452637 + }, + { + "auxiliary_loss_clip": 0.01106307, + "auxiliary_loss_mlp": 0.01086514, + "balance_loss_clip": 1.03158212, + "balance_loss_mlp": 1.0280261, + "epoch": 0.13738163234630993, + "flos": 29349979620480.0, + "grad_norm": 1.7791836999671387, + "language_loss": 0.76967597, + "learning_rate": 3.880256934503974e-06, + "loss": 0.79160422, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.78125, + "step": 2285, + "time_per_iteration": 2.4919214248657227 + }, + { + "auxiliary_loss_clip": 0.01099447, + "auxiliary_loss_mlp": 0.01076976, + "balance_loss_clip": 1.02609742, + "balance_loss_mlp": 1.02654481, + "epoch": 0.1374417555989779, + "flos": 26650111138560.0, + "grad_norm": 1.612541184902368, + "language_loss": 0.77256465, + "learning_rate": 3.880124162414689e-06, + "loss": 0.79432893, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7265625, + "step": 2286, + "time_per_iteration": 2.44081974029541 + }, + { + "auxiliary_loss_clip": 0.01102861, + "auxiliary_loss_mlp": 0.01072635, + "balance_loss_clip": 1.0188719, + "balance_loss_mlp": 1.02679563, + "epoch": 0.1375018788516459, + "flos": 28402122585600.0, + "grad_norm": 2.3770547945236316, + "language_loss": 0.88716841, + "learning_rate": 3.879991319030908e-06, + "loss": 0.90892333, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.76171875, + "step": 2287, + "time_per_iteration": 2.458857774734497 + }, + { + "auxiliary_loss_clip": 0.01097854, + "auxiliary_loss_mlp": 0.01075133, + "balance_loss_clip": 1.02506518, + "balance_loss_mlp": 1.02482677, + "epoch": 0.13756200210431385, + "flos": 37412297867520.0, + "grad_norm": 2.601686648892162, + "language_loss": 0.70701313, + "learning_rate": 3.879858404357666e-06, + "loss": 0.72874302, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.73046875, + "step": 2288, + "time_per_iteration": 2.527089834213257 + }, + { + "auxiliary_loss_clip": 0.01102251, + "auxiliary_loss_mlp": 0.01078883, + "balance_loss_clip": 1.026407, + "balance_loss_mlp": 1.02810645, + "epoch": 0.13762212535698182, + "flos": 22709918229120.0, + "grad_norm": 2.190929540402221, + "language_loss": 0.89871919, + "learning_rate": 3.879725418400005e-06, + "loss": 0.9205305, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7421875, + "step": 2289, + "time_per_iteration": 2.410564422607422 + }, + { + "auxiliary_loss_clip": 0.0109637, + "auxiliary_loss_mlp": 0.01072824, + "balance_loss_clip": 1.023543, + "balance_loss_mlp": 1.02444482, + "epoch": 0.13768224860964978, + "flos": 23950975795200.0, + "grad_norm": 1.7688432432085246, + "language_loss": 0.76291549, + "learning_rate": 3.879592361162969e-06, + "loss": 0.78460747, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.71875, + "step": 2290, + "time_per_iteration": 2.4063212871551514 + }, + { + "auxiliary_loss_clip": 0.01025214, + "auxiliary_loss_mlp": 0.01010472, + "balance_loss_clip": 1.00389171, + "balance_loss_mlp": 1.00469732, + "epoch": 0.13774237186231775, + "flos": 63586750510080.0, + "grad_norm": 0.7058457857720316, + "language_loss": 0.5168618, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53721869, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.20507812, + "step": 2291, + "time_per_iteration": 3.065821409225464 + }, + { + "auxiliary_loss_clip": 0.01098689, + "auxiliary_loss_mlp": 0.01067315, + "balance_loss_clip": 1.01688957, + "balance_loss_mlp": 1.02532411, + "epoch": 0.1378024951149857, + "flos": 24278321502720.0, + "grad_norm": 1.997706831738451, + "language_loss": 0.7277174, + "learning_rate": 3.879326032870952e-06, + "loss": 0.74937743, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.734375, + "step": 2292, + "time_per_iteration": 2.4066214561462402 + }, + { + "auxiliary_loss_clip": 0.01099581, + "auxiliary_loss_mlp": 0.01078393, + "balance_loss_clip": 1.02524972, + "balance_loss_mlp": 1.0276202, + "epoch": 0.13786261836765368, + "flos": 14020839521280.0, + "grad_norm": 2.7158987104013512, + "language_loss": 0.82024562, + "learning_rate": 3.879192761826071e-06, + "loss": 0.84202534, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.71875, + "step": 2293, + "time_per_iteration": 2.3778576850891113 + }, + { + "auxiliary_loss_clip": 0.0110099, + "auxiliary_loss_mlp": 0.01081193, + "balance_loss_clip": 1.02616608, + "balance_loss_mlp": 1.02490544, + "epoch": 0.13792274162032167, + "flos": 28877360279040.0, + "grad_norm": 2.349710677917006, + "language_loss": 0.80887091, + "learning_rate": 3.879059419522011e-06, + "loss": 0.83069271, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.76171875, + "step": 2294, + "time_per_iteration": 2.466806173324585 + }, + { + "auxiliary_loss_clip": 0.01099237, + "auxiliary_loss_mlp": 0.01068466, + "balance_loss_clip": 1.0192802, + "balance_loss_mlp": 1.02621043, + "epoch": 0.13798286487298964, + "flos": 21140118501120.0, + "grad_norm": 2.1096307442438134, + "language_loss": 0.8194865, + "learning_rate": 3.878926005963831e-06, + "loss": 0.84116352, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.73046875, + "step": 2295, + "time_per_iteration": 2.4102723598480225 + }, + { + "auxiliary_loss_clip": 0.01098162, + "auxiliary_loss_mlp": 0.0108087, + "balance_loss_clip": 1.02603388, + "balance_loss_mlp": 1.02447736, + "epoch": 0.1380429881256576, + "flos": 22486509239040.0, + "grad_norm": 1.6183791427254255, + "language_loss": 0.80288756, + "learning_rate": 3.878792521156588e-06, + "loss": 0.82467788, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.734375, + "step": 2296, + "time_per_iteration": 2.4178006649017334 + }, + { + "auxiliary_loss_clip": 0.01100757, + "auxiliary_loss_mlp": 0.01072975, + "balance_loss_clip": 1.02400398, + "balance_loss_mlp": 1.02733612, + "epoch": 0.13810311137832557, + "flos": 21392715254400.0, + "grad_norm": 1.8487846683645697, + "language_loss": 0.800394, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.82213134, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.734375, + "step": 2297, + "time_per_iteration": 2.413954019546509 + }, + { + "auxiliary_loss_clip": 0.0110143, + "auxiliary_loss_mlp": 0.01079807, + "balance_loss_clip": 1.0299058, + "balance_loss_mlp": 1.02815509, + "epoch": 0.13816323463099353, + "flos": 25988786565120.0, + "grad_norm": 2.0851136503108894, + "language_loss": 0.70543957, + "learning_rate": 3.878525337815164e-06, + "loss": 0.72725189, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.734375, + "step": 2298, + "time_per_iteration": 2.4297866821289062 + }, + { + "auxiliary_loss_clip": 0.01105569, + "auxiliary_loss_mlp": 0.01086277, + "balance_loss_clip": 1.03444457, + "balance_loss_mlp": 1.02774286, + "epoch": 0.1382233578836615, + "flos": 19243322179200.0, + "grad_norm": 2.1961745269843256, + "language_loss": 0.89040697, + "learning_rate": 3.878391639291116e-06, + "loss": 0.91232544, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.78125, + "step": 2299, + "time_per_iteration": 2.3947176933288574 + }, + { + "auxiliary_loss_clip": 0.01096682, + "auxiliary_loss_mlp": 0.01079389, + "balance_loss_clip": 1.0280571, + "balance_loss_mlp": 1.02380145, + "epoch": 0.1382834811363295, + "flos": 25665106550400.0, + "grad_norm": 1.7743553231500588, + "language_loss": 0.78206468, + "learning_rate": 3.878257869538267e-06, + "loss": 0.80382538, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.73046875, + "step": 2300, + "time_per_iteration": 2.44600248336792 + }, + { + "auxiliary_loss_clip": 0.01099037, + "auxiliary_loss_mlp": 0.01080589, + "balance_loss_clip": 1.03202343, + "balance_loss_mlp": 1.02796245, + "epoch": 0.13834360438899745, + "flos": 19783394000640.0, + "grad_norm": 2.3544058251236164, + "language_loss": 0.84997076, + "learning_rate": 3.878124028561692e-06, + "loss": 0.87176704, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7109375, + "step": 2301, + "time_per_iteration": 2.36763072013855 + }, + { + "auxiliary_loss_clip": 0.01097462, + "auxiliary_loss_mlp": 0.01075698, + "balance_loss_clip": 1.02698886, + "balance_loss_mlp": 1.0255233, + "epoch": 0.13840372764166542, + "flos": 26650634808960.0, + "grad_norm": 3.0487476019763236, + "language_loss": 0.87715501, + "learning_rate": 3.877990116366466e-06, + "loss": 0.89888656, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.71875, + "step": 2302, + "time_per_iteration": 2.4854891300201416 + }, + { + "auxiliary_loss_clip": 0.01025184, + "auxiliary_loss_mlp": 0.01014759, + "balance_loss_clip": 1.00932288, + "balance_loss_mlp": 1.00514877, + "epoch": 0.13846385089433338, + "flos": 70507444343040.0, + "grad_norm": 0.7706338764867829, + "language_loss": 0.65793359, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67833304, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.05444336, + "router_z_loss_mlp": 0.20117188, + "step": 2303, + "time_per_iteration": 3.156560182571411 + }, + { + "auxiliary_loss_clip": 0.01098113, + "auxiliary_loss_mlp": 0.01068781, + "balance_loss_clip": 1.01966667, + "balance_loss_mlp": 1.02554488, + "epoch": 0.13852397414700135, + "flos": 17347747754880.0, + "grad_norm": 3.2309340948674956, + "language_loss": 0.79546964, + "learning_rate": 3.877722078340374e-06, + "loss": 0.81713855, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.7265625, + "step": 2304, + "time_per_iteration": 2.3472673892974854 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01069077, + "balance_loss_clip": 1.01791239, + "balance_loss_mlp": 1.02756572, + "epoch": 0.13858409739966931, + "flos": 21542701921920.0, + "grad_norm": 1.6291551231975308, + "language_loss": 0.79094279, + "learning_rate": 3.877587952519672e-06, + "loss": 0.81265527, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.74609375, + "step": 2305, + "time_per_iteration": 2.406261920928955 + }, + { + "auxiliary_loss_clip": 0.01096838, + "auxiliary_loss_mlp": 0.01072228, + "balance_loss_clip": 1.0230664, + "balance_loss_mlp": 1.02455163, + "epoch": 0.13864422065233728, + "flos": 21578837045760.0, + "grad_norm": 1.6828507129835466, + "language_loss": 0.89839327, + "learning_rate": 3.877453755500647e-06, + "loss": 0.92008394, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.72265625, + "step": 2306, + "time_per_iteration": 2.4221625328063965 + }, + { + "auxiliary_loss_clip": 0.01025438, + "auxiliary_loss_mlp": 0.01012873, + "balance_loss_clip": 1.00653148, + "balance_loss_mlp": 1.00638831, + "epoch": 0.13870434390500527, + "flos": 53368861743360.0, + "grad_norm": 0.8895551455015643, + "language_loss": 0.59281766, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61320078, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.06347656, + "router_z_loss_mlp": 0.19042969, + "step": 2307, + "time_per_iteration": 3.0858495235443115 + }, + { + "auxiliary_loss_clip": 0.01101974, + "auxiliary_loss_mlp": 0.01079981, + "balance_loss_clip": 1.02624106, + "balance_loss_mlp": 1.02611279, + "epoch": 0.13876446715767324, + "flos": 22564784240640.0, + "grad_norm": 1.761154039198625, + "language_loss": 0.81950474, + "learning_rate": 3.877185147887984e-06, + "loss": 0.84132427, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.7578125, + "step": 2308, + "time_per_iteration": 2.41827654838562 + }, + { + "auxiliary_loss_clip": 0.01095422, + "auxiliary_loss_mlp": 0.01066286, + "balance_loss_clip": 1.01648068, + "balance_loss_mlp": 1.02429426, + "epoch": 0.1388245904103412, + "flos": 20704157953920.0, + "grad_norm": 2.0389684100065417, + "language_loss": 0.80894417, + "learning_rate": 3.877050737304533e-06, + "loss": 0.83056128, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.7109375, + "step": 2309, + "time_per_iteration": 3.811041831970215 + }, + { + "auxiliary_loss_clip": 0.01103964, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_clip": 1.01579809, + "balance_loss_mlp": 1.02735353, + "epoch": 0.13888471366300917, + "flos": 20553787261440.0, + "grad_norm": 1.9623723176277659, + "language_loss": 0.69932956, + "learning_rate": 3.876916255543129e-06, + "loss": 0.72104478, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.765625, + "step": 2310, + "time_per_iteration": 3.8001632690429688 + }, + { + "auxiliary_loss_clip": 0.01097431, + "auxiliary_loss_mlp": 0.01074428, + "balance_loss_clip": 1.02552867, + "balance_loss_mlp": 1.02534401, + "epoch": 0.13894483691567713, + "flos": 13837370993280.0, + "grad_norm": 1.881604913232477, + "language_loss": 0.85570848, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.8774271, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.71875, + "step": 2311, + "time_per_iteration": 2.357818603515625 + }, + { + "auxiliary_loss_clip": 0.01103614, + "auxiliary_loss_mlp": 0.01083903, + "balance_loss_clip": 1.02684927, + "balance_loss_mlp": 1.02703547, + "epoch": 0.1390049601683451, + "flos": 28030123382400.0, + "grad_norm": 2.235906768874491, + "language_loss": 0.83865625, + "learning_rate": 3.876647078506866e-06, + "loss": 0.86053145, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.765625, + "step": 2312, + "time_per_iteration": 3.8730225563049316 + }, + { + "auxiliary_loss_clip": 0.01103618, + "auxiliary_loss_mlp": 0.01080113, + "balance_loss_clip": 1.02773273, + "balance_loss_mlp": 1.02786899, + "epoch": 0.13906508342101306, + "flos": 26755758512640.0, + "grad_norm": 1.672791128299684, + "language_loss": 0.88396561, + "learning_rate": 3.876512383242215e-06, + "loss": 0.90580297, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7578125, + "step": 2313, + "time_per_iteration": 2.4380085468292236 + }, + { + "auxiliary_loss_clip": 0.01099907, + "auxiliary_loss_mlp": 0.01082395, + "balance_loss_clip": 1.03056276, + "balance_loss_mlp": 1.02565956, + "epoch": 0.13912520667368106, + "flos": 24533955544320.0, + "grad_norm": 1.8049775069403353, + "language_loss": 0.81867188, + "learning_rate": 3.876377616820024e-06, + "loss": 0.84049487, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.7421875, + "step": 2314, + "time_per_iteration": 2.4388504028320312 + }, + { + "auxiliary_loss_clip": 0.01100529, + "auxiliary_loss_mlp": 0.01074857, + "balance_loss_clip": 1.02311993, + "balance_loss_mlp": 1.02689791, + "epoch": 0.13918532992634902, + "flos": 19382416502400.0, + "grad_norm": 2.329975764577655, + "language_loss": 0.88713676, + "learning_rate": 3.876242779245409e-06, + "loss": 0.9088906, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.734375, + "step": 2315, + "time_per_iteration": 2.3557851314544678 + }, + { + "auxiliary_loss_clip": 0.01098722, + "auxiliary_loss_mlp": 0.01075573, + "balance_loss_clip": 1.02636361, + "balance_loss_mlp": 1.02541173, + "epoch": 0.139245453179017, + "flos": 21322714245120.0, + "grad_norm": 4.266012427524393, + "language_loss": 0.79235256, + "learning_rate": 3.876107870523477e-06, + "loss": 0.8140955, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.734375, + "step": 2316, + "time_per_iteration": 2.399505376815796 + }, + { + "auxiliary_loss_clip": 0.01099451, + "auxiliary_loss_mlp": 0.01077135, + "balance_loss_clip": 1.02554083, + "balance_loss_mlp": 1.02741385, + "epoch": 0.13930557643168495, + "flos": 19499584625280.0, + "grad_norm": 1.6705724494023786, + "language_loss": 0.78873736, + "learning_rate": 3.875972890659349e-06, + "loss": 0.81050324, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.71875, + "step": 2317, + "time_per_iteration": 2.4020607471466064 + }, + { + "auxiliary_loss_clip": 0.01100838, + "auxiliary_loss_mlp": 0.01066524, + "balance_loss_clip": 1.0191021, + "balance_loss_mlp": 1.02721941, + "epoch": 0.13936569968435292, + "flos": 25409647065600.0, + "grad_norm": 2.0362172739515687, + "language_loss": 0.82685184, + "learning_rate": 3.875837839658139e-06, + "loss": 0.84852552, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.734375, + "step": 2318, + "time_per_iteration": 2.433493137359619 + }, + { + "auxiliary_loss_clip": 0.01020803, + "auxiliary_loss_mlp": 0.01015984, + "balance_loss_clip": 1.01021445, + "balance_loss_mlp": 1.00301051, + "epoch": 0.13942582293702088, + "flos": 70767372481920.0, + "grad_norm": 0.8638124709776664, + "language_loss": 0.59128332, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61165118, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.17773438, + "step": 2319, + "time_per_iteration": 3.064567804336548 + }, + { + "auxiliary_loss_clip": 0.01099857, + "auxiliary_loss_mlp": 0.01070451, + "balance_loss_clip": 1.02174187, + "balance_loss_mlp": 1.0253098, + "epoch": 0.13948594618968888, + "flos": 35589412627200.0, + "grad_norm": 2.179385314317167, + "language_loss": 0.68977374, + "learning_rate": 3.875567524264967e-06, + "loss": 0.7114768, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.74609375, + "step": 2320, + "time_per_iteration": 2.5087361335754395 + }, + { + "auxiliary_loss_clip": 0.01096609, + "auxiliary_loss_mlp": 0.01067206, + "balance_loss_clip": 1.02126241, + "balance_loss_mlp": 1.02520037, + "epoch": 0.13954606944235684, + "flos": 21104157934080.0, + "grad_norm": 1.7929222526692348, + "language_loss": 0.72406977, + "learning_rate": 3.875432259883256e-06, + "loss": 0.74570793, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.71484375, + "step": 2321, + "time_per_iteration": 2.3905608654022217 + }, + { + "auxiliary_loss_clip": 0.01098514, + "auxiliary_loss_mlp": 0.01072776, + "balance_loss_clip": 1.0253073, + "balance_loss_mlp": 1.02546048, + "epoch": 0.1396061926950248, + "flos": 25043303502720.0, + "grad_norm": 1.823688269110296, + "language_loss": 0.87624943, + "learning_rate": 3.875296924384965e-06, + "loss": 0.89796227, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.73046875, + "step": 2322, + "time_per_iteration": 2.444798469543457 + }, + { + "auxiliary_loss_clip": 0.01093817, + "auxiliary_loss_mlp": 0.01067472, + "balance_loss_clip": 1.02527153, + "balance_loss_mlp": 1.02503991, + "epoch": 0.13966631594769277, + "flos": 37632495012480.0, + "grad_norm": 1.679641955649151, + "language_loss": 0.68735874, + "learning_rate": 3.875161517775226e-06, + "loss": 0.70897162, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6875, + "step": 2323, + "time_per_iteration": 2.5323569774627686 + }, + { + "auxiliary_loss_clip": 0.01106393, + "auxiliary_loss_mlp": 0.0106864, + "balance_loss_clip": 1.01878691, + "balance_loss_mlp": 1.02811384, + "epoch": 0.13972643920036074, + "flos": 16690053962880.0, + "grad_norm": 2.001042786560693, + "language_loss": 0.92948622, + "learning_rate": 3.875026040059175e-06, + "loss": 0.95123661, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.78125, + "step": 2324, + "time_per_iteration": 2.3752689361572266 + }, + { + "auxiliary_loss_clip": 0.01099865, + "auxiliary_loss_mlp": 0.01078059, + "balance_loss_clip": 1.02706146, + "balance_loss_mlp": 1.02509928, + "epoch": 0.1397865624530287, + "flos": 23329940797440.0, + "grad_norm": 3.020303081202028, + "language_loss": 0.73696572, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.75874496, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2325, + "time_per_iteration": 2.39174485206604 + }, + { + "auxiliary_loss_clip": 0.01098923, + "auxiliary_loss_mlp": 0.01075597, + "balance_loss_clip": 1.02738833, + "balance_loss_mlp": 1.02781367, + "epoch": 0.13984668570569667, + "flos": 22777370709120.0, + "grad_norm": 1.8625049111790701, + "language_loss": 0.83402061, + "learning_rate": 3.874754871328688e-06, + "loss": 0.85576576, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.7109375, + "step": 2326, + "time_per_iteration": 2.4044384956359863 + }, + { + "auxiliary_loss_clip": 0.01096732, + "auxiliary_loss_mlp": 0.01071019, + "balance_loss_clip": 1.02586281, + "balance_loss_mlp": 1.0266856, + "epoch": 0.13990680895836466, + "flos": 19463519324160.0, + "grad_norm": 1.797039911333627, + "language_loss": 0.90511805, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9267956, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.69921875, + "step": 2327, + "time_per_iteration": 2.3899290561676025 + }, + { + "auxiliary_loss_clip": 0.01096659, + "auxiliary_loss_mlp": 0.01070155, + "balance_loss_clip": 1.02347279, + "balance_loss_mlp": 1.02642465, + "epoch": 0.13996693221103262, + "flos": 20302237848960.0, + "grad_norm": 1.9157399043675598, + "language_loss": 0.86463577, + "learning_rate": 3.874483418234632e-06, + "loss": 0.8863039, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.703125, + "step": 2328, + "time_per_iteration": 2.3760221004486084 + }, + { + "auxiliary_loss_clip": 0.01097616, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_clip": 1.02168345, + "balance_loss_mlp": 1.02462769, + "epoch": 0.1400270554637006, + "flos": 26616419809920.0, + "grad_norm": 1.5674418082209531, + "language_loss": 0.75268614, + "learning_rate": 3.874347585064131e-06, + "loss": 0.77435005, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.73046875, + "step": 2329, + "time_per_iteration": 2.4418728351593018 + }, + { + "auxiliary_loss_clip": 0.01098106, + "auxiliary_loss_mlp": 0.01065353, + "balance_loss_clip": 1.017097, + "balance_loss_mlp": 1.02543378, + "epoch": 0.14008717871636855, + "flos": 19390446115200.0, + "grad_norm": 1.8090363478201983, + "language_loss": 0.80433857, + "learning_rate": 3.874211680818183e-06, + "loss": 0.82597315, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.7265625, + "step": 2330, + "time_per_iteration": 2.384094715118408 + }, + { + "auxiliary_loss_clip": 0.01095153, + "auxiliary_loss_mlp": 0.01069231, + "balance_loss_clip": 1.02355027, + "balance_loss_mlp": 1.0243454, + "epoch": 0.14014730196903652, + "flos": 15303373649280.0, + "grad_norm": 3.1018054971729954, + "language_loss": 0.74080718, + "learning_rate": 3.87407570550194e-06, + "loss": 0.76245099, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.7109375, + "step": 2331, + "time_per_iteration": 2.363744020462036 + }, + { + "auxiliary_loss_clip": 0.01092016, + "auxiliary_loss_mlp": 0.01071282, + "balance_loss_clip": 1.02977347, + "balance_loss_mlp": 1.02559328, + "epoch": 0.14020742522170448, + "flos": 14938810565760.0, + "grad_norm": 1.60475532013002, + "language_loss": 0.74623108, + "learning_rate": 3.873939659120557e-06, + "loss": 0.76786405, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6640625, + "step": 2332, + "time_per_iteration": 2.436201810836792 + }, + { + "auxiliary_loss_clip": 0.01022133, + "auxiliary_loss_mlp": 0.01012344, + "balance_loss_clip": 1.00652611, + "balance_loss_mlp": 1.00351977, + "epoch": 0.14026754847437245, + "flos": 48822017316480.0, + "grad_norm": 0.8487859216876602, + "language_loss": 0.56244385, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58278859, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.05810547, + "router_z_loss_mlp": 0.18554688, + "step": 2333, + "time_per_iteration": 2.8638577461242676 + }, + { + "auxiliary_loss_clip": 0.01096487, + "auxiliary_loss_mlp": 0.01075171, + "balance_loss_clip": 1.02875125, + "balance_loss_mlp": 1.02506387, + "epoch": 0.14032767172704044, + "flos": 25772150378880.0, + "grad_norm": 1.5914010635479372, + "language_loss": 0.8360281, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85774463, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.71484375, + "step": 2334, + "time_per_iteration": 2.4287009239196777 + }, + { + "auxiliary_loss_clip": 0.01094514, + "auxiliary_loss_mlp": 0.01071489, + "balance_loss_clip": 1.02914596, + "balance_loss_mlp": 1.02508903, + "epoch": 0.1403877949797084, + "flos": 21215216568960.0, + "grad_norm": 1.6071457497797799, + "language_loss": 0.81850404, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.84016401, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6953125, + "step": 2335, + "time_per_iteration": 2.417670249938965 + }, + { + "auxiliary_loss_clip": 0.01097257, + "auxiliary_loss_mlp": 0.01077199, + "balance_loss_clip": 1.02741706, + "balance_loss_mlp": 1.02569258, + "epoch": 0.14044791823237637, + "flos": 22746856314240.0, + "grad_norm": 1.6337788034437608, + "language_loss": 0.83876276, + "learning_rate": 3.873394763046862e-06, + "loss": 0.86050731, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.71484375, + "step": 2336, + "time_per_iteration": 2.4388797283172607 + }, + { + "auxiliary_loss_clip": 0.01096473, + "auxiliary_loss_mlp": 0.01077751, + "balance_loss_clip": 1.03276134, + "balance_loss_mlp": 1.02563655, + "epoch": 0.14050804148504434, + "flos": 22963387766400.0, + "grad_norm": 1.6690502699344472, + "language_loss": 0.81863654, + "learning_rate": 3.873258361417225e-06, + "loss": 0.84037876, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.70703125, + "step": 2337, + "time_per_iteration": 2.399803400039673 + }, + { + "auxiliary_loss_clip": 0.01096852, + "auxiliary_loss_mlp": 0.01080153, + "balance_loss_clip": 1.03435278, + "balance_loss_mlp": 1.0252521, + "epoch": 0.1405681647377123, + "flos": 22199243639040.0, + "grad_norm": 3.618651933775152, + "language_loss": 0.81271327, + "learning_rate": 3.873121888753442e-06, + "loss": 0.83448327, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.71484375, + "step": 2338, + "time_per_iteration": 2.4098405838012695 + }, + { + "auxiliary_loss_clip": 0.01099849, + "auxiliary_loss_mlp": 0.01087712, + "balance_loss_clip": 1.03735852, + "balance_loss_mlp": 1.02764118, + "epoch": 0.14062828799038027, + "flos": 23731651434240.0, + "grad_norm": 2.298099449912599, + "language_loss": 0.83314776, + "learning_rate": 3.87298534506069e-06, + "loss": 0.85502338, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.72265625, + "step": 2339, + "time_per_iteration": 2.41626238822937 + }, + { + "auxiliary_loss_clip": 0.01095892, + "auxiliary_loss_mlp": 0.01074312, + "balance_loss_clip": 1.02879786, + "balance_loss_mlp": 1.02549624, + "epoch": 0.14068841124304826, + "flos": 39200933197440.0, + "grad_norm": 1.7317404394200744, + "language_loss": 0.67206317, + "learning_rate": 3.872848730344146e-06, + "loss": 0.69376522, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.703125, + "step": 2340, + "time_per_iteration": 2.556863784790039 + }, + { + "auxiliary_loss_clip": 0.01093781, + "auxiliary_loss_mlp": 0.01067871, + "balance_loss_clip": 1.02552819, + "balance_loss_mlp": 1.02686203, + "epoch": 0.14074853449571623, + "flos": 20191283948160.0, + "grad_norm": 2.5793686109431158, + "language_loss": 0.81785798, + "learning_rate": 3.87271204460899e-06, + "loss": 0.83947456, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.66796875, + "step": 2341, + "time_per_iteration": 2.4176712036132812 + }, + { + "auxiliary_loss_clip": 0.01093383, + "auxiliary_loss_mlp": 0.01070137, + "balance_loss_clip": 1.02390778, + "balance_loss_mlp": 1.02490532, + "epoch": 0.1408086577483842, + "flos": 18404882945280.0, + "grad_norm": 1.8573064688021232, + "language_loss": 0.83149493, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.8531301, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.68359375, + "step": 2342, + "time_per_iteration": 2.3970870971679688 + }, + { + "auxiliary_loss_clip": 0.01090687, + "auxiliary_loss_mlp": 0.0106508, + "balance_loss_clip": 1.02297568, + "balance_loss_mlp": 1.02588344, + "epoch": 0.14086878100105216, + "flos": 25263430824960.0, + "grad_norm": 2.3744095516360826, + "language_loss": 0.80615437, + "learning_rate": 3.87243846010358e-06, + "loss": 0.82771206, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6484375, + "step": 2343, + "time_per_iteration": 2.413259506225586 + }, + { + "auxiliary_loss_clip": 0.01021274, + "auxiliary_loss_mlp": 0.01014568, + "balance_loss_clip": 1.00846434, + "balance_loss_mlp": 1.00476551, + "epoch": 0.14092890425372012, + "flos": 65975194730880.0, + "grad_norm": 0.8386368192521199, + "language_loss": 0.61639196, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63675034, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.06103516, + "router_z_loss_mlp": 0.16503906, + "step": 2344, + "time_per_iteration": 2.957770347595215 + }, + { + "auxiliary_loss_clip": 0.01091914, + "auxiliary_loss_mlp": 0.01057357, + "balance_loss_clip": 1.01525259, + "balance_loss_mlp": 1.02395976, + "epoch": 0.1409890275063881, + "flos": 23693875476480.0, + "grad_norm": 1.7217596055612878, + "language_loss": 0.66969234, + "learning_rate": 3.872164591585956e-06, + "loss": 0.691185, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6796875, + "step": 2345, + "time_per_iteration": 2.398142099380493 + }, + { + "auxiliary_loss_clip": 0.01099441, + "auxiliary_loss_mlp": 0.01064012, + "balance_loss_clip": 1.01682913, + "balance_loss_mlp": 1.02614999, + "epoch": 0.14104915075905605, + "flos": 23622024165120.0, + "grad_norm": 2.849842550456899, + "language_loss": 0.77279401, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.79442847, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.73046875, + "step": 2346, + "time_per_iteration": 2.4196252822875977 + }, + { + "auxiliary_loss_clip": 0.01097379, + "auxiliary_loss_mlp": 0.01069531, + "balance_loss_clip": 1.0225383, + "balance_loss_mlp": 1.02661347, + "epoch": 0.14110927401172405, + "flos": 20594111748480.0, + "grad_norm": 1.8907999068220207, + "language_loss": 0.7887587, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.81042778, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.7109375, + "step": 2347, + "time_per_iteration": 2.380185127258301 + }, + { + "auxiliary_loss_clip": 0.01096654, + "auxiliary_loss_mlp": 0.01067499, + "balance_loss_clip": 1.02100706, + "balance_loss_mlp": 1.02524769, + "epoch": 0.141169397264392, + "flos": 28546802726400.0, + "grad_norm": 2.0447175396747257, + "language_loss": 0.78037524, + "learning_rate": 3.8717532563775e-06, + "loss": 0.80201674, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.71484375, + "step": 2348, + "time_per_iteration": 3.87050461769104 + }, + { + "auxiliary_loss_clip": 0.01094176, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.01559746, + "balance_loss_mlp": 1.02532113, + "epoch": 0.14122952051705998, + "flos": 17091310752000.0, + "grad_norm": 1.62365409890781, + "language_loss": 0.88682103, + "learning_rate": 3.871616002680272e-06, + "loss": 0.90837222, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.6875, + "step": 2349, + "time_per_iteration": 3.808807611465454 + }, + { + "auxiliary_loss_clip": 0.01094341, + "auxiliary_loss_mlp": 0.01063383, + "balance_loss_clip": 1.01810718, + "balance_loss_mlp": 1.02666235, + "epoch": 0.14128964376972794, + "flos": 28945615720320.0, + "grad_norm": 1.609767530616754, + "language_loss": 0.90336883, + "learning_rate": 3.871478678011177e-06, + "loss": 0.92494613, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.6796875, + "step": 2350, + "time_per_iteration": 2.4457435607910156 + }, + { + "auxiliary_loss_clip": 0.01099722, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_clip": 1.02263784, + "balance_loss_mlp": 1.02827561, + "epoch": 0.1413497670223959, + "flos": 18988770389760.0, + "grad_norm": 1.7840272272770248, + "language_loss": 0.82614648, + "learning_rate": 3.871341282375423e-06, + "loss": 0.84785342, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.71484375, + "step": 2351, + "time_per_iteration": 3.763456106185913 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01072854, + "balance_loss_clip": 1.02726793, + "balance_loss_mlp": 1.02765095, + "epoch": 0.14140989027506387, + "flos": 29860933501440.0, + "grad_norm": 2.0538363623413867, + "language_loss": 0.86355698, + "learning_rate": 3.871203815778219e-06, + "loss": 0.88527393, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.7109375, + "step": 2352, + "time_per_iteration": 3.950869083404541 + }, + { + "auxiliary_loss_clip": 0.01021319, + "auxiliary_loss_mlp": 0.01018357, + "balance_loss_clip": 1.01215804, + "balance_loss_mlp": 1.00365102, + "epoch": 0.14147001352773186, + "flos": 62076303826560.0, + "grad_norm": 1.0290291430437577, + "language_loss": 0.62007737, + "learning_rate": 3.87106627822478e-06, + "loss": 0.6404742, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.06176758, + "router_z_loss_mlp": 0.17675781, + "step": 2353, + "time_per_iteration": 2.9648189544677734 + }, + { + "auxiliary_loss_clip": 0.01092072, + "auxiliary_loss_mlp": 0.0106999, + "balance_loss_clip": 1.02743196, + "balance_loss_mlp": 1.02592862, + "epoch": 0.14153013678039983, + "flos": 22016438426880.0, + "grad_norm": 2.114365724356601, + "language_loss": 0.88850433, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.9101249, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6640625, + "step": 2354, + "time_per_iteration": 2.4362807273864746 + }, + { + "auxiliary_loss_clip": 0.01095519, + "auxiliary_loss_mlp": 0.01065752, + "balance_loss_clip": 1.01880741, + "balance_loss_mlp": 1.02513134, + "epoch": 0.1415902600330678, + "flos": 19719048631680.0, + "grad_norm": 2.4738266875671293, + "language_loss": 0.76584566, + "learning_rate": 3.870790990270057e-06, + "loss": 0.78745842, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.703125, + "step": 2355, + "time_per_iteration": 2.393272876739502 + }, + { + "auxiliary_loss_clip": 0.01021171, + "auxiliary_loss_mlp": 0.01005469, + "balance_loss_clip": 0.99903131, + "balance_loss_mlp": 1.0038414, + "epoch": 0.14165038328573576, + "flos": 65897862336000.0, + "grad_norm": 0.6905474049446052, + "language_loss": 0.52014923, + "learning_rate": 3.870653239879212e-06, + "loss": 0.54041564, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.06445312, + "router_z_loss_mlp": 0.17382812, + "step": 2356, + "time_per_iteration": 2.9643349647521973 + }, + { + "auxiliary_loss_clip": 0.01095071, + "auxiliary_loss_mlp": 0.01067043, + "balance_loss_clip": 1.02305484, + "balance_loss_mlp": 1.02627206, + "epoch": 0.14171050653840372, + "flos": 12129349726080.0, + "grad_norm": 3.44367815537643, + "language_loss": 0.72895277, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.75057387, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.6875, + "step": 2357, + "time_per_iteration": 2.405557870864868 + }, + { + "auxiliary_loss_clip": 0.01097603, + "auxiliary_loss_mlp": 0.01068252, + "balance_loss_clip": 1.02228522, + "balance_loss_mlp": 1.02484655, + "epoch": 0.1417706297910717, + "flos": 20411446181760.0, + "grad_norm": 1.9540989193968346, + "language_loss": 0.84554189, + "learning_rate": 3.870377526296674e-06, + "loss": 0.86720049, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.7265625, + "step": 2358, + "time_per_iteration": 2.4183664321899414 + }, + { + "auxiliary_loss_clip": 0.0109973, + "auxiliary_loss_mlp": 0.0106999, + "balance_loss_clip": 1.02340364, + "balance_loss_mlp": 1.02662396, + "epoch": 0.14183075304373965, + "flos": 22379570144640.0, + "grad_norm": 2.1284243629906143, + "language_loss": 0.73992664, + "learning_rate": 3.870239563115436e-06, + "loss": 0.76162386, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.73046875, + "step": 2359, + "time_per_iteration": 2.397797107696533 + }, + { + "auxiliary_loss_clip": 0.01096407, + "auxiliary_loss_mlp": 0.0106192, + "balance_loss_clip": 1.02038801, + "balance_loss_mlp": 1.02701902, + "epoch": 0.14189087629640765, + "flos": 21579814563840.0, + "grad_norm": 2.0206623757932056, + "language_loss": 0.78634334, + "learning_rate": 3.870101529014526e-06, + "loss": 0.8079266, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6953125, + "step": 2360, + "time_per_iteration": 2.4997646808624268 + }, + { + "auxiliary_loss_clip": 0.01095137, + "auxiliary_loss_mlp": 0.01065257, + "balance_loss_clip": 1.01785898, + "balance_loss_mlp": 1.02696097, + "epoch": 0.1419509995490756, + "flos": 20007605952000.0, + "grad_norm": 2.0651959770895125, + "language_loss": 0.84261692, + "learning_rate": 3.869963423999178e-06, + "loss": 0.86422086, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.6796875, + "step": 2361, + "time_per_iteration": 2.389340400695801 + }, + { + "auxiliary_loss_clip": 0.0109568, + "auxiliary_loss_mlp": 0.01064148, + "balance_loss_clip": 1.0196352, + "balance_loss_mlp": 1.02720356, + "epoch": 0.14201112280174358, + "flos": 31940116099200.0, + "grad_norm": 1.8583325644120083, + "language_loss": 0.7663433, + "learning_rate": 3.86982524807463e-06, + "loss": 0.78794158, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.68359375, + "step": 2362, + "time_per_iteration": 2.5709047317504883 + }, + { + "auxiliary_loss_clip": 0.0109715, + "auxiliary_loss_mlp": 0.01063262, + "balance_loss_clip": 1.01753342, + "balance_loss_mlp": 1.02890205, + "epoch": 0.14207124605441154, + "flos": 41462536982400.0, + "grad_norm": 1.8974042724551525, + "language_loss": 0.75918436, + "learning_rate": 3.869687001246122e-06, + "loss": 0.78078848, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.68359375, + "step": 2363, + "time_per_iteration": 2.5720465183258057 + }, + { + "auxiliary_loss_clip": 0.01093469, + "auxiliary_loss_mlp": 0.01062822, + "balance_loss_clip": 1.02055085, + "balance_loss_mlp": 1.02511299, + "epoch": 0.1421313693070795, + "flos": 31903736595840.0, + "grad_norm": 1.949011831226733, + "language_loss": 0.74746877, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.76903164, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.68359375, + "step": 2364, + "time_per_iteration": 2.5085959434509277 + }, + { + "auxiliary_loss_clip": 0.01092967, + "auxiliary_loss_mlp": 0.0105875, + "balance_loss_clip": 1.01941097, + "balance_loss_mlp": 1.02714086, + "epoch": 0.14219149255974747, + "flos": 26869924258560.0, + "grad_norm": 1.8162505330910113, + "language_loss": 0.92309189, + "learning_rate": 3.869410294898195e-06, + "loss": 0.94460905, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.65625, + "step": 2365, + "time_per_iteration": 2.4344723224639893 + }, + { + "auxiliary_loss_clip": 0.01097296, + "auxiliary_loss_mlp": 0.01068953, + "balance_loss_clip": 1.0199821, + "balance_loss_mlp": 1.02694428, + "epoch": 0.14225161581241544, + "flos": 27453183298560.0, + "grad_norm": 1.6909320163029966, + "language_loss": 0.6732766, + "learning_rate": 3.869271835389268e-06, + "loss": 0.69493914, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.703125, + "step": 2366, + "time_per_iteration": 2.461634874343872 + }, + { + "auxiliary_loss_clip": 0.01093072, + "auxiliary_loss_mlp": 0.01061414, + "balance_loss_clip": 1.01713991, + "balance_loss_mlp": 1.02502322, + "epoch": 0.14231173906508343, + "flos": 10560667161600.0, + "grad_norm": 1.8002024462817556, + "language_loss": 0.83267176, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.85421658, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.6796875, + "step": 2367, + "time_per_iteration": 2.4098618030548096 + }, + { + "auxiliary_loss_clip": 0.01098377, + "auxiliary_loss_mlp": 0.01081762, + "balance_loss_clip": 1.03379166, + "balance_loss_mlp": 1.02722013, + "epoch": 0.1423718623177514, + "flos": 28359773239680.0, + "grad_norm": 1.8686043352427915, + "language_loss": 0.84684205, + "learning_rate": 3.868994703727742e-06, + "loss": 0.8686434, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.7109375, + "step": 2368, + "time_per_iteration": 2.4541900157928467 + }, + { + "auxiliary_loss_clip": 0.01094129, + "auxiliary_loss_mlp": 0.01074922, + "balance_loss_clip": 1.02819169, + "balance_loss_mlp": 1.02500486, + "epoch": 0.14243198557041936, + "flos": 19353228739200.0, + "grad_norm": 2.28753091294977, + "language_loss": 0.89359874, + "learning_rate": 3.868856031585652e-06, + "loss": 0.91528922, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.69140625, + "step": 2369, + "time_per_iteration": 2.374861717224121 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.01072857, + "balance_loss_clip": 1.02736664, + "balance_loss_mlp": 1.02430117, + "epoch": 0.14249210882308733, + "flos": 28805508967680.0, + "grad_norm": 1.478099951806457, + "language_loss": 0.76854205, + "learning_rate": 3.868717288576354e-06, + "loss": 0.79024547, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.734375, + "step": 2370, + "time_per_iteration": 2.4790923595428467 + }, + { + "auxiliary_loss_clip": 0.01095028, + "auxiliary_loss_mlp": 0.01063451, + "balance_loss_clip": 1.02017832, + "balance_loss_mlp": 1.02471995, + "epoch": 0.1425522320757553, + "flos": 21833947416960.0, + "grad_norm": 1.5897874487141483, + "language_loss": 0.84612048, + "learning_rate": 3.868578474705109e-06, + "loss": 0.86770523, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.703125, + "step": 2371, + "time_per_iteration": 2.3977749347686768 + }, + { + "auxiliary_loss_clip": 0.01099014, + "auxiliary_loss_mlp": 0.01070489, + "balance_loss_clip": 1.02457011, + "balance_loss_mlp": 1.02698374, + "epoch": 0.14261235532842326, + "flos": 17310495467520.0, + "grad_norm": 2.185422774035855, + "language_loss": 0.84390569, + "learning_rate": 3.868439589977181e-06, + "loss": 0.86560076, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.71875, + "step": 2372, + "time_per_iteration": 2.388528347015381 + }, + { + "auxiliary_loss_clip": 0.01095989, + "auxiliary_loss_mlp": 0.01074076, + "balance_loss_clip": 1.02839494, + "balance_loss_mlp": 1.02698219, + "epoch": 0.14267247858109125, + "flos": 18805755709440.0, + "grad_norm": 2.3927984742990107, + "language_loss": 0.86746681, + "learning_rate": 3.868300634397836e-06, + "loss": 0.88916743, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6875, + "step": 2373, + "time_per_iteration": 2.379897117614746 + }, + { + "auxiliary_loss_clip": 0.01092757, + "auxiliary_loss_mlp": 0.0107488, + "balance_loss_clip": 1.03117752, + "balance_loss_mlp": 1.0239073, + "epoch": 0.14273260183375922, + "flos": 11358258238080.0, + "grad_norm": 1.9870723132583892, + "language_loss": 0.87381363, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.89548993, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6875, + "step": 2374, + "time_per_iteration": 2.3837428092956543 + }, + { + "auxiliary_loss_clip": 0.01098982, + "auxiliary_loss_mlp": 0.01066802, + "balance_loss_clip": 1.01871264, + "balance_loss_mlp": 1.02655721, + "epoch": 0.14279272508642718, + "flos": 27566336615040.0, + "grad_norm": 1.7779997785714894, + "language_loss": 0.80318689, + "learning_rate": 3.868022510705977e-06, + "loss": 0.82484472, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.72265625, + "step": 2375, + "time_per_iteration": 2.462386131286621 + }, + { + "auxiliary_loss_clip": 0.01096006, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_clip": 1.031183, + "balance_loss_mlp": 1.02670288, + "epoch": 0.14285284833909515, + "flos": 16251649620480.0, + "grad_norm": 2.201692122505234, + "language_loss": 0.79072428, + "learning_rate": 3.867883342604009e-06, + "loss": 0.81244844, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.6953125, + "step": 2376, + "time_per_iteration": 2.3885726928710938 + }, + { + "auxiliary_loss_clip": 0.01094216, + "auxiliary_loss_mlp": 0.01066541, + "balance_loss_clip": 1.01940584, + "balance_loss_mlp": 1.02493668, + "epoch": 0.1429129715917631, + "flos": 19754590262400.0, + "grad_norm": 1.74923658613742, + "language_loss": 0.9503786, + "learning_rate": 3.867744103671717e-06, + "loss": 0.97198617, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.6953125, + "step": 2377, + "time_per_iteration": 2.4360718727111816 + }, + { + "auxiliary_loss_clip": 0.01095697, + "auxiliary_loss_mlp": 0.01063613, + "balance_loss_clip": 1.0174315, + "balance_loss_mlp": 1.0253408, + "epoch": 0.14297309484443108, + "flos": 21136173517440.0, + "grad_norm": 1.936924493031441, + "language_loss": 0.93021452, + "learning_rate": 3.867604793914382e-06, + "loss": 0.95180768, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.703125, + "step": 2378, + "time_per_iteration": 2.403214693069458 + }, + { + "auxiliary_loss_clip": 0.01095994, + "auxiliary_loss_mlp": 0.01056035, + "balance_loss_clip": 1.01276231, + "balance_loss_mlp": 1.02547026, + "epoch": 0.14303321809709904, + "flos": 23585539927680.0, + "grad_norm": 1.6473067291678978, + "language_loss": 0.76051378, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.78203404, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.703125, + "step": 2379, + "time_per_iteration": 2.405268430709839 + }, + { + "auxiliary_loss_clip": 0.01095794, + "auxiliary_loss_mlp": 0.01065766, + "balance_loss_clip": 1.0201329, + "balance_loss_mlp": 1.02556729, + "epoch": 0.14309334134976703, + "flos": 15887365827840.0, + "grad_norm": 1.9383256615331015, + "language_loss": 0.81375277, + "learning_rate": 3.867325961945714e-06, + "loss": 0.83536839, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.703125, + "step": 2380, + "time_per_iteration": 2.386484384536743 + }, + { + "auxiliary_loss_clip": 0.01097447, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_clip": 1.02341223, + "balance_loss_mlp": 1.02776682, + "epoch": 0.143153464602435, + "flos": 16324687918080.0, + "grad_norm": 2.0694957076498848, + "language_loss": 0.90646529, + "learning_rate": 3.867186439744955e-06, + "loss": 0.92811877, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6953125, + "step": 2381, + "time_per_iteration": 2.3853209018707275 + }, + { + "auxiliary_loss_clip": 0.01093604, + "auxiliary_loss_mlp": 0.01065668, + "balance_loss_clip": 1.02072573, + "balance_loss_mlp": 1.02596939, + "epoch": 0.14321358785510296, + "flos": 17091136195200.0, + "grad_norm": 2.11987394464071, + "language_loss": 0.7805512, + "learning_rate": 3.867046846740299e-06, + "loss": 0.80214387, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.67578125, + "step": 2382, + "time_per_iteration": 2.452282667160034 + }, + { + "auxiliary_loss_clip": 0.01096066, + "auxiliary_loss_mlp": 0.0106552, + "balance_loss_clip": 1.02205586, + "balance_loss_mlp": 1.02564335, + "epoch": 0.14327371110777093, + "flos": 26321718090240.0, + "grad_norm": 2.099294418313181, + "language_loss": 0.79235601, + "learning_rate": 3.866907182937039e-06, + "loss": 0.81397188, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.703125, + "step": 2383, + "time_per_iteration": 2.4532601833343506 + }, + { + "auxiliary_loss_clip": 0.01097324, + "auxiliary_loss_mlp": 0.01069447, + "balance_loss_clip": 1.02076149, + "balance_loss_mlp": 1.02640915, + "epoch": 0.1433338343604389, + "flos": 18075512378880.0, + "grad_norm": 2.117007521699034, + "language_loss": 0.90278947, + "learning_rate": 3.866767448340471e-06, + "loss": 0.92445719, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.7109375, + "step": 2384, + "time_per_iteration": 2.372790813446045 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01069744, + "balance_loss_clip": 1.02213192, + "balance_loss_mlp": 1.02633286, + "epoch": 0.14339395761310686, + "flos": 15521895048960.0, + "grad_norm": 2.0509814875596635, + "language_loss": 0.83245885, + "learning_rate": 3.866627642955895e-06, + "loss": 0.85414255, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.72265625, + "step": 2385, + "time_per_iteration": 2.35886287689209 + }, + { + "auxiliary_loss_clip": 0.01093618, + "auxiliary_loss_mlp": 0.01070739, + "balance_loss_clip": 1.02617836, + "balance_loss_mlp": 1.02287889, + "epoch": 0.14345408086577485, + "flos": 28547500953600.0, + "grad_norm": 1.8978318070862894, + "language_loss": 0.7692064, + "learning_rate": 3.866487766788612e-06, + "loss": 0.79084992, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.70703125, + "step": 2386, + "time_per_iteration": 2.4738097190856934 + }, + { + "auxiliary_loss_clip": 0.01095182, + "auxiliary_loss_mlp": 0.01061859, + "balance_loss_clip": 1.01720345, + "balance_loss_mlp": 1.02543402, + "epoch": 0.14351420411844282, + "flos": 20229024994560.0, + "grad_norm": 2.093115263853797, + "language_loss": 0.79714584, + "learning_rate": 3.866347819843925e-06, + "loss": 0.81871629, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6953125, + "step": 2387, + "time_per_iteration": 2.3994054794311523 + }, + { + "auxiliary_loss_clip": 0.01095333, + "auxiliary_loss_mlp": 0.01068445, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.02590656, + "epoch": 0.14357432737111078, + "flos": 19864008063360.0, + "grad_norm": 1.9367702883266693, + "language_loss": 0.84171307, + "learning_rate": 3.866207802127143e-06, + "loss": 0.86335087, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.6953125, + "step": 2388, + "time_per_iteration": 4.096855878829956 + }, + { + "auxiliary_loss_clip": 0.01096327, + "auxiliary_loss_mlp": 0.01071522, + "balance_loss_clip": 1.0276053, + "balance_loss_mlp": 1.02511561, + "epoch": 0.14363445062377875, + "flos": 28255557231360.0, + "grad_norm": 2.205635866942516, + "language_loss": 0.84409058, + "learning_rate": 3.866067713643573e-06, + "loss": 0.86576909, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.7109375, + "step": 2389, + "time_per_iteration": 4.014849901199341 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01080092, + "balance_loss_clip": 1.03264642, + "balance_loss_mlp": 1.02659345, + "epoch": 0.1436945738764467, + "flos": 18185698229760.0, + "grad_norm": 1.8211244725591191, + "language_loss": 0.85047162, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.87227136, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.734375, + "step": 2390, + "time_per_iteration": 2.44083833694458 + }, + { + "auxiliary_loss_clip": 0.01097296, + "auxiliary_loss_mlp": 0.01065515, + "balance_loss_clip": 1.02164602, + "balance_loss_mlp": 1.02640152, + "epoch": 0.14375469712911468, + "flos": 27306687767040.0, + "grad_norm": 1.640918566031728, + "language_loss": 0.76235723, + "learning_rate": 3.865787324397324e-06, + "loss": 0.78398538, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.7109375, + "step": 2391, + "time_per_iteration": 3.837477922439575 + }, + { + "auxiliary_loss_clip": 0.01026283, + "auxiliary_loss_mlp": 0.01016957, + "balance_loss_clip": 1.0124985, + "balance_loss_mlp": 1.00838256, + "epoch": 0.14381482038178264, + "flos": 56888559838080.0, + "grad_norm": 0.8765737300721885, + "language_loss": 0.6193682, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63980055, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.1796875, + "step": 2392, + "time_per_iteration": 4.335782527923584 + }, + { + "auxiliary_loss_clip": 0.01098616, + "auxiliary_loss_mlp": 0.0107467, + "balance_loss_clip": 1.02760637, + "balance_loss_mlp": 1.02492285, + "epoch": 0.14387494363445064, + "flos": 14281326241920.0, + "grad_norm": 2.2812503661024732, + "language_loss": 0.79401863, + "learning_rate": 3.865506652147709e-06, + "loss": 0.81575143, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.73828125, + "step": 2393, + "time_per_iteration": 2.5161001682281494 + }, + { + "auxiliary_loss_clip": 0.01096954, + "auxiliary_loss_mlp": 0.01068664, + "balance_loss_clip": 1.02212477, + "balance_loss_mlp": 1.0256474, + "epoch": 0.1439350668871186, + "flos": 26760262078080.0, + "grad_norm": 1.775653920820062, + "language_loss": 0.78532535, + "learning_rate": 3.865366209909941e-06, + "loss": 0.80698156, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.7109375, + "step": 2394, + "time_per_iteration": 2.609265089035034 + }, + { + "auxiliary_loss_clip": 0.0109491, + "auxiliary_loss_mlp": 0.01070361, + "balance_loss_clip": 1.02191448, + "balance_loss_mlp": 1.0252701, + "epoch": 0.14399519013978657, + "flos": 40698392855040.0, + "grad_norm": 1.6166734686881281, + "language_loss": 0.883066, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.90471876, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.6953125, + "step": 2395, + "time_per_iteration": 2.7156260013580322 + }, + { + "auxiliary_loss_clip": 0.01095757, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_clip": 1.02300513, + "balance_loss_mlp": 1.02708554, + "epoch": 0.14405531339245453, + "flos": 20556510347520.0, + "grad_norm": 1.5522994844666484, + "language_loss": 0.83613944, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85776454, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6875, + "step": 2396, + "time_per_iteration": 2.3968915939331055 + }, + { + "auxiliary_loss_clip": 0.01096141, + "auxiliary_loss_mlp": 0.0106436, + "balance_loss_clip": 1.0203476, + "balance_loss_mlp": 1.02688456, + "epoch": 0.1441154366451225, + "flos": 19571924695680.0, + "grad_norm": 2.2130350559590872, + "language_loss": 0.8561278, + "learning_rate": 3.864944458808712e-06, + "loss": 0.87773287, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.69140625, + "step": 2397, + "time_per_iteration": 2.391301155090332 + }, + { + "auxiliary_loss_clip": 0.0109947, + "auxiliary_loss_mlp": 0.01072523, + "balance_loss_clip": 1.02398133, + "balance_loss_mlp": 1.02649617, + "epoch": 0.14417555989779046, + "flos": 18514719682560.0, + "grad_norm": 1.8560700660300093, + "language_loss": 0.81835431, + "learning_rate": 3.86480373366343e-06, + "loss": 0.84007424, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.7265625, + "step": 2398, + "time_per_iteration": 2.400973320007324 + }, + { + "auxiliary_loss_clip": 0.01095864, + "auxiliary_loss_mlp": 0.01076245, + "balance_loss_clip": 1.03230476, + "balance_loss_mlp": 1.02710867, + "epoch": 0.14423568315045843, + "flos": 26030472595200.0, + "grad_norm": 2.2714374278289355, + "language_loss": 0.67254782, + "learning_rate": 3.864662937804603e-06, + "loss": 0.69426894, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.6875, + "step": 2399, + "time_per_iteration": 2.454521656036377 + }, + { + "auxiliary_loss_clip": 0.01097593, + "auxiliary_loss_mlp": 0.01068732, + "balance_loss_clip": 1.02126265, + "balance_loss_mlp": 1.02788377, + "epoch": 0.14429580640312642, + "flos": 21287661373440.0, + "grad_norm": 1.7685658290954376, + "language_loss": 0.84367967, + "learning_rate": 3.864522071237571e-06, + "loss": 0.86534292, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.6953125, + "step": 2400, + "time_per_iteration": 2.4384162425994873 + }, + { + "auxiliary_loss_clip": 0.01100274, + "auxiliary_loss_mlp": 0.01075201, + "balance_loss_clip": 1.02246308, + "balance_loss_mlp": 1.02673566, + "epoch": 0.14435592965579438, + "flos": 25626737099520.0, + "grad_norm": 1.517662881564376, + "language_loss": 0.76824874, + "learning_rate": 3.864381133967676e-06, + "loss": 0.79000354, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.734375, + "step": 2401, + "time_per_iteration": 2.475950002670288 + }, + { + "auxiliary_loss_clip": 0.01095731, + "auxiliary_loss_mlp": 0.01065891, + "balance_loss_clip": 1.02490687, + "balance_loss_mlp": 1.0267818, + "epoch": 0.14441605290846235, + "flos": 22963981259520.0, + "grad_norm": 1.562949438128985, + "language_loss": 0.82712728, + "learning_rate": 3.86424012600026e-06, + "loss": 0.8487435, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.69140625, + "step": 2402, + "time_per_iteration": 2.4332962036132812 + }, + { + "auxiliary_loss_clip": 0.01097803, + "auxiliary_loss_mlp": 0.01065153, + "balance_loss_clip": 1.01696825, + "balance_loss_mlp": 1.02635479, + "epoch": 0.14447617616113032, + "flos": 17346700414080.0, + "grad_norm": 2.617185989089007, + "language_loss": 0.8596493, + "learning_rate": 3.864099047340673e-06, + "loss": 0.88127881, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.7109375, + "step": 2403, + "time_per_iteration": 2.3692800998687744 + }, + { + "auxiliary_loss_clip": 0.01096001, + "auxiliary_loss_mlp": 0.01066793, + "balance_loss_clip": 1.02294803, + "balance_loss_mlp": 1.02428436, + "epoch": 0.14453629941379828, + "flos": 24059066964480.0, + "grad_norm": 1.6510926838423856, + "language_loss": 0.71366775, + "learning_rate": 3.863957897994262e-06, + "loss": 0.73529565, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.71875, + "step": 2404, + "time_per_iteration": 2.4166979789733887 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01067374, + "balance_loss_clip": 1.02290916, + "balance_loss_mlp": 1.0269078, + "epoch": 0.14459642266646625, + "flos": 14428659646080.0, + "grad_norm": 2.6499992253887146, + "language_loss": 0.7554329, + "learning_rate": 3.863816677966381e-06, + "loss": 0.77707767, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.703125, + "step": 2405, + "time_per_iteration": 2.3652732372283936 + }, + { + "auxiliary_loss_clip": 0.01096215, + "auxiliary_loss_mlp": 0.01067121, + "balance_loss_clip": 1.02501619, + "balance_loss_mlp": 1.02687156, + "epoch": 0.14465654591913424, + "flos": 9866314575360.0, + "grad_norm": 2.115247133715662, + "language_loss": 0.75162393, + "learning_rate": 3.863675387262386e-06, + "loss": 0.77325732, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.69140625, + "step": 2406, + "time_per_iteration": 2.4027485847473145 + }, + { + "auxiliary_loss_clip": 0.01096694, + "auxiliary_loss_mlp": 0.01071727, + "balance_loss_clip": 1.02313709, + "balance_loss_mlp": 1.02650082, + "epoch": 0.1447166691718022, + "flos": 24971766393600.0, + "grad_norm": 2.3874762183075284, + "language_loss": 0.78308189, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.80476612, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.703125, + "step": 2407, + "time_per_iteration": 2.435927391052246 + }, + { + "auxiliary_loss_clip": 0.01095514, + "auxiliary_loss_mlp": 0.01067331, + "balance_loss_clip": 1.02343774, + "balance_loss_mlp": 1.02509701, + "epoch": 0.14477679242447017, + "flos": 21906950803200.0, + "grad_norm": 1.6790895437525375, + "language_loss": 0.80228579, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.82391429, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.703125, + "step": 2408, + "time_per_iteration": 2.409522533416748 + }, + { + "auxiliary_loss_clip": 0.01098997, + "auxiliary_loss_mlp": 0.0107782, + "balance_loss_clip": 1.03235388, + "balance_loss_mlp": 1.02783346, + "epoch": 0.14483691567713813, + "flos": 20739699584640.0, + "grad_norm": 1.9210554877929276, + "language_loss": 0.84147936, + "learning_rate": 3.863251091147299e-06, + "loss": 0.86324751, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.7109375, + "step": 2409, + "time_per_iteration": 2.391932487487793 + }, + { + "auxiliary_loss_clip": 0.01097086, + "auxiliary_loss_mlp": 0.01067829, + "balance_loss_clip": 1.0230298, + "balance_loss_mlp": 1.02634716, + "epoch": 0.1448970389298061, + "flos": 35406258301440.0, + "grad_norm": 1.8574315057108421, + "language_loss": 0.76954699, + "learning_rate": 3.863109517792446e-06, + "loss": 0.79119611, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.70703125, + "step": 2410, + "time_per_iteration": 2.5106914043426514 + }, + { + "auxiliary_loss_clip": 0.01095143, + "auxiliary_loss_mlp": 0.01070289, + "balance_loss_clip": 1.02162755, + "balance_loss_mlp": 1.02483618, + "epoch": 0.14495716218247406, + "flos": 15413454766080.0, + "grad_norm": 1.655506284675812, + "language_loss": 0.83072007, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.85237432, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.703125, + "step": 2411, + "time_per_iteration": 2.367459297180176 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.01067704, + "balance_loss_clip": 1.02109289, + "balance_loss_mlp": 1.02751815, + "epoch": 0.14501728543514203, + "flos": 33691813344000.0, + "grad_norm": 2.067179888233775, + "language_loss": 0.733679, + "learning_rate": 3.862826159140214e-06, + "loss": 0.75532472, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.6953125, + "step": 2412, + "time_per_iteration": 2.5224807262420654 + }, + { + "auxiliary_loss_clip": 0.01097725, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_clip": 1.02207375, + "balance_loss_mlp": 1.02774489, + "epoch": 0.14507740868781002, + "flos": 15595212637440.0, + "grad_norm": 1.8269151586610555, + "language_loss": 0.78736496, + "learning_rate": 3.862684373853579e-06, + "loss": 0.80899042, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.69921875, + "step": 2413, + "time_per_iteration": 2.365117073059082 + }, + { + "auxiliary_loss_clip": 0.01024629, + "auxiliary_loss_mlp": 0.0100548, + "balance_loss_clip": 1.00006819, + "balance_loss_mlp": 1.00705779, + "epoch": 0.145137531940478, + "flos": 66672026023680.0, + "grad_norm": 0.9179057056763439, + "language_loss": 0.59007514, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.61037624, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.05419922, + "router_z_loss_mlp": 0.17578125, + "step": 2414, + "time_per_iteration": 2.9444878101348877 + }, + { + "auxiliary_loss_clip": 0.01023324, + "auxiliary_loss_mlp": 0.01010993, + "balance_loss_clip": 1.00584292, + "balance_loss_mlp": 1.00568306, + "epoch": 0.14519765519314595, + "flos": 67518041022720.0, + "grad_norm": 0.8526647312986252, + "language_loss": 0.62459564, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64493877, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.05151367, + "router_z_loss_mlp": 0.17675781, + "step": 2415, + "time_per_iteration": 3.0272014141082764 + }, + { + "auxiliary_loss_clip": 0.01097141, + "auxiliary_loss_mlp": 0.01063959, + "balance_loss_clip": 1.02144909, + "balance_loss_mlp": 1.02709723, + "epoch": 0.14525777844581392, + "flos": 17198040378240.0, + "grad_norm": 2.540199992625111, + "language_loss": 0.74670166, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.76831269, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.69921875, + "step": 2416, + "time_per_iteration": 2.3867125511169434 + }, + { + "auxiliary_loss_clip": 0.01022402, + "auxiliary_loss_mlp": 0.0100817, + "balance_loss_clip": 1.00225699, + "balance_loss_mlp": 1.00475931, + "epoch": 0.14531790169848188, + "flos": 65401152289920.0, + "grad_norm": 0.7370501876049392, + "language_loss": 0.60554767, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62585342, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.05908203, + "router_z_loss_mlp": 0.17675781, + "step": 2417, + "time_per_iteration": 3.0694992542266846 + }, + { + "auxiliary_loss_clip": 0.01099198, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_clip": 1.03403711, + "balance_loss_mlp": 1.02587605, + "epoch": 0.14537802495114985, + "flos": 32561081274240.0, + "grad_norm": 2.5084461822423028, + "language_loss": 0.81821817, + "learning_rate": 3.861974388030356e-06, + "loss": 0.84000069, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.734375, + "step": 2418, + "time_per_iteration": 2.4783363342285156 + }, + { + "auxiliary_loss_clip": 0.01094115, + "auxiliary_loss_mlp": 0.0106488, + "balance_loss_clip": 1.02215552, + "balance_loss_mlp": 1.0263499, + "epoch": 0.1454381482038178, + "flos": 20225743326720.0, + "grad_norm": 2.022417785769775, + "language_loss": 0.7369898, + "learning_rate": 3.861832179025394e-06, + "loss": 0.75857973, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.6796875, + "step": 2419, + "time_per_iteration": 2.4065279960632324 + }, + { + "auxiliary_loss_clip": 0.01096083, + "auxiliary_loss_mlp": 0.010687, + "balance_loss_clip": 1.02354336, + "balance_loss_mlp": 1.02679014, + "epoch": 0.1454982714564858, + "flos": 22892025214080.0, + "grad_norm": 2.17103110613656, + "language_loss": 0.92823929, + "learning_rate": 3.861689899419569e-06, + "loss": 0.9498871, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.6953125, + "step": 2420, + "time_per_iteration": 2.431934118270874 + }, + { + "auxiliary_loss_clip": 0.01100121, + "auxiliary_loss_mlp": 0.01066137, + "balance_loss_clip": 1.02021754, + "balance_loss_mlp": 1.02856088, + "epoch": 0.14555839470915377, + "flos": 20228815526400.0, + "grad_norm": 1.9537472857288558, + "language_loss": 0.83854508, + "learning_rate": 3.861547549218276e-06, + "loss": 0.86020762, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.71484375, + "step": 2421, + "time_per_iteration": 2.3996660709381104 + }, + { + "auxiliary_loss_clip": 0.01099295, + "auxiliary_loss_mlp": 0.01065234, + "balance_loss_clip": 1.02293825, + "balance_loss_mlp": 1.02757502, + "epoch": 0.14561851796182174, + "flos": 22235204206080.0, + "grad_norm": 2.046972095534364, + "language_loss": 0.8280127, + "learning_rate": 3.861405128426914e-06, + "loss": 0.84965795, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.71875, + "step": 2422, + "time_per_iteration": 2.4426403045654297 + }, + { + "auxiliary_loss_clip": 0.01022218, + "auxiliary_loss_mlp": 0.01008294, + "balance_loss_clip": 1.00226223, + "balance_loss_mlp": 1.00443721, + "epoch": 0.1456786412144897, + "flos": 52633624222080.0, + "grad_norm": 0.9043288996944128, + "language_loss": 0.63476884, + "learning_rate": 3.861262637050883e-06, + "loss": 0.655074, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.06030273, + "router_z_loss_mlp": 0.17773438, + "step": 2423, + "time_per_iteration": 3.0056581497192383 + }, + { + "auxiliary_loss_clip": 0.0109746, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_clip": 1.02073765, + "balance_loss_mlp": 1.02752936, + "epoch": 0.14573876446715767, + "flos": 23220557907840.0, + "grad_norm": 1.685930955669371, + "language_loss": 0.83249277, + "learning_rate": 3.861120075095585e-06, + "loss": 0.85407388, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.69921875, + "step": 2424, + "time_per_iteration": 2.433666229248047 + }, + { + "auxiliary_loss_clip": 0.01096648, + "auxiliary_loss_mlp": 0.01061929, + "balance_loss_clip": 1.01889491, + "balance_loss_mlp": 1.02763188, + "epoch": 0.14579888771982563, + "flos": 18113393070720.0, + "grad_norm": 1.890295413044119, + "language_loss": 0.80257285, + "learning_rate": 3.860977442566429e-06, + "loss": 0.82415861, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.69140625, + "step": 2425, + "time_per_iteration": 2.476473331451416 + }, + { + "auxiliary_loss_clip": 0.01096166, + "auxiliary_loss_mlp": 0.01072671, + "balance_loss_clip": 1.02911174, + "balance_loss_mlp": 1.02712667, + "epoch": 0.14585901097249362, + "flos": 23000046560640.0, + "grad_norm": 2.0627694127846348, + "language_loss": 0.84775686, + "learning_rate": 3.860834739468821e-06, + "loss": 0.86944532, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.69140625, + "step": 2426, + "time_per_iteration": 2.4028544425964355 + }, + { + "auxiliary_loss_clip": 0.01098722, + "auxiliary_loss_mlp": 0.01060839, + "balance_loss_clip": 1.01811409, + "balance_loss_mlp": 1.02875614, + "epoch": 0.1459191342251616, + "flos": 21907579207680.0, + "grad_norm": 2.245460876957568, + "language_loss": 0.89481699, + "learning_rate": 3.860691965808173e-06, + "loss": 0.91641259, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.69921875, + "step": 2427, + "time_per_iteration": 2.4195988178253174 + }, + { + "auxiliary_loss_clip": 0.01104713, + "auxiliary_loss_mlp": 0.01070712, + "balance_loss_clip": 1.022241, + "balance_loss_mlp": 1.02835882, + "epoch": 0.14597925747782955, + "flos": 14974631487360.0, + "grad_norm": 1.8941693416139154, + "language_loss": 0.69453251, + "learning_rate": 3.8605491215899e-06, + "loss": 0.71628678, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.765625, + "step": 2428, + "time_per_iteration": 3.9255177974700928 + }, + { + "auxiliary_loss_clip": 0.01095757, + "auxiliary_loss_mlp": 0.01061316, + "balance_loss_clip": 1.01782846, + "balance_loss_mlp": 1.02532625, + "epoch": 0.14603938073049752, + "flos": 21067848253440.0, + "grad_norm": 1.7454189941142433, + "language_loss": 0.85234916, + "learning_rate": 3.860406206819417e-06, + "loss": 0.8739199, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.703125, + "step": 2429, + "time_per_iteration": 3.895122766494751 + }, + { + "auxiliary_loss_clip": 0.0109403, + "auxiliary_loss_mlp": 0.01063588, + "balance_loss_clip": 1.02050531, + "balance_loss_mlp": 1.02502084, + "epoch": 0.14609950398316549, + "flos": 19863763683840.0, + "grad_norm": 1.7741341144285783, + "language_loss": 0.80833554, + "learning_rate": 3.860263221502145e-06, + "loss": 0.82991171, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6875, + "step": 2430, + "time_per_iteration": 3.8319296836853027 + }, + { + "auxiliary_loss_clip": 0.01099297, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.02439785, + "balance_loss_mlp": 1.02754045, + "epoch": 0.14615962723583345, + "flos": 22417765038720.0, + "grad_norm": 2.0607443172223965, + "language_loss": 0.8595466, + "learning_rate": 3.860120165643504e-06, + "loss": 0.88121152, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.71875, + "step": 2431, + "time_per_iteration": 2.4585623741149902 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01068439, + "balance_loss_clip": 1.02337813, + "balance_loss_mlp": 1.02770352, + "epoch": 0.14621975048850142, + "flos": 22345145677440.0, + "grad_norm": 2.1014633246690595, + "language_loss": 0.80919695, + "learning_rate": 3.859977039248921e-06, + "loss": 0.83087844, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.71875, + "step": 2432, + "time_per_iteration": 3.8450517654418945 + }, + { + "auxiliary_loss_clip": 0.01094827, + "auxiliary_loss_mlp": 0.01066237, + "balance_loss_clip": 1.0201745, + "balance_loss_mlp": 1.02483547, + "epoch": 0.1462798737411694, + "flos": 24388018594560.0, + "grad_norm": 1.9275260119718027, + "language_loss": 0.83050263, + "learning_rate": 3.859833842323822e-06, + "loss": 0.85211325, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.703125, + "step": 2433, + "time_per_iteration": 2.414325714111328 + }, + { + "auxiliary_loss_clip": 0.01093178, + "auxiliary_loss_mlp": 0.01064778, + "balance_loss_clip": 1.0227927, + "balance_loss_mlp": 1.02549481, + "epoch": 0.14633999699383737, + "flos": 19243671292800.0, + "grad_norm": 2.1521960261138635, + "language_loss": 0.79915607, + "learning_rate": 3.859690574873638e-06, + "loss": 0.82073563, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.67578125, + "step": 2434, + "time_per_iteration": 2.437758684158325 + }, + { + "auxiliary_loss_clip": 0.01022091, + "auxiliary_loss_mlp": 0.01009071, + "balance_loss_clip": 1.00313425, + "balance_loss_mlp": 1.00463533, + "epoch": 0.14640012024650534, + "flos": 62657468184960.0, + "grad_norm": 0.8692019039032973, + "language_loss": 0.58499002, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60530162, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.05932617, + "router_z_loss_mlp": 0.17480469, + "step": 2435, + "time_per_iteration": 3.0096054077148438 + }, + { + "auxiliary_loss_clip": 0.01090595, + "auxiliary_loss_mlp": 0.01062331, + "balance_loss_clip": 1.01865304, + "balance_loss_mlp": 1.02371049, + "epoch": 0.1464602434991733, + "flos": 12275426321280.0, + "grad_norm": 2.300555102014229, + "language_loss": 0.90603876, + "learning_rate": 3.859403828419744e-06, + "loss": 0.92756808, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.66796875, + "step": 2436, + "time_per_iteration": 2.387996196746826 + }, + { + "auxiliary_loss_clip": 0.01096664, + "auxiliary_loss_mlp": 0.01064815, + "balance_loss_clip": 1.02104187, + "balance_loss_mlp": 1.0266304, + "epoch": 0.14652036675184127, + "flos": 20921282899200.0, + "grad_norm": 1.992716338542552, + "language_loss": 0.7706126, + "learning_rate": 3.85926034942691e-06, + "loss": 0.79222739, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.69921875, + "step": 2437, + "time_per_iteration": 2.415969133377075 + }, + { + "auxiliary_loss_clip": 0.01098862, + "auxiliary_loss_mlp": 0.01072569, + "balance_loss_clip": 1.02269185, + "balance_loss_mlp": 1.02669942, + "epoch": 0.14658049000450923, + "flos": 27702603118080.0, + "grad_norm": 2.0972067205844414, + "language_loss": 0.75825679, + "learning_rate": 3.859116799930736e-06, + "loss": 0.77997112, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.71875, + "step": 2438, + "time_per_iteration": 2.4462890625 + }, + { + "auxiliary_loss_clip": 0.01097581, + "auxiliary_loss_mlp": 0.01068824, + "balance_loss_clip": 1.02531266, + "balance_loss_mlp": 1.02776372, + "epoch": 0.14664061325717723, + "flos": 24935351978880.0, + "grad_norm": 1.9075747824541653, + "language_loss": 0.76332027, + "learning_rate": 3.858973179936668e-06, + "loss": 0.78498435, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.69921875, + "step": 2439, + "time_per_iteration": 2.4122400283813477 + }, + { + "auxiliary_loss_clip": 0.01097006, + "auxiliary_loss_mlp": 0.01071461, + "balance_loss_clip": 1.0265671, + "balance_loss_mlp": 1.02731824, + "epoch": 0.1467007365098452, + "flos": 40296053813760.0, + "grad_norm": 1.9181926336963941, + "language_loss": 0.76293385, + "learning_rate": 3.85882948945015e-06, + "loss": 0.7846185, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6953125, + "step": 2440, + "time_per_iteration": 2.553365468978882 + }, + { + "auxiliary_loss_clip": 0.01095462, + "auxiliary_loss_mlp": 0.01069096, + "balance_loss_clip": 1.02999556, + "balance_loss_mlp": 1.02738667, + "epoch": 0.14676085976251316, + "flos": 26539890376320.0, + "grad_norm": 1.5567723605499801, + "language_loss": 0.8403334, + "learning_rate": 3.85868572847663e-06, + "loss": 0.86197901, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.6796875, + "step": 2441, + "time_per_iteration": 2.443652868270874 + }, + { + "auxiliary_loss_clip": 0.0110462, + "auxiliary_loss_mlp": 0.01074934, + "balance_loss_clip": 1.02488971, + "balance_loss_mlp": 1.0287286, + "epoch": 0.14682098301518112, + "flos": 23548985867520.0, + "grad_norm": 1.8919922846489026, + "language_loss": 0.75999266, + "learning_rate": 3.858541897021563e-06, + "loss": 0.78178823, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7578125, + "step": 2442, + "time_per_iteration": 2.3993821144104004 + }, + { + "auxiliary_loss_clip": 0.01101684, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_clip": 1.02566516, + "balance_loss_mlp": 1.02824354, + "epoch": 0.1468811062678491, + "flos": 11650411428480.0, + "grad_norm": 3.5926877895002503, + "language_loss": 0.85209274, + "learning_rate": 3.8583979950904e-06, + "loss": 0.87382305, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.734375, + "step": 2443, + "time_per_iteration": 2.3490679264068604 + }, + { + "auxiliary_loss_clip": 0.01098999, + "auxiliary_loss_mlp": 0.01077543, + "balance_loss_clip": 1.03035998, + "balance_loss_mlp": 1.02698636, + "epoch": 0.14694122952051705, + "flos": 23001512837760.0, + "grad_norm": 1.638147347253433, + "language_loss": 0.84472048, + "learning_rate": 3.858254022688599e-06, + "loss": 0.86648583, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.71875, + "step": 2444, + "time_per_iteration": 2.4034340381622314 + }, + { + "auxiliary_loss_clip": 0.01099359, + "auxiliary_loss_mlp": 0.01061886, + "balance_loss_clip": 1.01825547, + "balance_loss_mlp": 1.02699971, + "epoch": 0.14700135277318502, + "flos": 26501835127680.0, + "grad_norm": 1.5957139567683911, + "language_loss": 0.73091751, + "learning_rate": 3.85810997982162e-06, + "loss": 0.75252998, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.72265625, + "step": 2445, + "time_per_iteration": 2.4295976161956787 + }, + { + "auxiliary_loss_clip": 0.01024787, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.03320765, + "balance_loss_mlp": 1.00679374, + "epoch": 0.147061476025853, + "flos": 59446366531200.0, + "grad_norm": 0.8460990111409177, + "language_loss": 0.63223791, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65286767, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.04980469, + "router_z_loss_mlp": 0.1796875, + "step": 2446, + "time_per_iteration": 2.911855697631836 + }, + { + "auxiliary_loss_clip": 0.01101688, + "auxiliary_loss_mlp": 0.01074412, + "balance_loss_clip": 1.02765775, + "balance_loss_mlp": 1.02844119, + "epoch": 0.14712159927852098, + "flos": 28329607958400.0, + "grad_norm": 1.6272868934955003, + "language_loss": 0.77166736, + "learning_rate": 3.857821682713975e-06, + "loss": 0.7934283, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.734375, + "step": 2447, + "time_per_iteration": 2.4587817192077637 + }, + { + "auxiliary_loss_clip": 0.01099291, + "auxiliary_loss_mlp": 0.0106766, + "balance_loss_clip": 1.02159727, + "balance_loss_mlp": 1.02678084, + "epoch": 0.14718172253118894, + "flos": 27088585303680.0, + "grad_norm": 1.8810746481606395, + "language_loss": 0.87475824, + "learning_rate": 3.857677428484242e-06, + "loss": 0.89642775, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7265625, + "step": 2448, + "time_per_iteration": 2.43615460395813 + }, + { + "auxiliary_loss_clip": 0.01023392, + "auxiliary_loss_mlp": 0.01006158, + "balance_loss_clip": 1.00062668, + "balance_loss_mlp": 1.00605989, + "epoch": 0.1472418457838569, + "flos": 66703587759360.0, + "grad_norm": 0.7686752387624525, + "language_loss": 0.56934094, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58963645, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.05541992, + "router_z_loss_mlp": 0.17382812, + "step": 2449, + "time_per_iteration": 2.974536418914795 + }, + { + "auxiliary_loss_clip": 0.01097666, + "auxiliary_loss_mlp": 0.01067246, + "balance_loss_clip": 1.02108848, + "balance_loss_mlp": 1.02722335, + "epoch": 0.14730196903652487, + "flos": 19572553100160.0, + "grad_norm": 1.7620447302795383, + "language_loss": 0.86746895, + "learning_rate": 3.857388708700307e-06, + "loss": 0.88911808, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.703125, + "step": 2450, + "time_per_iteration": 2.3912692070007324 + }, + { + "auxiliary_loss_clip": 0.01099883, + "auxiliary_loss_mlp": 0.01079596, + "balance_loss_clip": 1.03060055, + "balance_loss_mlp": 1.02773046, + "epoch": 0.14736209228919284, + "flos": 16070101217280.0, + "grad_norm": 1.8948975742558394, + "language_loss": 0.77137697, + "learning_rate": 3.857244243157052e-06, + "loss": 0.79317176, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.71875, + "step": 2451, + "time_per_iteration": 2.36212420463562 + }, + { + "auxiliary_loss_clip": 0.01095651, + "auxiliary_loss_mlp": 0.01069599, + "balance_loss_clip": 1.02916348, + "balance_loss_mlp": 1.02660644, + "epoch": 0.1474222155418608, + "flos": 23038346188800.0, + "grad_norm": 1.5699881294284104, + "language_loss": 0.83476424, + "learning_rate": 3.85709970718691e-06, + "loss": 0.85641676, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.69140625, + "step": 2452, + "time_per_iteration": 2.408055543899536 + }, + { + "auxiliary_loss_clip": 0.01097725, + "auxiliary_loss_mlp": 0.01074171, + "balance_loss_clip": 1.02953935, + "balance_loss_mlp": 1.02688432, + "epoch": 0.1474823387945288, + "flos": 17017713872640.0, + "grad_norm": 1.9270339623963557, + "language_loss": 0.75618696, + "learning_rate": 3.856955100795361e-06, + "loss": 0.77790588, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.70703125, + "step": 2453, + "time_per_iteration": 2.351170539855957 + }, + { + "auxiliary_loss_clip": 0.01101718, + "auxiliary_loss_mlp": 0.01073018, + "balance_loss_clip": 1.02390349, + "balance_loss_mlp": 1.02754736, + "epoch": 0.14754246204719676, + "flos": 17894068709760.0, + "grad_norm": 1.9578791037843046, + "language_loss": 0.78440231, + "learning_rate": 3.856810423987889e-06, + "loss": 0.80614966, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.7421875, + "step": 2454, + "time_per_iteration": 2.388763666152954 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.02789593, + "balance_loss_mlp": 1.0262301, + "epoch": 0.14760258529986472, + "flos": 13078254101760.0, + "grad_norm": 1.902442925207913, + "language_loss": 0.84432828, + "learning_rate": 3.856665676769979e-06, + "loss": 0.86604702, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.734375, + "step": 2455, + "time_per_iteration": 2.365283489227295 + }, + { + "auxiliary_loss_clip": 0.01101424, + "auxiliary_loss_mlp": 0.01073822, + "balance_loss_clip": 1.02730632, + "balance_loss_mlp": 1.0260464, + "epoch": 0.1476627085525327, + "flos": 30805194666240.0, + "grad_norm": 1.9731022189781138, + "language_loss": 0.86136854, + "learning_rate": 3.85652085914712e-06, + "loss": 0.88312101, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.75390625, + "step": 2456, + "time_per_iteration": 2.467848300933838 + }, + { + "auxiliary_loss_clip": 0.01097719, + "auxiliary_loss_mlp": 0.01067471, + "balance_loss_clip": 1.02391195, + "balance_loss_mlp": 1.02812886, + "epoch": 0.14772283180520066, + "flos": 21688359580800.0, + "grad_norm": 1.6333746364611896, + "language_loss": 0.85959697, + "learning_rate": 3.856375971124805e-06, + "loss": 0.88124883, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.6953125, + "step": 2457, + "time_per_iteration": 2.3873815536499023 + }, + { + "auxiliary_loss_clip": 0.01093992, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_clip": 1.01853633, + "balance_loss_mlp": 1.02582431, + "epoch": 0.14778295505786862, + "flos": 18769411117440.0, + "grad_norm": 2.2157566975761385, + "language_loss": 0.77142531, + "learning_rate": 3.856231012708527e-06, + "loss": 0.79300809, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6796875, + "step": 2458, + "time_per_iteration": 2.3928511142730713 + }, + { + "auxiliary_loss_clip": 0.01106035, + "auxiliary_loss_mlp": 0.01074209, + "balance_loss_clip": 1.02304411, + "balance_loss_mlp": 1.02870023, + "epoch": 0.1478430783105366, + "flos": 22892444150400.0, + "grad_norm": 1.9505714856299643, + "language_loss": 0.85122252, + "learning_rate": 3.856085983903782e-06, + "loss": 0.87302494, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.7734375, + "step": 2459, + "time_per_iteration": 2.4292595386505127 + }, + { + "auxiliary_loss_clip": 0.01093187, + "auxiliary_loss_mlp": 0.0106228, + "balance_loss_clip": 1.02012777, + "balance_loss_mlp": 1.02594709, + "epoch": 0.14790320156320458, + "flos": 15084433313280.0, + "grad_norm": 2.023544266713236, + "language_loss": 0.76772064, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78927529, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.671875, + "step": 2460, + "time_per_iteration": 2.379777193069458 + }, + { + "auxiliary_loss_clip": 0.01099786, + "auxiliary_loss_mlp": 0.0107731, + "balance_loss_clip": 1.03162885, + "balance_loss_mlp": 1.02636433, + "epoch": 0.14796332481587254, + "flos": 26503580695680.0, + "grad_norm": 1.6709260765579441, + "language_loss": 0.82930833, + "learning_rate": 3.855795715150896e-06, + "loss": 0.85107929, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.734375, + "step": 2461, + "time_per_iteration": 2.4474596977233887 + }, + { + "auxiliary_loss_clip": 0.01097535, + "auxiliary_loss_mlp": 0.0107697, + "balance_loss_clip": 1.02876163, + "balance_loss_mlp": 1.02629185, + "epoch": 0.1480234480685405, + "flos": 17562324170880.0, + "grad_norm": 2.6040231332709873, + "language_loss": 0.68725038, + "learning_rate": 3.855650475213761e-06, + "loss": 0.70899546, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.7109375, + "step": 2462, + "time_per_iteration": 2.3660292625427246 + }, + { + "auxiliary_loss_clip": 0.01097023, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_clip": 1.02457881, + "balance_loss_mlp": 1.0268321, + "epoch": 0.14808357132120847, + "flos": 53580121580160.0, + "grad_norm": 2.2729480708219922, + "language_loss": 0.68763828, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.70929277, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.703125, + "step": 2463, + "time_per_iteration": 2.7324020862579346 + }, + { + "auxiliary_loss_clip": 0.01098132, + "auxiliary_loss_mlp": 0.01077114, + "balance_loss_clip": 1.02990699, + "balance_loss_mlp": 1.02659583, + "epoch": 0.14814369457387644, + "flos": 19828152230400.0, + "grad_norm": 1.6701447464541475, + "language_loss": 0.78808916, + "learning_rate": 3.855359784245646e-06, + "loss": 0.80984163, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.71484375, + "step": 2464, + "time_per_iteration": 2.391052007675171 + }, + { + "auxiliary_loss_clip": 0.01098688, + "auxiliary_loss_mlp": 0.01075794, + "balance_loss_clip": 1.03476238, + "balance_loss_mlp": 1.02810156, + "epoch": 0.1482038178265444, + "flos": 23913828241920.0, + "grad_norm": 1.6285918339432055, + "language_loss": 0.81520182, + "learning_rate": 3.855214333225688e-06, + "loss": 0.83694661, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.70703125, + "step": 2465, + "time_per_iteration": 2.416149139404297 + }, + { + "auxiliary_loss_clip": 0.01102655, + "auxiliary_loss_mlp": 0.01082639, + "balance_loss_clip": 1.03288102, + "balance_loss_mlp": 1.0280782, + "epoch": 0.1482639410792124, + "flos": 24169357549440.0, + "grad_norm": 1.5797553479385578, + "language_loss": 0.781461, + "learning_rate": 3.855068811855817e-06, + "loss": 0.80331397, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.74609375, + "step": 2466, + "time_per_iteration": 2.4337644577026367 + }, + { + "auxiliary_loss_clip": 0.01023507, + "auxiliary_loss_mlp": 0.01021416, + "balance_loss_clip": 1.01674354, + "balance_loss_mlp": 1.00532091, + "epoch": 0.14832406433188036, + "flos": 66188025578880.0, + "grad_norm": 0.7930868803681371, + "language_loss": 0.60151768, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62196696, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.04663086, + "router_z_loss_mlp": 0.18164062, + "step": 2467, + "time_per_iteration": 3.0348620414733887 + }, + { + "auxiliary_loss_clip": 0.0109461, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_clip": 1.02436399, + "balance_loss_mlp": 1.02526033, + "epoch": 0.14838418758454833, + "flos": 25410066001920.0, + "grad_norm": 2.0654667690431916, + "language_loss": 0.8914237, + "learning_rate": 3.85477755808841e-06, + "loss": 0.91305459, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.69140625, + "step": 2468, + "time_per_iteration": 3.8651061058044434 + }, + { + "auxiliary_loss_clip": 0.0109801, + "auxiliary_loss_mlp": 0.01070615, + "balance_loss_clip": 1.01933122, + "balance_loss_mlp": 1.0260278, + "epoch": 0.1484443108372163, + "flos": 23288918083200.0, + "grad_norm": 2.1231896995533526, + "language_loss": 0.78280437, + "learning_rate": 3.854631825701919e-06, + "loss": 0.80449057, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.71875, + "step": 2469, + "time_per_iteration": 2.404529571533203 + }, + { + "auxiliary_loss_clip": 0.01097149, + "auxiliary_loss_mlp": 0.01068847, + "balance_loss_clip": 1.02268934, + "balance_loss_mlp": 1.026824, + "epoch": 0.14850443408988426, + "flos": 14646797020800.0, + "grad_norm": 1.897251362223043, + "language_loss": 0.77403468, + "learning_rate": 3.854486022987603e-06, + "loss": 0.79569459, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.703125, + "step": 2470, + "time_per_iteration": 3.8456714153289795 + }, + { + "auxiliary_loss_clip": 0.01094599, + "auxiliary_loss_mlp": 0.01067411, + "balance_loss_clip": 1.02180183, + "balance_loss_mlp": 1.02603734, + "epoch": 0.14856455734255222, + "flos": 23547240299520.0, + "grad_norm": 1.6641754455455549, + "language_loss": 0.7405529, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.76217306, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.68359375, + "step": 2471, + "time_per_iteration": 3.8998091220855713 + }, + { + "auxiliary_loss_clip": 0.01099473, + "auxiliary_loss_mlp": 0.0107561, + "balance_loss_clip": 1.02551806, + "balance_loss_mlp": 1.02659035, + "epoch": 0.1486246805952202, + "flos": 18076315340160.0, + "grad_norm": 1.8348858223601816, + "language_loss": 0.91545963, + "learning_rate": 3.854194206597615e-06, + "loss": 0.93721044, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7265625, + "step": 2472, + "time_per_iteration": 2.4353387355804443 + }, + { + "auxiliary_loss_clip": 0.01098581, + "auxiliary_loss_mlp": 0.01072525, + "balance_loss_clip": 1.02321994, + "balance_loss_mlp": 1.02615833, + "epoch": 0.14868480384788818, + "flos": 19352635246080.0, + "grad_norm": 2.654187182109426, + "language_loss": 0.83452082, + "learning_rate": 3.854048192933008e-06, + "loss": 0.85623193, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.72265625, + "step": 2473, + "time_per_iteration": 2.3959057331085205 + }, + { + "auxiliary_loss_clip": 0.01098773, + "auxiliary_loss_mlp": 0.01078367, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.02574706, + "epoch": 0.14874492710055615, + "flos": 22199103993600.0, + "grad_norm": 2.1473805610447743, + "language_loss": 0.79949987, + "learning_rate": 3.853902108962709e-06, + "loss": 0.8212713, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.73046875, + "step": 2474, + "time_per_iteration": 2.396589517593384 + }, + { + "auxiliary_loss_clip": 0.01099843, + "auxiliary_loss_mlp": 0.01084397, + "balance_loss_clip": 1.03180194, + "balance_loss_mlp": 1.02602172, + "epoch": 0.1488050503532241, + "flos": 21102447277440.0, + "grad_norm": 1.8112436225554511, + "language_loss": 0.83936793, + "learning_rate": 3.853755954692255e-06, + "loss": 0.86121035, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.73828125, + "step": 2475, + "time_per_iteration": 2.4137630462646484 + }, + { + "auxiliary_loss_clip": 0.01098062, + "auxiliary_loss_mlp": 0.01074189, + "balance_loss_clip": 1.02350092, + "balance_loss_mlp": 1.0284934, + "epoch": 0.14886517360589208, + "flos": 12785751797760.0, + "grad_norm": 1.7990510082692523, + "language_loss": 0.82762033, + "learning_rate": 3.85360973012719e-06, + "loss": 0.84934282, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.6953125, + "step": 2476, + "time_per_iteration": 2.4145026206970215 + }, + { + "auxiliary_loss_clip": 0.01093167, + "auxiliary_loss_mlp": 0.01071103, + "balance_loss_clip": 1.02618492, + "balance_loss_mlp": 1.02607024, + "epoch": 0.14892529685856004, + "flos": 29021586572160.0, + "grad_norm": 1.644229651248555, + "language_loss": 0.79497421, + "learning_rate": 3.853463435273058e-06, + "loss": 0.81661701, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.671875, + "step": 2477, + "time_per_iteration": 2.4482643604278564 + }, + { + "auxiliary_loss_clip": 0.01023411, + "auxiliary_loss_mlp": 0.01073949, + "balance_loss_clip": 1.06829834, + "balance_loss_mlp": 1.00721741, + "epoch": 0.148985420111228, + "flos": 61923105313920.0, + "grad_norm": 0.8666176257297775, + "language_loss": 0.60206616, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62303978, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.16210938, + "step": 2478, + "time_per_iteration": 3.043100595474243 + }, + { + "auxiliary_loss_clip": 0.01097796, + "auxiliary_loss_mlp": 0.01071474, + "balance_loss_clip": 1.02524483, + "balance_loss_mlp": 1.02678895, + "epoch": 0.149045543363896, + "flos": 23913967887360.0, + "grad_norm": 2.6932144822480364, + "language_loss": 0.72641551, + "learning_rate": 3.853170634719787e-06, + "loss": 0.74810821, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7109375, + "step": 2479, + "time_per_iteration": 2.445418119430542 + }, + { + "auxiliary_loss_clip": 0.01095998, + "auxiliary_loss_mlp": 0.01074929, + "balance_loss_clip": 1.03141725, + "balance_loss_mlp": 1.0252521, + "epoch": 0.14910566661656396, + "flos": 23653411344000.0, + "grad_norm": 1.5988557082960875, + "language_loss": 0.82706577, + "learning_rate": 3.853024129031751e-06, + "loss": 0.84877503, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.70703125, + "step": 2480, + "time_per_iteration": 2.4252500534057617 + }, + { + "auxiliary_loss_clip": 0.01098516, + "auxiliary_loss_mlp": 0.01099534, + "balance_loss_clip": 1.05287576, + "balance_loss_mlp": 1.02562797, + "epoch": 0.14916578986923193, + "flos": 20514440292480.0, + "grad_norm": 2.0008817013092943, + "language_loss": 0.85698044, + "learning_rate": 3.852877553076854e-06, + "loss": 0.87896097, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.7265625, + "step": 2481, + "time_per_iteration": 2.4969537258148193 + }, + { + "auxiliary_loss_clip": 0.01099869, + "auxiliary_loss_mlp": 0.01101485, + "balance_loss_clip": 1.05151248, + "balance_loss_mlp": 1.0264293, + "epoch": 0.1492259131218999, + "flos": 22490733513600.0, + "grad_norm": 1.9794383528494857, + "language_loss": 0.79101324, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.81302673, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.734375, + "step": 2482, + "time_per_iteration": 2.5329623222351074 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.0110767, + "balance_loss_clip": 1.06303775, + "balance_loss_mlp": 1.02724206, + "epoch": 0.14928603637456786, + "flos": 23184736986240.0, + "grad_norm": 2.2893834210362014, + "language_loss": 0.82406473, + "learning_rate": 3.852584190388713e-06, + "loss": 0.84616947, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.7578125, + "step": 2483, + "time_per_iteration": 2.4188201427459717 + }, + { + "auxiliary_loss_clip": 0.01094475, + "auxiliary_loss_mlp": 0.01106103, + "balance_loss_clip": 1.06650198, + "balance_loss_mlp": 1.02571392, + "epoch": 0.14934615962723582, + "flos": 21652154634240.0, + "grad_norm": 1.5442775377177154, + "language_loss": 0.71927452, + "learning_rate": 3.852437403666595e-06, + "loss": 0.74128032, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.6875, + "step": 2484, + "time_per_iteration": 2.4173951148986816 + }, + { + "auxiliary_loss_clip": 0.01099546, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_clip": 1.05285001, + "balance_loss_mlp": 1.02653694, + "epoch": 0.1494062828799038, + "flos": 27009018581760.0, + "grad_norm": 1.7672985154264305, + "language_loss": 0.86294454, + "learning_rate": 3.852290546699863e-06, + "loss": 0.88495821, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.73046875, + "step": 2485, + "time_per_iteration": 2.4422507286071777 + }, + { + "auxiliary_loss_clip": 0.01099463, + "auxiliary_loss_mlp": 0.01100674, + "balance_loss_clip": 1.05446887, + "balance_loss_mlp": 1.02831745, + "epoch": 0.14946640613257178, + "flos": 21213889937280.0, + "grad_norm": 2.4061015982206544, + "language_loss": 0.8727873, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.89478868, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7109375, + "step": 2486, + "time_per_iteration": 2.4031965732574463 + }, + { + "auxiliary_loss_clip": 0.01096548, + "auxiliary_loss_mlp": 0.01091768, + "balance_loss_clip": 1.05202317, + "balance_loss_mlp": 1.02681255, + "epoch": 0.14952652938523975, + "flos": 13370023267200.0, + "grad_norm": 2.480372794657575, + "language_loss": 0.77013361, + "learning_rate": 3.851996622054842e-06, + "loss": 0.79201674, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6953125, + "step": 2487, + "time_per_iteration": 2.3689892292022705 + }, + { + "auxiliary_loss_clip": 0.01095101, + "auxiliary_loss_mlp": 0.0110138, + "balance_loss_clip": 1.05615211, + "balance_loss_mlp": 1.0265764, + "epoch": 0.1495866526379077, + "flos": 35516234684160.0, + "grad_norm": 2.466388873361637, + "language_loss": 0.72862023, + "learning_rate": 3.8518495543877e-06, + "loss": 0.75058508, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.68359375, + "step": 2488, + "time_per_iteration": 2.517245054244995 + }, + { + "auxiliary_loss_clip": 0.01100736, + "auxiliary_loss_mlp": 0.01086923, + "balance_loss_clip": 1.04133773, + "balance_loss_mlp": 1.027812, + "epoch": 0.14964677589057568, + "flos": 17631976066560.0, + "grad_norm": 2.65741708783581, + "language_loss": 0.72282994, + "learning_rate": 3.851702416498235e-06, + "loss": 0.74470651, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.73046875, + "step": 2489, + "time_per_iteration": 2.338932514190674 + }, + { + "auxiliary_loss_clip": 0.0109884, + "auxiliary_loss_mlp": 0.01078459, + "balance_loss_clip": 1.03275418, + "balance_loss_mlp": 1.02613616, + "epoch": 0.14970689914324364, + "flos": 20184476232960.0, + "grad_norm": 2.6920391628997637, + "language_loss": 0.84391975, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.86569279, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.7265625, + "step": 2490, + "time_per_iteration": 2.3517861366271973 + }, + { + "auxiliary_loss_clip": 0.01099617, + "auxiliary_loss_mlp": 0.01075755, + "balance_loss_clip": 1.03088498, + "balance_loss_mlp": 1.02728963, + "epoch": 0.1497670223959116, + "flos": 37227293239680.0, + "grad_norm": 2.492486266060026, + "language_loss": 0.81758022, + "learning_rate": 3.851407930074666e-06, + "loss": 0.83933401, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.72265625, + "step": 2491, + "time_per_iteration": 2.521962881088257 + }, + { + "auxiliary_loss_clip": 0.01098674, + "auxiliary_loss_mlp": 0.01064337, + "balance_loss_clip": 1.01813138, + "balance_loss_mlp": 1.02630544, + "epoch": 0.1498271456485796, + "flos": 24454877581440.0, + "grad_norm": 2.074963728153601, + "language_loss": 0.91819882, + "learning_rate": 3.851260581551727e-06, + "loss": 0.93982893, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.72265625, + "step": 2492, + "time_per_iteration": 2.412959575653076 + }, + { + "auxiliary_loss_clip": 0.01096167, + "auxiliary_loss_mlp": 0.01072636, + "balance_loss_clip": 1.02862382, + "balance_loss_mlp": 1.02655816, + "epoch": 0.14988726890124757, + "flos": 16252662049920.0, + "grad_norm": 2.8398881463730405, + "language_loss": 0.83351183, + "learning_rate": 3.851113162828802e-06, + "loss": 0.85519987, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.6953125, + "step": 2493, + "time_per_iteration": 2.382564067840576 + }, + { + "auxiliary_loss_clip": 0.01098172, + "auxiliary_loss_mlp": 0.01066212, + "balance_loss_clip": 1.02296281, + "balance_loss_mlp": 1.02753878, + "epoch": 0.14994739215391553, + "flos": 20665544123520.0, + "grad_norm": 1.73868949817116, + "language_loss": 0.82704777, + "learning_rate": 3.85096567391148e-06, + "loss": 0.84869158, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.70703125, + "step": 2494, + "time_per_iteration": 2.4043378829956055 + }, + { + "auxiliary_loss_clip": 0.01091939, + "auxiliary_loss_mlp": 0.01068915, + "balance_loss_clip": 1.02194691, + "balance_loss_mlp": 1.02556372, + "epoch": 0.1500075154065835, + "flos": 70649961845760.0, + "grad_norm": 1.9611367575343288, + "language_loss": 0.68126768, + "learning_rate": 3.850818114805354e-06, + "loss": 0.70287621, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.6640625, + "step": 2495, + "time_per_iteration": 2.825629949569702 + }, + { + "auxiliary_loss_clip": 0.01027743, + "auxiliary_loss_mlp": 0.01016543, + "balance_loss_clip": 1.0122273, + "balance_loss_mlp": 1.01094174, + "epoch": 0.15006763865925146, + "flos": 68008955783040.0, + "grad_norm": 0.9021422311471574, + "language_loss": 0.59555101, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61599392, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.04321289, + "router_z_loss_mlp": 0.16796875, + "step": 2496, + "time_per_iteration": 3.0454862117767334 + }, + { + "auxiliary_loss_clip": 0.01094093, + "auxiliary_loss_mlp": 0.01072911, + "balance_loss_clip": 1.02722991, + "balance_loss_mlp": 1.02446198, + "epoch": 0.15012776191191943, + "flos": 18915278244480.0, + "grad_norm": 2.047809941665339, + "language_loss": 0.68437803, + "learning_rate": 3.850522786049075e-06, + "loss": 0.70604801, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6953125, + "step": 2497, + "time_per_iteration": 2.3804757595062256 + }, + { + "auxiliary_loss_clip": 0.01095766, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_clip": 1.04129052, + "balance_loss_mlp": 1.02748537, + "epoch": 0.1501878851645874, + "flos": 23700054787200.0, + "grad_norm": 1.4754583020147116, + "language_loss": 0.76348162, + "learning_rate": 3.850375016410121e-06, + "loss": 0.78526086, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.68359375, + "step": 2498, + "time_per_iteration": 2.4788625240325928 + }, + { + "auxiliary_loss_clip": 0.01101004, + "auxiliary_loss_mlp": 0.01088418, + "balance_loss_clip": 1.04171205, + "balance_loss_mlp": 1.02797115, + "epoch": 0.15024800841725539, + "flos": 20411481093120.0, + "grad_norm": 2.271934057075471, + "language_loss": 0.73164815, + "learning_rate": 3.850227176604761e-06, + "loss": 0.75354236, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.73046875, + "step": 2499, + "time_per_iteration": 2.3873672485351562 + }, + { + "auxiliary_loss_clip": 0.01094149, + "auxiliary_loss_mlp": 0.01074328, + "balance_loss_clip": 1.03031552, + "balance_loss_mlp": 1.02604723, + "epoch": 0.15030813166992335, + "flos": 31829685868800.0, + "grad_norm": 2.2560546851762195, + "language_loss": 0.74265355, + "learning_rate": 3.850079266638601e-06, + "loss": 0.76433831, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.6796875, + "step": 2500, + "time_per_iteration": 2.495368480682373 + }, + { + "auxiliary_loss_clip": 0.01093771, + "auxiliary_loss_mlp": 0.01077357, + "balance_loss_clip": 1.03084135, + "balance_loss_mlp": 1.02628386, + "epoch": 0.15036825492259132, + "flos": 35656515993600.0, + "grad_norm": 2.215772391043712, + "language_loss": 0.67097712, + "learning_rate": 3.849931286517249e-06, + "loss": 0.69268835, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.67578125, + "step": 2501, + "time_per_iteration": 2.5124545097351074 + }, + { + "auxiliary_loss_clip": 0.0109413, + "auxiliary_loss_mlp": 0.01072506, + "balance_loss_clip": 1.02773142, + "balance_loss_mlp": 1.02629566, + "epoch": 0.15042837817525928, + "flos": 18837317445120.0, + "grad_norm": 2.1300337737716935, + "language_loss": 0.86138809, + "learning_rate": 3.849783236246318e-06, + "loss": 0.88305449, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.67578125, + "step": 2502, + "time_per_iteration": 2.375136137008667 + }, + { + "auxiliary_loss_clip": 0.01091678, + "auxiliary_loss_mlp": 0.01070329, + "balance_loss_clip": 1.02805758, + "balance_loss_mlp": 1.02404237, + "epoch": 0.15048850142792725, + "flos": 19534567674240.0, + "grad_norm": 2.144282051528111, + "language_loss": 0.79522771, + "learning_rate": 3.849635115831421e-06, + "loss": 0.81684774, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6796875, + "step": 2503, + "time_per_iteration": 2.38259220123291 + }, + { + "auxiliary_loss_clip": 0.01092682, + "auxiliary_loss_mlp": 0.01066483, + "balance_loss_clip": 1.02156496, + "balance_loss_mlp": 1.02475059, + "epoch": 0.1505486246805952, + "flos": 22016473338240.0, + "grad_norm": 2.434187261231111, + "language_loss": 0.87555397, + "learning_rate": 3.849486925278176e-06, + "loss": 0.89714551, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6796875, + "step": 2504, + "time_per_iteration": 2.419578790664673 + }, + { + "auxiliary_loss_clip": 0.01092609, + "auxiliary_loss_mlp": 0.01065769, + "balance_loss_clip": 1.02542877, + "balance_loss_mlp": 1.02682245, + "epoch": 0.15060874793326318, + "flos": 20742038645760.0, + "grad_norm": 1.5612602549669992, + "language_loss": 0.83585477, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85743856, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.65625, + "step": 2505, + "time_per_iteration": 2.45501708984375 + }, + { + "auxiliary_loss_clip": 0.01093467, + "auxiliary_loss_mlp": 0.01063675, + "balance_loss_clip": 1.02042592, + "balance_loss_mlp": 1.02515697, + "epoch": 0.15066887118593117, + "flos": 16470973981440.0, + "grad_norm": 2.5012837575028084, + "language_loss": 0.7779249, + "learning_rate": 3.849190333779117e-06, + "loss": 0.79949629, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.68359375, + "step": 2506, + "time_per_iteration": 2.3906748294830322 + }, + { + "auxiliary_loss_clip": 0.01097539, + "auxiliary_loss_mlp": 0.01065713, + "balance_loss_clip": 1.0229404, + "balance_loss_mlp": 1.02731264, + "epoch": 0.15072899443859913, + "flos": 19858457157120.0, + "grad_norm": 2.6371719139540675, + "language_loss": 0.80366504, + "learning_rate": 3.849041932844552e-06, + "loss": 0.82529759, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.703125, + "step": 2507, + "time_per_iteration": 5.239079475402832 + }, + { + "auxiliary_loss_clip": 0.01093988, + "auxiliary_loss_mlp": 0.01056863, + "balance_loss_clip": 1.01656985, + "balance_loss_mlp": 1.0266223, + "epoch": 0.1507891176912671, + "flos": 20775206304000.0, + "grad_norm": 2.465007941890069, + "language_loss": 0.70571911, + "learning_rate": 3.848893461794131e-06, + "loss": 0.72722763, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.67578125, + "step": 2508, + "time_per_iteration": 2.390188217163086 + }, + { + "auxiliary_loss_clip": 0.01100176, + "auxiliary_loss_mlp": 0.01066198, + "balance_loss_clip": 1.02192402, + "balance_loss_mlp": 1.0284121, + "epoch": 0.15084924094393506, + "flos": 23585505016320.0, + "grad_norm": 1.7538201526249657, + "language_loss": 0.79751909, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.81918287, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.71875, + "step": 2509, + "time_per_iteration": 2.4125492572784424 + }, + { + "auxiliary_loss_clip": 0.01100566, + "auxiliary_loss_mlp": 0.01072605, + "balance_loss_clip": 1.02549314, + "balance_loss_mlp": 1.02777267, + "epoch": 0.15090936419660303, + "flos": 18910460476800.0, + "grad_norm": 3.9251387380211877, + "language_loss": 0.84335577, + "learning_rate": 3.848596309368246e-06, + "loss": 0.86508751, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.7265625, + "step": 2510, + "time_per_iteration": 3.75276255607605 + }, + { + "auxiliary_loss_clip": 0.01098107, + "auxiliary_loss_mlp": 0.01076069, + "balance_loss_clip": 1.02972007, + "balance_loss_mlp": 1.02757215, + "epoch": 0.150969487449271, + "flos": 17927341102080.0, + "grad_norm": 1.7600087110432463, + "language_loss": 0.75543225, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.77717406, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.703125, + "step": 2511, + "time_per_iteration": 3.790982723236084 + }, + { + "auxiliary_loss_clip": 0.01093534, + "auxiliary_loss_mlp": 0.01070108, + "balance_loss_clip": 1.02876663, + "balance_loss_mlp": 1.02660429, + "epoch": 0.151029610701939, + "flos": 24241941999360.0, + "grad_norm": 2.155248054891697, + "language_loss": 0.71194518, + "learning_rate": 3.848298876546534e-06, + "loss": 0.73358154, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.66796875, + "step": 2512, + "time_per_iteration": 2.4149928092956543 + }, + { + "auxiliary_loss_clip": 0.01098788, + "auxiliary_loss_mlp": 0.01071113, + "balance_loss_clip": 1.02793515, + "balance_loss_mlp": 1.02983439, + "epoch": 0.15108973395460695, + "flos": 30261212772480.0, + "grad_norm": 2.190306031423565, + "language_loss": 0.75713086, + "learning_rate": 3.84815005500134e-06, + "loss": 0.77882993, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.69140625, + "step": 2513, + "time_per_iteration": 2.486478567123413 + }, + { + "auxiliary_loss_clip": 0.01024724, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.02709091, + "balance_loss_mlp": 1.00602555, + "epoch": 0.15114985720727492, + "flos": 60434443319040.0, + "grad_norm": 0.8884858023483032, + "language_loss": 0.64962852, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6701901, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.04345703, + "router_z_loss_mlp": 0.1875, + "step": 2514, + "time_per_iteration": 2.9855165481567383 + }, + { + "auxiliary_loss_clip": 0.01099107, + "auxiliary_loss_mlp": 0.0107579, + "balance_loss_clip": 1.02710509, + "balance_loss_mlp": 1.02955222, + "epoch": 0.15120998045994288, + "flos": 20520654514560.0, + "grad_norm": 2.0703687169247695, + "language_loss": 0.75318503, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.77493405, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.6953125, + "step": 2515, + "time_per_iteration": 2.3908915519714355 + }, + { + "auxiliary_loss_clip": 0.0109604, + "auxiliary_loss_mlp": 0.01076667, + "balance_loss_clip": 1.03151011, + "balance_loss_mlp": 1.02622676, + "epoch": 0.15127010371261085, + "flos": 21177824636160.0, + "grad_norm": 1.6872791922004229, + "language_loss": 0.79323679, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.81496382, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.69921875, + "step": 2516, + "time_per_iteration": 2.410939931869507 + }, + { + "auxiliary_loss_clip": 0.01022552, + "auxiliary_loss_mlp": 0.01005508, + "balance_loss_clip": 1.00009632, + "balance_loss_mlp": 1.00476396, + "epoch": 0.1513302269652788, + "flos": 65317500938880.0, + "grad_norm": 0.7290160840294603, + "language_loss": 0.54699725, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56727785, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.05419922, + "router_z_loss_mlp": 0.17773438, + "step": 2517, + "time_per_iteration": 3.055333137512207 + }, + { + "auxiliary_loss_clip": 0.01097866, + "auxiliary_loss_mlp": 0.01075375, + "balance_loss_clip": 1.02845407, + "balance_loss_mlp": 1.02681875, + "epoch": 0.15139035021794678, + "flos": 19134812073600.0, + "grad_norm": 1.8817959344641373, + "language_loss": 0.7996822, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.82141459, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.7109375, + "step": 2518, + "time_per_iteration": 2.421412944793701 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01078742, + "balance_loss_clip": 1.02934194, + "balance_loss_mlp": 1.02605844, + "epoch": 0.15145047347061477, + "flos": 26577352131840.0, + "grad_norm": 2.062727654951096, + "language_loss": 0.72276735, + "learning_rate": 3.847255654205137e-06, + "loss": 0.74455762, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.7421875, + "step": 2519, + "time_per_iteration": 2.4406862258911133 + }, + { + "auxiliary_loss_clip": 0.01098385, + "auxiliary_loss_mlp": 0.01087486, + "balance_loss_clip": 1.04008865, + "balance_loss_mlp": 1.02673721, + "epoch": 0.15151059672328274, + "flos": 20301923646720.0, + "grad_norm": 2.3345630663774353, + "language_loss": 0.80948764, + "learning_rate": 3.847106342204354e-06, + "loss": 0.83134639, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.71875, + "step": 2520, + "time_per_iteration": 2.407841920852661 + }, + { + "auxiliary_loss_clip": 0.01100066, + "auxiliary_loss_mlp": 0.01083717, + "balance_loss_clip": 1.0338881, + "balance_loss_mlp": 1.02749395, + "epoch": 0.1515707199759507, + "flos": 27227330513280.0, + "grad_norm": 1.872046612520587, + "language_loss": 0.76894939, + "learning_rate": 3.846956960161114e-06, + "loss": 0.79078716, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.7265625, + "step": 2521, + "time_per_iteration": 2.457338333129883 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01075213, + "balance_loss_clip": 1.02419186, + "balance_loss_mlp": 1.02622497, + "epoch": 0.15163084322861867, + "flos": 23586203243520.0, + "grad_norm": 2.135534199209432, + "language_loss": 0.8412987, + "learning_rate": 3.84680750808108e-06, + "loss": 0.86305714, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.74609375, + "step": 2522, + "time_per_iteration": 2.427781105041504 + }, + { + "auxiliary_loss_clip": 0.01022689, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.02304602, + "balance_loss_mlp": 1.00400174, + "epoch": 0.15169096648128663, + "flos": 66886427882880.0, + "grad_norm": 0.8399761433835947, + "language_loss": 0.57936323, + "learning_rate": 3.846657985969922e-06, + "loss": 0.5998764, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.0559082, + "router_z_loss_mlp": 0.1875, + "step": 2523, + "time_per_iteration": 2.9662187099456787 + }, + { + "auxiliary_loss_clip": 0.01096955, + "auxiliary_loss_mlp": 0.01075797, + "balance_loss_clip": 1.02634931, + "balance_loss_mlp": 1.02684617, + "epoch": 0.1517510897339546, + "flos": 29094171022080.0, + "grad_norm": 1.5002728619740164, + "language_loss": 0.76528943, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.78701699, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.703125, + "step": 2524, + "time_per_iteration": 2.4852726459503174 + }, + { + "auxiliary_loss_clip": 0.0109892, + "auxiliary_loss_mlp": 0.01085202, + "balance_loss_clip": 1.03573036, + "balance_loss_mlp": 1.02613258, + "epoch": 0.1518112129866226, + "flos": 18405546261120.0, + "grad_norm": 1.7776221562925887, + "language_loss": 0.76452565, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.78636694, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.7265625, + "step": 2525, + "time_per_iteration": 2.3836252689361572 + }, + { + "auxiliary_loss_clip": 0.0110516, + "auxiliary_loss_mlp": 0.01084778, + "balance_loss_clip": 1.02951288, + "balance_loss_mlp": 1.03054094, + "epoch": 0.15187133623929056, + "flos": 19424451646080.0, + "grad_norm": 1.8969240245464323, + "language_loss": 0.81745791, + "learning_rate": 3.846208999506402e-06, + "loss": 0.83935726, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.74609375, + "step": 2526, + "time_per_iteration": 2.405791997909546 + }, + { + "auxiliary_loss_clip": 0.01097633, + "auxiliary_loss_mlp": 0.01065167, + "balance_loss_clip": 1.02043939, + "balance_loss_mlp": 1.02802122, + "epoch": 0.15193145949195852, + "flos": 17565256725120.0, + "grad_norm": 1.8035210069683476, + "language_loss": 0.86383271, + "learning_rate": 3.846059197327466e-06, + "loss": 0.88546067, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.6953125, + "step": 2527, + "time_per_iteration": 2.382150650024414 + }, + { + "auxiliary_loss_clip": 0.01099964, + "auxiliary_loss_mlp": 0.01074429, + "balance_loss_clip": 1.02831864, + "balance_loss_mlp": 1.02859974, + "epoch": 0.15199158274462649, + "flos": 36174731437440.0, + "grad_norm": 1.5994046406836089, + "language_loss": 0.71308672, + "learning_rate": 3.845909325145779e-06, + "loss": 0.73483062, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.71484375, + "step": 2528, + "time_per_iteration": 2.5499250888824463 + }, + { + "auxiliary_loss_clip": 0.01098732, + "auxiliary_loss_mlp": 0.01074785, + "balance_loss_clip": 1.02810311, + "balance_loss_mlp": 1.02697706, + "epoch": 0.15205170599729445, + "flos": 23072980124160.0, + "grad_norm": 1.7772628479897812, + "language_loss": 0.8875289, + "learning_rate": 3.845759382967026e-06, + "loss": 0.90926409, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.71875, + "step": 2529, + "time_per_iteration": 2.4078006744384766 + }, + { + "auxiliary_loss_clip": 0.01099326, + "auxiliary_loss_mlp": 0.01076162, + "balance_loss_clip": 1.02914584, + "balance_loss_mlp": 1.02898145, + "epoch": 0.15211182924996242, + "flos": 21907299916800.0, + "grad_norm": 1.7981590331376214, + "language_loss": 0.84938741, + "learning_rate": 3.845609370796893e-06, + "loss": 0.87114227, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.703125, + "step": 2530, + "time_per_iteration": 2.425597667694092 + }, + { + "auxiliary_loss_clip": 0.01098963, + "auxiliary_loss_mlp": 0.01079499, + "balance_loss_clip": 1.03591609, + "balance_loss_mlp": 1.02711892, + "epoch": 0.15217195250263038, + "flos": 13880662945920.0, + "grad_norm": 2.719446557287777, + "language_loss": 0.82987899, + "learning_rate": 3.845459288641066e-06, + "loss": 0.85166359, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.71875, + "step": 2531, + "time_per_iteration": 2.377437114715576 + }, + { + "auxiliary_loss_clip": 0.01097254, + "auxiliary_loss_mlp": 0.01078266, + "balance_loss_clip": 1.03394413, + "balance_loss_mlp": 1.02813625, + "epoch": 0.15223207575529837, + "flos": 24534165012480.0, + "grad_norm": 1.7052328572682445, + "language_loss": 0.80166292, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.82341814, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.69140625, + "step": 2532, + "time_per_iteration": 2.4589648246765137 + }, + { + "auxiliary_loss_clip": 0.01096223, + "auxiliary_loss_mlp": 0.0107674, + "balance_loss_clip": 1.0337292, + "balance_loss_mlp": 1.02731657, + "epoch": 0.15229219900796634, + "flos": 25555618926720.0, + "grad_norm": 1.8842302485160907, + "language_loss": 0.89500463, + "learning_rate": 3.845158914395105e-06, + "loss": 0.91673428, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6875, + "step": 2533, + "time_per_iteration": 2.4251551628112793 + }, + { + "auxiliary_loss_clip": 0.01099628, + "auxiliary_loss_mlp": 0.01081829, + "balance_loss_clip": 1.03500319, + "balance_loss_mlp": 1.02836061, + "epoch": 0.1523523222606343, + "flos": 18216980674560.0, + "grad_norm": 2.6803366057171853, + "language_loss": 0.80958569, + "learning_rate": 3.84500862231636e-06, + "loss": 0.83140028, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.7109375, + "step": 2534, + "time_per_iteration": 2.3794875144958496 + }, + { + "auxiliary_loss_clip": 0.01100741, + "auxiliary_loss_mlp": 0.01075901, + "balance_loss_clip": 1.02714491, + "balance_loss_mlp": 1.02767408, + "epoch": 0.15241244551330227, + "flos": 13259278834560.0, + "grad_norm": 2.472772633228721, + "language_loss": 0.79112452, + "learning_rate": 3.844858260274702e-06, + "loss": 0.81289101, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.73046875, + "step": 2535, + "time_per_iteration": 2.389275312423706 + }, + { + "auxiliary_loss_clip": 0.01102043, + "auxiliary_loss_mlp": 0.01073849, + "balance_loss_clip": 1.02680922, + "balance_loss_mlp": 1.02880085, + "epoch": 0.15247256876597023, + "flos": 19714649800320.0, + "grad_norm": 2.0403229171505113, + "language_loss": 0.79499602, + "learning_rate": 3.844707828275835e-06, + "loss": 0.816755, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.734375, + "step": 2536, + "time_per_iteration": 2.4379541873931885 + }, + { + "auxiliary_loss_clip": 0.01097123, + "auxiliary_loss_mlp": 0.01063776, + "balance_loss_clip": 1.02021682, + "balance_loss_mlp": 1.02906179, + "epoch": 0.1525326920186382, + "flos": 20374822298880.0, + "grad_norm": 2.3188945937776513, + "language_loss": 0.77777088, + "learning_rate": 3.844557326325461e-06, + "loss": 0.79937983, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.6796875, + "step": 2537, + "time_per_iteration": 2.4232356548309326 + }, + { + "auxiliary_loss_clip": 0.01099923, + "auxiliary_loss_mlp": 0.01073122, + "balance_loss_clip": 1.02415073, + "balance_loss_mlp": 1.02896285, + "epoch": 0.15259281527130616, + "flos": 13589103248640.0, + "grad_norm": 2.3592252554996143, + "language_loss": 0.79057246, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.81230295, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.70703125, + "step": 2538, + "time_per_iteration": 2.400545597076416 + }, + { + "auxiliary_loss_clip": 0.0109451, + "auxiliary_loss_mlp": 0.01055938, + "balance_loss_clip": 1.01433396, + "balance_loss_mlp": 1.02702236, + "epoch": 0.15265293852397416, + "flos": 22859171758080.0, + "grad_norm": 1.562961096304252, + "language_loss": 0.91274017, + "learning_rate": 3.844256112593029e-06, + "loss": 0.93424469, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.67578125, + "step": 2539, + "time_per_iteration": 2.502378225326538 + }, + { + "auxiliary_loss_clip": 0.01097021, + "auxiliary_loss_mlp": 0.01067757, + "balance_loss_clip": 1.02426958, + "balance_loss_mlp": 1.02690458, + "epoch": 0.15271306177664212, + "flos": 29236931038080.0, + "grad_norm": 2.06925833340324, + "language_loss": 0.94694918, + "learning_rate": 3.844105400822391e-06, + "loss": 0.96859694, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.703125, + "step": 2540, + "time_per_iteration": 2.459963798522949 + }, + { + "auxiliary_loss_clip": 0.01094573, + "auxiliary_loss_mlp": 0.01062582, + "balance_loss_clip": 1.01823568, + "balance_loss_mlp": 1.02596366, + "epoch": 0.1527731850293101, + "flos": 31244995463040.0, + "grad_norm": 1.7116432977712255, + "language_loss": 0.764305, + "learning_rate": 3.843954619123092e-06, + "loss": 0.78587657, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6875, + "step": 2541, + "time_per_iteration": 2.5024309158325195 + }, + { + "auxiliary_loss_clip": 0.01095473, + "auxiliary_loss_mlp": 0.01069237, + "balance_loss_clip": 1.02555835, + "balance_loss_mlp": 1.0263083, + "epoch": 0.15283330828197805, + "flos": 22381001510400.0, + "grad_norm": 1.5473365294735217, + "language_loss": 0.82768631, + "learning_rate": 3.84380376750085e-06, + "loss": 0.84933335, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.69140625, + "step": 2542, + "time_per_iteration": 2.3986380100250244 + }, + { + "auxiliary_loss_clip": 0.01097486, + "auxiliary_loss_mlp": 0.01070007, + "balance_loss_clip": 1.02477884, + "balance_loss_mlp": 1.0267849, + "epoch": 0.15289343153464602, + "flos": 25518960132480.0, + "grad_norm": 2.4328903536694884, + "language_loss": 0.79360616, + "learning_rate": 3.843652845961383e-06, + "loss": 0.81528109, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.70703125, + "step": 2543, + "time_per_iteration": 2.4708242416381836 + }, + { + "auxiliary_loss_clip": 0.0109462, + "auxiliary_loss_mlp": 0.01059643, + "balance_loss_clip": 1.01725245, + "balance_loss_mlp": 1.02587903, + "epoch": 0.15295355478731398, + "flos": 22708940711040.0, + "grad_norm": 1.8985773567350754, + "language_loss": 0.88735342, + "learning_rate": 3.843501854510416e-06, + "loss": 0.90889609, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.6875, + "step": 2544, + "time_per_iteration": 2.3975024223327637 + }, + { + "auxiliary_loss_clip": 0.01101356, + "auxiliary_loss_mlp": 0.01066471, + "balance_loss_clip": 1.0177145, + "balance_loss_mlp": 1.02765787, + "epoch": 0.15301367803998198, + "flos": 23250967568640.0, + "grad_norm": 1.956342701724328, + "language_loss": 0.83747399, + "learning_rate": 3.843350793153673e-06, + "loss": 0.8591522, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.734375, + "step": 2545, + "time_per_iteration": 2.4199275970458984 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.01067211, + "balance_loss_clip": 1.02117181, + "balance_loss_mlp": 1.02882612, + "epoch": 0.15307380129264994, + "flos": 25885059315840.0, + "grad_norm": 2.3184733391423045, + "language_loss": 0.72719479, + "learning_rate": 3.843199661896884e-06, + "loss": 0.74884808, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.6953125, + "step": 2546, + "time_per_iteration": 3.8704402446746826 + }, + { + "auxiliary_loss_clip": 0.01098132, + "auxiliary_loss_mlp": 0.01069904, + "balance_loss_clip": 1.02105236, + "balance_loss_mlp": 1.02642655, + "epoch": 0.1531339245453179, + "flos": 46971482279040.0, + "grad_norm": 1.6817684212124877, + "language_loss": 0.80030125, + "learning_rate": 3.843048460745779e-06, + "loss": 0.82198161, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.71875, + "step": 2547, + "time_per_iteration": 4.031328439712524 + }, + { + "auxiliary_loss_clip": 0.01099036, + "auxiliary_loss_mlp": 0.01075053, + "balance_loss_clip": 1.02722621, + "balance_loss_mlp": 1.02752709, + "epoch": 0.15319404779798587, + "flos": 35880588299520.0, + "grad_norm": 1.978955515397656, + "language_loss": 0.75919974, + "learning_rate": 3.842897189706092e-06, + "loss": 0.78094065, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.71875, + "step": 2548, + "time_per_iteration": 2.523813009262085 + }, + { + "auxiliary_loss_clip": 0.01095194, + "auxiliary_loss_mlp": 0.01067977, + "balance_loss_clip": 1.02348804, + "balance_loss_mlp": 1.02574301, + "epoch": 0.15325417105065384, + "flos": 25663500627840.0, + "grad_norm": 1.4432942664705732, + "language_loss": 0.81832165, + "learning_rate": 3.842745848783558e-06, + "loss": 0.83995336, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6953125, + "step": 2549, + "time_per_iteration": 2.45752215385437 + }, + { + "auxiliary_loss_clip": 0.01098374, + "auxiliary_loss_mlp": 0.01065502, + "balance_loss_clip": 1.02032161, + "balance_loss_mlp": 1.02622664, + "epoch": 0.1533142943033218, + "flos": 18769830053760.0, + "grad_norm": 1.6653555769892043, + "language_loss": 0.75929648, + "learning_rate": 3.842594437983917e-06, + "loss": 0.78093523, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.71875, + "step": 2550, + "time_per_iteration": 5.203188419342041 + }, + { + "auxiliary_loss_clip": 0.01100453, + "auxiliary_loss_mlp": 0.01068569, + "balance_loss_clip": 1.01981199, + "balance_loss_mlp": 1.02835107, + "epoch": 0.15337441755598977, + "flos": 23106392161920.0, + "grad_norm": 2.279158254129376, + "language_loss": 0.79346585, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.8151561, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.72265625, + "step": 2551, + "time_per_iteration": 2.416642665863037 + }, + { + "auxiliary_loss_clip": 0.01024596, + "auxiliary_loss_mlp": 0.01046489, + "balance_loss_clip": 1.03967071, + "balance_loss_mlp": 1.00470138, + "epoch": 0.15343454080865776, + "flos": 59857712703360.0, + "grad_norm": 1.0364194003386662, + "language_loss": 0.56765676, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58836764, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.06835938, + "router_z_loss_mlp": 0.19921875, + "step": 2552, + "time_per_iteration": 2.96636700630188 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.0107233, + "balance_loss_clip": 1.02500379, + "balance_loss_mlp": 1.0277127, + "epoch": 0.15349466406132573, + "flos": 11910095187840.0, + "grad_norm": 2.4014388423634045, + "language_loss": 0.9034788, + "learning_rate": 3.84213978637978e-06, + "loss": 0.92519987, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.71875, + "step": 2553, + "time_per_iteration": 2.3910927772521973 + }, + { + "auxiliary_loss_clip": 0.01101591, + "auxiliary_loss_mlp": 0.01072463, + "balance_loss_clip": 1.02895141, + "balance_loss_mlp": 1.02820694, + "epoch": 0.1535547873139937, + "flos": 24095795581440.0, + "grad_norm": 2.1363456086996937, + "language_loss": 0.80224639, + "learning_rate": 3.841988096129152e-06, + "loss": 0.82398689, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.734375, + "step": 2554, + "time_per_iteration": 2.427932024002075 + }, + { + "auxiliary_loss_clip": 0.01103302, + "auxiliary_loss_mlp": 0.010858, + "balance_loss_clip": 1.03763938, + "balance_loss_mlp": 1.02909565, + "epoch": 0.15361491056666166, + "flos": 17565501104640.0, + "grad_norm": 2.3367972699744013, + "language_loss": 0.81216413, + "learning_rate": 3.841836336030151e-06, + "loss": 0.83405519, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.7421875, + "step": 2555, + "time_per_iteration": 2.3645737171173096 + }, + { + "auxiliary_loss_clip": 0.01096604, + "auxiliary_loss_mlp": 0.01079506, + "balance_loss_clip": 1.03747249, + "balance_loss_mlp": 1.02723527, + "epoch": 0.15367503381932962, + "flos": 25044874513920.0, + "grad_norm": 1.5878386172858978, + "language_loss": 0.79242194, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.81418312, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.6953125, + "step": 2556, + "time_per_iteration": 2.4415009021759033 + }, + { + "auxiliary_loss_clip": 0.01094195, + "auxiliary_loss_mlp": 0.01067409, + "balance_loss_clip": 1.02678275, + "balance_loss_mlp": 1.02621567, + "epoch": 0.15373515707199759, + "flos": 21506252595840.0, + "grad_norm": 1.7834831595398932, + "language_loss": 0.92716771, + "learning_rate": 3.84153260631005e-06, + "loss": 0.94878376, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6796875, + "step": 2557, + "time_per_iteration": 2.40327525138855 + }, + { + "auxiliary_loss_clip": 0.010992, + "auxiliary_loss_mlp": 0.01069665, + "balance_loss_clip": 1.02112246, + "balance_loss_mlp": 1.02693439, + "epoch": 0.15379528032466555, + "flos": 25993534510080.0, + "grad_norm": 3.622183165240118, + "language_loss": 0.72086704, + "learning_rate": 3.841380636700468e-06, + "loss": 0.74255562, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.72265625, + "step": 2558, + "time_per_iteration": 2.4307589530944824 + }, + { + "auxiliary_loss_clip": 0.01098714, + "auxiliary_loss_mlp": 0.01072301, + "balance_loss_clip": 1.02614355, + "balance_loss_mlp": 1.02772546, + "epoch": 0.15385540357733354, + "flos": 19276420014720.0, + "grad_norm": 1.9605255719640426, + "language_loss": 0.93350697, + "learning_rate": 3.841228597265548e-06, + "loss": 0.95521712, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7109375, + "step": 2559, + "time_per_iteration": 2.3948278427124023 + }, + { + "auxiliary_loss_clip": 0.01096545, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_clip": 1.02042961, + "balance_loss_mlp": 1.02639329, + "epoch": 0.1539155268300015, + "flos": 28547850067200.0, + "grad_norm": 2.453192870595979, + "language_loss": 0.6731596, + "learning_rate": 3.841076488011055e-06, + "loss": 0.69479567, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.703125, + "step": 2560, + "time_per_iteration": 2.446274995803833 + }, + { + "auxiliary_loss_clip": 0.0109657, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_clip": 1.02691865, + "balance_loss_mlp": 1.02596188, + "epoch": 0.15397565008266947, + "flos": 23546821363200.0, + "grad_norm": 1.6250910385109087, + "language_loss": 0.90009302, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.92181712, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.703125, + "step": 2561, + "time_per_iteration": 2.481677532196045 + }, + { + "auxiliary_loss_clip": 0.01093649, + "auxiliary_loss_mlp": 0.01059406, + "balance_loss_clip": 1.01930368, + "balance_loss_mlp": 1.02710235, + "epoch": 0.15403577333533744, + "flos": 17128842330240.0, + "grad_norm": 1.9682293613477904, + "language_loss": 0.84695685, + "learning_rate": 3.840772060066425e-06, + "loss": 0.86848736, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.6640625, + "step": 2562, + "time_per_iteration": 2.3705010414123535 + }, + { + "auxiliary_loss_clip": 0.01104933, + "auxiliary_loss_mlp": 0.01076273, + "balance_loss_clip": 1.02572799, + "balance_loss_mlp": 1.02979231, + "epoch": 0.1540958965880054, + "flos": 17893545039360.0, + "grad_norm": 1.776985284325818, + "language_loss": 0.76830292, + "learning_rate": 3.840619741387832e-06, + "loss": 0.790115, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.75, + "step": 2563, + "time_per_iteration": 2.3822598457336426 + }, + { + "auxiliary_loss_clip": 0.01096761, + "auxiliary_loss_mlp": 0.01061446, + "balance_loss_clip": 1.01709998, + "balance_loss_mlp": 1.025455, + "epoch": 0.15415601984067337, + "flos": 32159684839680.0, + "grad_norm": 1.9113635556428057, + "language_loss": 0.79161245, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.81319451, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.7109375, + "step": 2564, + "time_per_iteration": 2.5182526111602783 + }, + { + "auxiliary_loss_clip": 0.01096085, + "auxiliary_loss_mlp": 0.0106898, + "balance_loss_clip": 1.02584982, + "balance_loss_mlp": 1.02684999, + "epoch": 0.15421614309334136, + "flos": 24023280954240.0, + "grad_norm": 2.183516921216014, + "language_loss": 0.72840607, + "learning_rate": 3.840314894646969e-06, + "loss": 0.75005674, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.69140625, + "step": 2565, + "time_per_iteration": 2.44225811958313 + }, + { + "auxiliary_loss_clip": 0.01097313, + "auxiliary_loss_mlp": 0.01072384, + "balance_loss_clip": 1.02744198, + "balance_loss_mlp": 1.02783966, + "epoch": 0.15427626634600933, + "flos": 24385225685760.0, + "grad_norm": 2.034402558322712, + "language_loss": 0.7373898, + "learning_rate": 3.840162366596259e-06, + "loss": 0.75908673, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6953125, + "step": 2566, + "time_per_iteration": 2.426607131958008 + }, + { + "auxiliary_loss_clip": 0.01093254, + "auxiliary_loss_mlp": 0.01058925, + "balance_loss_clip": 1.01982427, + "balance_loss_mlp": 1.02594876, + "epoch": 0.1543363895986773, + "flos": 23330394645120.0, + "grad_norm": 1.820940303541678, + "language_loss": 0.86442018, + "learning_rate": 3.840009768766408e-06, + "loss": 0.88594198, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.671875, + "step": 2567, + "time_per_iteration": 2.411522626876831 + }, + { + "auxiliary_loss_clip": 0.01097132, + "auxiliary_loss_mlp": 0.01063508, + "balance_loss_clip": 1.02257109, + "balance_loss_mlp": 1.02830207, + "epoch": 0.15439651285134526, + "flos": 24273294266880.0, + "grad_norm": 1.9144231075705551, + "language_loss": 0.80204099, + "learning_rate": 3.839857101163202e-06, + "loss": 0.82364738, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.6875, + "step": 2568, + "time_per_iteration": 2.4023633003234863 + }, + { + "auxiliary_loss_clip": 0.0109785, + "auxiliary_loss_mlp": 0.01064498, + "balance_loss_clip": 1.02024746, + "balance_loss_mlp": 1.02825642, + "epoch": 0.15445663610401322, + "flos": 22455052237440.0, + "grad_norm": 1.8937998798109774, + "language_loss": 0.71557164, + "learning_rate": 3.83970436379243e-06, + "loss": 0.73719513, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.6953125, + "step": 2569, + "time_per_iteration": 2.4714925289154053 + }, + { + "auxiliary_loss_clip": 0.01094335, + "auxiliary_loss_mlp": 0.01065552, + "balance_loss_clip": 1.02297068, + "balance_loss_mlp": 1.02727473, + "epoch": 0.1545167593566812, + "flos": 22048558744320.0, + "grad_norm": 1.7500689344432683, + "language_loss": 0.78234422, + "learning_rate": 3.839551556659884e-06, + "loss": 0.80394304, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.671875, + "step": 2570, + "time_per_iteration": 2.3910834789276123 + }, + { + "auxiliary_loss_clip": 0.01096604, + "auxiliary_loss_mlp": 0.01064898, + "balance_loss_clip": 1.02360368, + "balance_loss_mlp": 1.0279547, + "epoch": 0.15457688260934915, + "flos": 19317233260800.0, + "grad_norm": 2.0716643593386217, + "language_loss": 0.79481471, + "learning_rate": 3.839398679771359e-06, + "loss": 0.81642973, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6875, + "step": 2571, + "time_per_iteration": 2.3895623683929443 + }, + { + "auxiliary_loss_clip": 0.01096258, + "auxiliary_loss_mlp": 0.01070475, + "balance_loss_clip": 1.02786994, + "balance_loss_mlp": 1.02694273, + "epoch": 0.15463700586201715, + "flos": 24132838400640.0, + "grad_norm": 2.186632878312773, + "language_loss": 0.84615272, + "learning_rate": 3.839245733132652e-06, + "loss": 0.86782002, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.69140625, + "step": 2572, + "time_per_iteration": 2.4396603107452393 + }, + { + "auxiliary_loss_clip": 0.0109753, + "auxiliary_loss_mlp": 0.01075659, + "balance_loss_clip": 1.0338167, + "balance_loss_mlp": 1.02662945, + "epoch": 0.1546971291146851, + "flos": 22419789897600.0, + "grad_norm": 1.555799664525543, + "language_loss": 0.91988909, + "learning_rate": 3.839092716749563e-06, + "loss": 0.94162095, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.7109375, + "step": 2573, + "time_per_iteration": 2.4209787845611572 + }, + { + "auxiliary_loss_clip": 0.01097817, + "auxiliary_loss_mlp": 0.0106894, + "balance_loss_clip": 1.0257144, + "balance_loss_mlp": 1.02670562, + "epoch": 0.15475725236735308, + "flos": 17529261246720.0, + "grad_norm": 3.0262617267426966, + "language_loss": 0.72338963, + "learning_rate": 3.838939630627893e-06, + "loss": 0.74505711, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.7109375, + "step": 2574, + "time_per_iteration": 2.38788104057312 + }, + { + "auxiliary_loss_clip": 0.01097943, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.023193, + "balance_loss_mlp": 1.02711785, + "epoch": 0.15481737562002104, + "flos": 22560734522880.0, + "grad_norm": 1.5998806438172986, + "language_loss": 0.8411774, + "learning_rate": 3.838786474773448e-06, + "loss": 0.86285156, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.70703125, + "step": 2575, + "time_per_iteration": 2.4068105220794678 + }, + { + "auxiliary_loss_clip": 0.0109555, + "auxiliary_loss_mlp": 0.01077061, + "balance_loss_clip": 1.03281069, + "balance_loss_mlp": 1.02546036, + "epoch": 0.154877498872689, + "flos": 24899391411840.0, + "grad_norm": 1.8159503566104984, + "language_loss": 0.86419511, + "learning_rate": 3.838633249192036e-06, + "loss": 0.88592124, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.703125, + "step": 2576, + "time_per_iteration": 2.4594900608062744 + }, + { + "auxiliary_loss_clip": 0.01098122, + "auxiliary_loss_mlp": 0.01066833, + "balance_loss_clip": 1.02391768, + "balance_loss_mlp": 1.02649486, + "epoch": 0.15493762212535697, + "flos": 28146244164480.0, + "grad_norm": 1.622452302752956, + "language_loss": 0.84554434, + "learning_rate": 3.838479953889465e-06, + "loss": 0.86719388, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.71875, + "step": 2577, + "time_per_iteration": 2.4852817058563232 + }, + { + "auxiliary_loss_clip": 0.01097698, + "auxiliary_loss_mlp": 0.01081294, + "balance_loss_clip": 1.03375316, + "balance_loss_mlp": 1.02753055, + "epoch": 0.15499774537802496, + "flos": 25409891445120.0, + "grad_norm": 2.1785750619947892, + "language_loss": 0.7869215, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.80871141, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.69921875, + "step": 2578, + "time_per_iteration": 2.4510843753814697 + }, + { + "auxiliary_loss_clip": 0.01097543, + "auxiliary_loss_mlp": 0.01076175, + "balance_loss_clip": 1.02670348, + "balance_loss_mlp": 1.02702403, + "epoch": 0.15505786863069293, + "flos": 22090454242560.0, + "grad_norm": 1.7573737442840023, + "language_loss": 0.83660018, + "learning_rate": 3.83817315414411e-06, + "loss": 0.85833734, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.70703125, + "step": 2579, + "time_per_iteration": 2.430471658706665 + }, + { + "auxiliary_loss_clip": 0.01098023, + "auxiliary_loss_mlp": 0.01078904, + "balance_loss_clip": 1.0338428, + "balance_loss_mlp": 1.02726936, + "epoch": 0.1551179918833609, + "flos": 18916116117120.0, + "grad_norm": 1.5607441086364207, + "language_loss": 0.82495558, + "learning_rate": 3.838019649712958e-06, + "loss": 0.84672493, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.7109375, + "step": 2580, + "time_per_iteration": 2.3770833015441895 + }, + { + "auxiliary_loss_clip": 0.01042176, + "auxiliary_loss_mlp": 0.01008939, + "balance_loss_clip": 1.0023582, + "balance_loss_mlp": 1.01274061, + "epoch": 0.15517811513602886, + "flos": 66235821096960.0, + "grad_norm": 0.8458406064497984, + "language_loss": 0.58942306, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60993421, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.29492188, + "step": 2581, + "time_per_iteration": 3.1661159992218018 + }, + { + "auxiliary_loss_clip": 0.01100546, + "auxiliary_loss_mlp": 0.01076247, + "balance_loss_clip": 1.02870584, + "balance_loss_mlp": 1.02673602, + "epoch": 0.15523823838869683, + "flos": 24020034197760.0, + "grad_norm": 1.7861447766695535, + "language_loss": 0.86469716, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.88646507, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.73828125, + "step": 2582, + "time_per_iteration": 2.434481382369995 + }, + { + "auxiliary_loss_clip": 0.01099081, + "auxiliary_loss_mlp": 0.0107589, + "balance_loss_clip": 1.02491593, + "balance_loss_mlp": 1.02664554, + "epoch": 0.1552983616413648, + "flos": 20484030631680.0, + "grad_norm": 2.320409293554752, + "language_loss": 0.80251288, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.82426262, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7265625, + "step": 2583, + "time_per_iteration": 2.4756081104278564 + }, + { + "auxiliary_loss_clip": 0.01100372, + "auxiliary_loss_mlp": 0.01065919, + "balance_loss_clip": 1.01983285, + "balance_loss_mlp": 1.0280323, + "epoch": 0.15535848489403276, + "flos": 32122362729600.0, + "grad_norm": 1.7238357292521107, + "language_loss": 0.77733183, + "learning_rate": 3.837404935067705e-06, + "loss": 0.79899478, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.72265625, + "step": 2584, + "time_per_iteration": 2.509094476699829 + }, + { + "auxiliary_loss_clip": 0.01098478, + "auxiliary_loss_mlp": 0.01071619, + "balance_loss_clip": 1.02229059, + "balance_loss_mlp": 1.02719021, + "epoch": 0.15541860814670075, + "flos": 19097455052160.0, + "grad_norm": 1.8437334872258608, + "language_loss": 0.77844614, + "learning_rate": 3.837251082205368e-06, + "loss": 0.80014712, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.7109375, + "step": 2585, + "time_per_iteration": 3.8687052726745605 + }, + { + "auxiliary_loss_clip": 0.01096942, + "auxiliary_loss_mlp": 0.01072587, + "balance_loss_clip": 1.02754974, + "balance_loss_mlp": 1.02739859, + "epoch": 0.1554787313993687, + "flos": 19171086842880.0, + "grad_norm": 3.2636896408155587, + "language_loss": 0.63598025, + "learning_rate": 3.837097159674286e-06, + "loss": 0.65767562, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6953125, + "step": 2586, + "time_per_iteration": 2.423546314239502 + }, + { + "auxiliary_loss_clip": 0.01098077, + "auxiliary_loss_mlp": 0.01073369, + "balance_loss_clip": 1.02542281, + "balance_loss_mlp": 1.02625418, + "epoch": 0.15553885465203668, + "flos": 16142895135360.0, + "grad_norm": 1.7942491504368887, + "language_loss": 0.83843333, + "learning_rate": 3.836943167480296e-06, + "loss": 0.86014783, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.71875, + "step": 2587, + "time_per_iteration": 3.7980542182922363 + }, + { + "auxiliary_loss_clip": 0.01099563, + "auxiliary_loss_mlp": 0.01071335, + "balance_loss_clip": 1.02427125, + "balance_loss_mlp": 1.02707887, + "epoch": 0.15559897790470464, + "flos": 25336608768000.0, + "grad_norm": 1.8160141890613328, + "language_loss": 0.9063884, + "learning_rate": 3.836789105629236e-06, + "loss": 0.92809737, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.7265625, + "step": 2588, + "time_per_iteration": 2.4508678913116455 + }, + { + "auxiliary_loss_clip": 0.01097904, + "auxiliary_loss_mlp": 0.01078093, + "balance_loss_clip": 1.02828765, + "balance_loss_mlp": 1.0269134, + "epoch": 0.1556591011573726, + "flos": 23147659255680.0, + "grad_norm": 2.4615164940096617, + "language_loss": 0.66682875, + "learning_rate": 3.83663497412695e-06, + "loss": 0.68858874, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.7109375, + "step": 2589, + "time_per_iteration": 3.8832924365997314 + }, + { + "auxiliary_loss_clip": 0.01095306, + "auxiliary_loss_mlp": 0.01077039, + "balance_loss_clip": 1.02747214, + "balance_loss_mlp": 1.02510238, + "epoch": 0.15571922441004057, + "flos": 25369811337600.0, + "grad_norm": 2.003641474081145, + "language_loss": 0.8438338, + "learning_rate": 3.836480772979281e-06, + "loss": 0.86555719, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.703125, + "step": 2590, + "time_per_iteration": 3.873076915740967 + }, + { + "auxiliary_loss_clip": 0.0109792, + "auxiliary_loss_mlp": 0.0107168, + "balance_loss_clip": 1.02394867, + "balance_loss_mlp": 1.02709472, + "epoch": 0.15577934766270854, + "flos": 14500510957440.0, + "grad_norm": 40.3573028737164, + "language_loss": 0.81230736, + "learning_rate": 3.836326502192077e-06, + "loss": 0.83400339, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.7109375, + "step": 2591, + "time_per_iteration": 2.4202780723571777 + }, + { + "auxiliary_loss_clip": 0.01096003, + "auxiliary_loss_mlp": 0.01075992, + "balance_loss_clip": 1.0332675, + "balance_loss_mlp": 1.02646804, + "epoch": 0.15583947091537653, + "flos": 37413031006080.0, + "grad_norm": 2.4946715793412793, + "language_loss": 0.68006468, + "learning_rate": 3.836172161771189e-06, + "loss": 0.70178461, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6953125, + "step": 2592, + "time_per_iteration": 2.5637879371643066 + }, + { + "auxiliary_loss_clip": 0.01100202, + "auxiliary_loss_mlp": 0.01079081, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.02854741, + "epoch": 0.1558995941680445, + "flos": 21833668126080.0, + "grad_norm": 2.1995989804263982, + "language_loss": 0.84717542, + "learning_rate": 3.836017751722467e-06, + "loss": 0.86896825, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.71484375, + "step": 2593, + "time_per_iteration": 2.4200408458709717 + }, + { + "auxiliary_loss_clip": 0.01095047, + "auxiliary_loss_mlp": 0.01078224, + "balance_loss_clip": 1.02963424, + "balance_loss_mlp": 1.02588654, + "epoch": 0.15595971742071246, + "flos": 19791598170240.0, + "grad_norm": 1.9928576033483099, + "language_loss": 0.7483117, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.77004445, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.69140625, + "step": 2594, + "time_per_iteration": 2.417465925216675 + }, + { + "auxiliary_loss_clip": 0.01093761, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.02505457, + "epoch": 0.15601984067338043, + "flos": 26720984931840.0, + "grad_norm": 1.9794451969064943, + "language_loss": 0.83469123, + "learning_rate": 3.835708722764952e-06, + "loss": 0.85637188, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.6875, + "step": 2595, + "time_per_iteration": 2.5143094062805176 + }, + { + "auxiliary_loss_clip": 0.01098907, + "auxiliary_loss_mlp": 0.01082905, + "balance_loss_clip": 1.03171659, + "balance_loss_mlp": 1.02644813, + "epoch": 0.1560799639260484, + "flos": 18368293973760.0, + "grad_norm": 2.5570768471939864, + "language_loss": 0.88448894, + "learning_rate": 3.835554103867876e-06, + "loss": 0.90630698, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.7265625, + "step": 2596, + "time_per_iteration": 2.5827157497406006 + }, + { + "auxiliary_loss_clip": 0.01094468, + "auxiliary_loss_mlp": 0.01067886, + "balance_loss_clip": 1.02244306, + "balance_loss_mlp": 1.02636695, + "epoch": 0.15614008717871636, + "flos": 22597951898880.0, + "grad_norm": 2.269797878176051, + "language_loss": 0.70113981, + "learning_rate": 3.835399415366404e-06, + "loss": 0.7227633, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.6796875, + "step": 2597, + "time_per_iteration": 2.6604318618774414 + }, + { + "auxiliary_loss_clip": 0.01093514, + "auxiliary_loss_mlp": 0.01069119, + "balance_loss_clip": 1.02448761, + "balance_loss_mlp": 1.02565885, + "epoch": 0.15620021043138435, + "flos": 22745774062080.0, + "grad_norm": 1.7647168158664732, + "language_loss": 0.80912727, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.83075362, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6796875, + "step": 2598, + "time_per_iteration": 2.4874603748321533 + }, + { + "auxiliary_loss_clip": 0.01093876, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_clip": 1.02070904, + "balance_loss_mlp": 1.02676916, + "epoch": 0.15626033368405232, + "flos": 13114109934720.0, + "grad_norm": 1.960505862975146, + "language_loss": 0.83797705, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85955822, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.671875, + "step": 2599, + "time_per_iteration": 2.3877322673797607 + }, + { + "auxiliary_loss_clip": 0.01104908, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_clip": 1.02594745, + "balance_loss_mlp": 1.02978063, + "epoch": 0.15632045693672028, + "flos": 16471358006400.0, + "grad_norm": 1.962282710291464, + "language_loss": 0.84407914, + "learning_rate": 3.834934932294287e-06, + "loss": 0.86589766, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2600, + "time_per_iteration": 2.3838112354278564 + }, + { + "auxiliary_loss_clip": 0.01099673, + "auxiliary_loss_mlp": 0.01082818, + "balance_loss_clip": 1.03353667, + "balance_loss_mlp": 1.02814841, + "epoch": 0.15638058018938825, + "flos": 20849291942400.0, + "grad_norm": 1.829847356188403, + "language_loss": 0.90162408, + "learning_rate": 3.834779965433917e-06, + "loss": 0.92344898, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.71484375, + "step": 2601, + "time_per_iteration": 2.4428181648254395 + }, + { + "auxiliary_loss_clip": 0.01101377, + "auxiliary_loss_mlp": 0.01089257, + "balance_loss_clip": 1.03685224, + "balance_loss_mlp": 1.02728939, + "epoch": 0.1564407034420562, + "flos": 21871129881600.0, + "grad_norm": 1.896950716803312, + "language_loss": 0.8066892, + "learning_rate": 3.834624928998508e-06, + "loss": 0.82859552, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7421875, + "step": 2602, + "time_per_iteration": 2.4765241146087646 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.01074248, + "balance_loss_clip": 1.02718413, + "balance_loss_mlp": 1.02735484, + "epoch": 0.15650082669472418, + "flos": 21833493569280.0, + "grad_norm": 2.188821973294658, + "language_loss": 0.75559092, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.7773155, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.70703125, + "step": 2603, + "time_per_iteration": 2.4652647972106934 + }, + { + "auxiliary_loss_clip": 0.01097853, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.02672815, + "balance_loss_mlp": 1.02665043, + "epoch": 0.15656094994739214, + "flos": 13799909237760.0, + "grad_norm": 4.021296249830068, + "language_loss": 0.90792179, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.92962825, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7109375, + "step": 2604, + "time_per_iteration": 2.4414494037628174 + }, + { + "auxiliary_loss_clip": 0.0109881, + "auxiliary_loss_mlp": 0.01079018, + "balance_loss_clip": 1.02904534, + "balance_loss_mlp": 1.02626705, + "epoch": 0.15662107320006013, + "flos": 27306967057920.0, + "grad_norm": 2.0282952524323554, + "language_loss": 0.86962706, + "learning_rate": 3.834159402300841e-06, + "loss": 0.89140534, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7265625, + "step": 2605, + "time_per_iteration": 2.505866765975952 + }, + { + "auxiliary_loss_clip": 0.01102049, + "auxiliary_loss_mlp": 0.01073818, + "balance_loss_clip": 1.02048373, + "balance_loss_mlp": 1.02715468, + "epoch": 0.1566811964527281, + "flos": 26683942112640.0, + "grad_norm": 2.088027107330147, + "language_loss": 0.75297964, + "learning_rate": 3.834004087624087e-06, + "loss": 0.77473825, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.75, + "step": 2606, + "time_per_iteration": 2.498234510421753 + }, + { + "auxiliary_loss_clip": 0.0110058, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_clip": 1.02268052, + "balance_loss_mlp": 1.03020382, + "epoch": 0.15674131970539606, + "flos": 16102605559680.0, + "grad_norm": 2.2071101273572737, + "language_loss": 0.78361607, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.80532575, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.703125, + "step": 2607, + "time_per_iteration": 2.39552640914917 + }, + { + "auxiliary_loss_clip": 0.01095934, + "auxiliary_loss_mlp": 0.01076469, + "balance_loss_clip": 1.02997768, + "balance_loss_mlp": 1.02711856, + "epoch": 0.15680144295806403, + "flos": 19168747781760.0, + "grad_norm": 1.726709307908285, + "language_loss": 0.83852005, + "learning_rate": 3.833693249639615e-06, + "loss": 0.86024415, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.6875, + "step": 2608, + "time_per_iteration": 2.3945152759552 + }, + { + "auxiliary_loss_clip": 0.01101603, + "auxiliary_loss_mlp": 0.0107635, + "balance_loss_clip": 1.02232409, + "balance_loss_mlp": 1.02746892, + "epoch": 0.156861566210732, + "flos": 20812388768640.0, + "grad_norm": 1.719671566334231, + "language_loss": 0.74420464, + "learning_rate": 3.833537726343684e-06, + "loss": 0.76598418, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.7421875, + "step": 2609, + "time_per_iteration": 2.413965940475464 + }, + { + "auxiliary_loss_clip": 0.01098867, + "auxiliary_loss_mlp": 0.01075632, + "balance_loss_clip": 1.02666104, + "balance_loss_mlp": 1.02619791, + "epoch": 0.15692168946339996, + "flos": 20046883098240.0, + "grad_norm": 1.851824119612525, + "language_loss": 0.74521577, + "learning_rate": 3.833382133519818e-06, + "loss": 0.76696068, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7265625, + "step": 2610, + "time_per_iteration": 2.4074745178222656 + }, + { + "auxiliary_loss_clip": 0.01098668, + "auxiliary_loss_mlp": 0.010764, + "balance_loss_clip": 1.02437758, + "balance_loss_mlp": 1.02639592, + "epoch": 0.15698181271606793, + "flos": 21396939528960.0, + "grad_norm": 1.8556924020672763, + "language_loss": 0.744973, + "learning_rate": 3.833226471173919e-06, + "loss": 0.76672369, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 0.72265625, + "step": 2611, + "time_per_iteration": 2.4300589561462402 + }, + { + "auxiliary_loss_clip": 0.01096871, + "auxiliary_loss_mlp": 0.01076701, + "balance_loss_clip": 1.02765834, + "balance_loss_mlp": 1.02564597, + "epoch": 0.15704193596873592, + "flos": 20844858199680.0, + "grad_norm": 2.0675315764762137, + "language_loss": 0.72220063, + "learning_rate": 3.833070739311887e-06, + "loss": 0.74393636, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.7109375, + "step": 2612, + "time_per_iteration": 2.4165761470794678 + }, + { + "auxiliary_loss_clip": 0.01098907, + "auxiliary_loss_mlp": 0.01072652, + "balance_loss_clip": 1.02451539, + "balance_loss_mlp": 1.02693939, + "epoch": 0.15710205922140388, + "flos": 21761816814720.0, + "grad_norm": 1.9217376755711606, + "language_loss": 0.78542399, + "learning_rate": 3.83291493793963e-06, + "loss": 0.80713964, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 0.71875, + "step": 2613, + "time_per_iteration": 2.459002733230591 + }, + { + "auxiliary_loss_clip": 0.01100126, + "auxiliary_loss_mlp": 0.01087867, + "balance_loss_clip": 1.03431821, + "balance_loss_mlp": 1.02644718, + "epoch": 0.15716218247407185, + "flos": 25006644708480.0, + "grad_norm": 1.7780168945260628, + "language_loss": 0.68102163, + "learning_rate": 3.832759067063055e-06, + "loss": 0.7029016, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.73828125, + "step": 2614, + "time_per_iteration": 2.4510717391967773 + }, + { + "auxiliary_loss_clip": 0.01102961, + "auxiliary_loss_mlp": 0.01080912, + "balance_loss_clip": 1.02674294, + "balance_loss_mlp": 1.02846146, + "epoch": 0.1572223057267398, + "flos": 20190795189120.0, + "grad_norm": 2.3061352287113133, + "language_loss": 0.77515626, + "learning_rate": 3.832603126688072e-06, + "loss": 0.79699504, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.74609375, + "step": 2615, + "time_per_iteration": 2.424018383026123 + }, + { + "auxiliary_loss_clip": 0.01095458, + "auxiliary_loss_mlp": 0.01081897, + "balance_loss_clip": 1.03421307, + "balance_loss_mlp": 1.02550828, + "epoch": 0.15728242897940778, + "flos": 20958465363840.0, + "grad_norm": 1.616655039620417, + "language_loss": 0.74437958, + "learning_rate": 3.832447116820594e-06, + "loss": 0.7661531, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.69921875, + "step": 2616, + "time_per_iteration": 2.482408285140991 + }, + { + "auxiliary_loss_clip": 0.01098058, + "auxiliary_loss_mlp": 0.01079693, + "balance_loss_clip": 1.03105593, + "balance_loss_mlp": 1.02611196, + "epoch": 0.15734255223207574, + "flos": 23037194113920.0, + "grad_norm": 2.8715599155733815, + "language_loss": 0.74388766, + "learning_rate": 3.832291037466539e-06, + "loss": 0.76566517, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.71875, + "step": 2617, + "time_per_iteration": 2.4293770790100098 + }, + { + "auxiliary_loss_clip": 0.01097798, + "auxiliary_loss_mlp": 0.01073883, + "balance_loss_clip": 1.02634239, + "balance_loss_mlp": 1.02641308, + "epoch": 0.15740267548474374, + "flos": 20550435770880.0, + "grad_norm": 2.037959970366465, + "language_loss": 0.76729262, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.78900945, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.71484375, + "step": 2618, + "time_per_iteration": 2.395024538040161 + }, + { + "auxiliary_loss_clip": 0.01102677, + "auxiliary_loss_mlp": 0.01073903, + "balance_loss_clip": 1.02049708, + "balance_loss_mlp": 1.02666175, + "epoch": 0.1574627987374117, + "flos": 22666032783360.0, + "grad_norm": 1.879084371129764, + "language_loss": 0.80936933, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.83113503, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 0.76171875, + "step": 2619, + "time_per_iteration": 2.4555461406707764 + }, + { + "auxiliary_loss_clip": 0.01099022, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_clip": 1.02477384, + "balance_loss_mlp": 1.02842295, + "epoch": 0.15752292199007967, + "flos": 16799716143360.0, + "grad_norm": 2.748983273564901, + "language_loss": 0.78752613, + "learning_rate": 3.831822382544101e-06, + "loss": 0.80923355, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.70703125, + "step": 2620, + "time_per_iteration": 2.426804542541504 + }, + { + "auxiliary_loss_clip": 0.01101798, + "auxiliary_loss_mlp": 0.01078507, + "balance_loss_clip": 1.02348006, + "balance_loss_mlp": 1.02762103, + "epoch": 0.15758304524274763, + "flos": 29824693643520.0, + "grad_norm": 1.7943109274642832, + "language_loss": 0.72698474, + "learning_rate": 3.831666025302944e-06, + "loss": 0.74878782, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 0.7421875, + "step": 2621, + "time_per_iteration": 2.5202910900115967 + }, + { + "auxiliary_loss_clip": 0.0110304, + "auxiliary_loss_mlp": 0.01080069, + "balance_loss_clip": 1.02928591, + "balance_loss_mlp": 1.02807271, + "epoch": 0.1576431684954156, + "flos": 53575478369280.0, + "grad_norm": 1.900484935239961, + "language_loss": 0.74166363, + "learning_rate": 3.831509598604828e-06, + "loss": 0.76349473, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.75, + "step": 2622, + "time_per_iteration": 2.789544105529785 + }, + { + "auxiliary_loss_clip": 0.01100957, + "auxiliary_loss_mlp": 0.01067971, + "balance_loss_clip": 1.02104998, + "balance_loss_mlp": 1.02849829, + "epoch": 0.15770329174808356, + "flos": 20812563325440.0, + "grad_norm": 1.8235500441917967, + "language_loss": 0.89836836, + "learning_rate": 3.831353102455684e-06, + "loss": 0.92005765, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.72265625, + "step": 2623, + "time_per_iteration": 2.437532663345337 + }, + { + "auxiliary_loss_clip": 0.01096214, + "auxiliary_loss_mlp": 0.01073358, + "balance_loss_clip": 1.02670002, + "balance_loss_mlp": 1.02628982, + "epoch": 0.15776341500075153, + "flos": 24972813734400.0, + "grad_norm": 1.666040177322077, + "language_loss": 0.83315217, + "learning_rate": 3.831196536861448e-06, + "loss": 0.85484785, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.69921875, + "step": 2624, + "time_per_iteration": 2.446948289871216 + }, + { + "auxiliary_loss_clip": 0.01101544, + "auxiliary_loss_mlp": 0.0108034, + "balance_loss_clip": 1.02402544, + "balance_loss_mlp": 1.02624452, + "epoch": 0.15782353825341952, + "flos": 21906846069120.0, + "grad_norm": 2.0986777501414937, + "language_loss": 0.83725595, + "learning_rate": 3.831039901828054e-06, + "loss": 0.85907477, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.75390625, + "step": 2625, + "time_per_iteration": 3.9176290035247803 + }, + { + "auxiliary_loss_clip": 0.01100739, + "auxiliary_loss_mlp": 0.01072548, + "balance_loss_clip": 1.02233672, + "balance_loss_mlp": 1.02728963, + "epoch": 0.15788366150608749, + "flos": 26175990608640.0, + "grad_norm": 2.3956552405422746, + "language_loss": 0.8162238, + "learning_rate": 3.830883197361445e-06, + "loss": 0.83795667, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 0.734375, + "step": 2626, + "time_per_iteration": 2.4363486766815186 + }, + { + "auxiliary_loss_clip": 0.01101219, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.01810384, + "balance_loss_mlp": 1.02912283, + "epoch": 0.15794378475875545, + "flos": 27708572960640.0, + "grad_norm": 1.7190558571701282, + "language_loss": 0.76267004, + "learning_rate": 3.830726423467561e-06, + "loss": 0.78435415, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 0.72265625, + "step": 2627, + "time_per_iteration": 3.8809659481048584 + }, + { + "auxiliary_loss_clip": 0.01101071, + "auxiliary_loss_mlp": 0.01081705, + "balance_loss_clip": 1.02806103, + "balance_loss_mlp": 1.02695453, + "epoch": 0.15800390801142342, + "flos": 12129349726080.0, + "grad_norm": 2.124684347500375, + "language_loss": 0.87497663, + "learning_rate": 3.830569580152348e-06, + "loss": 0.89680433, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 0.7421875, + "step": 2628, + "time_per_iteration": 2.4015252590179443 + }, + { + "auxiliary_loss_clip": 0.01096136, + "auxiliary_loss_mlp": 0.01064989, + "balance_loss_clip": 1.01928425, + "balance_loss_mlp": 1.02456725, + "epoch": 0.15806403126409138, + "flos": 20703669194880.0, + "grad_norm": 3.607243813166436, + "language_loss": 0.7866255, + "learning_rate": 3.830412667421752e-06, + "loss": 0.80823678, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.71875, + "step": 2629, + "time_per_iteration": 5.244709730148315 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01098151, + "balance_loss_clip": 1.04219413, + "balance_loss_mlp": 1.02702641, + "epoch": 0.15812415451675935, + "flos": 17820751121280.0, + "grad_norm": 2.5815700806940867, + "language_loss": 0.76315856, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.78514016, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 0.73046875, + "step": 2630, + "time_per_iteration": 2.4157791137695312 + }, + { + "auxiliary_loss_clip": 0.01100199, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_clip": 1.02495754, + "balance_loss_mlp": 1.02617908, + "epoch": 0.15818427776942734, + "flos": 20083018222080.0, + "grad_norm": 2.1344526814547753, + "language_loss": 0.86044872, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.88221359, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.7421875, + "step": 2631, + "time_per_iteration": 2.472343683242798 + }, + { + "auxiliary_loss_clip": 0.01099275, + "auxiliary_loss_mlp": 0.01075192, + "balance_loss_clip": 1.02226305, + "balance_loss_mlp": 1.0258863, + "epoch": 0.1582444010220953, + "flos": 21213855025920.0, + "grad_norm": 1.6551893914505127, + "language_loss": 0.81332576, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.83507037, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.734375, + "step": 2632, + "time_per_iteration": 2.442218065261841 + }, + { + "auxiliary_loss_clip": 0.01102179, + "auxiliary_loss_mlp": 0.01083126, + "balance_loss_clip": 1.02883828, + "balance_loss_mlp": 1.0264976, + "epoch": 0.15830452427476327, + "flos": 17857375004160.0, + "grad_norm": 1.8736910905682398, + "language_loss": 0.84556162, + "learning_rate": 3.829784322464594e-06, + "loss": 0.86741471, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 0.7578125, + "step": 2633, + "time_per_iteration": 2.387376308441162 + }, + { + "auxiliary_loss_clip": 0.01101787, + "auxiliary_loss_mlp": 0.01072932, + "balance_loss_clip": 1.02365136, + "balance_loss_mlp": 1.02760816, + "epoch": 0.15836464752743123, + "flos": 24533815898880.0, + "grad_norm": 1.7359865077593086, + "language_loss": 0.79173571, + "learning_rate": 3.829627062746394e-06, + "loss": 0.81348288, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.7421875, + "step": 2634, + "time_per_iteration": 2.438629388809204 + }, + { + "auxiliary_loss_clip": 0.01102009, + "auxiliary_loss_mlp": 0.01083705, + "balance_loss_clip": 1.02626967, + "balance_loss_mlp": 1.02485025, + "epoch": 0.1584247707800992, + "flos": 20119781750400.0, + "grad_norm": 2.039671432380225, + "language_loss": 0.90512645, + "learning_rate": 3.829469733648552e-06, + "loss": 0.92698354, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 0.7734375, + "step": 2635, + "time_per_iteration": 2.4059805870056152 + }, + { + "auxiliary_loss_clip": 0.0110019, + "auxiliary_loss_mlp": 0.01072048, + "balance_loss_clip": 1.02114594, + "balance_loss_mlp": 1.02440238, + "epoch": 0.15848489403276717, + "flos": 20374927032960.0, + "grad_norm": 1.8074476311024006, + "language_loss": 0.78704494, + "learning_rate": 3.829312335177034e-06, + "loss": 0.80876732, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.7578125, + "step": 2636, + "time_per_iteration": 2.427769184112549 + }, + { + "auxiliary_loss_clip": 0.01103503, + "auxiliary_loss_mlp": 0.01078265, + "balance_loss_clip": 1.02192688, + "balance_loss_mlp": 1.02711892, + "epoch": 0.15854501728543513, + "flos": 39345368958720.0, + "grad_norm": 2.3115574486844093, + "language_loss": 0.74972999, + "learning_rate": 3.82915486733781e-06, + "loss": 0.77154768, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 0.765625, + "step": 2637, + "time_per_iteration": 2.57893967628479 + }, + { + "auxiliary_loss_clip": 0.01097539, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_clip": 1.02657592, + "balance_loss_mlp": 1.02479827, + "epoch": 0.15860514053810312, + "flos": 24863046819840.0, + "grad_norm": 1.9576004329653567, + "language_loss": 0.79636997, + "learning_rate": 3.82899733013685e-06, + "loss": 0.81810462, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.7265625, + "step": 2638, + "time_per_iteration": 2.4371321201324463 + }, + { + "auxiliary_loss_clip": 0.0109894, + "auxiliary_loss_mlp": 0.01082489, + "balance_loss_clip": 1.02927375, + "balance_loss_mlp": 1.02595854, + "epoch": 0.1586652637907711, + "flos": 26176479367680.0, + "grad_norm": 3.104627863596063, + "language_loss": 0.76812345, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78993773, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.73046875, + "step": 2639, + "time_per_iteration": 2.4586029052734375 + }, + { + "auxiliary_loss_clip": 0.01101195, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_clip": 1.03331232, + "balance_loss_mlp": 1.02687442, + "epoch": 0.15872538704343905, + "flos": 19791039588480.0, + "grad_norm": 1.8040361415833037, + "language_loss": 0.83456886, + "learning_rate": 3.82868204767362e-06, + "loss": 0.85643703, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7421875, + "step": 2640, + "time_per_iteration": 2.41383695602417 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.0106839, + "balance_loss_clip": 1.02318597, + "balance_loss_mlp": 1.02549183, + "epoch": 0.15878551029610702, + "flos": 28474113542400.0, + "grad_norm": 1.424396915022408, + "language_loss": 0.6830225, + "learning_rate": 3.828524302423306e-06, + "loss": 0.70468652, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.7265625, + "step": 2641, + "time_per_iteration": 2.4977784156799316 + }, + { + "auxiliary_loss_clip": 0.01105077, + "auxiliary_loss_mlp": 0.01080039, + "balance_loss_clip": 1.03056765, + "balance_loss_mlp": 1.02823091, + "epoch": 0.15884563354877498, + "flos": 24205562496000.0, + "grad_norm": 2.371099570402696, + "language_loss": 0.78178591, + "learning_rate": 3.828366487835167e-06, + "loss": 0.80363709, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.76953125, + "step": 2642, + "time_per_iteration": 2.442441701889038 + }, + { + "auxiliary_loss_clip": 0.01094477, + "auxiliary_loss_mlp": 0.01069461, + "balance_loss_clip": 1.02478147, + "balance_loss_mlp": 1.02654958, + "epoch": 0.15890575680144295, + "flos": 23948706556800.0, + "grad_norm": 1.8333506951786191, + "language_loss": 0.71590292, + "learning_rate": 3.828208603915186e-06, + "loss": 0.73754227, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6796875, + "step": 2643, + "time_per_iteration": 2.4454565048217773 + }, + { + "auxiliary_loss_clip": 0.01095029, + "auxiliary_loss_mlp": 0.01061837, + "balance_loss_clip": 1.01880217, + "balance_loss_mlp": 1.02706289, + "epoch": 0.15896588005411091, + "flos": 21213959760000.0, + "grad_norm": 2.058746810669545, + "language_loss": 0.80580717, + "learning_rate": 3.828050650669353e-06, + "loss": 0.82737583, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6796875, + "step": 2644, + "time_per_iteration": 2.425658941268921 + }, + { + "auxiliary_loss_clip": 0.01093061, + "auxiliary_loss_mlp": 0.0105943, + "balance_loss_clip": 1.01811218, + "balance_loss_mlp": 1.02446723, + "epoch": 0.1590260033067789, + "flos": 24351255066240.0, + "grad_norm": 2.0787927131270343, + "language_loss": 0.83960801, + "learning_rate": 3.827892628103657e-06, + "loss": 0.86113292, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.68359375, + "step": 2645, + "time_per_iteration": 2.439939022064209 + }, + { + "auxiliary_loss_clip": 0.01095936, + "auxiliary_loss_mlp": 0.01068466, + "balance_loss_clip": 1.02273715, + "balance_loss_mlp": 1.02590823, + "epoch": 0.15908612655944687, + "flos": 32047648686720.0, + "grad_norm": 1.9431893168702854, + "language_loss": 0.71780741, + "learning_rate": 3.827734536224087e-06, + "loss": 0.73945141, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.69921875, + "step": 2646, + "time_per_iteration": 2.4849064350128174 + }, + { + "auxiliary_loss_clip": 0.0109283, + "auxiliary_loss_mlp": 0.01064616, + "balance_loss_clip": 1.01855385, + "balance_loss_mlp": 1.02574337, + "epoch": 0.15914624981211484, + "flos": 17784406529280.0, + "grad_norm": 14.303537823184502, + "language_loss": 0.65621138, + "learning_rate": 3.827576375036642e-06, + "loss": 0.67778587, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.671875, + "step": 2647, + "time_per_iteration": 2.377608299255371 + }, + { + "auxiliary_loss_clip": 0.01095679, + "auxiliary_loss_mlp": 0.01061849, + "balance_loss_clip": 1.01716948, + "balance_loss_mlp": 1.02752876, + "epoch": 0.1592063730647828, + "flos": 17711542788480.0, + "grad_norm": 2.1753340263319854, + "language_loss": 0.91483533, + "learning_rate": 3.827418144547318e-06, + "loss": 0.93641061, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.6796875, + "step": 2648, + "time_per_iteration": 2.407356023788452 + }, + { + "auxiliary_loss_clip": 0.01091264, + "auxiliary_loss_mlp": 0.01063141, + "balance_loss_clip": 1.02003467, + "balance_loss_mlp": 1.02510774, + "epoch": 0.15926649631745077, + "flos": 18802648598400.0, + "grad_norm": 1.7810082400528613, + "language_loss": 0.92982066, + "learning_rate": 3.827259844762114e-06, + "loss": 0.9513647, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.6640625, + "step": 2649, + "time_per_iteration": 2.3983843326568604 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.01071355, + "balance_loss_clip": 1.02543569, + "balance_loss_mlp": 1.02592516, + "epoch": 0.15932661957011873, + "flos": 17565291636480.0, + "grad_norm": 2.2682701147948516, + "language_loss": 0.7460053, + "learning_rate": 3.827101475687033e-06, + "loss": 0.76771855, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.7421875, + "step": 2650, + "time_per_iteration": 2.375493049621582 + }, + { + "auxiliary_loss_clip": 0.01091331, + "auxiliary_loss_mlp": 0.01055527, + "balance_loss_clip": 1.01919222, + "balance_loss_mlp": 1.0263145, + "epoch": 0.15938674282278673, + "flos": 13333504118400.0, + "grad_norm": 1.971952551312726, + "language_loss": 0.73300397, + "learning_rate": 3.826943037328082e-06, + "loss": 0.75447249, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.6484375, + "step": 2651, + "time_per_iteration": 2.3750319480895996 + }, + { + "auxiliary_loss_clip": 0.01096327, + "auxiliary_loss_mlp": 0.01066337, + "balance_loss_clip": 1.02091849, + "balance_loss_mlp": 1.02716303, + "epoch": 0.1594468660754547, + "flos": 22487835870720.0, + "grad_norm": 1.8996291003317622, + "language_loss": 0.81735992, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.83898652, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.69140625, + "step": 2652, + "time_per_iteration": 2.4049973487854004 + }, + { + "auxiliary_loss_clip": 0.01089914, + "auxiliary_loss_mlp": 0.01063354, + "balance_loss_clip": 1.02232242, + "balance_loss_mlp": 1.02589393, + "epoch": 0.15950698932812266, + "flos": 15006577248000.0, + "grad_norm": 4.170650128181991, + "language_loss": 0.72224247, + "learning_rate": 3.826625952782601e-06, + "loss": 0.74377519, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.640625, + "step": 2653, + "time_per_iteration": 2.3639533519744873 + }, + { + "auxiliary_loss_clip": 0.01093727, + "auxiliary_loss_mlp": 0.01065346, + "balance_loss_clip": 1.02343178, + "balance_loss_mlp": 1.02631128, + "epoch": 0.15956711258079062, + "flos": 30153715096320.0, + "grad_norm": 4.575782056338166, + "language_loss": 0.80152208, + "learning_rate": 3.826467306608095e-06, + "loss": 0.82311285, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.67578125, + "step": 2654, + "time_per_iteration": 2.4609246253967285 + }, + { + "auxiliary_loss_clip": 0.01089253, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.01812387, + "balance_loss_mlp": 1.02335906, + "epoch": 0.1596272358334586, + "flos": 21031643306880.0, + "grad_norm": 1.7388208505243963, + "language_loss": 0.83851707, + "learning_rate": 3.826308591173765e-06, + "loss": 0.86001211, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.66015625, + "step": 2655, + "time_per_iteration": 2.403367280960083 + }, + { + "auxiliary_loss_clip": 0.01091779, + "auxiliary_loss_mlp": 0.01055803, + "balance_loss_clip": 1.01620209, + "balance_loss_mlp": 1.02450609, + "epoch": 0.15968735908612655, + "flos": 15267133791360.0, + "grad_norm": 3.5150680980185114, + "language_loss": 0.7570827, + "learning_rate": 3.826149806485631e-06, + "loss": 0.77855849, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.671875, + "step": 2656, + "time_per_iteration": 2.3946263790130615 + }, + { + "auxiliary_loss_clip": 0.01089642, + "auxiliary_loss_mlp": 0.01054335, + "balance_loss_clip": 1.01680779, + "balance_loss_mlp": 1.02524066, + "epoch": 0.15974748233879452, + "flos": 52663791369600.0, + "grad_norm": 1.7684650600710579, + "language_loss": 0.79660714, + "learning_rate": 3.825990952549713e-06, + "loss": 0.81804693, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.64453125, + "step": 2657, + "time_per_iteration": 2.674722671508789 + }, + { + "auxiliary_loss_clip": 0.01092017, + "auxiliary_loss_mlp": 0.01060128, + "balance_loss_clip": 1.02262449, + "balance_loss_mlp": 1.02665114, + "epoch": 0.1598076055914625, + "flos": 18732263564160.0, + "grad_norm": 1.841096095247121, + "language_loss": 0.76384759, + "learning_rate": 3.825832029372035e-06, + "loss": 0.78536898, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.65234375, + "step": 2658, + "time_per_iteration": 2.3875274658203125 + }, + { + "auxiliary_loss_clip": 0.01091214, + "auxiliary_loss_mlp": 0.0106498, + "balance_loss_clip": 1.02232742, + "balance_loss_mlp": 1.02546239, + "epoch": 0.15986772884413047, + "flos": 34347831390720.0, + "grad_norm": 1.739787938833924, + "language_loss": 0.76943326, + "learning_rate": 3.825673036958624e-06, + "loss": 0.79099524, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.65625, + "step": 2659, + "time_per_iteration": 2.501847267150879 + }, + { + "auxiliary_loss_clip": 0.01090822, + "auxiliary_loss_mlp": 0.01064629, + "balance_loss_clip": 1.02431226, + "balance_loss_mlp": 1.02535343, + "epoch": 0.15992785209679844, + "flos": 22053865271040.0, + "grad_norm": 1.9660373099506043, + "language_loss": 0.92782974, + "learning_rate": 3.825513975315508e-06, + "loss": 0.94938421, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.65625, + "step": 2660, + "time_per_iteration": 2.396108627319336 + }, + { + "auxiliary_loss_clip": 0.01094898, + "auxiliary_loss_mlp": 0.01068581, + "balance_loss_clip": 1.02695322, + "balance_loss_mlp": 1.02805352, + "epoch": 0.1599879753494664, + "flos": 33065436908160.0, + "grad_norm": 2.5568746089322074, + "language_loss": 0.79423976, + "learning_rate": 3.82535484444872e-06, + "loss": 0.81587458, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.66796875, + "step": 2661, + "time_per_iteration": 2.5061111450195312 + }, + { + "auxiliary_loss_clip": 0.01089635, + "auxiliary_loss_mlp": 0.01056961, + "balance_loss_clip": 1.0154283, + "balance_loss_mlp": 1.02363503, + "epoch": 0.16004809860213437, + "flos": 28036756540800.0, + "grad_norm": 1.7401096682432593, + "language_loss": 0.75250876, + "learning_rate": 3.825195644364292e-06, + "loss": 0.77397478, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.66015625, + "step": 2662, + "time_per_iteration": 2.466242551803589 + }, + { + "auxiliary_loss_clip": 0.01092225, + "auxiliary_loss_mlp": 0.01064052, + "balance_loss_clip": 1.02383065, + "balance_loss_mlp": 1.02566361, + "epoch": 0.16010822185480234, + "flos": 22779116277120.0, + "grad_norm": 1.7482026492216716, + "language_loss": 0.84508014, + "learning_rate": 3.825036375068263e-06, + "loss": 0.86664289, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.6640625, + "step": 2663, + "time_per_iteration": 2.3994972705841064 + }, + { + "auxiliary_loss_clip": 0.0109474, + "auxiliary_loss_mlp": 0.0106391, + "balance_loss_clip": 1.024333, + "balance_loss_mlp": 1.0273664, + "epoch": 0.16016834510747033, + "flos": 20082983310720.0, + "grad_norm": 1.988743138826804, + "language_loss": 0.8231287, + "learning_rate": 3.824877036566672e-06, + "loss": 0.84471524, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.671875, + "step": 2664, + "time_per_iteration": 3.961005926132202 + }, + { + "auxiliary_loss_clip": 0.01088999, + "auxiliary_loss_mlp": 0.01060717, + "balance_loss_clip": 1.01992345, + "balance_loss_mlp": 1.02447975, + "epoch": 0.1602284683601383, + "flos": 21172902134400.0, + "grad_norm": 1.6194735878548034, + "language_loss": 0.95316195, + "learning_rate": 3.824717628865561e-06, + "loss": 0.97465909, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.64453125, + "step": 2665, + "time_per_iteration": 2.4029390811920166 + }, + { + "auxiliary_loss_clip": 0.01092966, + "auxiliary_loss_mlp": 0.01066202, + "balance_loss_clip": 1.02359629, + "balance_loss_mlp": 1.0243938, + "epoch": 0.16028859161280626, + "flos": 14646692286720.0, + "grad_norm": 2.0840561325913423, + "language_loss": 0.86957467, + "learning_rate": 3.824558151970974e-06, + "loss": 0.89116639, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6875, + "step": 2666, + "time_per_iteration": 3.8050687313079834 + }, + { + "auxiliary_loss_clip": 0.0109248, + "auxiliary_loss_mlp": 0.01064362, + "balance_loss_clip": 1.0246172, + "balance_loss_mlp": 1.02415252, + "epoch": 0.16034871486547422, + "flos": 20989433606400.0, + "grad_norm": 1.895193303777103, + "language_loss": 0.83652955, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.85809797, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.68359375, + "step": 2667, + "time_per_iteration": 2.399412155151367 + }, + { + "auxiliary_loss_clip": 0.01091101, + "auxiliary_loss_mlp": 0.01071254, + "balance_loss_clip": 1.02700377, + "balance_loss_mlp": 1.02534235, + "epoch": 0.1604088381181422, + "flos": 21396660238080.0, + "grad_norm": 1.7256139119295297, + "language_loss": 0.75124538, + "learning_rate": 3.824238990625567e-06, + "loss": 0.77286899, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.65625, + "step": 2668, + "time_per_iteration": 3.9083492755889893 + }, + { + "auxiliary_loss_clip": 0.01090756, + "auxiliary_loss_mlp": 0.01063395, + "balance_loss_clip": 1.0208137, + "balance_loss_mlp": 1.02353835, + "epoch": 0.16046896137081015, + "flos": 23875947550080.0, + "grad_norm": 1.7063421584559342, + "language_loss": 0.79000604, + "learning_rate": 3.824079306186848e-06, + "loss": 0.81154758, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.671875, + "step": 2669, + "time_per_iteration": 3.7820682525634766 + }, + { + "auxiliary_loss_clip": 0.01036176, + "auxiliary_loss_mlp": 0.01065617, + "balance_loss_clip": 1.05741537, + "balance_loss_mlp": 1.01335704, + "epoch": 0.16052908462347812, + "flos": 59803842608640.0, + "grad_norm": 0.8401215530012947, + "language_loss": 0.55672359, + "learning_rate": 3.823919552578861e-06, + "loss": 0.5777415, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.08203125, + "router_z_loss_mlp": 0.22851562, + "step": 2670, + "time_per_iteration": 2.9293465614318848 + }, + { + "auxiliary_loss_clip": 0.01092176, + "auxiliary_loss_mlp": 0.01071234, + "balance_loss_clip": 1.02767467, + "balance_loss_mlp": 1.02424431, + "epoch": 0.1605892078761461, + "flos": 18295569878400.0, + "grad_norm": 2.928476199966725, + "language_loss": 0.8002286, + "learning_rate": 3.82375972980766e-06, + "loss": 0.8218627, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.6796875, + "step": 2671, + "time_per_iteration": 2.392563819885254 + }, + { + "auxiliary_loss_clip": 0.01093085, + "auxiliary_loss_mlp": 0.01077763, + "balance_loss_clip": 1.03539658, + "balance_loss_mlp": 1.02554178, + "epoch": 0.16064933112881408, + "flos": 32159370637440.0, + "grad_norm": 1.9396272889947388, + "language_loss": 0.67251039, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.69421887, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.67578125, + "step": 2672, + "time_per_iteration": 2.477816343307495 + }, + { + "auxiliary_loss_clip": 0.01091999, + "auxiliary_loss_mlp": 0.0108174, + "balance_loss_clip": 1.03324556, + "balance_loss_mlp": 1.02331531, + "epoch": 0.16070945438148204, + "flos": 19827768205440.0, + "grad_norm": 1.6955721135931157, + "language_loss": 0.88085216, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.90258956, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.6875, + "step": 2673, + "time_per_iteration": 2.378833055496216 + }, + { + "auxiliary_loss_clip": 0.01094984, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_clip": 1.04409325, + "balance_loss_mlp": 1.02678657, + "epoch": 0.16076957763415, + "flos": 18912240956160.0, + "grad_norm": 4.18110689604754, + "language_loss": 0.75048435, + "learning_rate": 3.823279846575403e-06, + "loss": 0.77232331, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6796875, + "step": 2674, + "time_per_iteration": 2.3886334896087646 + }, + { + "auxiliary_loss_clip": 0.01092921, + "auxiliary_loss_mlp": 0.0108617, + "balance_loss_clip": 1.04044092, + "balance_loss_mlp": 1.02506042, + "epoch": 0.16082970088681797, + "flos": 16763406462720.0, + "grad_norm": 1.5426172647324543, + "language_loss": 0.86076713, + "learning_rate": 3.823119747211986e-06, + "loss": 0.88255799, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6796875, + "step": 2675, + "time_per_iteration": 2.3749051094055176 + }, + { + "auxiliary_loss_clip": 0.01097307, + "auxiliary_loss_mlp": 0.01083353, + "balance_loss_clip": 1.03888774, + "balance_loss_mlp": 1.02745247, + "epoch": 0.16088982413948594, + "flos": 35148878691840.0, + "grad_norm": 1.871075866949604, + "language_loss": 0.83937752, + "learning_rate": 3.822959578715685e-06, + "loss": 0.86118412, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.69921875, + "step": 2676, + "time_per_iteration": 2.500680446624756 + }, + { + "auxiliary_loss_clip": 0.01091704, + "auxiliary_loss_mlp": 0.01083272, + "balance_loss_clip": 1.04328895, + "balance_loss_mlp": 1.02494872, + "epoch": 0.1609499473921539, + "flos": 18624102572160.0, + "grad_norm": 1.7043712648772171, + "language_loss": 0.74623424, + "learning_rate": 3.822799341092573e-06, + "loss": 0.76798403, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6640625, + "step": 2677, + "time_per_iteration": 2.3728644847869873 + }, + { + "auxiliary_loss_clip": 0.01092592, + "auxiliary_loss_mlp": 0.01081683, + "balance_loss_clip": 1.0403173, + "balance_loss_mlp": 1.0246048, + "epoch": 0.1610100706448219, + "flos": 33144340314240.0, + "grad_norm": 1.8535376131159786, + "language_loss": 0.7829237, + "learning_rate": 3.822639034348728e-06, + "loss": 0.8046664, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6796875, + "step": 2678, + "time_per_iteration": 2.491743803024292 + }, + { + "auxiliary_loss_clip": 0.01096118, + "auxiliary_loss_mlp": 0.01073535, + "balance_loss_clip": 1.02599466, + "balance_loss_mlp": 1.02593994, + "epoch": 0.16107019389748986, + "flos": 34675316743680.0, + "grad_norm": 1.844125136316917, + "language_loss": 0.71860206, + "learning_rate": 3.822478658490228e-06, + "loss": 0.74029863, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.703125, + "step": 2679, + "time_per_iteration": 2.490894317626953 + }, + { + "auxiliary_loss_clip": 0.01033289, + "auxiliary_loss_mlp": 0.01017011, + "balance_loss_clip": 1.01066899, + "balance_loss_mlp": 1.00841951, + "epoch": 0.16113031715015783, + "flos": 65710483735680.0, + "grad_norm": 0.7798423525406406, + "language_loss": 0.51964855, + "learning_rate": 3.822318213523154e-06, + "loss": 0.54015154, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.06347656, + "router_z_loss_mlp": 0.24804688, + "step": 2680, + "time_per_iteration": 3.0766689777374268 + }, + { + "auxiliary_loss_clip": 0.01095016, + "auxiliary_loss_mlp": 0.01070524, + "balance_loss_clip": 1.01938295, + "balance_loss_mlp": 1.02400446, + "epoch": 0.1611904404028258, + "flos": 20809456214400.0, + "grad_norm": 1.6961277487523598, + "language_loss": 0.8174001, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.83905542, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.7109375, + "step": 2681, + "time_per_iteration": 2.40366792678833 + }, + { + "auxiliary_loss_clip": 0.01096598, + "auxiliary_loss_mlp": 0.01073435, + "balance_loss_clip": 1.02777815, + "balance_loss_mlp": 1.02774262, + "epoch": 0.16125056365549376, + "flos": 27012195515520.0, + "grad_norm": 1.76514982554947, + "language_loss": 0.71643353, + "learning_rate": 3.821997116287627e-06, + "loss": 0.73813379, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6875, + "step": 2682, + "time_per_iteration": 2.462615966796875 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01077098, + "balance_loss_clip": 1.02853251, + "balance_loss_mlp": 1.02876449, + "epoch": 0.16131068690816172, + "flos": 19275651964800.0, + "grad_norm": 1.7792556206152477, + "language_loss": 0.89227724, + "learning_rate": 3.821836464031348e-06, + "loss": 0.91403449, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.69921875, + "step": 2683, + "time_per_iteration": 2.4210846424102783 + }, + { + "auxiliary_loss_clip": 0.01095054, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_clip": 1.02569413, + "balance_loss_mlp": 1.02488112, + "epoch": 0.16137081016082971, + "flos": 35336396937600.0, + "grad_norm": 1.829267015230841, + "language_loss": 0.75662398, + "learning_rate": 3.821675742690849e-06, + "loss": 0.7783078, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.703125, + "step": 2684, + "time_per_iteration": 2.5550172328948975 + }, + { + "auxiliary_loss_clip": 0.01100929, + "auxiliary_loss_mlp": 0.01075257, + "balance_loss_clip": 1.02630973, + "balance_loss_mlp": 1.02815008, + "epoch": 0.16143093341349768, + "flos": 34233979847040.0, + "grad_norm": 1.7119750818041055, + "language_loss": 0.72189063, + "learning_rate": 3.821514952272223e-06, + "loss": 0.74365252, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.7265625, + "step": 2685, + "time_per_iteration": 2.4954161643981934 + }, + { + "auxiliary_loss_clip": 0.01094183, + "auxiliary_loss_mlp": 0.01077578, + "balance_loss_clip": 1.02970314, + "balance_loss_mlp": 1.02591825, + "epoch": 0.16149105666616564, + "flos": 27998072887680.0, + "grad_norm": 1.9662251542675646, + "language_loss": 0.73176676, + "learning_rate": 3.821354092781567e-06, + "loss": 0.75348437, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.68359375, + "step": 2686, + "time_per_iteration": 2.443716526031494 + }, + { + "auxiliary_loss_clip": 0.01096042, + "auxiliary_loss_mlp": 0.01076242, + "balance_loss_clip": 1.02603102, + "balance_loss_mlp": 1.02602363, + "epoch": 0.1615511799188336, + "flos": 19421344535040.0, + "grad_norm": 1.8048812145806639, + "language_loss": 0.83775449, + "learning_rate": 3.821193164224981e-06, + "loss": 0.85947728, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.69921875, + "step": 2687, + "time_per_iteration": 2.4030957221984863 + }, + { + "auxiliary_loss_clip": 0.01100241, + "auxiliary_loss_mlp": 0.01070098, + "balance_loss_clip": 1.02012563, + "balance_loss_mlp": 1.02686024, + "epoch": 0.16161130317150157, + "flos": 22853865231360.0, + "grad_norm": 1.594619486430567, + "language_loss": 0.73399842, + "learning_rate": 3.821032166608568e-06, + "loss": 0.75570178, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.734375, + "step": 2688, + "time_per_iteration": 2.4123518466949463 + }, + { + "auxiliary_loss_clip": 0.01095559, + "auxiliary_loss_mlp": 0.01084705, + "balance_loss_clip": 1.03733099, + "balance_loss_mlp": 1.02610958, + "epoch": 0.16167142642416954, + "flos": 26109201444480.0, + "grad_norm": 1.606633674882936, + "language_loss": 0.7699343, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.7917369, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.6953125, + "step": 2689, + "time_per_iteration": 2.487297773361206 + }, + { + "auxiliary_loss_clip": 0.01097188, + "auxiliary_loss_mlp": 0.01077935, + "balance_loss_clip": 1.03299272, + "balance_loss_mlp": 1.02742696, + "epoch": 0.1617315496768375, + "flos": 22778662429440.0, + "grad_norm": 1.9566340475995379, + "language_loss": 0.89395744, + "learning_rate": 3.820709964220683e-06, + "loss": 0.91570866, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.69921875, + "step": 2690, + "time_per_iteration": 2.4246702194213867 + }, + { + "auxiliary_loss_clip": 0.01093319, + "auxiliary_loss_mlp": 0.01066033, + "balance_loss_clip": 1.02457178, + "balance_loss_mlp": 1.02544236, + "epoch": 0.1617916729295055, + "flos": 22016228958720.0, + "grad_norm": 2.0138945528375545, + "language_loss": 0.89286566, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.91445923, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6796875, + "step": 2691, + "time_per_iteration": 2.4003124237060547 + }, + { + "auxiliary_loss_clip": 0.01100069, + "auxiliary_loss_mlp": 0.01079591, + "balance_loss_clip": 1.02694786, + "balance_loss_mlp": 1.02610683, + "epoch": 0.16185179618217346, + "flos": 23437194094080.0, + "grad_norm": 2.0439464153757734, + "language_loss": 0.83840066, + "learning_rate": 3.820387485666784e-06, + "loss": 0.86019731, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.7421875, + "step": 2692, + "time_per_iteration": 2.4256579875946045 + }, + { + "auxiliary_loss_clip": 0.01098212, + "auxiliary_loss_mlp": 0.01074407, + "balance_loss_clip": 1.02209759, + "balance_loss_mlp": 1.02692103, + "epoch": 0.16191191943484143, + "flos": 25664931993600.0, + "grad_norm": 2.009129307953413, + "language_loss": 0.83523935, + "learning_rate": 3.820226142842862e-06, + "loss": 0.85696554, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.7109375, + "step": 2693, + "time_per_iteration": 2.425150156021118 + }, + { + "auxiliary_loss_clip": 0.01091878, + "auxiliary_loss_mlp": 0.01067944, + "balance_loss_clip": 1.02533901, + "balance_loss_mlp": 1.02436769, + "epoch": 0.1619720426875094, + "flos": 23476226860800.0, + "grad_norm": 1.6721366556593724, + "language_loss": 0.86105806, + "learning_rate": 3.820064730995783e-06, + "loss": 0.88265622, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.67578125, + "step": 2694, + "time_per_iteration": 2.4386985301971436 + }, + { + "auxiliary_loss_clip": 0.0109783, + "auxiliary_loss_mlp": 0.01068059, + "balance_loss_clip": 1.02314138, + "balance_loss_mlp": 1.02694428, + "epoch": 0.16203216594017736, + "flos": 24132524198400.0, + "grad_norm": 3.6482335622896427, + "language_loss": 0.70934826, + "learning_rate": 3.819903250131667e-06, + "loss": 0.73100716, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.7109375, + "step": 2695, + "time_per_iteration": 2.4154298305511475 + }, + { + "auxiliary_loss_clip": 0.01096584, + "auxiliary_loss_mlp": 0.01067771, + "balance_loss_clip": 1.02206576, + "balance_loss_mlp": 1.02723885, + "epoch": 0.16209228919284532, + "flos": 22339943884800.0, + "grad_norm": 1.9694572645686819, + "language_loss": 0.84639549, + "learning_rate": 3.819741700256637e-06, + "loss": 0.86803901, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.69140625, + "step": 2696, + "time_per_iteration": 2.398836851119995 + }, + { + "auxiliary_loss_clip": 0.01100517, + "auxiliary_loss_mlp": 0.01072032, + "balance_loss_clip": 1.02084398, + "balance_loss_mlp": 1.02690983, + "epoch": 0.1621524124455133, + "flos": 15814222796160.0, + "grad_norm": 2.2551522185021042, + "language_loss": 0.91134334, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.93306887, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.734375, + "step": 2697, + "time_per_iteration": 2.3839991092681885 + }, + { + "auxiliary_loss_clip": 0.01089268, + "auxiliary_loss_mlp": 0.01059873, + "balance_loss_clip": 1.02048683, + "balance_loss_mlp": 1.02399898, + "epoch": 0.16221253569818128, + "flos": 30185486300160.0, + "grad_norm": 1.7710440362427138, + "language_loss": 0.8216778, + "learning_rate": 3.819418393498343e-06, + "loss": 0.84316921, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.65234375, + "step": 2698, + "time_per_iteration": 2.4925849437713623 + }, + { + "auxiliary_loss_clip": 0.01093283, + "auxiliary_loss_mlp": 0.01067636, + "balance_loss_clip": 1.02674747, + "balance_loss_mlp": 1.02655768, + "epoch": 0.16227265895084925, + "flos": 24604899160320.0, + "grad_norm": 1.8181549553829663, + "language_loss": 0.78505689, + "learning_rate": 3.819256636627339e-06, + "loss": 0.80666608, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.66796875, + "step": 2699, + "time_per_iteration": 2.426504135131836 + }, + { + "auxiliary_loss_clip": 0.0109156, + "auxiliary_loss_mlp": 0.01058394, + "balance_loss_clip": 1.01826787, + "balance_loss_mlp": 1.02600098, + "epoch": 0.1623327822035172, + "flos": 19572308720640.0, + "grad_norm": 1.8290886490304923, + "language_loss": 0.88572347, + "learning_rate": 3.81909481076994e-06, + "loss": 0.90722299, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.65625, + "step": 2700, + "time_per_iteration": 2.3980765342712402 + }, + { + "auxiliary_loss_clip": 0.01093135, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_clip": 1.01938093, + "balance_loss_mlp": 1.02603936, + "epoch": 0.16239290545618518, + "flos": 26467271015040.0, + "grad_norm": 1.8836592541042572, + "language_loss": 0.81904012, + "learning_rate": 3.818932915932284e-06, + "loss": 0.84065002, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.671875, + "step": 2701, + "time_per_iteration": 2.4483413696289062 + }, + { + "auxiliary_loss_clip": 0.0109367, + "auxiliary_loss_mlp": 0.01064515, + "balance_loss_clip": 1.02353108, + "balance_loss_mlp": 1.0270319, + "epoch": 0.16245302870885314, + "flos": 15851021235840.0, + "grad_norm": 2.0513919141649675, + "language_loss": 0.75530303, + "learning_rate": 3.818770952120511e-06, + "loss": 0.77688491, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.6640625, + "step": 2702, + "time_per_iteration": 2.3906679153442383 + }, + { + "auxiliary_loss_clip": 0.01091956, + "auxiliary_loss_mlp": 0.01065225, + "balance_loss_clip": 1.02192795, + "balance_loss_mlp": 1.02434945, + "epoch": 0.1625131519615211, + "flos": 14755656240000.0, + "grad_norm": 2.094674775972685, + "language_loss": 0.74669278, + "learning_rate": 3.81860891934076e-06, + "loss": 0.76826453, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.67578125, + "step": 2703, + "time_per_iteration": 2.367050886154175 + }, + { + "auxiliary_loss_clip": 0.0109315, + "auxiliary_loss_mlp": 0.01064639, + "balance_loss_clip": 1.02243876, + "balance_loss_mlp": 1.02501714, + "epoch": 0.1625732752141891, + "flos": 28219247550720.0, + "grad_norm": 1.8282801240020046, + "language_loss": 0.72640473, + "learning_rate": 3.818446817599176e-06, + "loss": 0.74798262, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6796875, + "step": 2704, + "time_per_iteration": 4.011623859405518 + }, + { + "auxiliary_loss_clip": 0.01047862, + "auxiliary_loss_mlp": 0.01016953, + "balance_loss_clip": 1.00746441, + "balance_loss_mlp": 1.01578522, + "epoch": 0.16263339846685707, + "flos": 67324727491200.0, + "grad_norm": 0.791335893067402, + "language_loss": 0.534006, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55465412, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.09472656, + "router_z_loss_mlp": 0.3203125, + "step": 2705, + "time_per_iteration": 3.0511341094970703 + }, + { + "auxiliary_loss_clip": 0.01094152, + "auxiliary_loss_mlp": 0.01074007, + "balance_loss_clip": 1.03240299, + "balance_loss_mlp": 1.02474213, + "epoch": 0.16269352171952503, + "flos": 14318299238400.0, + "grad_norm": 2.277006740979859, + "language_loss": 0.79052103, + "learning_rate": 3.818122407255102e-06, + "loss": 0.81220257, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.6953125, + "step": 2706, + "time_per_iteration": 3.846130132675171 + }, + { + "auxiliary_loss_clip": 0.01095976, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_clip": 1.0270884, + "balance_loss_mlp": 1.02802014, + "epoch": 0.162753644972193, + "flos": 28360087441920.0, + "grad_norm": 2.296022043326329, + "language_loss": 0.75572413, + "learning_rate": 3.817960098664914e-06, + "loss": 0.77734888, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.6796875, + "step": 2707, + "time_per_iteration": 2.4815258979797363 + }, + { + "auxiliary_loss_clip": 0.01093241, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_clip": 1.02884722, + "balance_loss_mlp": 1.02569687, + "epoch": 0.16281376822486096, + "flos": 19936836892800.0, + "grad_norm": 2.6570295027676174, + "language_loss": 0.8553896, + "learning_rate": 3.817797721137495e-06, + "loss": 0.87703842, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.67578125, + "step": 2708, + "time_per_iteration": 3.861713409423828 + }, + { + "auxiliary_loss_clip": 0.01096238, + "auxiliary_loss_mlp": 0.01074846, + "balance_loss_clip": 1.02909303, + "balance_loss_mlp": 1.0269208, + "epoch": 0.16287389147752893, + "flos": 21250653465600.0, + "grad_norm": 2.1182491484523673, + "language_loss": 0.88173187, + "learning_rate": 3.817635274679006e-06, + "loss": 0.90344274, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6953125, + "step": 2709, + "time_per_iteration": 3.882869005203247 + }, + { + "auxiliary_loss_clip": 0.01091914, + "auxiliary_loss_mlp": 0.01070059, + "balance_loss_clip": 1.02838314, + "balance_loss_mlp": 1.02477336, + "epoch": 0.1629340147301969, + "flos": 19243671292800.0, + "grad_norm": 1.592941797544818, + "language_loss": 0.92725307, + "learning_rate": 3.817472759295605e-06, + "loss": 0.9488728, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.671875, + "step": 2710, + "time_per_iteration": 2.444967746734619 + }, + { + "auxiliary_loss_clip": 0.01094023, + "auxiliary_loss_mlp": 0.01069983, + "balance_loss_clip": 1.02966619, + "balance_loss_mlp": 1.0276798, + "epoch": 0.16299413798286488, + "flos": 21248803163520.0, + "grad_norm": 2.016995762090991, + "language_loss": 0.836146, + "learning_rate": 3.817310174993453e-06, + "loss": 0.85778606, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.6640625, + "step": 2711, + "time_per_iteration": 2.3803019523620605 + }, + { + "auxiliary_loss_clip": 0.01095846, + "auxiliary_loss_mlp": 0.01063663, + "balance_loss_clip": 1.01960301, + "balance_loss_mlp": 1.02494705, + "epoch": 0.16305426123553285, + "flos": 18769585674240.0, + "grad_norm": 2.089185651215556, + "language_loss": 0.82852042, + "learning_rate": 3.817147521778719e-06, + "loss": 0.85011548, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.7109375, + "step": 2712, + "time_per_iteration": 2.3931591510772705 + }, + { + "auxiliary_loss_clip": 0.01093462, + "auxiliary_loss_mlp": 0.01070878, + "balance_loss_clip": 1.02419567, + "balance_loss_mlp": 1.02365398, + "epoch": 0.16311438448820081, + "flos": 22086648904320.0, + "grad_norm": 2.047178917649759, + "language_loss": 0.7931484, + "learning_rate": 3.816984799657568e-06, + "loss": 0.8147918, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.6953125, + "step": 2713, + "time_per_iteration": 2.424274444580078 + }, + { + "auxiliary_loss_clip": 0.01092187, + "auxiliary_loss_mlp": 0.01067558, + "balance_loss_clip": 1.02867174, + "balance_loss_mlp": 1.02757418, + "epoch": 0.16317450774086878, + "flos": 16466889352320.0, + "grad_norm": 3.2163196711627666, + "language_loss": 0.8069948, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.82859224, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.64453125, + "step": 2714, + "time_per_iteration": 2.376124858856201 + }, + { + "auxiliary_loss_clip": 0.01093781, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.02166629, + "balance_loss_mlp": 1.02637553, + "epoch": 0.16323463099353674, + "flos": 24351778736640.0, + "grad_norm": 1.5659686658865972, + "language_loss": 0.80030835, + "learning_rate": 3.816659148720702e-06, + "loss": 0.82184476, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.67578125, + "step": 2715, + "time_per_iteration": 2.443765163421631 + }, + { + "auxiliary_loss_clip": 0.0109333, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_clip": 1.02277958, + "balance_loss_mlp": 1.02614319, + "epoch": 0.1632947542462047, + "flos": 24899600880000.0, + "grad_norm": 1.9264488581064292, + "language_loss": 0.83345109, + "learning_rate": 3.816496219917336e-06, + "loss": 0.85501873, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.671875, + "step": 2716, + "time_per_iteration": 2.435058832168579 + }, + { + "auxiliary_loss_clip": 0.01095454, + "auxiliary_loss_mlp": 0.01072794, + "balance_loss_clip": 1.0284481, + "balance_loss_mlp": 1.02735472, + "epoch": 0.1633548774988727, + "flos": 24899112120960.0, + "grad_norm": 1.9990177246392262, + "language_loss": 0.87963068, + "learning_rate": 3.816333222232251e-06, + "loss": 0.90131319, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.6796875, + "step": 2717, + "time_per_iteration": 2.4388387203216553 + }, + { + "auxiliary_loss_clip": 0.01092309, + "auxiliary_loss_mlp": 0.01065505, + "balance_loss_clip": 1.02402067, + "balance_loss_mlp": 1.02613974, + "epoch": 0.16341500075154067, + "flos": 30440596671360.0, + "grad_norm": 1.6857768496793424, + "language_loss": 0.79198837, + "learning_rate": 3.816170155671629e-06, + "loss": 0.81356645, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6640625, + "step": 2718, + "time_per_iteration": 2.487579822540283 + }, + { + "auxiliary_loss_clip": 0.01095227, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.02410674, + "balance_loss_mlp": 1.02628195, + "epoch": 0.16347512400420863, + "flos": 22783410374400.0, + "grad_norm": 1.9430913493701927, + "language_loss": 0.76186013, + "learning_rate": 3.816007020241652e-06, + "loss": 0.78347719, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.6875, + "step": 2719, + "time_per_iteration": 2.4324374198913574 + }, + { + "auxiliary_loss_clip": 0.01093016, + "auxiliary_loss_mlp": 0.01066867, + "balance_loss_clip": 1.02459526, + "balance_loss_mlp": 1.02502549, + "epoch": 0.1635352472568766, + "flos": 22632306543360.0, + "grad_norm": 1.9287140311994697, + "language_loss": 0.7308557, + "learning_rate": 3.815843815948507e-06, + "loss": 0.75245452, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6796875, + "step": 2720, + "time_per_iteration": 2.4097049236297607 + }, + { + "auxiliary_loss_clip": 0.01091363, + "auxiliary_loss_mlp": 0.01056681, + "balance_loss_clip": 1.01598346, + "balance_loss_mlp": 1.02644634, + "epoch": 0.16359537050954456, + "flos": 15522104517120.0, + "grad_norm": 1.8715118993343218, + "language_loss": 0.77581632, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.79729676, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6484375, + "step": 2721, + "time_per_iteration": 2.399419069290161 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01064944, + "balance_loss_clip": 1.01888156, + "balance_loss_mlp": 1.02543783, + "epoch": 0.16365549376221253, + "flos": 22089092699520.0, + "grad_norm": 1.8081238244143578, + "language_loss": 0.81009877, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.8317107, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7109375, + "step": 2722, + "time_per_iteration": 2.4425203800201416 + }, + { + "auxiliary_loss_clip": 0.01097422, + "auxiliary_loss_mlp": 0.01078767, + "balance_loss_clip": 1.02612448, + "balance_loss_mlp": 1.02561784, + "epoch": 0.1637156170148805, + "flos": 24059276432640.0, + "grad_norm": 2.2473288268122626, + "language_loss": 0.86193991, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.8837018, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.71875, + "step": 2723, + "time_per_iteration": 2.4318931102752686 + }, + { + "auxiliary_loss_clip": 0.01092102, + "auxiliary_loss_mlp": 0.01061835, + "balance_loss_clip": 1.01968253, + "balance_loss_mlp": 1.02643514, + "epoch": 0.1637757402675485, + "flos": 26684221403520.0, + "grad_norm": 1.7797690807441746, + "language_loss": 0.72865003, + "learning_rate": 3.815190310268058e-06, + "loss": 0.75018942, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.65625, + "step": 2724, + "time_per_iteration": 2.455733299255371 + }, + { + "auxiliary_loss_clip": 0.01091655, + "auxiliary_loss_mlp": 0.01065202, + "balance_loss_clip": 1.02316856, + "balance_loss_mlp": 1.02520752, + "epoch": 0.16383586352021645, + "flos": 16106026872960.0, + "grad_norm": 1.9212610056570587, + "language_loss": 0.72683203, + "learning_rate": 3.815026761751955e-06, + "loss": 0.74840057, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6640625, + "step": 2725, + "time_per_iteration": 2.394416093826294 + }, + { + "auxiliary_loss_clip": 0.01090978, + "auxiliary_loss_mlp": 0.01062947, + "balance_loss_clip": 1.0201031, + "balance_loss_mlp": 1.02531576, + "epoch": 0.16389598677288442, + "flos": 19165151911680.0, + "grad_norm": 1.8530783611217188, + "language_loss": 0.89870691, + "learning_rate": 3.814863144409855e-06, + "loss": 0.92024612, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.65625, + "step": 2726, + "time_per_iteration": 2.3902058601379395 + }, + { + "auxiliary_loss_clip": 0.01098999, + "auxiliary_loss_mlp": 0.01069757, + "balance_loss_clip": 1.02653182, + "balance_loss_mlp": 1.02823138, + "epoch": 0.16395611002555238, + "flos": 21505938393600.0, + "grad_norm": 2.0577769687017278, + "language_loss": 0.75856721, + "learning_rate": 3.814699458247963e-06, + "loss": 0.78025484, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.70703125, + "step": 2727, + "time_per_iteration": 2.404475450515747 + }, + { + "auxiliary_loss_clip": 0.01091338, + "auxiliary_loss_mlp": 0.01066496, + "balance_loss_clip": 1.02570248, + "balance_loss_mlp": 1.0252198, + "epoch": 0.16401623327822035, + "flos": 21469838181120.0, + "grad_norm": 1.5698083685685307, + "language_loss": 0.84929699, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.8708753, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.66015625, + "step": 2728, + "time_per_iteration": 2.4072928428649902 + }, + { + "auxiliary_loss_clip": 0.01097676, + "auxiliary_loss_mlp": 0.01071508, + "balance_loss_clip": 1.02594614, + "balance_loss_mlp": 1.02689338, + "epoch": 0.1640763565308883, + "flos": 13625378017920.0, + "grad_norm": 2.292422380052449, + "language_loss": 0.87333423, + "learning_rate": 3.814371879489633e-06, + "loss": 0.89502603, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.70703125, + "step": 2729, + "time_per_iteration": 2.381352186203003 + }, + { + "auxiliary_loss_clip": 0.0109542, + "auxiliary_loss_mlp": 0.01078385, + "balance_loss_clip": 1.03244233, + "balance_loss_mlp": 1.02495241, + "epoch": 0.16413647978355628, + "flos": 15450532496640.0, + "grad_norm": 1.9198768415741962, + "language_loss": 0.75433809, + "learning_rate": 3.814207986905616e-06, + "loss": 0.7760762, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.703125, + "step": 2730, + "time_per_iteration": 2.367783784866333 + }, + { + "auxiliary_loss_clip": 0.01099118, + "auxiliary_loss_mlp": 0.0107143, + "balance_loss_clip": 1.02362716, + "balance_loss_mlp": 1.02615309, + "epoch": 0.16419660303622427, + "flos": 45876955155840.0, + "grad_norm": 1.5729551036239653, + "language_loss": 0.76675683, + "learning_rate": 3.814044025526651e-06, + "loss": 0.78846228, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.73046875, + "step": 2731, + "time_per_iteration": 2.640993118286133 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01072276, + "balance_loss_clip": 1.02430594, + "balance_loss_mlp": 1.02763271, + "epoch": 0.16425672628889224, + "flos": 18951832304640.0, + "grad_norm": 2.259870318660352, + "language_loss": 0.8128016, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.8345226, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.72265625, + "step": 2732, + "time_per_iteration": 2.3673460483551025 + }, + { + "auxiliary_loss_clip": 0.01095517, + "auxiliary_loss_mlp": 0.01076865, + "balance_loss_clip": 1.02748895, + "balance_loss_mlp": 1.02508545, + "epoch": 0.1643168495415602, + "flos": 24311943008640.0, + "grad_norm": 2.2321730246807596, + "language_loss": 0.70915186, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.73087573, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.703125, + "step": 2733, + "time_per_iteration": 2.4428422451019287 + }, + { + "auxiliary_loss_clip": 0.01095502, + "auxiliary_loss_mlp": 0.01066617, + "balance_loss_clip": 1.0217464, + "balance_loss_mlp": 1.02665818, + "epoch": 0.16437697279422817, + "flos": 26427330552960.0, + "grad_norm": 1.8790922844191156, + "language_loss": 0.82300675, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.84462798, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6875, + "step": 2734, + "time_per_iteration": 2.4976723194122314 + }, + { + "auxiliary_loss_clip": 0.01094099, + "auxiliary_loss_mlp": 0.01065247, + "balance_loss_clip": 1.01951873, + "balance_loss_mlp": 1.02500129, + "epoch": 0.16443709604689613, + "flos": 34530811159680.0, + "grad_norm": 2.1005593638275877, + "language_loss": 0.85012686, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.87172031, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.69140625, + "step": 2735, + "time_per_iteration": 2.5512137413024902 + }, + { + "auxiliary_loss_clip": 0.01090602, + "auxiliary_loss_mlp": 0.01062742, + "balance_loss_clip": 1.02049398, + "balance_loss_mlp": 1.02420712, + "epoch": 0.1644972192995641, + "flos": 23256937411200.0, + "grad_norm": 2.3911799844136263, + "language_loss": 0.80560231, + "learning_rate": 3.813223186925296e-06, + "loss": 0.82713568, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6640625, + "step": 2736, + "time_per_iteration": 2.391968011856079 + }, + { + "auxiliary_loss_clip": 0.01098731, + "auxiliary_loss_mlp": 0.01069731, + "balance_loss_clip": 1.02598119, + "balance_loss_mlp": 1.02872121, + "epoch": 0.1645573425522321, + "flos": 26978329630080.0, + "grad_norm": 1.9642517434762736, + "language_loss": 0.82283413, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.84451872, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.69921875, + "step": 2737, + "time_per_iteration": 2.4580259323120117 + }, + { + "auxiliary_loss_clip": 0.01092928, + "auxiliary_loss_mlp": 0.01061738, + "balance_loss_clip": 1.01939499, + "balance_loss_mlp": 1.02387822, + "epoch": 0.16461746580490005, + "flos": 28730480722560.0, + "grad_norm": 1.768092522224841, + "language_loss": 0.89176226, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.91330886, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.6875, + "step": 2738, + "time_per_iteration": 2.4888288974761963 + }, + { + "auxiliary_loss_clip": 0.01093143, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_clip": 1.0315026, + "balance_loss_mlp": 1.02412128, + "epoch": 0.16467758905756802, + "flos": 24929172668160.0, + "grad_norm": 1.6955088842711679, + "language_loss": 0.73338234, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.75508618, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6875, + "step": 2739, + "time_per_iteration": 2.447483539581299 + }, + { + "auxiliary_loss_clip": 0.01093085, + "auxiliary_loss_mlp": 0.01070486, + "balance_loss_clip": 1.02575827, + "balance_loss_mlp": 1.02491307, + "epoch": 0.16473771231023598, + "flos": 24825375596160.0, + "grad_norm": 13.003842748907921, + "language_loss": 0.82771897, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.84935462, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.68359375, + "step": 2740, + "time_per_iteration": 2.4232521057128906 + }, + { + "auxiliary_loss_clip": 0.01096083, + "auxiliary_loss_mlp": 0.01074049, + "balance_loss_clip": 1.02741444, + "balance_loss_mlp": 1.02525282, + "epoch": 0.16479783556290395, + "flos": 39894482822400.0, + "grad_norm": 1.9566560366360048, + "language_loss": 0.71405935, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.73576069, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.70703125, + "step": 2741, + "time_per_iteration": 2.5660853385925293 + }, + { + "auxiliary_loss_clip": 0.01092452, + "auxiliary_loss_mlp": 0.01061293, + "balance_loss_clip": 1.01725733, + "balance_loss_mlp": 1.02567506, + "epoch": 0.16485795881557191, + "flos": 19896163292160.0, + "grad_norm": 1.7787717677893216, + "language_loss": 0.81296003, + "learning_rate": 3.812235911671472e-06, + "loss": 0.83449745, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.66796875, + "step": 2742, + "time_per_iteration": 2.379420518875122 + }, + { + "auxiliary_loss_clip": 0.01093336, + "auxiliary_loss_mlp": 0.01068493, + "balance_loss_clip": 1.02591193, + "balance_loss_mlp": 1.02672958, + "epoch": 0.16491808206823988, + "flos": 20555148804480.0, + "grad_norm": 1.7240146547881183, + "language_loss": 0.86191559, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.88353384, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6640625, + "step": 2743, + "time_per_iteration": 2.4186954498291016 + }, + { + "auxiliary_loss_clip": 0.01089318, + "auxiliary_loss_mlp": 0.01064581, + "balance_loss_clip": 1.02204752, + "balance_loss_mlp": 1.02435291, + "epoch": 0.16497820532090787, + "flos": 23799802141440.0, + "grad_norm": 1.5270042900874738, + "language_loss": 0.87049669, + "learning_rate": 3.811906270092265e-06, + "loss": 0.89203572, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6484375, + "step": 2744, + "time_per_iteration": 3.916055917739868 + }, + { + "auxiliary_loss_clip": 0.01087792, + "auxiliary_loss_mlp": 0.01065176, + "balance_loss_clip": 1.02223635, + "balance_loss_mlp": 1.02456534, + "epoch": 0.16503832857357584, + "flos": 25481498376960.0, + "grad_norm": 2.3486178426489257, + "language_loss": 0.83970082, + "learning_rate": 3.811741346238036e-06, + "loss": 0.86123049, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6328125, + "step": 2745, + "time_per_iteration": 2.4591195583343506 + }, + { + "auxiliary_loss_clip": 0.01095044, + "auxiliary_loss_mlp": 0.01069975, + "balance_loss_clip": 1.02546191, + "balance_loss_mlp": 1.02557325, + "epoch": 0.1650984518262438, + "flos": 17675093462400.0, + "grad_norm": 1.9075349393238235, + "language_loss": 0.78004473, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.80169493, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6953125, + "step": 2746, + "time_per_iteration": 3.819058895111084 + }, + { + "auxiliary_loss_clip": 0.01090119, + "auxiliary_loss_mlp": 0.01071521, + "balance_loss_clip": 1.02550578, + "balance_loss_mlp": 1.0242486, + "epoch": 0.16515857507891177, + "flos": 18697315426560.0, + "grad_norm": 1.5162653944322744, + "language_loss": 0.81500322, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83661962, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.66015625, + "step": 2747, + "time_per_iteration": 3.82015323638916 + }, + { + "auxiliary_loss_clip": 0.0109425, + "auxiliary_loss_mlp": 0.01064368, + "balance_loss_clip": 1.01942587, + "balance_loss_mlp": 1.02555263, + "epoch": 0.16521869833157973, + "flos": 15009649447680.0, + "grad_norm": 2.174902745316553, + "language_loss": 0.71604735, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.73763359, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6875, + "step": 2748, + "time_per_iteration": 3.751910924911499 + }, + { + "auxiliary_loss_clip": 0.0109039, + "auxiliary_loss_mlp": 0.01068612, + "balance_loss_clip": 1.02469552, + "balance_loss_mlp": 1.02422309, + "epoch": 0.1652788215842477, + "flos": 22120235498880.0, + "grad_norm": 2.0742631307424975, + "language_loss": 0.90456176, + "learning_rate": 3.811080963869561e-06, + "loss": 0.92615175, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.66015625, + "step": 2749, + "time_per_iteration": 2.408053398132324 + }, + { + "auxiliary_loss_clip": 0.01091405, + "auxiliary_loss_mlp": 0.01059867, + "balance_loss_clip": 1.01621294, + "balance_loss_mlp": 1.02384591, + "epoch": 0.16533894483691566, + "flos": 18332089027200.0, + "grad_norm": 2.081618946180685, + "language_loss": 0.80768466, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.82919741, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.67578125, + "step": 2750, + "time_per_iteration": 2.401859998703003 + }, + { + "auxiliary_loss_clip": 0.01089643, + "auxiliary_loss_mlp": 0.01066797, + "balance_loss_clip": 1.02385771, + "balance_loss_mlp": 1.02401328, + "epoch": 0.16539906808958366, + "flos": 22381036421760.0, + "grad_norm": 3.383606732749116, + "language_loss": 0.96704721, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.98861158, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.65625, + "step": 2751, + "time_per_iteration": 2.4410171508789062 + }, + { + "auxiliary_loss_clip": 0.01090882, + "auxiliary_loss_mlp": 0.01063173, + "balance_loss_clip": 1.02054405, + "balance_loss_mlp": 1.02675104, + "epoch": 0.16545919134225162, + "flos": 22709988051840.0, + "grad_norm": 1.8899083623984048, + "language_loss": 0.72576171, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.74730229, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.640625, + "step": 2752, + "time_per_iteration": 2.4002676010131836 + }, + { + "auxiliary_loss_clip": 0.01028962, + "auxiliary_loss_mlp": 0.01017949, + "balance_loss_clip": 1.01036727, + "balance_loss_mlp": 1.00957465, + "epoch": 0.1655193145949196, + "flos": 67799720805120.0, + "grad_norm": 0.7712634080014775, + "language_loss": 0.5423367, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56280577, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.07568359, + "router_z_loss_mlp": 0.19335938, + "step": 2753, + "time_per_iteration": 3.1309733390808105 + }, + { + "auxiliary_loss_clip": 0.01088824, + "auxiliary_loss_mlp": 0.01063184, + "balance_loss_clip": 1.01809859, + "balance_loss_mlp": 1.02385747, + "epoch": 0.16557943784758755, + "flos": 24279229198080.0, + "grad_norm": 1.6258996643962718, + "language_loss": 0.77334481, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.79486489, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6484375, + "step": 2754, + "time_per_iteration": 2.4204041957855225 + }, + { + "auxiliary_loss_clip": 0.01100078, + "auxiliary_loss_mlp": 0.01077234, + "balance_loss_clip": 1.02924025, + "balance_loss_mlp": 1.02935445, + "epoch": 0.16563956110025552, + "flos": 20082599285760.0, + "grad_norm": 2.1398829892761286, + "language_loss": 0.89349544, + "learning_rate": 3.810088330151188e-06, + "loss": 0.91526854, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.70703125, + "step": 2755, + "time_per_iteration": 2.390040636062622 + }, + { + "auxiliary_loss_clip": 0.01091965, + "auxiliary_loss_mlp": 0.01070943, + "balance_loss_clip": 1.02671599, + "balance_loss_mlp": 1.02611148, + "epoch": 0.16569968435292348, + "flos": 28033300316160.0, + "grad_norm": 1.8221852890069372, + "language_loss": 0.74945003, + "learning_rate": 3.80992265092595e-06, + "loss": 0.77107906, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.66015625, + "step": 2756, + "time_per_iteration": 2.4387753009796143 + }, + { + "auxiliary_loss_clip": 0.01090842, + "auxiliary_loss_mlp": 0.01060902, + "balance_loss_clip": 1.02034688, + "balance_loss_mlp": 1.02738094, + "epoch": 0.16575980760559147, + "flos": 26249028906240.0, + "grad_norm": 1.5835441168216, + "language_loss": 0.76725578, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.78877318, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.63671875, + "step": 2757, + "time_per_iteration": 2.481924533843994 + }, + { + "auxiliary_loss_clip": 0.0109289, + "auxiliary_loss_mlp": 0.01062927, + "balance_loss_clip": 1.02203822, + "balance_loss_mlp": 1.02745128, + "epoch": 0.16581993085825944, + "flos": 26942718176640.0, + "grad_norm": 2.582196966652395, + "language_loss": 0.86347389, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.88503206, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.65234375, + "step": 2758, + "time_per_iteration": 2.4430432319641113 + }, + { + "auxiliary_loss_clip": 0.01093548, + "auxiliary_loss_mlp": 0.01066322, + "balance_loss_clip": 1.02641082, + "balance_loss_mlp": 1.02721334, + "epoch": 0.1658800541109274, + "flos": 21652538659200.0, + "grad_norm": 2.3870797543162143, + "language_loss": 0.81196606, + "learning_rate": 3.809425201480689e-06, + "loss": 0.83356476, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6640625, + "step": 2759, + "time_per_iteration": 2.4006094932556152 + }, + { + "auxiliary_loss_clip": 0.01090035, + "auxiliary_loss_mlp": 0.01068631, + "balance_loss_clip": 1.02404726, + "balance_loss_mlp": 1.02448022, + "epoch": 0.16594017736359537, + "flos": 16434559566720.0, + "grad_norm": 2.1172870503428505, + "language_loss": 0.77690071, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.79848742, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.65625, + "step": 2760, + "time_per_iteration": 2.4081296920776367 + }, + { + "auxiliary_loss_clip": 0.01096653, + "auxiliary_loss_mlp": 0.01069722, + "balance_loss_clip": 1.02508974, + "balance_loss_mlp": 1.0270468, + "epoch": 0.16600030061626334, + "flos": 22636216615680.0, + "grad_norm": 1.8200067825628155, + "language_loss": 0.75873214, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.78039593, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6953125, + "step": 2761, + "time_per_iteration": 2.4225738048553467 + }, + { + "auxiliary_loss_clip": 0.01091461, + "auxiliary_loss_mlp": 0.0106588, + "balance_loss_clip": 1.02384698, + "balance_loss_mlp": 1.02562535, + "epoch": 0.1660604238689313, + "flos": 26395349880960.0, + "grad_norm": 1.885966839764211, + "language_loss": 0.90257275, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.92414618, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.66015625, + "step": 2762, + "time_per_iteration": 2.450077772140503 + }, + { + "auxiliary_loss_clip": 0.01093222, + "auxiliary_loss_mlp": 0.01069601, + "balance_loss_clip": 1.02778196, + "balance_loss_mlp": 1.02628696, + "epoch": 0.16612054712159927, + "flos": 23038869859200.0, + "grad_norm": 1.7725303276699687, + "language_loss": 0.9017812, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.92340946, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.671875, + "step": 2763, + "time_per_iteration": 2.4360148906707764 + }, + { + "auxiliary_loss_clip": 0.01026298, + "auxiliary_loss_mlp": 0.01011992, + "balance_loss_clip": 1.00069058, + "balance_loss_mlp": 1.0072217, + "epoch": 0.16618067037426726, + "flos": 59237864691840.0, + "grad_norm": 0.7810601386425962, + "language_loss": 0.59933674, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61971962, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.19140625, + "step": 2764, + "time_per_iteration": 3.0757365226745605 + }, + { + "auxiliary_loss_clip": 0.01093638, + "auxiliary_loss_mlp": 0.01071255, + "balance_loss_clip": 1.02400064, + "balance_loss_mlp": 1.02620268, + "epoch": 0.16624079362693522, + "flos": 27197584168320.0, + "grad_norm": 1.7044008716848251, + "language_loss": 0.83426261, + "learning_rate": 3.808428450193401e-06, + "loss": 0.85591155, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.671875, + "step": 2765, + "time_per_iteration": 2.4910948276519775 + }, + { + "auxiliary_loss_clip": 0.0109763, + "auxiliary_loss_mlp": 0.01067257, + "balance_loss_clip": 1.0206219, + "balance_loss_mlp": 1.02670562, + "epoch": 0.1663009168796032, + "flos": 10924322549760.0, + "grad_norm": 2.477884376384354, + "language_loss": 0.71547925, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.73712802, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.7109375, + "step": 2766, + "time_per_iteration": 2.3629746437072754 + }, + { + "auxiliary_loss_clip": 0.01089683, + "auxiliary_loss_mlp": 0.01062297, + "balance_loss_clip": 1.01864231, + "balance_loss_mlp": 1.02584291, + "epoch": 0.16636104013227115, + "flos": 17893475216640.0, + "grad_norm": 2.792617140085992, + "language_loss": 0.90197724, + "learning_rate": 3.808095651090769e-06, + "loss": 0.92349696, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.63671875, + "step": 2767, + "time_per_iteration": 2.393547296524048 + }, + { + "auxiliary_loss_clip": 0.01025557, + "auxiliary_loss_mlp": 0.01017246, + "balance_loss_clip": 1.00880623, + "balance_loss_mlp": 1.00772238, + "epoch": 0.16642116338493912, + "flos": 66722335159680.0, + "grad_norm": 0.6532341541765665, + "language_loss": 0.52998656, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.55041462, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.08447266, + "router_z_loss_mlp": 0.17773438, + "step": 2768, + "time_per_iteration": 3.1636197566986084 + }, + { + "auxiliary_loss_clip": 0.01093826, + "auxiliary_loss_mlp": 0.01067203, + "balance_loss_clip": 1.02187955, + "balance_loss_mlp": 1.02571464, + "epoch": 0.16648128663760708, + "flos": 19025045159040.0, + "grad_norm": 2.2753863533059158, + "language_loss": 0.88566965, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.90727997, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.6796875, + "step": 2769, + "time_per_iteration": 2.3921074867248535 + }, + { + "auxiliary_loss_clip": 0.01023739, + "auxiliary_loss_mlp": 0.01012871, + "balance_loss_clip": 1.00462151, + "balance_loss_mlp": 1.00586343, + "epoch": 0.16654140989027508, + "flos": 70131744535680.0, + "grad_norm": 0.814591491707449, + "language_loss": 0.57643324, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59679937, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.08251953, + "router_z_loss_mlp": 0.17871094, + "step": 2770, + "time_per_iteration": 2.901458501815796 + }, + { + "auxiliary_loss_clip": 0.01021962, + "auxiliary_loss_mlp": 0.01013024, + "balance_loss_clip": 1.00506115, + "balance_loss_mlp": 1.0043304, + "epoch": 0.16660153314294304, + "flos": 70269407493120.0, + "grad_norm": 0.8685549315047858, + "language_loss": 0.56272244, + "learning_rate": 3.807429230178015e-06, + "loss": 0.5830723, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.07958984, + "router_z_loss_mlp": 0.17578125, + "step": 2771, + "time_per_iteration": 2.896737575531006 + }, + { + "auxiliary_loss_clip": 0.01091314, + "auxiliary_loss_mlp": 0.01075012, + "balance_loss_clip": 1.02704263, + "balance_loss_mlp": 1.02591109, + "epoch": 0.166661656395611, + "flos": 23073957642240.0, + "grad_norm": 2.0332154926987855, + "language_loss": 0.72690505, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.7485683, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.65625, + "step": 2772, + "time_per_iteration": 2.4210855960845947 + }, + { + "auxiliary_loss_clip": 0.01089599, + "auxiliary_loss_mlp": 0.01076538, + "balance_loss_clip": 1.03080952, + "balance_loss_mlp": 1.02465582, + "epoch": 0.16672177964827897, + "flos": 28365079766400.0, + "grad_norm": 2.029207375792823, + "language_loss": 0.8746177, + "learning_rate": 3.807095608468975e-06, + "loss": 0.8962791, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6484375, + "step": 2773, + "time_per_iteration": 2.439413070678711 + }, + { + "auxiliary_loss_clip": 0.01091207, + "auxiliary_loss_mlp": 0.01059769, + "balance_loss_clip": 1.0193094, + "balance_loss_mlp": 1.02734828, + "epoch": 0.16678190290094694, + "flos": 19090228400640.0, + "grad_norm": 2.3064932494663783, + "language_loss": 0.83679879, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.85830855, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.63671875, + "step": 2774, + "time_per_iteration": 2.4070403575897217 + }, + { + "auxiliary_loss_clip": 0.01094846, + "auxiliary_loss_mlp": 0.01060216, + "balance_loss_clip": 1.01830184, + "balance_loss_mlp": 1.02864218, + "epoch": 0.1668420261536149, + "flos": 21798021761280.0, + "grad_norm": 2.0684483057720993, + "language_loss": 0.85034013, + "learning_rate": 3.806761712658952e-06, + "loss": 0.87189078, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.6640625, + "step": 2775, + "time_per_iteration": 2.40207576751709 + }, + { + "auxiliary_loss_clip": 0.01091657, + "auxiliary_loss_mlp": 0.01063227, + "balance_loss_clip": 1.02155209, + "balance_loss_mlp": 1.02725029, + "epoch": 0.16690214940628287, + "flos": 19061529396480.0, + "grad_norm": 2.0821496837326947, + "language_loss": 0.82717186, + "learning_rate": 3.806594661981897e-06, + "loss": 0.84872073, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.64453125, + "step": 2776, + "time_per_iteration": 2.3937885761260986 + }, + { + "auxiliary_loss_clip": 0.01087707, + "auxiliary_loss_mlp": 0.01069525, + "balance_loss_clip": 1.02868366, + "balance_loss_mlp": 1.02592278, + "epoch": 0.16696227265895086, + "flos": 18587548512000.0, + "grad_norm": 2.136259099147055, + "language_loss": 0.8116473, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.83321959, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.6171875, + "step": 2777, + "time_per_iteration": 2.4002158641815186 + }, + { + "auxiliary_loss_clip": 0.0109262, + "auxiliary_loss_mlp": 0.01062301, + "balance_loss_clip": 1.01976669, + "balance_loss_mlp": 1.0271982, + "epoch": 0.16702239591161883, + "flos": 23293037623680.0, + "grad_norm": 3.3325808203982556, + "language_loss": 0.86160427, + "learning_rate": 3.806260355115371e-06, + "loss": 0.88315356, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.65625, + "step": 2778, + "time_per_iteration": 2.4349942207336426 + }, + { + "auxiliary_loss_clip": 0.01095508, + "auxiliary_loss_mlp": 0.01068136, + "balance_loss_clip": 1.02317047, + "balance_loss_mlp": 1.02934504, + "epoch": 0.1670825191642868, + "flos": 24424502832000.0, + "grad_norm": 2.04750349274943, + "language_loss": 0.75941694, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.78105336, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6640625, + "step": 2779, + "time_per_iteration": 2.468214750289917 + }, + { + "auxiliary_loss_clip": 0.0109291, + "auxiliary_loss_mlp": 0.01068536, + "balance_loss_clip": 1.02490544, + "balance_loss_mlp": 1.02726984, + "epoch": 0.16714264241695476, + "flos": 26796292467840.0, + "grad_norm": 2.7949213821781527, + "language_loss": 0.67962003, + "learning_rate": 3.805925774274554e-06, + "loss": 0.70123452, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.65625, + "step": 2780, + "time_per_iteration": 2.4597458839416504 + }, + { + "auxiliary_loss_clip": 0.01095954, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.01856315, + "balance_loss_mlp": 1.02972507, + "epoch": 0.16720276566962272, + "flos": 21834226707840.0, + "grad_norm": 2.037810793045841, + "language_loss": 0.80807018, + "learning_rate": 3.805758381129643e-06, + "loss": 0.82968938, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.6640625, + "step": 2781, + "time_per_iteration": 2.4106314182281494 + }, + { + "auxiliary_loss_clip": 0.01094679, + "auxiliary_loss_mlp": 0.01060651, + "balance_loss_clip": 1.0164485, + "balance_loss_mlp": 1.02724838, + "epoch": 0.1672628889222907, + "flos": 21469349422080.0, + "grad_norm": 1.4387507377797755, + "language_loss": 0.76775312, + "learning_rate": 3.805590919510193e-06, + "loss": 0.7893064, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.67578125, + "step": 2782, + "time_per_iteration": 2.443311929702759 + }, + { + "auxiliary_loss_clip": 0.010983, + "auxiliary_loss_mlp": 0.01073267, + "balance_loss_clip": 1.02756262, + "balance_loss_mlp": 1.02845621, + "epoch": 0.16732301217495865, + "flos": 30772690323840.0, + "grad_norm": 2.189455448213355, + "language_loss": 0.69594377, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.71765947, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.69921875, + "step": 2783, + "time_per_iteration": 3.917137622833252 + }, + { + "auxiliary_loss_clip": 0.01093422, + "auxiliary_loss_mlp": 0.01072136, + "balance_loss_clip": 1.02669358, + "balance_loss_mlp": 1.02789831, + "epoch": 0.16738313542762664, + "flos": 23473573597440.0, + "grad_norm": 1.6714100616770964, + "language_loss": 0.71307266, + "learning_rate": 3.805255790873081e-06, + "loss": 0.73472822, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.65625, + "step": 2784, + "time_per_iteration": 2.445460319519043 + }, + { + "auxiliary_loss_clip": 0.0109254, + "auxiliary_loss_mlp": 0.0107582, + "balance_loss_clip": 1.02582383, + "balance_loss_mlp": 1.02506375, + "epoch": 0.1674432586802946, + "flos": 29787790469760.0, + "grad_norm": 1.8435815162394726, + "language_loss": 0.63103914, + "learning_rate": 3.805088123868126e-06, + "loss": 0.65272272, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.67578125, + "step": 2785, + "time_per_iteration": 3.964520215988159 + }, + { + "auxiliary_loss_clip": 0.01024513, + "auxiliary_loss_mlp": 0.01012005, + "balance_loss_clip": 1.00323153, + "balance_loss_mlp": 1.00698185, + "epoch": 0.16750338193296258, + "flos": 66132547695360.0, + "grad_norm": 0.7825227151365929, + "language_loss": 0.58961439, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60997951, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.17578125, + "step": 2786, + "time_per_iteration": 3.0586705207824707 + }, + { + "auxiliary_loss_clip": 0.0109328, + "auxiliary_loss_mlp": 0.01069216, + "balance_loss_clip": 1.02255785, + "balance_loss_mlp": 1.02573967, + "epoch": 0.16756350518563054, + "flos": 25695760590720.0, + "grad_norm": 2.2494316508196097, + "language_loss": 0.78478754, + "learning_rate": 3.80475258451721e-06, + "loss": 0.80641246, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.67578125, + "step": 2787, + "time_per_iteration": 5.32574725151062 + }, + { + "auxiliary_loss_clip": 0.01095044, + "auxiliary_loss_mlp": 0.01069856, + "balance_loss_clip": 1.02489054, + "balance_loss_mlp": 1.02803338, + "epoch": 0.1676236284382985, + "flos": 23835134304000.0, + "grad_norm": 1.6677961699951511, + "language_loss": 0.79439384, + "learning_rate": 3.804584712183972e-06, + "loss": 0.81604278, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.671875, + "step": 2788, + "time_per_iteration": 2.4263904094696045 + }, + { + "auxiliary_loss_clip": 0.01024282, + "auxiliary_loss_mlp": 0.01018732, + "balance_loss_clip": 1.0111984, + "balance_loss_mlp": 1.00728643, + "epoch": 0.16768375169096647, + "flos": 59872167872640.0, + "grad_norm": 0.867902365954242, + "language_loss": 0.594733, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61516315, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.07519531, + "router_z_loss_mlp": 0.16992188, + "step": 2789, + "time_per_iteration": 2.900134563446045 + }, + { + "auxiliary_loss_clip": 0.01091549, + "auxiliary_loss_mlp": 0.0107826, + "balance_loss_clip": 1.0288837, + "balance_loss_mlp": 1.02462959, + "epoch": 0.16774387494363446, + "flos": 38434135806720.0, + "grad_norm": 1.525805535185926, + "language_loss": 0.71776664, + "learning_rate": 3.804248762233765e-06, + "loss": 0.73946476, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.66796875, + "step": 2790, + "time_per_iteration": 2.5661370754241943 + }, + { + "auxiliary_loss_clip": 0.01090705, + "auxiliary_loss_mlp": 0.01074322, + "balance_loss_clip": 1.02890301, + "balance_loss_mlp": 1.02492142, + "epoch": 0.16780399819630243, + "flos": 22636530817920.0, + "grad_norm": 1.5365242704082918, + "language_loss": 0.80947655, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.83112681, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.65625, + "step": 2791, + "time_per_iteration": 2.401360034942627 + }, + { + "auxiliary_loss_clip": 0.01094044, + "auxiliary_loss_mlp": 0.01077105, + "balance_loss_clip": 1.03223491, + "balance_loss_mlp": 1.02581787, + "epoch": 0.1678641214489704, + "flos": 32890102727040.0, + "grad_norm": 2.461871611199629, + "language_loss": 0.73667097, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.75838244, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.68359375, + "step": 2792, + "time_per_iteration": 2.5113272666931152 + }, + { + "auxiliary_loss_clip": 0.01096819, + "auxiliary_loss_mlp": 0.01065061, + "balance_loss_clip": 1.0216217, + "balance_loss_mlp": 1.02692533, + "epoch": 0.16792424470163836, + "flos": 19973879712000.0, + "grad_norm": 1.9149757130230727, + "language_loss": 0.73280507, + "learning_rate": 3.803744324194691e-06, + "loss": 0.75442392, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.69921875, + "step": 2793, + "time_per_iteration": 2.3997018337249756 + }, + { + "auxiliary_loss_clip": 0.01095139, + "auxiliary_loss_mlp": 0.01060778, + "balance_loss_clip": 1.01543045, + "balance_loss_mlp": 1.02732563, + "epoch": 0.16798436795430632, + "flos": 19718839163520.0, + "grad_norm": 2.8224003523386365, + "language_loss": 0.79046798, + "learning_rate": 3.803576041376831e-06, + "loss": 0.81202716, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.6796875, + "step": 2794, + "time_per_iteration": 2.4007251262664795 + }, + { + "auxiliary_loss_clip": 0.01092973, + "auxiliary_loss_mlp": 0.01068821, + "balance_loss_clip": 1.02387929, + "balance_loss_mlp": 1.02664173, + "epoch": 0.1680444912069743, + "flos": 28103755173120.0, + "grad_norm": 5.099615268587083, + "language_loss": 0.73345435, + "learning_rate": 3.803407690167187e-06, + "loss": 0.75507236, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6640625, + "step": 2795, + "time_per_iteration": 2.439584255218506 + }, + { + "auxiliary_loss_clip": 0.01091668, + "auxiliary_loss_mlp": 0.01063246, + "balance_loss_clip": 1.01873302, + "balance_loss_mlp": 1.02502477, + "epoch": 0.16810461445964225, + "flos": 18074290481280.0, + "grad_norm": 1.775111465305368, + "language_loss": 0.8533355, + "learning_rate": 3.803239270572142e-06, + "loss": 0.87488467, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6640625, + "step": 2796, + "time_per_iteration": 2.3776962757110596 + }, + { + "auxiliary_loss_clip": 0.01095268, + "auxiliary_loss_mlp": 0.01068788, + "balance_loss_clip": 1.02127147, + "balance_loss_mlp": 1.02758515, + "epoch": 0.16816473771231025, + "flos": 23877518561280.0, + "grad_norm": 1.706767131724614, + "language_loss": 0.83410811, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.85574871, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 0.67578125, + "step": 2797, + "time_per_iteration": 2.420983076095581 + }, + { + "auxiliary_loss_clip": 0.01088298, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.01515734, + "balance_loss_mlp": 1.02574921, + "epoch": 0.1682248609649782, + "flos": 22782502679040.0, + "grad_norm": 1.382584802726107, + "language_loss": 0.76569343, + "learning_rate": 3.802902226251401e-06, + "loss": 0.78710943, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.625, + "step": 2798, + "time_per_iteration": 2.4325733184814453 + }, + { + "auxiliary_loss_clip": 0.01094637, + "auxiliary_loss_mlp": 0.01064014, + "balance_loss_clip": 1.01973939, + "balance_loss_mlp": 1.0278883, + "epoch": 0.16828498421764618, + "flos": 20704053219840.0, + "grad_norm": 1.5284822655969978, + "language_loss": 0.81118733, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.83277386, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.66796875, + "step": 2799, + "time_per_iteration": 2.4079232215881348 + }, + { + "auxiliary_loss_clip": 0.01095266, + "auxiliary_loss_mlp": 0.01078377, + "balance_loss_clip": 1.02978754, + "balance_loss_mlp": 1.02584505, + "epoch": 0.16834510747031414, + "flos": 29419422048000.0, + "grad_norm": 2.04990122905547, + "language_loss": 0.72826451, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.75000095, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.6953125, + "step": 2800, + "time_per_iteration": 2.4630932807922363 + }, + { + "auxiliary_loss_clip": 0.01092803, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.01781678, + "balance_loss_mlp": 1.02688348, + "epoch": 0.1684052307229821, + "flos": 18144535870080.0, + "grad_norm": 1.7474245568222944, + "language_loss": 0.8524878, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.87404794, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.66015625, + "step": 2801, + "time_per_iteration": 2.3590078353881836 + }, + { + "auxiliary_loss_clip": 0.01092891, + "auxiliary_loss_mlp": 0.01067795, + "balance_loss_clip": 1.02333045, + "balance_loss_mlp": 1.02633595, + "epoch": 0.16846535397565007, + "flos": 16574177560320.0, + "grad_norm": 2.2529465228494354, + "language_loss": 0.86171067, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.88331759, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6640625, + "step": 2802, + "time_per_iteration": 2.3782804012298584 + }, + { + "auxiliary_loss_clip": 0.01097434, + "auxiliary_loss_mlp": 0.01066881, + "balance_loss_clip": 1.01962686, + "balance_loss_mlp": 1.02692091, + "epoch": 0.16852547722831807, + "flos": 30407568658560.0, + "grad_norm": 1.5950719908345647, + "language_loss": 0.83168805, + "learning_rate": 3.802058419152413e-06, + "loss": 0.85333121, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.703125, + "step": 2803, + "time_per_iteration": 2.4855194091796875 + }, + { + "auxiliary_loss_clip": 0.01092785, + "auxiliary_loss_mlp": 0.01064887, + "balance_loss_clip": 1.0205884, + "balance_loss_mlp": 1.02781117, + "epoch": 0.16858560048098603, + "flos": 33506110488960.0, + "grad_norm": 2.356662869916808, + "language_loss": 0.79995322, + "learning_rate": 3.801889452704297e-06, + "loss": 0.82152992, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.6484375, + "step": 2804, + "time_per_iteration": 2.5529115200042725 + }, + { + "auxiliary_loss_clip": 0.01023339, + "auxiliary_loss_mlp": 0.01008624, + "balance_loss_clip": 1.00089884, + "balance_loss_mlp": 1.00572586, + "epoch": 0.168645723733654, + "flos": 67367111748480.0, + "grad_norm": 0.8294134436018508, + "language_loss": 0.55504894, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57536858, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.07714844, + "router_z_loss_mlp": 0.17578125, + "step": 2805, + "time_per_iteration": 2.975346326828003 + }, + { + "auxiliary_loss_clip": 0.0109067, + "auxiliary_loss_mlp": 0.01060328, + "balance_loss_clip": 1.01996338, + "balance_loss_mlp": 1.02566147, + "epoch": 0.16870584698632196, + "flos": 21323552117760.0, + "grad_norm": 1.8883964826912536, + "language_loss": 0.73888111, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.76039106, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.6484375, + "step": 2806, + "time_per_iteration": 2.417182207107544 + }, + { + "auxiliary_loss_clip": 0.01090625, + "auxiliary_loss_mlp": 0.01063053, + "balance_loss_clip": 1.02082884, + "balance_loss_mlp": 1.02597785, + "epoch": 0.16876597023898993, + "flos": 20739699584640.0, + "grad_norm": 1.7346409261006361, + "language_loss": 0.71404123, + "learning_rate": 3.80138214341862e-06, + "loss": 0.735578, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6484375, + "step": 2807, + "time_per_iteration": 2.405721664428711 + }, + { + "auxiliary_loss_clip": 0.01091665, + "auxiliary_loss_mlp": 0.01066247, + "balance_loss_clip": 1.01992249, + "balance_loss_mlp": 1.02509046, + "epoch": 0.1688260934916579, + "flos": 20302447317120.0, + "grad_norm": 2.377249928402245, + "language_loss": 0.7285319, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.75011098, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.6640625, + "step": 2808, + "time_per_iteration": 2.411302328109741 + }, + { + "auxiliary_loss_clip": 0.01091956, + "auxiliary_loss_mlp": 0.01065891, + "balance_loss_clip": 1.02106786, + "balance_loss_mlp": 1.02544224, + "epoch": 0.16888621674432586, + "flos": 20339629781760.0, + "grad_norm": 2.3646287421784695, + "language_loss": 0.81897187, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.84055036, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6640625, + "step": 2809, + "time_per_iteration": 2.400893211364746 + }, + { + "auxiliary_loss_clip": 0.01095708, + "auxiliary_loss_mlp": 0.01062693, + "balance_loss_clip": 1.01944363, + "balance_loss_mlp": 1.02665591, + "epoch": 0.16894633999699385, + "flos": 16244108766720.0, + "grad_norm": 2.0378585544721473, + "language_loss": 0.9052316, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.92681563, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.69140625, + "step": 2810, + "time_per_iteration": 2.3855783939361572 + }, + { + "auxiliary_loss_clip": 0.010969, + "auxiliary_loss_mlp": 0.010714, + "balance_loss_clip": 1.02574301, + "balance_loss_mlp": 1.02747667, + "epoch": 0.16900646324966181, + "flos": 19609142071680.0, + "grad_norm": 1.8785568446537682, + "language_loss": 0.94044548, + "learning_rate": 3.800704774747416e-06, + "loss": 0.96212852, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6953125, + "step": 2811, + "time_per_iteration": 2.393899917602539 + }, + { + "auxiliary_loss_clip": 0.0109317, + "auxiliary_loss_mlp": 0.01064237, + "balance_loss_clip": 1.02079678, + "balance_loss_mlp": 1.02594984, + "epoch": 0.16906658650232978, + "flos": 22016997008640.0, + "grad_norm": 2.0360706760987233, + "language_loss": 0.80971998, + "learning_rate": 3.800535261856291e-06, + "loss": 0.83129406, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.671875, + "step": 2812, + "time_per_iteration": 2.423426866531372 + }, + { + "auxiliary_loss_clip": 0.01092422, + "auxiliary_loss_mlp": 0.01065571, + "balance_loss_clip": 1.02346683, + "balance_loss_mlp": 1.02685845, + "epoch": 0.16912670975499774, + "flos": 11762936340480.0, + "grad_norm": 2.325365374136123, + "language_loss": 0.77186131, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.79344118, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.65625, + "step": 2813, + "time_per_iteration": 2.361151933670044 + }, + { + "auxiliary_loss_clip": 0.0109578, + "auxiliary_loss_mlp": 0.01069475, + "balance_loss_clip": 1.02398443, + "balance_loss_mlp": 1.0260067, + "epoch": 0.1691868330076657, + "flos": 17160543711360.0, + "grad_norm": 2.3636441629704663, + "language_loss": 0.71642256, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.73807508, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.69921875, + "step": 2814, + "time_per_iteration": 2.3779735565185547 + }, + { + "auxiliary_loss_clip": 0.01091392, + "auxiliary_loss_mlp": 0.01061905, + "balance_loss_clip": 1.01577103, + "balance_loss_mlp": 1.0256629, + "epoch": 0.16924695626033368, + "flos": 22415530711680.0, + "grad_norm": 3.1133592841790625, + "language_loss": 0.6339041, + "learning_rate": 3.800026313549776e-06, + "loss": 0.65543711, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.65625, + "step": 2815, + "time_per_iteration": 2.399751663208008 + }, + { + "auxiliary_loss_clip": 0.0109106, + "auxiliary_loss_mlp": 0.01060372, + "balance_loss_clip": 1.02007937, + "balance_loss_mlp": 1.02477145, + "epoch": 0.16930707951300164, + "flos": 25738459050240.0, + "grad_norm": 1.5544173376299129, + "language_loss": 0.83343053, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.85494483, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.6640625, + "step": 2816, + "time_per_iteration": 2.4539413452148438 + }, + { + "auxiliary_loss_clip": 0.01093484, + "auxiliary_loss_mlp": 0.01067237, + "balance_loss_clip": 1.02212799, + "balance_loss_mlp": 1.02522147, + "epoch": 0.16936720276566963, + "flos": 22745948618880.0, + "grad_norm": 1.9986814531228778, + "language_loss": 0.89108241, + "learning_rate": 3.799686673382153e-06, + "loss": 0.91268969, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.6796875, + "step": 2817, + "time_per_iteration": 2.4144058227539062 + }, + { + "auxiliary_loss_clip": 0.01093106, + "auxiliary_loss_mlp": 0.01068669, + "balance_loss_clip": 1.02084184, + "balance_loss_mlp": 1.02678072, + "epoch": 0.1694273260183376, + "flos": 19572937125120.0, + "grad_norm": 1.6312176305661494, + "language_loss": 0.82893324, + "learning_rate": 3.799516750928672e-06, + "loss": 0.85055101, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.6640625, + "step": 2818, + "time_per_iteration": 2.4073991775512695 + }, + { + "auxiliary_loss_clip": 0.01091673, + "auxiliary_loss_mlp": 0.01073756, + "balance_loss_clip": 1.02635825, + "balance_loss_mlp": 1.02404082, + "epoch": 0.16948744927100556, + "flos": 12457044547200.0, + "grad_norm": 3.2134526476407332, + "language_loss": 0.83228564, + "learning_rate": 3.799346760237336e-06, + "loss": 0.85393989, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.67578125, + "step": 2819, + "time_per_iteration": 2.383348226547241 + }, + { + "auxiliary_loss_clip": 0.01024571, + "auxiliary_loss_mlp": 0.01022505, + "balance_loss_clip": 1.01382637, + "balance_loss_mlp": 1.00536227, + "epoch": 0.16954757252367353, + "flos": 71288731814400.0, + "grad_norm": 0.9578316048820912, + "language_loss": 0.61405838, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63452911, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.08691406, + "router_z_loss_mlp": 0.19140625, + "step": 2820, + "time_per_iteration": 2.918581247329712 + }, + { + "auxiliary_loss_clip": 0.01093271, + "auxiliary_loss_mlp": 0.01073771, + "balance_loss_clip": 1.02828026, + "balance_loss_mlp": 1.02458072, + "epoch": 0.1696076957763415, + "flos": 29605229637120.0, + "grad_norm": 1.865177375554526, + "language_loss": 0.80895323, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.83062363, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.6875, + "step": 2821, + "time_per_iteration": 2.476318597793579 + }, + { + "auxiliary_loss_clip": 0.01092765, + "auxiliary_loss_mlp": 0.01069327, + "balance_loss_clip": 1.02228749, + "balance_loss_mlp": 1.02621293, + "epoch": 0.16966781902900946, + "flos": 24387460012800.0, + "grad_norm": 2.0926119293000447, + "language_loss": 0.80663157, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.82825243, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.6640625, + "step": 2822, + "time_per_iteration": 3.856463670730591 + }, + { + "auxiliary_loss_clip": 0.01090651, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_clip": 1.02118349, + "balance_loss_mlp": 1.02448606, + "epoch": 0.16972794228167745, + "flos": 23037717784320.0, + "grad_norm": 1.6736904869094582, + "language_loss": 0.77210093, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.79367185, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.66015625, + "step": 2823, + "time_per_iteration": 2.4150795936584473 + }, + { + "auxiliary_loss_clip": 0.01094506, + "auxiliary_loss_mlp": 0.01073399, + "balance_loss_clip": 1.02717018, + "balance_loss_mlp": 1.0256331, + "epoch": 0.16978806553434542, + "flos": 35227153693440.0, + "grad_norm": 1.751131267386468, + "language_loss": 0.60404825, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.6257273, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.6875, + "step": 2824, + "time_per_iteration": 2.5323829650878906 + }, + { + "auxiliary_loss_clip": 0.010927, + "auxiliary_loss_mlp": 0.01075737, + "balance_loss_clip": 1.03062844, + "balance_loss_mlp": 1.02600312, + "epoch": 0.16984818878701338, + "flos": 32012944928640.0, + "grad_norm": 1.6840524864540858, + "language_loss": 0.74705148, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.76873583, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.6640625, + "step": 2825, + "time_per_iteration": 3.881988525390625 + }, + { + "auxiliary_loss_clip": 0.01095974, + "auxiliary_loss_mlp": 0.01083982, + "balance_loss_clip": 1.03386664, + "balance_loss_mlp": 1.0247283, + "epoch": 0.16990831203968135, + "flos": 22817555550720.0, + "grad_norm": 1.9377285476898294, + "language_loss": 0.87846041, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.90025997, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.7109375, + "step": 2826, + "time_per_iteration": 3.8416035175323486 + }, + { + "auxiliary_loss_clip": 0.01095079, + "auxiliary_loss_mlp": 0.01078206, + "balance_loss_clip": 1.03145254, + "balance_loss_mlp": 1.02447474, + "epoch": 0.1699684352923493, + "flos": 23038485834240.0, + "grad_norm": 1.626655203869992, + "language_loss": 0.83590013, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.85763299, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.70703125, + "step": 2827, + "time_per_iteration": 3.8422253131866455 + }, + { + "auxiliary_loss_clip": 0.01095027, + "auxiliary_loss_mlp": 0.01073493, + "balance_loss_clip": 1.02333021, + "balance_loss_mlp": 1.02551496, + "epoch": 0.17002855854501728, + "flos": 21433039741440.0, + "grad_norm": 2.128925568326487, + "language_loss": 0.75779223, + "learning_rate": 3.797813774376267e-06, + "loss": 0.77947748, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 0.6953125, + "step": 2828, + "time_per_iteration": 2.4118363857269287 + }, + { + "auxiliary_loss_clip": 0.01025202, + "auxiliary_loss_mlp": 0.01015542, + "balance_loss_clip": 1.00762641, + "balance_loss_mlp": 1.00500631, + "epoch": 0.17008868179768524, + "flos": 71450099585280.0, + "grad_norm": 0.7917697809161098, + "language_loss": 0.56626511, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58667254, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.20214844, + "step": 2829, + "time_per_iteration": 3.067166566848755 + }, + { + "auxiliary_loss_clip": 0.01091603, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_clip": 1.02482343, + "balance_loss_mlp": 1.02419639, + "epoch": 0.17014880505035324, + "flos": 24899147032320.0, + "grad_norm": 1.7196391893583634, + "language_loss": 0.85071129, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.87229133, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.671875, + "step": 2830, + "time_per_iteration": 2.4402270317077637 + }, + { + "auxiliary_loss_clip": 0.0109456, + "auxiliary_loss_mlp": 0.01065927, + "balance_loss_clip": 1.01924455, + "balance_loss_mlp": 1.02663517, + "epoch": 0.1702089283030212, + "flos": 29861108058240.0, + "grad_norm": 1.9270029980710022, + "language_loss": 0.81109655, + "learning_rate": 3.797301551737529e-06, + "loss": 0.83270144, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.6796875, + "step": 2831, + "time_per_iteration": 2.5193099975585938 + }, + { + "auxiliary_loss_clip": 0.0109337, + "auxiliary_loss_mlp": 0.01068804, + "balance_loss_clip": 1.02276587, + "balance_loss_mlp": 1.02478743, + "epoch": 0.17026905155568917, + "flos": 17743348903680.0, + "grad_norm": 1.6812705303883722, + "language_loss": 0.81523043, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.83685213, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.6875, + "step": 2832, + "time_per_iteration": 2.4307305812835693 + }, + { + "auxiliary_loss_clip": 0.01092446, + "auxiliary_loss_mlp": 0.0107651, + "balance_loss_clip": 1.03087664, + "balance_loss_mlp": 1.02484059, + "epoch": 0.17032917480835713, + "flos": 23147554521600.0, + "grad_norm": 1.6856733648519924, + "language_loss": 0.90962052, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.93131006, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.67578125, + "step": 2833, + "time_per_iteration": 2.421616792678833 + }, + { + "auxiliary_loss_clip": 0.01092926, + "auxiliary_loss_mlp": 0.01069102, + "balance_loss_clip": 1.02399349, + "balance_loss_mlp": 1.02570987, + "epoch": 0.1703892980610251, + "flos": 39201003020160.0, + "grad_norm": 2.2873112119177934, + "language_loss": 0.74799186, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.76961219, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.671875, + "step": 2834, + "time_per_iteration": 2.5521669387817383 + }, + { + "auxiliary_loss_clip": 0.01095933, + "auxiliary_loss_mlp": 0.01067311, + "balance_loss_clip": 1.02241635, + "balance_loss_mlp": 1.02685952, + "epoch": 0.17044942131369306, + "flos": 23037997075200.0, + "grad_norm": 2.2584313265656175, + "language_loss": 0.888789, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.91042137, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.69140625, + "step": 2835, + "time_per_iteration": 2.406376838684082 + }, + { + "auxiliary_loss_clip": 0.01097036, + "auxiliary_loss_mlp": 0.01077636, + "balance_loss_clip": 1.0244211, + "balance_loss_mlp": 1.02696443, + "epoch": 0.17050954456636103, + "flos": 17054058464640.0, + "grad_norm": 2.8093964956035515, + "language_loss": 0.771348, + "learning_rate": 3.796446484348989e-06, + "loss": 0.79309475, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 0.703125, + "step": 2836, + "time_per_iteration": 2.4281187057495117 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_clip": 1.02592754, + "balance_loss_mlp": 1.02751422, + "epoch": 0.17056966781902902, + "flos": 16836025824000.0, + "grad_norm": 2.3312670562003843, + "language_loss": 0.821051, + "learning_rate": 3.796275266481036e-06, + "loss": 0.8428095, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.70703125, + "step": 2837, + "time_per_iteration": 2.458406448364258 + }, + { + "auxiliary_loss_clip": 0.01091271, + "auxiliary_loss_mlp": 0.01072088, + "balance_loss_clip": 1.02829099, + "balance_loss_mlp": 1.0270071, + "epoch": 0.17062979107169698, + "flos": 17711577699840.0, + "grad_norm": 1.7072664403600089, + "language_loss": 0.8567369, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.87837046, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.64453125, + "step": 2838, + "time_per_iteration": 2.469435453414917 + }, + { + "auxiliary_loss_clip": 0.01090102, + "auxiliary_loss_mlp": 0.01059607, + "balance_loss_clip": 1.01816952, + "balance_loss_mlp": 1.02592552, + "epoch": 0.17068991432436495, + "flos": 22524040817280.0, + "grad_norm": 1.6544007514271024, + "language_loss": 0.95071912, + "learning_rate": 3.795932626406812e-06, + "loss": 0.97221625, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.640625, + "step": 2839, + "time_per_iteration": 2.43489670753479 + }, + { + "auxiliary_loss_clip": 0.01092947, + "auxiliary_loss_mlp": 0.01068756, + "balance_loss_clip": 1.02088141, + "balance_loss_mlp": 1.02543569, + "epoch": 0.17075003757703291, + "flos": 25881812559360.0, + "grad_norm": 2.0410307943300716, + "language_loss": 0.85349417, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.87511122, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.67578125, + "step": 2840, + "time_per_iteration": 2.4728140830993652 + }, + { + "auxiliary_loss_clip": 0.01091487, + "auxiliary_loss_mlp": 0.01065869, + "balance_loss_clip": 1.0204978, + "balance_loss_mlp": 1.02458382, + "epoch": 0.17081016082970088, + "flos": 20119677016320.0, + "grad_norm": 1.7944528870396306, + "language_loss": 0.78457022, + "learning_rate": 3.79558971392481e-06, + "loss": 0.8061437, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.66796875, + "step": 2841, + "time_per_iteration": 2.455179452896118 + }, + { + "auxiliary_loss_clip": 0.01090766, + "auxiliary_loss_mlp": 0.01066414, + "balance_loss_clip": 1.01975513, + "balance_loss_mlp": 1.02391315, + "epoch": 0.17087028408236885, + "flos": 24935317067520.0, + "grad_norm": 1.8311591088719055, + "language_loss": 0.78259361, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.80416536, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.66796875, + "step": 2842, + "time_per_iteration": 2.453768014907837 + }, + { + "auxiliary_loss_clip": 0.01088348, + "auxiliary_loss_mlp": 0.01064896, + "balance_loss_clip": 1.01973987, + "balance_loss_mlp": 1.02416992, + "epoch": 0.17093040733503684, + "flos": 19056990919680.0, + "grad_norm": 2.347448659326227, + "language_loss": 0.8680855, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88961798, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.640625, + "step": 2843, + "time_per_iteration": 2.430903196334839 + }, + { + "auxiliary_loss_clip": 0.01089378, + "auxiliary_loss_mlp": 0.01064965, + "balance_loss_clip": 1.02264547, + "balance_loss_mlp": 1.02520728, + "epoch": 0.1709905305877048, + "flos": 13078114456320.0, + "grad_norm": 1.757724554712279, + "language_loss": 0.70827973, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.72982317, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.640625, + "step": 2844, + "time_per_iteration": 2.3674488067626953 + }, + { + "auxiliary_loss_clip": 0.01092279, + "auxiliary_loss_mlp": 0.01060944, + "balance_loss_clip": 1.0175997, + "balance_loss_mlp": 1.02771211, + "epoch": 0.17105065384037277, + "flos": 19208304218880.0, + "grad_norm": 1.6626378599259997, + "language_loss": 0.79371041, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.81524265, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.6484375, + "step": 2845, + "time_per_iteration": 2.37955379486084 + }, + { + "auxiliary_loss_clip": 0.01090959, + "auxiliary_loss_mlp": 0.01055603, + "balance_loss_clip": 1.01449919, + "balance_loss_mlp": 1.02574313, + "epoch": 0.17111077709304073, + "flos": 18514196012160.0, + "grad_norm": 2.3811232664417163, + "language_loss": 0.80395436, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.82542002, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.65234375, + "step": 2846, + "time_per_iteration": 2.3786847591400146 + }, + { + "auxiliary_loss_clip": 0.01089206, + "auxiliary_loss_mlp": 0.01069906, + "balance_loss_clip": 1.0286119, + "balance_loss_mlp": 1.02437866, + "epoch": 0.1711709003457087, + "flos": 25081498396800.0, + "grad_norm": 1.9916801296997746, + "language_loss": 0.81481111, + "learning_rate": 3.794559342552472e-06, + "loss": 0.83640224, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6484375, + "step": 2847, + "time_per_iteration": 2.4318573474884033 + }, + { + "auxiliary_loss_clip": 0.01088987, + "auxiliary_loss_mlp": 0.01067246, + "balance_loss_clip": 1.02640545, + "balance_loss_mlp": 1.02371383, + "epoch": 0.17123102359837666, + "flos": 17565431281920.0, + "grad_norm": 2.322745653189173, + "language_loss": 0.89611942, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.91768175, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.65234375, + "step": 2848, + "time_per_iteration": 2.371429681777954 + }, + { + "auxiliary_loss_clip": 0.01090049, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_clip": 1.01415241, + "balance_loss_mlp": 1.02399111, + "epoch": 0.17129114685104463, + "flos": 26172534384000.0, + "grad_norm": 2.014456980993039, + "language_loss": 0.77098888, + "learning_rate": 3.794215340959902e-06, + "loss": 0.79247415, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.66015625, + "step": 2849, + "time_per_iteration": 2.4634225368499756 + }, + { + "auxiliary_loss_clip": 0.01025478, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.01898992, + "balance_loss_mlp": 1.00735235, + "epoch": 0.17135127010371262, + "flos": 69266212220160.0, + "grad_norm": 0.8014429701659052, + "language_loss": 0.57607508, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59658229, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.0625, + "router_z_loss_mlp": 0.18164062, + "step": 2850, + "time_per_iteration": 3.016446590423584 + }, + { + "auxiliary_loss_clip": 0.01088152, + "auxiliary_loss_mlp": 0.01061125, + "balance_loss_clip": 1.02071261, + "balance_loss_mlp": 1.02571094, + "epoch": 0.1714113933563806, + "flos": 23548985867520.0, + "grad_norm": 2.08154463439929, + "language_loss": 0.82437003, + "learning_rate": 3.793871067220031e-06, + "loss": 0.84586281, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.625, + "step": 2851, + "time_per_iteration": 2.412357807159424 + }, + { + "auxiliary_loss_clip": 0.01086579, + "auxiliary_loss_mlp": 0.01054264, + "balance_loss_clip": 1.01406705, + "balance_loss_mlp": 1.02386236, + "epoch": 0.17147151660904855, + "flos": 21141375310080.0, + "grad_norm": 2.943319179369485, + "language_loss": 0.9570325, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.97844088, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.625, + "step": 2852, + "time_per_iteration": 2.443964719772339 + }, + { + "auxiliary_loss_clip": 0.01090196, + "auxiliary_loss_mlp": 0.01068709, + "balance_loss_clip": 1.02686667, + "balance_loss_mlp": 1.02409673, + "epoch": 0.17153163986171652, + "flos": 18623893104000.0, + "grad_norm": 1.8171999551438784, + "language_loss": 0.70537043, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.72695947, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.66015625, + "step": 2853, + "time_per_iteration": 2.401047706604004 + }, + { + "auxiliary_loss_clip": 0.01094033, + "auxiliary_loss_mlp": 0.01062486, + "balance_loss_clip": 1.01971376, + "balance_loss_mlp": 1.02656257, + "epoch": 0.17159176311438448, + "flos": 18222287201280.0, + "grad_norm": 2.0516479640448533, + "language_loss": 0.69866461, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.7202298, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.67578125, + "step": 2854, + "time_per_iteration": 2.400339126586914 + }, + { + "auxiliary_loss_clip": 0.01086326, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_clip": 1.01795685, + "balance_loss_mlp": 1.02289236, + "epoch": 0.17165188636705245, + "flos": 20737988928000.0, + "grad_norm": 1.6499920789907063, + "language_loss": 0.90296578, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.92441845, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.6328125, + "step": 2855, + "time_per_iteration": 2.3948287963867188 + }, + { + "auxiliary_loss_clip": 0.01092296, + "auxiliary_loss_mlp": 0.01063254, + "balance_loss_clip": 1.0206964, + "balance_loss_mlp": 1.02629983, + "epoch": 0.17171200961972044, + "flos": 24898728096000.0, + "grad_norm": 2.126830905089671, + "language_loss": 0.86010611, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.88166165, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.66015625, + "step": 2856, + "time_per_iteration": 2.4358468055725098 + }, + { + "auxiliary_loss_clip": 0.01090769, + "auxiliary_loss_mlp": 0.01068782, + "balance_loss_clip": 1.02715468, + "balance_loss_mlp": 1.02634645, + "epoch": 0.1717721328723884, + "flos": 20156196165120.0, + "grad_norm": 2.0773448047747047, + "language_loss": 0.87851989, + "learning_rate": 3.792836613639026e-06, + "loss": 0.90011537, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.64453125, + "step": 2857, + "time_per_iteration": 2.401097536087036 + }, + { + "auxiliary_loss_clip": 0.01090152, + "auxiliary_loss_mlp": 0.01069214, + "balance_loss_clip": 1.02803922, + "balance_loss_mlp": 1.02598619, + "epoch": 0.17183225612505637, + "flos": 23360699571840.0, + "grad_norm": 2.056305273230322, + "language_loss": 0.79866272, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.82025635, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.640625, + "step": 2858, + "time_per_iteration": 2.396958827972412 + }, + { + "auxiliary_loss_clip": 0.01098051, + "auxiliary_loss_mlp": 0.01070496, + "balance_loss_clip": 1.02450514, + "balance_loss_mlp": 1.02694249, + "epoch": 0.17189237937772434, + "flos": 18113253425280.0, + "grad_norm": 1.763734713206636, + "language_loss": 0.78848308, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.81016856, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.7109375, + "step": 2859, + "time_per_iteration": 2.3705272674560547 + }, + { + "auxiliary_loss_clip": 0.01087962, + "auxiliary_loss_mlp": 0.01055665, + "balance_loss_clip": 1.01449049, + "balance_loss_mlp": 1.02440035, + "epoch": 0.1719525026303923, + "flos": 23257286524800.0, + "grad_norm": 1.8610753537587248, + "language_loss": 0.776173, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79760921, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.6328125, + "step": 2860, + "time_per_iteration": 2.4063658714294434 + }, + { + "auxiliary_loss_clip": 0.0109193, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.01706851, + "balance_loss_mlp": 1.02594209, + "epoch": 0.17201262588306027, + "flos": 20809456214400.0, + "grad_norm": 2.1402244413536726, + "language_loss": 0.82587159, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84741068, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.66015625, + "step": 2861, + "time_per_iteration": 2.4642581939697266 + }, + { + "auxiliary_loss_clip": 0.01090158, + "auxiliary_loss_mlp": 0.01064441, + "balance_loss_clip": 1.02135921, + "balance_loss_mlp": 1.02513468, + "epoch": 0.17207274913572823, + "flos": 20374822298880.0, + "grad_norm": 2.332865779112885, + "language_loss": 0.88171089, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.90325689, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6484375, + "step": 2862, + "time_per_iteration": 3.8884477615356445 + }, + { + "auxiliary_loss_clip": 0.01085168, + "auxiliary_loss_mlp": 0.0105586, + "balance_loss_clip": 1.01597285, + "balance_loss_mlp": 1.02435887, + "epoch": 0.17213287238839622, + "flos": 26796501936000.0, + "grad_norm": 1.842860367672586, + "language_loss": 0.79576755, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.81717777, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.609375, + "step": 2863, + "time_per_iteration": 2.443203926086426 + }, + { + "auxiliary_loss_clip": 0.01089886, + "auxiliary_loss_mlp": 0.01059486, + "balance_loss_clip": 1.01761961, + "balance_loss_mlp": 1.02480996, + "epoch": 0.1721929956410642, + "flos": 26029634722560.0, + "grad_norm": 1.8480385034423394, + "language_loss": 0.74562615, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.76711988, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.65234375, + "step": 2864, + "time_per_iteration": 3.865826368331909 + }, + { + "auxiliary_loss_clip": 0.01091071, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_clip": 1.02795434, + "balance_loss_mlp": 1.02619588, + "epoch": 0.17225311889373215, + "flos": 22272002645760.0, + "grad_norm": 1.6351578351136464, + "language_loss": 0.74111384, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.7626853, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.6484375, + "step": 2865, + "time_per_iteration": 2.3896028995513916 + }, + { + "auxiliary_loss_clip": 0.01089291, + "auxiliary_loss_mlp": 0.010688, + "balance_loss_clip": 1.02624273, + "balance_loss_mlp": 1.02543771, + "epoch": 0.17231324214640012, + "flos": 21286718766720.0, + "grad_norm": 2.043121400930538, + "language_loss": 0.80341017, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.82499111, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.640625, + "step": 2866, + "time_per_iteration": 3.809830665588379 + }, + { + "auxiliary_loss_clip": 0.01092415, + "auxiliary_loss_mlp": 0.01063541, + "balance_loss_clip": 1.02081633, + "balance_loss_mlp": 1.0259279, + "epoch": 0.17237336539906808, + "flos": 19679771485440.0, + "grad_norm": 2.001604475815061, + "language_loss": 0.81287777, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.83443731, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6640625, + "step": 2867, + "time_per_iteration": 3.8203063011169434 + }, + { + "auxiliary_loss_clip": 0.0108853, + "auxiliary_loss_mlp": 0.01060792, + "balance_loss_clip": 1.0195694, + "balance_loss_mlp": 1.02425981, + "epoch": 0.17243348865173605, + "flos": 17528702664960.0, + "grad_norm": 2.130211316937863, + "language_loss": 0.80520242, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.82669562, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.640625, + "step": 2868, + "time_per_iteration": 2.39717435836792 + }, + { + "auxiliary_loss_clip": 0.01095225, + "auxiliary_loss_mlp": 0.01063995, + "balance_loss_clip": 1.02069783, + "balance_loss_mlp": 1.02827501, + "epoch": 0.17249361190440402, + "flos": 18258876172800.0, + "grad_norm": 2.253349580675669, + "language_loss": 0.85268378, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.87427604, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.66796875, + "step": 2869, + "time_per_iteration": 2.4184956550598145 + }, + { + "auxiliary_loss_clip": 0.01091667, + "auxiliary_loss_mlp": 0.01072226, + "balance_loss_clip": 1.02833319, + "balance_loss_mlp": 1.02560723, + "epoch": 0.172553735157072, + "flos": 21173425804800.0, + "grad_norm": 2.0158062909707986, + "language_loss": 0.79092187, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.81256074, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.66015625, + "step": 2870, + "time_per_iteration": 2.423748731613159 + }, + { + "auxiliary_loss_clip": 0.01086343, + "auxiliary_loss_mlp": 0.01060011, + "balance_loss_clip": 1.02143502, + "balance_loss_mlp": 1.02345431, + "epoch": 0.17261385840973997, + "flos": 22272177202560.0, + "grad_norm": 2.5248432630015873, + "language_loss": 0.79267114, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.81413472, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.625, + "step": 2871, + "time_per_iteration": 2.4080138206481934 + }, + { + "auxiliary_loss_clip": 0.01092304, + "auxiliary_loss_mlp": 0.01068253, + "balance_loss_clip": 1.02488458, + "balance_loss_mlp": 1.0263412, + "epoch": 0.17267398166240794, + "flos": 27921159429120.0, + "grad_norm": 2.4889666753116795, + "language_loss": 0.76858968, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.79019523, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.66015625, + "step": 2872, + "time_per_iteration": 2.4459519386291504 + }, + { + "auxiliary_loss_clip": 0.0108538, + "auxiliary_loss_mlp": 0.01058044, + "balance_loss_clip": 1.01691675, + "balance_loss_mlp": 1.02305019, + "epoch": 0.1727341049150759, + "flos": 21944028533760.0, + "grad_norm": 1.7133838906470993, + "language_loss": 0.8329708, + "learning_rate": 3.790066109323988e-06, + "loss": 0.85440505, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.625, + "step": 2873, + "time_per_iteration": 2.4300153255462646 + }, + { + "auxiliary_loss_clip": 0.01088716, + "auxiliary_loss_mlp": 0.01063228, + "balance_loss_clip": 1.0189774, + "balance_loss_mlp": 1.02495396, + "epoch": 0.17279422816774387, + "flos": 18107074114560.0, + "grad_norm": 1.9751021867256773, + "language_loss": 0.76727509, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.78879452, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.640625, + "step": 2874, + "time_per_iteration": 2.3731637001037598 + }, + { + "auxiliary_loss_clip": 0.01090574, + "auxiliary_loss_mlp": 0.01074873, + "balance_loss_clip": 1.02530599, + "balance_loss_mlp": 1.02485073, + "epoch": 0.17285435142041183, + "flos": 21834366353280.0, + "grad_norm": 2.7186990155197464, + "language_loss": 0.82897699, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.85063148, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 0.65625, + "step": 2875, + "time_per_iteration": 2.4122555255889893 + }, + { + "auxiliary_loss_clip": 0.01095177, + "auxiliary_loss_mlp": 0.01073274, + "balance_loss_clip": 1.02430272, + "balance_loss_mlp": 1.02667856, + "epoch": 0.17291447467307983, + "flos": 18367491012480.0, + "grad_norm": 2.3318906605914753, + "language_loss": 0.91021979, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.93190426, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.6875, + "step": 2876, + "time_per_iteration": 2.353543281555176 + }, + { + "auxiliary_loss_clip": 0.0108946, + "auxiliary_loss_mlp": 0.01064435, + "balance_loss_clip": 1.02247381, + "balance_loss_mlp": 1.02576244, + "epoch": 0.1729745979257478, + "flos": 18623648724480.0, + "grad_norm": 1.725976610517751, + "language_loss": 0.8641333, + "learning_rate": 3.789370767013681e-06, + "loss": 0.88567227, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.63671875, + "step": 2877, + "time_per_iteration": 2.3814995288848877 + }, + { + "auxiliary_loss_clip": 0.01093803, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_clip": 1.02364922, + "balance_loss_mlp": 1.02731514, + "epoch": 0.17303472117841576, + "flos": 22997253651840.0, + "grad_norm": 2.1444974179637284, + "language_loss": 0.81142867, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.83307648, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.6640625, + "step": 2878, + "time_per_iteration": 2.3977184295654297 + }, + { + "auxiliary_loss_clip": 0.01089788, + "auxiliary_loss_mlp": 0.01066502, + "balance_loss_clip": 1.02136946, + "balance_loss_mlp": 1.0251019, + "epoch": 0.17309484443108372, + "flos": 25663256248320.0, + "grad_norm": 1.6555079331606155, + "language_loss": 0.7190547, + "learning_rate": 3.78902268871344e-06, + "loss": 0.74061757, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.6484375, + "step": 2879, + "time_per_iteration": 2.4285013675689697 + }, + { + "auxiliary_loss_clip": 0.01090438, + "auxiliary_loss_mlp": 0.01067652, + "balance_loss_clip": 1.02039778, + "balance_loss_mlp": 1.02314425, + "epoch": 0.1731549676837517, + "flos": 13552060429440.0, + "grad_norm": 2.0589374673783594, + "language_loss": 0.85347188, + "learning_rate": 3.78884854780014e-06, + "loss": 0.87505287, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.671875, + "step": 2880, + "time_per_iteration": 2.3559110164642334 + }, + { + "auxiliary_loss_clip": 0.01093063, + "auxiliary_loss_mlp": 0.01065678, + "balance_loss_clip": 1.01684988, + "balance_loss_mlp": 1.02497661, + "epoch": 0.17321509093641965, + "flos": 22855959912960.0, + "grad_norm": 1.9413701568488353, + "language_loss": 0.83029056, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.85187805, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.48828125, + "router_z_loss_mlp": 0.6796875, + "step": 2881, + "time_per_iteration": 2.4011948108673096 + }, + { + "auxiliary_loss_clip": 0.01091507, + "auxiliary_loss_mlp": 0.01062467, + "balance_loss_clip": 1.01926541, + "balance_loss_mlp": 1.02626657, + "epoch": 0.17327521418908762, + "flos": 24351639091200.0, + "grad_norm": 1.959092305494506, + "language_loss": 0.78930008, + "learning_rate": 3.788500062480197e-06, + "loss": 0.81083977, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.65234375, + "step": 2882, + "time_per_iteration": 2.4336893558502197 + }, + { + "auxiliary_loss_clip": 0.01091673, + "auxiliary_loss_mlp": 0.01065719, + "balance_loss_clip": 1.02080059, + "balance_loss_mlp": 1.02672148, + "epoch": 0.1733353374417556, + "flos": 33104364940800.0, + "grad_norm": 2.06214303358596, + "language_loss": 0.77585185, + "learning_rate": 3.788325718086769e-06, + "loss": 0.79742575, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6484375, + "step": 2883, + "time_per_iteration": 2.531148910522461 + }, + { + "auxiliary_loss_clip": 0.01089575, + "auxiliary_loss_mlp": 0.01062584, + "balance_loss_clip": 1.02107573, + "balance_loss_mlp": 1.02517271, + "epoch": 0.17339546069442358, + "flos": 24387809126400.0, + "grad_norm": 2.219275878996124, + "language_loss": 0.87246788, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.89398956, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.64453125, + "step": 2884, + "time_per_iteration": 2.4422266483306885 + }, + { + "auxiliary_loss_clip": 0.01093311, + "auxiliary_loss_mlp": 0.01080063, + "balance_loss_clip": 1.0334518, + "balance_loss_mlp": 1.02620149, + "epoch": 0.17345558394709154, + "flos": 27452938919040.0, + "grad_norm": 1.7148885603740223, + "language_loss": 0.76168227, + "learning_rate": 3.787976825866055e-06, + "loss": 0.78341603, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.671875, + "step": 2885, + "time_per_iteration": 2.439279079437256 + }, + { + "auxiliary_loss_clip": 0.01086862, + "auxiliary_loss_mlp": 0.01065454, + "balance_loss_clip": 1.02525687, + "balance_loss_mlp": 1.02570796, + "epoch": 0.1735157071997595, + "flos": 24680974746240.0, + "grad_norm": 1.5842489108610747, + "language_loss": 0.72438949, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.74591267, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.609375, + "step": 2886, + "time_per_iteration": 2.4342730045318604 + }, + { + "auxiliary_loss_clip": 0.01091524, + "auxiliary_loss_mlp": 0.01066169, + "balance_loss_clip": 1.01939154, + "balance_loss_mlp": 1.02485204, + "epoch": 0.17357583045242747, + "flos": 21687870821760.0, + "grad_norm": 2.147681637425942, + "language_loss": 0.71361846, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.7351954, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 0.6640625, + "step": 2887, + "time_per_iteration": 2.437958240509033 + }, + { + "auxiliary_loss_clip": 0.01091698, + "auxiliary_loss_mlp": 0.01073901, + "balance_loss_clip": 1.02683735, + "balance_loss_mlp": 1.02610159, + "epoch": 0.17363595370509544, + "flos": 15374875847040.0, + "grad_norm": 1.9358917692345845, + "language_loss": 0.8657006, + "learning_rate": 3.787452979049585e-06, + "loss": 0.88735664, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.65625, + "step": 2888, + "time_per_iteration": 2.421739339828491 + }, + { + "auxiliary_loss_clip": 0.01091755, + "auxiliary_loss_mlp": 0.01065567, + "balance_loss_clip": 1.01993394, + "balance_loss_mlp": 1.02645934, + "epoch": 0.1736960769577634, + "flos": 23439812446080.0, + "grad_norm": 2.2776633270140545, + "language_loss": 0.80712545, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.82869864, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.65234375, + "step": 2889, + "time_per_iteration": 2.4313101768493652 + }, + { + "auxiliary_loss_clip": 0.01086424, + "auxiliary_loss_mlp": 0.01065279, + "balance_loss_clip": 1.02243495, + "balance_loss_mlp": 1.0244174, + "epoch": 0.1737562002104314, + "flos": 18586850284800.0, + "grad_norm": 2.33987683777248, + "language_loss": 0.8695032, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.89102018, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6171875, + "step": 2890, + "time_per_iteration": 2.370270252227783 + }, + { + "auxiliary_loss_clip": 0.01092103, + "auxiliary_loss_mlp": 0.01063365, + "balance_loss_clip": 1.01968646, + "balance_loss_mlp": 1.02601039, + "epoch": 0.17381632346309936, + "flos": 15997132742400.0, + "grad_norm": 2.0173457717488796, + "language_loss": 0.84001702, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.86157167, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.66015625, + "step": 2891, + "time_per_iteration": 2.3745503425598145 + }, + { + "auxiliary_loss_clip": 0.01092097, + "auxiliary_loss_mlp": 0.01070298, + "balance_loss_clip": 1.02411628, + "balance_loss_mlp": 1.02544606, + "epoch": 0.17387644671576732, + "flos": 13369010837760.0, + "grad_norm": 2.079217038439262, + "language_loss": 0.83039296, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.85201693, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.6640625, + "step": 2892, + "time_per_iteration": 2.347869396209717 + }, + { + "auxiliary_loss_clip": 0.0109466, + "auxiliary_loss_mlp": 0.01077507, + "balance_loss_clip": 1.02534091, + "balance_loss_mlp": 1.02787924, + "epoch": 0.1739365699684353, + "flos": 26614290216960.0, + "grad_norm": 1.9204135060563383, + "language_loss": 0.76770782, + "learning_rate": 3.786578545502627e-06, + "loss": 0.78942955, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.66796875, + "step": 2893, + "time_per_iteration": 2.4391705989837646 + }, + { + "auxiliary_loss_clip": 0.01092058, + "auxiliary_loss_mlp": 0.01072547, + "balance_loss_clip": 1.02283621, + "balance_loss_mlp": 1.02635539, + "epoch": 0.17399669322110325, + "flos": 23366843971200.0, + "grad_norm": 2.305887795467626, + "language_loss": 0.83406007, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.8557061, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 0.65625, + "step": 2894, + "time_per_iteration": 2.400878429412842 + }, + { + "auxiliary_loss_clip": 0.01089722, + "auxiliary_loss_mlp": 0.01066506, + "balance_loss_clip": 1.01984727, + "balance_loss_mlp": 1.02459931, + "epoch": 0.17405681647377122, + "flos": 22053027398400.0, + "grad_norm": 2.1427415387874214, + "language_loss": 0.75733209, + "learning_rate": 3.786228297806741e-06, + "loss": 0.77889431, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.65234375, + "step": 2895, + "time_per_iteration": 2.4011738300323486 + }, + { + "auxiliary_loss_clip": 0.01025027, + "auxiliary_loss_mlp": 0.01008268, + "balance_loss_clip": 1.00154495, + "balance_loss_mlp": 1.00728679, + "epoch": 0.1741169397264392, + "flos": 61454396044800.0, + "grad_norm": 0.8902278389024223, + "language_loss": 0.62882751, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64916044, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.06738281, + "router_z_loss_mlp": 0.17773438, + "step": 2896, + "time_per_iteration": 3.0961220264434814 + }, + { + "auxiliary_loss_clip": 0.01088869, + "auxiliary_loss_mlp": 0.01062283, + "balance_loss_clip": 1.01896286, + "balance_loss_mlp": 1.02338195, + "epoch": 0.17417706297910718, + "flos": 27016419790080.0, + "grad_norm": 1.8453671968280196, + "language_loss": 0.77417445, + "learning_rate": 3.785877779175034e-06, + "loss": 0.79568589, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.65625, + "step": 2897, + "time_per_iteration": 2.4469895362854004 + }, + { + "auxiliary_loss_clip": 0.01088662, + "auxiliary_loss_mlp": 0.01071551, + "balance_loss_clip": 1.02744365, + "balance_loss_mlp": 1.02574635, + "epoch": 0.17423718623177514, + "flos": 33507506943360.0, + "grad_norm": 2.125949905240734, + "language_loss": 0.70511973, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.72672188, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.62890625, + "step": 2898, + "time_per_iteration": 2.4773776531219482 + }, + { + "auxiliary_loss_clip": 0.01091993, + "auxiliary_loss_mlp": 0.01072233, + "balance_loss_clip": 1.02447808, + "balance_loss_mlp": 1.02548468, + "epoch": 0.1742973094844431, + "flos": 27197409611520.0, + "grad_norm": 2.309683355679883, + "language_loss": 0.78572631, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.80736858, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.6640625, + "step": 2899, + "time_per_iteration": 2.435570478439331 + }, + { + "auxiliary_loss_clip": 0.01087748, + "auxiliary_loss_mlp": 0.01067096, + "balance_loss_clip": 1.02277434, + "balance_loss_mlp": 1.02474427, + "epoch": 0.17435743273711107, + "flos": 22709638938240.0, + "grad_norm": 2.211621101480241, + "language_loss": 0.74694598, + "learning_rate": 3.785351493339121e-06, + "loss": 0.76849449, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.6328125, + "step": 2900, + "time_per_iteration": 2.4122655391693115 + }, + { + "auxiliary_loss_clip": 0.01093288, + "auxiliary_loss_mlp": 0.01071154, + "balance_loss_clip": 1.02687955, + "balance_loss_mlp": 1.02783132, + "epoch": 0.17441755598977904, + "flos": 41644853435520.0, + "grad_norm": 1.8231338749612178, + "language_loss": 0.7149592, + "learning_rate": 3.785175929316863e-06, + "loss": 0.73660362, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.65625, + "step": 2901, + "time_per_iteration": 4.0488200187683105 + }, + { + "auxiliary_loss_clip": 0.0109378, + "auxiliary_loss_mlp": 0.01070795, + "balance_loss_clip": 1.02695036, + "balance_loss_mlp": 1.02655005, + "epoch": 0.174477679242447, + "flos": 26285862257280.0, + "grad_norm": 1.696730553977846, + "language_loss": 0.77205765, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.79370344, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.671875, + "step": 2902, + "time_per_iteration": 2.431731700897217 + }, + { + "auxiliary_loss_clip": 0.01092196, + "auxiliary_loss_mlp": 0.0108078, + "balance_loss_clip": 1.03688669, + "balance_loss_mlp": 1.02548313, + "epoch": 0.174537802495115, + "flos": 17857444826880.0, + "grad_norm": 2.325862379307491, + "language_loss": 0.83158028, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.85331005, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.66796875, + "step": 2903, + "time_per_iteration": 3.811264991760254 + }, + { + "auxiliary_loss_clip": 0.01087772, + "auxiliary_loss_mlp": 0.01065544, + "balance_loss_clip": 1.02212834, + "balance_loss_mlp": 1.02493596, + "epoch": 0.17459792574778296, + "flos": 16939927630080.0, + "grad_norm": 1.778342493653196, + "language_loss": 0.74906021, + "learning_rate": 3.784648831112429e-06, + "loss": 0.7705934, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.62890625, + "step": 2904, + "time_per_iteration": 2.3795006275177 + }, + { + "auxiliary_loss_clip": 0.01089669, + "auxiliary_loss_mlp": 0.01060206, + "balance_loss_clip": 1.01647973, + "balance_loss_mlp": 1.02560961, + "epoch": 0.17465804900045093, + "flos": 25518855398400.0, + "grad_norm": 3.361783911647681, + "language_loss": 0.66569066, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.6871894, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.640625, + "step": 2905, + "time_per_iteration": 2.4227616786956787 + }, + { + "auxiliary_loss_clip": 0.01095816, + "auxiliary_loss_mlp": 0.01071252, + "balance_loss_clip": 1.0260005, + "balance_loss_mlp": 1.02737045, + "epoch": 0.1747181722531189, + "flos": 24128683948800.0, + "grad_norm": 2.0351061643150565, + "language_loss": 0.81126302, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.83293366, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.68359375, + "step": 2906, + "time_per_iteration": 3.9045958518981934 + }, + { + "auxiliary_loss_clip": 0.01091531, + "auxiliary_loss_mlp": 0.01075309, + "balance_loss_clip": 1.03368115, + "balance_loss_mlp": 1.02680981, + "epoch": 0.17477829550578686, + "flos": 17747852469120.0, + "grad_norm": 1.7452977449841942, + "language_loss": 0.83257782, + "learning_rate": 3.784121123841449e-06, + "loss": 0.85424626, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6484375, + "step": 2907, + "time_per_iteration": 3.767719030380249 + }, + { + "auxiliary_loss_clip": 0.01088334, + "auxiliary_loss_mlp": 0.01073022, + "balance_loss_clip": 1.02994001, + "balance_loss_mlp": 1.02476311, + "epoch": 0.17483841875845482, + "flos": 15376446858240.0, + "grad_norm": 1.9210044062239655, + "language_loss": 0.83407009, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.85568357, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.63671875, + "step": 2908, + "time_per_iteration": 2.384033203125 + }, + { + "auxiliary_loss_clip": 0.01090446, + "auxiliary_loss_mlp": 0.01073959, + "balance_loss_clip": 1.02780151, + "balance_loss_mlp": 1.02590287, + "epoch": 0.17489854201112282, + "flos": 17162359102080.0, + "grad_norm": 2.5856077688131824, + "language_loss": 0.83098608, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.85263014, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.6484375, + "step": 2909, + "time_per_iteration": 2.36572265625 + }, + { + "auxiliary_loss_clip": 0.01092034, + "auxiliary_loss_mlp": 0.01066829, + "balance_loss_clip": 1.02424788, + "balance_loss_mlp": 1.02703702, + "epoch": 0.17495866526379078, + "flos": 19754276060160.0, + "grad_norm": 1.9003662677130055, + "language_loss": 0.78629875, + "learning_rate": 3.783592807684017e-06, + "loss": 0.80788743, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6484375, + "step": 2910, + "time_per_iteration": 2.434605360031128 + }, + { + "auxiliary_loss_clip": 0.0108969, + "auxiliary_loss_mlp": 0.0106356, + "balance_loss_clip": 1.0205493, + "balance_loss_mlp": 1.02534711, + "epoch": 0.17501878851645875, + "flos": 28509899552640.0, + "grad_norm": 1.83108964243515, + "language_loss": 0.87899262, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.90052509, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.64453125, + "step": 2911, + "time_per_iteration": 2.4561171531677246 + }, + { + "auxiliary_loss_clip": 0.01086353, + "auxiliary_loss_mlp": 0.01062566, + "balance_loss_clip": 1.02050877, + "balance_loss_mlp": 1.02299929, + "epoch": 0.1750789117691267, + "flos": 17930238744960.0, + "grad_norm": 5.3857464009912634, + "language_loss": 0.92041099, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.94190013, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.6328125, + "step": 2912, + "time_per_iteration": 2.3590400218963623 + }, + { + "auxiliary_loss_clip": 0.01091721, + "auxiliary_loss_mlp": 0.01068017, + "balance_loss_clip": 1.02317047, + "balance_loss_mlp": 1.02568412, + "epoch": 0.17513903502179468, + "flos": 18258457236480.0, + "grad_norm": 2.3782396807398385, + "language_loss": 0.74478644, + "learning_rate": 3.783063882820439e-06, + "loss": 0.76638377, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.66015625, + "step": 2913, + "time_per_iteration": 2.3830840587615967 + }, + { + "auxiliary_loss_clip": 0.01089038, + "auxiliary_loss_mlp": 0.01062526, + "balance_loss_clip": 1.01818037, + "balance_loss_mlp": 1.02552581, + "epoch": 0.17519915827446264, + "flos": 20703669194880.0, + "grad_norm": 2.2455793451257215, + "language_loss": 0.70798314, + "learning_rate": 3.782887439295741e-06, + "loss": 0.72949874, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.6328125, + "step": 2914, + "time_per_iteration": 2.4031383991241455 + }, + { + "auxiliary_loss_clip": 0.01088876, + "auxiliary_loss_mlp": 0.01066116, + "balance_loss_clip": 1.0230819, + "balance_loss_mlp": 1.02584743, + "epoch": 0.1752592815271306, + "flos": 20522330259840.0, + "grad_norm": 1.7611092682813587, + "language_loss": 0.94816107, + "learning_rate": 3.782710928163772e-06, + "loss": 0.96971101, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.62890625, + "step": 2915, + "time_per_iteration": 2.4056267738342285 + }, + { + "auxiliary_loss_clip": 0.0108432, + "auxiliary_loss_mlp": 0.01060703, + "balance_loss_clip": 1.02083921, + "balance_loss_mlp": 1.02452826, + "epoch": 0.1753194047797986, + "flos": 21798091584000.0, + "grad_norm": 1.667851734575182, + "language_loss": 0.82568735, + "learning_rate": 3.782534349431226e-06, + "loss": 0.84713757, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.59765625, + "step": 2916, + "time_per_iteration": 2.429841995239258 + }, + { + "auxiliary_loss_clip": 0.01089989, + "auxiliary_loss_mlp": 0.01077711, + "balance_loss_clip": 1.03336525, + "balance_loss_mlp": 1.02554369, + "epoch": 0.17537952803246656, + "flos": 20667289691520.0, + "grad_norm": 1.6106207047547538, + "language_loss": 0.74966395, + "learning_rate": 3.782357703104799e-06, + "loss": 0.77134097, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.64453125, + "step": 2917, + "time_per_iteration": 2.377786159515381 + }, + { + "auxiliary_loss_clip": 0.01086022, + "auxiliary_loss_mlp": 0.01063227, + "balance_loss_clip": 1.02343524, + "balance_loss_mlp": 1.02559149, + "epoch": 0.17543965128513453, + "flos": 23293945319040.0, + "grad_norm": 2.637318788006816, + "language_loss": 0.78584701, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.80733949, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.60546875, + "step": 2918, + "time_per_iteration": 2.3992128372192383 + }, + { + "auxiliary_loss_clip": 0.01090813, + "auxiliary_loss_mlp": 0.01058827, + "balance_loss_clip": 1.01622128, + "balance_loss_mlp": 1.02608585, + "epoch": 0.1754997745378025, + "flos": 29094345578880.0, + "grad_norm": 2.087091039587337, + "language_loss": 0.75596237, + "learning_rate": 3.782004207697098e-06, + "loss": 0.77745879, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6484375, + "step": 2919, + "time_per_iteration": 2.4571003913879395 + }, + { + "auxiliary_loss_clip": 0.01090791, + "auxiliary_loss_mlp": 0.01067772, + "balance_loss_clip": 1.02447534, + "balance_loss_mlp": 1.02365923, + "epoch": 0.17555989779047046, + "flos": 30370560750720.0, + "grad_norm": 1.6886228336384057, + "language_loss": 0.75645745, + "learning_rate": 3.781827358629228e-06, + "loss": 0.77804309, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.671875, + "step": 2920, + "time_per_iteration": 2.4785592555999756 + }, + { + "auxiliary_loss_clip": 0.01084765, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.02383161, + "balance_loss_mlp": 1.0235281, + "epoch": 0.17562002104313842, + "flos": 23286823401600.0, + "grad_norm": 2.4105591010198006, + "language_loss": 0.8089667, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.83042955, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.61328125, + "step": 2921, + "time_per_iteration": 2.374786853790283 + }, + { + "auxiliary_loss_clip": 0.01091367, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_clip": 1.02351213, + "balance_loss_mlp": 1.02525771, + "epoch": 0.1756801442958064, + "flos": 24789345206400.0, + "grad_norm": 1.6960201527840404, + "language_loss": 0.89165568, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.91323435, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.66015625, + "step": 2922, + "time_per_iteration": 2.439666986465454 + }, + { + "auxiliary_loss_clip": 0.0108881, + "auxiliary_loss_mlp": 0.0106564, + "balance_loss_clip": 1.02031636, + "balance_loss_mlp": 1.02369809, + "epoch": 0.17574026754847438, + "flos": 25770579367680.0, + "grad_norm": 2.6182608140079617, + "language_loss": 0.65145898, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.67300355, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.65234375, + "step": 2923, + "time_per_iteration": 2.448112726211548 + }, + { + "auxiliary_loss_clip": 0.01091541, + "auxiliary_loss_mlp": 0.01062206, + "balance_loss_clip": 1.01821792, + "balance_loss_mlp": 1.02611911, + "epoch": 0.17580039080114235, + "flos": 17455664367360.0, + "grad_norm": 4.211161474440613, + "language_loss": 0.83717859, + "learning_rate": 3.78111928675413e-06, + "loss": 0.85871601, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.65234375, + "step": 2924, + "time_per_iteration": 2.3846991062164307 + }, + { + "auxiliary_loss_clip": 0.01092443, + "auxiliary_loss_mlp": 0.01064805, + "balance_loss_clip": 1.01976728, + "balance_loss_mlp": 1.02559555, + "epoch": 0.1758605140538103, + "flos": 14863817232000.0, + "grad_norm": 3.912727636095897, + "language_loss": 0.74349821, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.76507074, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.66796875, + "step": 2925, + "time_per_iteration": 2.3375930786132812 + }, + { + "auxiliary_loss_clip": 0.01087223, + "auxiliary_loss_mlp": 0.01062899, + "balance_loss_clip": 1.02384627, + "balance_loss_mlp": 1.02541661, + "epoch": 0.17592063730647828, + "flos": 23003118760320.0, + "grad_norm": 1.6388124988300197, + "language_loss": 0.7239027, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74540389, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.6171875, + "step": 2926, + "time_per_iteration": 2.433352470397949 + }, + { + "auxiliary_loss_clip": 0.01092121, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.02301896, + "balance_loss_mlp": 1.02733636, + "epoch": 0.17598076055914624, + "flos": 20740432723200.0, + "grad_norm": 2.1085767856671405, + "language_loss": 0.8708483, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.89244175, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.6484375, + "step": 2927, + "time_per_iteration": 2.3931381702423096 + }, + { + "auxiliary_loss_clip": 0.01090264, + "auxiliary_loss_mlp": 0.01059403, + "balance_loss_clip": 1.01913428, + "balance_loss_mlp": 1.02754247, + "epoch": 0.1760408838118142, + "flos": 34091080185600.0, + "grad_norm": 2.277042296122084, + "language_loss": 0.73026568, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.75176233, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.62890625, + "step": 2928, + "time_per_iteration": 2.519291877746582 + }, + { + "auxiliary_loss_clip": 0.01087588, + "auxiliary_loss_mlp": 0.01053662, + "balance_loss_clip": 1.015944, + "balance_loss_mlp": 1.02680743, + "epoch": 0.1761010070644822, + "flos": 24167297779200.0, + "grad_norm": 1.98724188239387, + "language_loss": 0.8368383, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85825086, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.609375, + "step": 2929, + "time_per_iteration": 2.4175868034362793 + }, + { + "auxiliary_loss_clip": 0.01089639, + "auxiliary_loss_mlp": 0.01060852, + "balance_loss_clip": 1.02282453, + "balance_loss_mlp": 1.02553225, + "epoch": 0.17616113031715017, + "flos": 26575536741120.0, + "grad_norm": 1.5652907653744115, + "language_loss": 0.80871391, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.83021879, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.640625, + "step": 2930, + "time_per_iteration": 2.4472079277038574 + }, + { + "auxiliary_loss_clip": 0.01090927, + "auxiliary_loss_mlp": 0.0105445, + "balance_loss_clip": 1.01368046, + "balance_loss_mlp": 1.02758789, + "epoch": 0.17622125356981813, + "flos": 25665490575360.0, + "grad_norm": 2.966483888353363, + "language_loss": 0.79167688, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.81313062, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.6328125, + "step": 2931, + "time_per_iteration": 2.4499385356903076 + }, + { + "auxiliary_loss_clip": 0.01089269, + "auxiliary_loss_mlp": 0.01062005, + "balance_loss_clip": 1.02016222, + "balance_loss_mlp": 1.02585924, + "epoch": 0.1762813768224861, + "flos": 16507597864320.0, + "grad_norm": 2.5413528061024735, + "language_loss": 0.77786922, + "learning_rate": 3.779699901503696e-06, + "loss": 0.79938197, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6328125, + "step": 2932, + "time_per_iteration": 2.38240122795105 + }, + { + "auxiliary_loss_clip": 0.01095052, + "auxiliary_loss_mlp": 0.01063066, + "balance_loss_clip": 1.01872039, + "balance_loss_mlp": 1.0274384, + "epoch": 0.17634150007515406, + "flos": 11211239036160.0, + "grad_norm": 2.284665674463721, + "language_loss": 0.91705918, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.93864036, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.67578125, + "step": 2933, + "time_per_iteration": 2.4355058670043945 + }, + { + "auxiliary_loss_clip": 0.01089914, + "auxiliary_loss_mlp": 0.01065629, + "balance_loss_clip": 1.0268141, + "balance_loss_mlp": 1.02732658, + "epoch": 0.17640162332782203, + "flos": 23658787693440.0, + "grad_norm": 1.7608745400203607, + "language_loss": 0.89507747, + "learning_rate": 3.779344380192448e-06, + "loss": 0.91663289, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.625, + "step": 2934, + "time_per_iteration": 2.446197032928467 + }, + { + "auxiliary_loss_clip": 0.01084472, + "auxiliary_loss_mlp": 0.01065314, + "balance_loss_clip": 1.0243299, + "balance_loss_mlp": 1.02436435, + "epoch": 0.17646174658049, + "flos": 53795012198400.0, + "grad_norm": 1.598324683849239, + "language_loss": 0.72590935, + "learning_rate": 3.779166518324077e-06, + "loss": 0.7474072, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.6015625, + "step": 2935, + "time_per_iteration": 2.7311477661132812 + }, + { + "auxiliary_loss_clip": 0.01094954, + "auxiliary_loss_mlp": 0.01064281, + "balance_loss_clip": 1.01843357, + "balance_loss_mlp": 1.02664304, + "epoch": 0.17652186983315798, + "flos": 24242710049280.0, + "grad_norm": 2.018301567654352, + "language_loss": 0.72943503, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.75102735, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.68359375, + "step": 2936, + "time_per_iteration": 2.4620087146759033 + }, + { + "auxiliary_loss_clip": 0.01092576, + "auxiliary_loss_mlp": 0.010671, + "balance_loss_clip": 1.02633047, + "balance_loss_mlp": 1.0292021, + "epoch": 0.17658199308582595, + "flos": 27453043653120.0, + "grad_norm": 3.402756259469312, + "language_loss": 0.72766805, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.74926484, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6328125, + "step": 2937, + "time_per_iteration": 2.471367359161377 + }, + { + "auxiliary_loss_clip": 0.01092993, + "auxiliary_loss_mlp": 0.01061944, + "balance_loss_clip": 1.01879013, + "balance_loss_mlp": 1.02625847, + "epoch": 0.17664211633849392, + "flos": 22417590481920.0, + "grad_norm": 2.2535563903697224, + "language_loss": 0.78291547, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.80446482, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.66796875, + "step": 2938, + "time_per_iteration": 2.4107861518859863 + }, + { + "auxiliary_loss_clip": 0.01090456, + "auxiliary_loss_mlp": 0.01059775, + "balance_loss_clip": 1.01874328, + "balance_loss_mlp": 1.02509427, + "epoch": 0.17670223959116188, + "flos": 24714037670400.0, + "grad_norm": 2.090379864234691, + "language_loss": 0.7373234, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.75882578, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.65625, + "step": 2939, + "time_per_iteration": 2.414764404296875 + }, + { + "auxiliary_loss_clip": 0.0108966, + "auxiliary_loss_mlp": 0.01063165, + "balance_loss_clip": 1.02265763, + "balance_loss_mlp": 1.02658463, + "epoch": 0.17676236284382985, + "flos": 22525995853440.0, + "grad_norm": 3.3454685088918388, + "language_loss": 0.7668162, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.78834438, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.6328125, + "step": 2940, + "time_per_iteration": 2.409230947494507 + }, + { + "auxiliary_loss_clip": 0.01091008, + "auxiliary_loss_mlp": 0.0106132, + "balance_loss_clip": 1.02055073, + "balance_loss_mlp": 1.02596807, + "epoch": 0.1768224860964978, + "flos": 12384355363200.0, + "grad_norm": 2.119221690239612, + "language_loss": 0.88552141, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.90704465, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6484375, + "step": 2941, + "time_per_iteration": 3.802500009536743 + }, + { + "auxiliary_loss_clip": 0.01091089, + "auxiliary_loss_mlp": 0.01060538, + "balance_loss_clip": 1.01931596, + "balance_loss_mlp": 1.0262109, + "epoch": 0.1768826093491658, + "flos": 24352197672960.0, + "grad_norm": 1.9127458396190964, + "language_loss": 0.78635901, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.80787528, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.6484375, + "step": 2942, + "time_per_iteration": 2.4250519275665283 + }, + { + "auxiliary_loss_clip": 0.01091681, + "auxiliary_loss_mlp": 0.01065552, + "balance_loss_clip": 1.02015686, + "balance_loss_mlp": 1.02609038, + "epoch": 0.17694273260183377, + "flos": 23585923952640.0, + "grad_norm": 2.4871662247255313, + "language_loss": 0.81709087, + "learning_rate": 3.77774119516197e-06, + "loss": 0.83866316, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.65625, + "step": 2943, + "time_per_iteration": 3.832476854324341 + }, + { + "auxiliary_loss_clip": 0.01090813, + "auxiliary_loss_mlp": 0.01068929, + "balance_loss_clip": 1.022843, + "balance_loss_mlp": 1.02410316, + "epoch": 0.17700285585450173, + "flos": 26759773319040.0, + "grad_norm": 1.6811395293676255, + "language_loss": 0.82449412, + "learning_rate": 3.777562726341155e-06, + "loss": 0.84609151, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.66796875, + "step": 2944, + "time_per_iteration": 2.4378952980041504 + }, + { + "auxiliary_loss_clip": 0.01088668, + "auxiliary_loss_mlp": 0.01063426, + "balance_loss_clip": 1.02220368, + "balance_loss_mlp": 1.02404368, + "epoch": 0.1770629791071697, + "flos": 42774712721280.0, + "grad_norm": 1.8146150788316255, + "language_loss": 0.74577105, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.76729202, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.6484375, + "step": 2945, + "time_per_iteration": 2.5761895179748535 + }, + { + "auxiliary_loss_clip": 0.01088469, + "auxiliary_loss_mlp": 0.01063855, + "balance_loss_clip": 1.02082014, + "balance_loss_mlp": 1.02520621, + "epoch": 0.17712310235983766, + "flos": 17344675555200.0, + "grad_norm": 2.9731231695945817, + "language_loss": 0.80659848, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.82812172, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6328125, + "step": 2946, + "time_per_iteration": 3.7661190032958984 + }, + { + "auxiliary_loss_clip": 0.01086977, + "auxiliary_loss_mlp": 0.01057965, + "balance_loss_clip": 1.01872134, + "balance_loss_mlp": 1.02483714, + "epoch": 0.17718322561250563, + "flos": 23877344004480.0, + "grad_norm": 1.6438459014118192, + "language_loss": 0.77965319, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.80110264, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.62109375, + "step": 2947, + "time_per_iteration": 3.812652349472046 + }, + { + "auxiliary_loss_clip": 0.01087952, + "auxiliary_loss_mlp": 0.01064782, + "balance_loss_clip": 1.02284384, + "balance_loss_mlp": 1.02438092, + "epoch": 0.1772433488651736, + "flos": 36464859768960.0, + "grad_norm": 1.991526034089991, + "language_loss": 0.74439216, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.76591945, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.63671875, + "step": 2948, + "time_per_iteration": 2.5272905826568604 + }, + { + "auxiliary_loss_clip": 0.01090615, + "auxiliary_loss_mlp": 0.01065633, + "balance_loss_clip": 1.02302742, + "balance_loss_mlp": 1.02630484, + "epoch": 0.1773034721178416, + "flos": 26683592999040.0, + "grad_norm": 1.9244414683092006, + "language_loss": 0.82702535, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84858781, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.64453125, + "step": 2949, + "time_per_iteration": 2.443025588989258 + }, + { + "auxiliary_loss_clip": 0.01028988, + "auxiliary_loss_mlp": 0.01012618, + "balance_loss_clip": 1.0064187, + "balance_loss_mlp": 1.00949144, + "epoch": 0.17736359537050955, + "flos": 57114377712000.0, + "grad_norm": 0.7857559367825304, + "language_loss": 0.64979935, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67021549, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.06176758, + "router_z_loss_mlp": 0.1953125, + "step": 2950, + "time_per_iteration": 3.1087162494659424 + }, + { + "auxiliary_loss_clip": 0.01088878, + "auxiliary_loss_mlp": 0.01061852, + "balance_loss_clip": 1.01712513, + "balance_loss_mlp": 1.02598345, + "epoch": 0.17742371862317752, + "flos": 27196990675200.0, + "grad_norm": 1.6928622531178008, + "language_loss": 0.86279452, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.88430178, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.62890625, + "step": 2951, + "time_per_iteration": 2.4450535774230957 + }, + { + "auxiliary_loss_clip": 0.01092478, + "auxiliary_loss_mlp": 0.01072355, + "balance_loss_clip": 1.02631664, + "balance_loss_mlp": 1.02539968, + "epoch": 0.17748384187584548, + "flos": 20958639920640.0, + "grad_norm": 2.0502827851310466, + "language_loss": 0.82256985, + "learning_rate": 3.776132549750806e-06, + "loss": 0.84421819, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.671875, + "step": 2952, + "time_per_iteration": 2.3991332054138184 + }, + { + "auxiliary_loss_clip": 0.0109136, + "auxiliary_loss_mlp": 0.01069049, + "balance_loss_clip": 1.02751613, + "balance_loss_mlp": 1.02606821, + "epoch": 0.17754396512851345, + "flos": 25008809212800.0, + "grad_norm": 2.109505861018574, + "language_loss": 0.82029462, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.84189868, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.65625, + "step": 2953, + "time_per_iteration": 2.423686981201172 + }, + { + "auxiliary_loss_clip": 0.01092565, + "auxiliary_loss_mlp": 0.01068753, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.02600431, + "epoch": 0.1776040883811814, + "flos": 32050197216000.0, + "grad_norm": 1.9120741323726649, + "language_loss": 0.90183246, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.9234457, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.6640625, + "step": 2954, + "time_per_iteration": 2.4903390407562256 + }, + { + "auxiliary_loss_clip": 0.01091948, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.02483177, + "epoch": 0.17766421163384938, + "flos": 21573216316800.0, + "grad_norm": 1.8223407953926485, + "language_loss": 0.86054671, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.88213354, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.671875, + "step": 2955, + "time_per_iteration": 2.4123291969299316 + }, + { + "auxiliary_loss_clip": 0.01088498, + "auxiliary_loss_mlp": 0.01068142, + "balance_loss_clip": 1.02515531, + "balance_loss_mlp": 1.02532196, + "epoch": 0.17772433488651737, + "flos": 22418218886400.0, + "grad_norm": 2.024614206975815, + "language_loss": 0.73179841, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.7533648, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6328125, + "step": 2956, + "time_per_iteration": 2.417614459991455 + }, + { + "auxiliary_loss_clip": 0.01089355, + "auxiliary_loss_mlp": 0.01065999, + "balance_loss_clip": 1.02379906, + "balance_loss_mlp": 1.02462697, + "epoch": 0.17778445813918534, + "flos": 25628273199360.0, + "grad_norm": 2.323884608757457, + "language_loss": 0.84253979, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.8640933, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6484375, + "step": 2957, + "time_per_iteration": 2.431140899658203 + }, + { + "auxiliary_loss_clip": 0.01089182, + "auxiliary_loss_mlp": 0.01071633, + "balance_loss_clip": 1.02804971, + "balance_loss_mlp": 1.02612519, + "epoch": 0.1778445813918533, + "flos": 25627714617600.0, + "grad_norm": 1.5552382787778225, + "language_loss": 0.75916815, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.78077626, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.62890625, + "step": 2958, + "time_per_iteration": 2.433487892150879 + }, + { + "auxiliary_loss_clip": 0.01093099, + "auxiliary_loss_mlp": 0.01069043, + "balance_loss_clip": 1.02579391, + "balance_loss_mlp": 1.0282073, + "epoch": 0.17790470464452127, + "flos": 22344447450240.0, + "grad_norm": 2.1738270680083502, + "language_loss": 0.82757676, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.84919822, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.6484375, + "step": 2959, + "time_per_iteration": 2.425863027572632 + }, + { + "auxiliary_loss_clip": 0.0109603, + "auxiliary_loss_mlp": 0.01069011, + "balance_loss_clip": 1.01765597, + "balance_loss_mlp": 1.02738857, + "epoch": 0.17796482789718923, + "flos": 18765012286080.0, + "grad_norm": 2.329095013415127, + "language_loss": 0.54422832, + "learning_rate": 3.774698062689362e-06, + "loss": 0.56587869, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.6875, + "step": 2960, + "time_per_iteration": 2.396094560623169 + }, + { + "auxiliary_loss_clip": 0.01095425, + "auxiliary_loss_mlp": 0.01071421, + "balance_loss_clip": 1.02731347, + "balance_loss_mlp": 1.02805972, + "epoch": 0.1780249511498572, + "flos": 23439812446080.0, + "grad_norm": 1.7721663445494245, + "language_loss": 0.90405816, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.92572659, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.671875, + "step": 2961, + "time_per_iteration": 2.4950878620147705 + }, + { + "auxiliary_loss_clip": 0.01095457, + "auxiliary_loss_mlp": 0.01071767, + "balance_loss_clip": 1.02370191, + "balance_loss_mlp": 1.0271039, + "epoch": 0.1780850744025252, + "flos": 23366355212160.0, + "grad_norm": 2.45789156140795, + "language_loss": 0.8134445, + "learning_rate": 3.774338767820631e-06, + "loss": 0.83511674, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.68359375, + "step": 2962, + "time_per_iteration": 2.4359676837921143 + }, + { + "auxiliary_loss_clip": 0.01094236, + "auxiliary_loss_mlp": 0.01070934, + "balance_loss_clip": 1.02475214, + "balance_loss_mlp": 1.02859378, + "epoch": 0.17814519765519315, + "flos": 13771140410880.0, + "grad_norm": 5.2537481995066715, + "language_loss": 0.7657671, + "learning_rate": 3.774159019458203e-06, + "loss": 0.78741872, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 0.65625, + "step": 2963, + "time_per_iteration": 2.3878188133239746 + }, + { + "auxiliary_loss_clip": 0.01096869, + "auxiliary_loss_mlp": 0.01066645, + "balance_loss_clip": 1.021083, + "balance_loss_mlp": 1.02868414, + "epoch": 0.17820532090786112, + "flos": 21975450624000.0, + "grad_norm": 1.7425773186894244, + "language_loss": 0.80399209, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.82562721, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.6796875, + "step": 2964, + "time_per_iteration": 2.4181792736053467 + }, + { + "auxiliary_loss_clip": 0.01094541, + "auxiliary_loss_mlp": 0.01070385, + "balance_loss_clip": 1.02611053, + "balance_loss_mlp": 1.02925324, + "epoch": 0.17826544416052909, + "flos": 24789589585920.0, + "grad_norm": 1.521725165563962, + "language_loss": 0.82486302, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.84651226, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.65234375, + "step": 2965, + "time_per_iteration": 2.4763569831848145 + }, + { + "auxiliary_loss_clip": 0.01092371, + "auxiliary_loss_mlp": 0.01065001, + "balance_loss_clip": 1.02315831, + "balance_loss_mlp": 1.02773428, + "epoch": 0.17832556741319705, + "flos": 13878777732480.0, + "grad_norm": 2.2779903016109446, + "language_loss": 0.97151577, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.99308956, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6484375, + "step": 2966, + "time_per_iteration": 2.360368490219116 + }, + { + "auxiliary_loss_clip": 0.01092327, + "auxiliary_loss_mlp": 0.01069858, + "balance_loss_clip": 1.02424836, + "balance_loss_mlp": 1.0269804, + "epoch": 0.17838569066586502, + "flos": 36640403418240.0, + "grad_norm": 2.039276864827948, + "language_loss": 0.74423856, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.76586044, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.65234375, + "step": 2967, + "time_per_iteration": 2.5491385459899902 + }, + { + "auxiliary_loss_clip": 0.01089274, + "auxiliary_loss_mlp": 0.01065407, + "balance_loss_clip": 1.02425599, + "balance_loss_mlp": 1.02705884, + "epoch": 0.17844581391853298, + "flos": 18726468278400.0, + "grad_norm": 1.9861208247607782, + "language_loss": 0.78608954, + "learning_rate": 3.773259268638157e-06, + "loss": 0.80763638, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.625, + "step": 2968, + "time_per_iteration": 2.3777480125427246 + }, + { + "auxiliary_loss_clip": 0.01094463, + "auxiliary_loss_mlp": 0.01065998, + "balance_loss_clip": 1.02577662, + "balance_loss_mlp": 1.03026736, + "epoch": 0.17850593717120097, + "flos": 27377107712640.0, + "grad_norm": 1.7800022303088707, + "language_loss": 0.76929861, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.79090321, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.640625, + "step": 2969, + "time_per_iteration": 2.477607488632202 + }, + { + "auxiliary_loss_clip": 0.01031344, + "auxiliary_loss_mlp": 0.01019625, + "balance_loss_clip": 1.01411796, + "balance_loss_mlp": 1.01111662, + "epoch": 0.17856606042386894, + "flos": 66992913129600.0, + "grad_norm": 0.8496502416146613, + "language_loss": 0.69044411, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71095383, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.203125, + "step": 2970, + "time_per_iteration": 3.11696457862854 + }, + { + "auxiliary_loss_clip": 0.0109688, + "auxiliary_loss_mlp": 0.01062834, + "balance_loss_clip": 1.01417327, + "balance_loss_mlp": 1.0267489, + "epoch": 0.1786261836765369, + "flos": 36975499447680.0, + "grad_norm": 1.799057981902391, + "language_loss": 0.69209933, + "learning_rate": 3.772718611185505e-06, + "loss": 0.71369648, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.703125, + "step": 2971, + "time_per_iteration": 2.5535476207733154 + }, + { + "auxiliary_loss_clip": 0.01092677, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_clip": 1.02252221, + "balance_loss_mlp": 1.02562046, + "epoch": 0.17868630692920487, + "flos": 24824328255360.0, + "grad_norm": 1.6689932514066017, + "language_loss": 0.91488957, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.93653607, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.671875, + "step": 2972, + "time_per_iteration": 2.446415424346924 + }, + { + "auxiliary_loss_clip": 0.01093351, + "auxiliary_loss_mlp": 0.01069817, + "balance_loss_clip": 1.02501774, + "balance_loss_mlp": 1.02773273, + "epoch": 0.17874643018187283, + "flos": 16981055078400.0, + "grad_norm": 2.350224869438406, + "language_loss": 0.8992734, + "learning_rate": 3.77235783676401e-06, + "loss": 0.92090511, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.65625, + "step": 2973, + "time_per_iteration": 2.376587390899658 + }, + { + "auxiliary_loss_clip": 0.01092104, + "auxiliary_loss_mlp": 0.01065348, + "balance_loss_clip": 1.02043009, + "balance_loss_mlp": 1.0280683, + "epoch": 0.1788065534345408, + "flos": 21031189459200.0, + "grad_norm": 1.8451531708926765, + "language_loss": 0.78170973, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.80328429, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.640625, + "step": 2974, + "time_per_iteration": 2.418952465057373 + }, + { + "auxiliary_loss_clip": 0.01095703, + "auxiliary_loss_mlp": 0.01075507, + "balance_loss_clip": 1.0274415, + "balance_loss_mlp": 1.02945495, + "epoch": 0.17886667668720876, + "flos": 23986587248640.0, + "grad_norm": 2.6096923945924995, + "language_loss": 0.77947247, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.80118459, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 0.6640625, + "step": 2975, + "time_per_iteration": 2.415820837020874 + }, + { + "auxiliary_loss_clip": 0.010915, + "auxiliary_loss_mlp": 0.01062939, + "balance_loss_clip": 1.0219785, + "balance_loss_mlp": 1.02681684, + "epoch": 0.17892679993987676, + "flos": 25738284493440.0, + "grad_norm": 1.5519971780822637, + "language_loss": 0.74373519, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.76527959, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.6484375, + "step": 2976, + "time_per_iteration": 2.461374521255493 + }, + { + "auxiliary_loss_clip": 0.01087426, + "auxiliary_loss_mlp": 0.01061995, + "balance_loss_clip": 1.02372909, + "balance_loss_mlp": 1.02813351, + "epoch": 0.17898692319254472, + "flos": 25698588410880.0, + "grad_norm": 1.7576418524856383, + "language_loss": 0.78353024, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.8050245, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.59375, + "step": 2977, + "time_per_iteration": 2.4743964672088623 + }, + { + "auxiliary_loss_clip": 0.01094827, + "auxiliary_loss_mlp": 0.01083284, + "balance_loss_clip": 1.03705442, + "balance_loss_mlp": 1.0310663, + "epoch": 0.1790470464452127, + "flos": 19316779413120.0, + "grad_norm": 2.003891850634519, + "language_loss": 0.8177827, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.83956379, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 0.63671875, + "step": 2978, + "time_per_iteration": 2.4579994678497314 + }, + { + "auxiliary_loss_clip": 0.01096647, + "auxiliary_loss_mlp": 0.01076371, + "balance_loss_clip": 1.02914, + "balance_loss_mlp": 1.02882838, + "epoch": 0.17910716969788065, + "flos": 30042970663680.0, + "grad_norm": 1.4988797704832337, + "language_loss": 0.77510154, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.79683173, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.6796875, + "step": 2979, + "time_per_iteration": 2.497911214828491 + }, + { + "auxiliary_loss_clip": 0.01092926, + "auxiliary_loss_mlp": 0.01071382, + "balance_loss_clip": 1.03085089, + "balance_loss_mlp": 1.02933168, + "epoch": 0.17916729295054862, + "flos": 19426685973120.0, + "grad_norm": 2.034246159665966, + "language_loss": 0.70731312, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.72895616, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.63671875, + "step": 2980, + "time_per_iteration": 2.4042341709136963 + }, + { + "auxiliary_loss_clip": 0.01094946, + "auxiliary_loss_mlp": 0.01066934, + "balance_loss_clip": 1.02144325, + "balance_loss_mlp": 1.02829623, + "epoch": 0.17922741620321658, + "flos": 14610661896960.0, + "grad_norm": 1.9576351353776122, + "language_loss": 0.72229278, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.7439115, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.6640625, + "step": 2981, + "time_per_iteration": 3.8065884113311768 + }, + { + "auxiliary_loss_clip": 0.01093691, + "auxiliary_loss_mlp": 0.0107369, + "balance_loss_clip": 1.02924895, + "balance_loss_mlp": 1.02799845, + "epoch": 0.17928753945588458, + "flos": 17164349049600.0, + "grad_norm": 2.068811821014254, + "language_loss": 0.8369323, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.8586061, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.66015625, + "step": 2982, + "time_per_iteration": 3.7967631816864014 + }, + { + "auxiliary_loss_clip": 0.01088146, + "auxiliary_loss_mlp": 0.01061244, + "balance_loss_clip": 1.02145147, + "balance_loss_mlp": 1.02565253, + "epoch": 0.17934766270855254, + "flos": 31394248992000.0, + "grad_norm": 1.612073895844637, + "language_loss": 0.84339297, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.86488688, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.625, + "step": 2983, + "time_per_iteration": 2.4897921085357666 + }, + { + "auxiliary_loss_clip": 0.01093153, + "auxiliary_loss_mlp": 0.01068123, + "balance_loss_clip": 1.02411067, + "balance_loss_mlp": 1.02636325, + "epoch": 0.1794077859612205, + "flos": 20813121907200.0, + "grad_norm": 1.812398969735257, + "language_loss": 0.88168931, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.90330207, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.671875, + "step": 2984, + "time_per_iteration": 2.3961474895477295 + }, + { + "auxiliary_loss_clip": 0.01090928, + "auxiliary_loss_mlp": 0.01063725, + "balance_loss_clip": 1.01878297, + "balance_loss_mlp": 1.02599335, + "epoch": 0.17946790921388847, + "flos": 28985172157440.0, + "grad_norm": 1.3938349388431412, + "language_loss": 0.90504766, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.9265942, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6484375, + "step": 2985, + "time_per_iteration": 3.89841628074646 + }, + { + "auxiliary_loss_clip": 0.01086745, + "auxiliary_loss_mlp": 0.01057283, + "balance_loss_clip": 1.01794398, + "balance_loss_mlp": 1.0259825, + "epoch": 0.17952803246655644, + "flos": 20736452828160.0, + "grad_norm": 2.210879630029339, + "language_loss": 0.72706532, + "learning_rate": 3.770006252694922e-06, + "loss": 0.74850559, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.609375, + "step": 2986, + "time_per_iteration": 3.8518753051757812 + }, + { + "auxiliary_loss_clip": 0.01088958, + "auxiliary_loss_mlp": 0.01061264, + "balance_loss_clip": 1.0199461, + "balance_loss_mlp": 1.02669168, + "epoch": 0.1795881557192244, + "flos": 28254754270080.0, + "grad_norm": 2.4020663756793645, + "language_loss": 0.79779857, + "learning_rate": 3.769824891588688e-06, + "loss": 0.81930083, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.625, + "step": 2987, + "time_per_iteration": 2.443889617919922 + }, + { + "auxiliary_loss_clip": 0.01093734, + "auxiliary_loss_mlp": 0.01070551, + "balance_loss_clip": 1.02315319, + "balance_loss_mlp": 1.0276202, + "epoch": 0.17964827897189237, + "flos": 18551029363200.0, + "grad_norm": 1.9215821726544553, + "language_loss": 0.80457699, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.82621986, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.66015625, + "step": 2988, + "time_per_iteration": 2.3856709003448486 + }, + { + "auxiliary_loss_clip": 0.01031617, + "auxiliary_loss_mlp": 0.01010996, + "balance_loss_clip": 1.00489283, + "balance_loss_mlp": 1.00945616, + "epoch": 0.17970840222456036, + "flos": 58162261392000.0, + "grad_norm": 0.7716294168185285, + "language_loss": 0.62746322, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64788938, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.06103516, + "router_z_loss_mlp": 0.22167969, + "step": 2989, + "time_per_iteration": 2.973705768585205 + }, + { + "auxiliary_loss_clip": 0.01090386, + "auxiliary_loss_mlp": 0.01054903, + "balance_loss_clip": 1.01542068, + "balance_loss_mlp": 1.02716446, + "epoch": 0.17976852547722832, + "flos": 20299828965120.0, + "grad_norm": 11.068882089740727, + "language_loss": 0.7305361, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.75198901, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.6328125, + "step": 2990, + "time_per_iteration": 2.4334096908569336 + }, + { + "auxiliary_loss_clip": 0.01092797, + "auxiliary_loss_mlp": 0.01065512, + "balance_loss_clip": 1.02531433, + "balance_loss_mlp": 1.02941871, + "epoch": 0.1798286487298963, + "flos": 39668001632640.0, + "grad_norm": 2.0823771517741494, + "language_loss": 0.70580608, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.72738922, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.6328125, + "step": 2991, + "time_per_iteration": 2.5683889389038086 + }, + { + "auxiliary_loss_clip": 0.01095467, + "auxiliary_loss_mlp": 0.01068133, + "balance_loss_clip": 1.02629066, + "balance_loss_mlp": 1.03146887, + "epoch": 0.17988877198256426, + "flos": 25519134689280.0, + "grad_norm": 1.493029457158248, + "language_loss": 0.84229404, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.86393005, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.640625, + "step": 2992, + "time_per_iteration": 2.4548990726470947 + }, + { + "auxiliary_loss_clip": 0.01091343, + "auxiliary_loss_mlp": 0.01061855, + "balance_loss_clip": 1.02451801, + "balance_loss_mlp": 1.03033054, + "epoch": 0.17994889523523222, + "flos": 18806488848000.0, + "grad_norm": 1.898143793989187, + "language_loss": 0.83772552, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.85925752, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.609375, + "step": 2993, + "time_per_iteration": 2.382178544998169 + }, + { + "auxiliary_loss_clip": 0.01092293, + "auxiliary_loss_mlp": 0.01057194, + "balance_loss_clip": 1.01651978, + "balance_loss_mlp": 1.02893877, + "epoch": 0.18000901848790019, + "flos": 21103424795520.0, + "grad_norm": 1.6764172116583114, + "language_loss": 0.80338109, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.82487595, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6328125, + "step": 2994, + "time_per_iteration": 2.455782890319824 + }, + { + "auxiliary_loss_clip": 0.01094398, + "auxiliary_loss_mlp": 0.01062825, + "balance_loss_clip": 1.02241313, + "balance_loss_mlp": 1.02984941, + "epoch": 0.18006914174056818, + "flos": 19645416840960.0, + "grad_norm": 1.984292348680541, + "language_loss": 0.82541978, + "learning_rate": 3.768371587287296e-06, + "loss": 0.84699202, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.64453125, + "step": 2995, + "time_per_iteration": 2.4314863681793213 + }, + { + "auxiliary_loss_clip": 0.01093516, + "auxiliary_loss_mlp": 0.01068033, + "balance_loss_clip": 1.03014851, + "balance_loss_mlp": 1.02956164, + "epoch": 0.18012926499323614, + "flos": 19498886398080.0, + "grad_norm": 1.5848528473635575, + "language_loss": 0.85656714, + "learning_rate": 3.768189622421512e-06, + "loss": 0.87818265, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.640625, + "step": 2996, + "time_per_iteration": 2.3924291133880615 + }, + { + "auxiliary_loss_clip": 0.01087095, + "auxiliary_loss_mlp": 0.01069513, + "balance_loss_clip": 1.03279662, + "balance_loss_mlp": 1.02600503, + "epoch": 0.1801893882459041, + "flos": 19463519324160.0, + "grad_norm": 3.484491301737472, + "language_loss": 0.89038014, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.91194624, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.609375, + "step": 2997, + "time_per_iteration": 2.467494010925293 + }, + { + "auxiliary_loss_clip": 0.01093891, + "auxiliary_loss_mlp": 0.01069424, + "balance_loss_clip": 1.02557898, + "balance_loss_mlp": 1.02730846, + "epoch": 0.18024951149857207, + "flos": 26869365676800.0, + "grad_norm": 1.7501927550630594, + "language_loss": 0.87256241, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.8941955, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6640625, + "step": 2998, + "time_per_iteration": 2.4533181190490723 + }, + { + "auxiliary_loss_clip": 0.01088712, + "auxiliary_loss_mlp": 0.0106378, + "balance_loss_clip": 1.02458358, + "balance_loss_mlp": 1.02592683, + "epoch": 0.18030963475124004, + "flos": 30225322028160.0, + "grad_norm": 1.6512911327092155, + "language_loss": 0.85836935, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.87989426, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.62890625, + "step": 2999, + "time_per_iteration": 2.4637019634246826 + }, + { + "auxiliary_loss_clip": 0.01089146, + "auxiliary_loss_mlp": 0.01068798, + "balance_loss_clip": 1.02650213, + "balance_loss_mlp": 1.0255034, + "epoch": 0.180369758003908, + "flos": 22306462024320.0, + "grad_norm": 1.6408172871852538, + "language_loss": 0.76792109, + "learning_rate": 3.76746109252814e-06, + "loss": 0.78950053, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.63671875, + "step": 3000, + "time_per_iteration": 2.422412872314453 + }, + { + "auxiliary_loss_clip": 0.01088006, + "auxiliary_loss_mlp": 0.01068695, + "balance_loss_clip": 1.02978468, + "balance_loss_mlp": 1.02558672, + "epoch": 0.18042988125657597, + "flos": 23730918295680.0, + "grad_norm": 1.727632371656516, + "language_loss": 0.73275137, + "learning_rate": 3.76727879248177e-06, + "loss": 0.75431836, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.625, + "step": 3001, + "time_per_iteration": 2.412972927093506 + }, + { + "auxiliary_loss_clip": 0.01091345, + "auxiliary_loss_mlp": 0.01063401, + "balance_loss_clip": 1.02298892, + "balance_loss_mlp": 1.02721977, + "epoch": 0.18049000450924396, + "flos": 24092548824960.0, + "grad_norm": 2.074750306761446, + "language_loss": 0.89982629, + "learning_rate": 3.767096425420011e-06, + "loss": 0.92137372, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.640625, + "step": 3002, + "time_per_iteration": 2.41715407371521 + }, + { + "auxiliary_loss_clip": 0.01090443, + "auxiliary_loss_mlp": 0.01059452, + "balance_loss_clip": 1.02106702, + "balance_loss_mlp": 1.02664602, + "epoch": 0.18055012776191193, + "flos": 22162096085760.0, + "grad_norm": 2.321027853246211, + "language_loss": 0.8299185, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.85141742, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.640625, + "step": 3003, + "time_per_iteration": 2.3861923217773438 + }, + { + "auxiliary_loss_clip": 0.01090547, + "auxiliary_loss_mlp": 0.0106701, + "balance_loss_clip": 1.02717066, + "balance_loss_mlp": 1.0262897, + "epoch": 0.1806102510145799, + "flos": 28912238593920.0, + "grad_norm": 2.579181695905576, + "language_loss": 0.70442009, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.72599566, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.640625, + "step": 3004, + "time_per_iteration": 2.445986032485962 + }, + { + "auxiliary_loss_clip": 0.01090146, + "auxiliary_loss_mlp": 0.01066086, + "balance_loss_clip": 1.02634132, + "balance_loss_mlp": 1.02550173, + "epoch": 0.18067037426724786, + "flos": 19024696045440.0, + "grad_norm": 1.5999047364249168, + "language_loss": 0.86246699, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.88402927, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.6484375, + "step": 3005, + "time_per_iteration": 2.3730576038360596 + }, + { + "auxiliary_loss_clip": 0.01087625, + "auxiliary_loss_mlp": 0.01059012, + "balance_loss_clip": 1.02045989, + "balance_loss_mlp": 1.02589869, + "epoch": 0.18073049751991582, + "flos": 27452415248640.0, + "grad_norm": 1.7864600540557356, + "language_loss": 0.84757876, + "learning_rate": 3.766366287157432e-06, + "loss": 0.86904514, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.6171875, + "step": 3006, + "time_per_iteration": 2.51257061958313 + }, + { + "auxiliary_loss_clip": 0.01088623, + "auxiliary_loss_mlp": 0.01067121, + "balance_loss_clip": 1.02334762, + "balance_loss_mlp": 1.02553558, + "epoch": 0.1807906207725838, + "flos": 28727827459200.0, + "grad_norm": 2.52094147332136, + "language_loss": 0.78411305, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.8056705, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.62890625, + "step": 3007, + "time_per_iteration": 2.5056376457214355 + }, + { + "auxiliary_loss_clip": 0.01040732, + "auxiliary_loss_mlp": 0.01006524, + "balance_loss_clip": 1.00104082, + "balance_loss_mlp": 1.01898336, + "epoch": 0.18085074402525175, + "flos": 64462791144960.0, + "grad_norm": 0.8091906867313444, + "language_loss": 0.56967551, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59014809, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.05493164, + "router_z_loss_mlp": 0.21679688, + "step": 3008, + "time_per_iteration": 3.161937952041626 + }, + { + "auxiliary_loss_clip": 0.01096332, + "auxiliary_loss_mlp": 0.01069931, + "balance_loss_clip": 1.02613401, + "balance_loss_mlp": 1.03095663, + "epoch": 0.18091086727791975, + "flos": 23475842835840.0, + "grad_norm": 1.7788124089352244, + "language_loss": 0.69621408, + "learning_rate": 3.765817980138021e-06, + "loss": 0.71787679, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.65625, + "step": 3009, + "time_per_iteration": 2.4074559211730957 + }, + { + "auxiliary_loss_clip": 0.01092909, + "auxiliary_loss_mlp": 0.01061818, + "balance_loss_clip": 1.02109647, + "balance_loss_mlp": 1.02826762, + "epoch": 0.1809709905305877, + "flos": 24169322638080.0, + "grad_norm": 1.691234575417803, + "language_loss": 0.77881837, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.80036569, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6484375, + "step": 3010, + "time_per_iteration": 2.414494037628174 + }, + { + "auxiliary_loss_clip": 0.01084425, + "auxiliary_loss_mlp": 0.01054079, + "balance_loss_clip": 1.01774454, + "balance_loss_mlp": 1.0245012, + "epoch": 0.18103111378325568, + "flos": 21649885395840.0, + "grad_norm": 1.5264142413068367, + "language_loss": 0.68939781, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.71078283, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.59765625, + "step": 3011, + "time_per_iteration": 2.3955702781677246 + }, + { + "auxiliary_loss_clip": 0.01085409, + "auxiliary_loss_mlp": 0.01053434, + "balance_loss_clip": 1.01564503, + "balance_loss_mlp": 1.02373314, + "epoch": 0.18109123703592364, + "flos": 53684965992960.0, + "grad_norm": 1.6117654015582308, + "language_loss": 0.72109449, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.7424829, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.6171875, + "step": 3012, + "time_per_iteration": 2.755882501602173 + }, + { + "auxiliary_loss_clip": 0.01085025, + "auxiliary_loss_mlp": 0.0106104, + "balance_loss_clip": 1.02000809, + "balance_loss_mlp": 1.02561986, + "epoch": 0.1811513602885916, + "flos": 35844104062080.0, + "grad_norm": 3.4148078725983937, + "language_loss": 0.64396024, + "learning_rate": 3.765085966704609e-06, + "loss": 0.66542089, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.59375, + "step": 3013, + "time_per_iteration": 2.5151026248931885 + }, + { + "auxiliary_loss_clip": 0.0109226, + "auxiliary_loss_mlp": 0.01067016, + "balance_loss_clip": 1.02522123, + "balance_loss_mlp": 1.02759814, + "epoch": 0.18121148354125957, + "flos": 23731441966080.0, + "grad_norm": 2.2100178441995864, + "language_loss": 0.77964616, + "learning_rate": 3.764902795998309e-06, + "loss": 0.80123895, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6484375, + "step": 3014, + "time_per_iteration": 2.441596269607544 + }, + { + "auxiliary_loss_clip": 0.01091703, + "auxiliary_loss_mlp": 0.01068425, + "balance_loss_clip": 1.02419853, + "balance_loss_mlp": 1.02573228, + "epoch": 0.18127160679392756, + "flos": 28727129232000.0, + "grad_norm": 1.7915866865287473, + "language_loss": 0.66944313, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.69104439, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.66015625, + "step": 3015, + "time_per_iteration": 2.451900005340576 + }, + { + "auxiliary_loss_clip": 0.01085785, + "auxiliary_loss_mlp": 0.01068173, + "balance_loss_clip": 1.02745128, + "balance_loss_mlp": 1.02382159, + "epoch": 0.18133173004659553, + "flos": 20484030631680.0, + "grad_norm": 1.7734391708851638, + "language_loss": 0.79522479, + "learning_rate": 3.764536253816785e-06, + "loss": 0.81676435, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.62109375, + "step": 3016, + "time_per_iteration": 2.418789863586426 + }, + { + "auxiliary_loss_clip": 0.01092395, + "auxiliary_loss_mlp": 0.01064556, + "balance_loss_clip": 1.01932764, + "balance_loss_mlp": 1.02633011, + "epoch": 0.1813918532992635, + "flos": 22851107233920.0, + "grad_norm": 1.66438416750243, + "language_loss": 0.84907389, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.8706435, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.66015625, + "step": 3017, + "time_per_iteration": 2.3997199535369873 + }, + { + "auxiliary_loss_clip": 0.01089102, + "auxiliary_loss_mlp": 0.01052532, + "balance_loss_clip": 1.01474237, + "balance_loss_mlp": 1.02585483, + "epoch": 0.18145197655193146, + "flos": 36063637891200.0, + "grad_norm": 1.9694538540421964, + "language_loss": 0.69048989, + "learning_rate": 3.764169443989697e-06, + "loss": 0.71190619, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.6328125, + "step": 3018, + "time_per_iteration": 2.5227150917053223 + }, + { + "auxiliary_loss_clip": 0.01091851, + "auxiliary_loss_mlp": 0.01053622, + "balance_loss_clip": 1.01235199, + "balance_loss_mlp": 1.02666652, + "epoch": 0.18151209980459942, + "flos": 24022827106560.0, + "grad_norm": 1.886675963168429, + "language_loss": 0.78554004, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.8069948, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.65234375, + "step": 3019, + "time_per_iteration": 2.4942259788513184 + }, + { + "auxiliary_loss_clip": 0.01091267, + "auxiliary_loss_mlp": 0.01063034, + "balance_loss_clip": 1.01992834, + "balance_loss_mlp": 1.02615094, + "epoch": 0.1815722230572674, + "flos": 23950487036160.0, + "grad_norm": 1.996404597655128, + "language_loss": 0.83049488, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.85203791, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.65234375, + "step": 3020, + "time_per_iteration": 3.8534700870513916 + }, + { + "auxiliary_loss_clip": 0.01089978, + "auxiliary_loss_mlp": 0.0105933, + "balance_loss_clip": 1.01929998, + "balance_loss_mlp": 1.02613926, + "epoch": 0.18163234630993536, + "flos": 24385400242560.0, + "grad_norm": 2.835304414127524, + "language_loss": 0.78807652, + "learning_rate": 3.763618727535352e-06, + "loss": 0.8095696, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.63671875, + "step": 3021, + "time_per_iteration": 2.406572103500366 + }, + { + "auxiliary_loss_clip": 0.01085675, + "auxiliary_loss_mlp": 0.0106004, + "balance_loss_clip": 1.01988983, + "balance_loss_mlp": 1.0239979, + "epoch": 0.18169246956260335, + "flos": 24680171784960.0, + "grad_norm": 1.6531190617352767, + "language_loss": 0.85688484, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87834197, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.6171875, + "step": 3022, + "time_per_iteration": 3.8180861473083496 + }, + { + "auxiliary_loss_clip": 0.01089579, + "auxiliary_loss_mlp": 0.01062933, + "balance_loss_clip": 1.0215677, + "balance_loss_mlp": 1.02508712, + "epoch": 0.1817525928152713, + "flos": 24242151467520.0, + "grad_norm": 1.8383173256231369, + "language_loss": 0.71631002, + "learning_rate": 3.763251248837859e-06, + "loss": 0.73783517, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.64453125, + "step": 3023, + "time_per_iteration": 2.419936180114746 + }, + { + "auxiliary_loss_clip": 0.01089952, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.02491307, + "balance_loss_mlp": 1.02630401, + "epoch": 0.18181271606793928, + "flos": 16471148538240.0, + "grad_norm": 1.7389662313216923, + "language_loss": 0.75945282, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.7809822, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.63671875, + "step": 3024, + "time_per_iteration": 3.7847676277160645 + }, + { + "auxiliary_loss_clip": 0.01088344, + "auxiliary_loss_mlp": 0.01061857, + "balance_loss_clip": 1.02330494, + "balance_loss_mlp": 1.02524614, + "epoch": 0.18187283932060724, + "flos": 18580252037760.0, + "grad_norm": 2.0457418530485794, + "language_loss": 0.90036964, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.9218716, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.6328125, + "step": 3025, + "time_per_iteration": 3.782660722732544 + }, + { + "auxiliary_loss_clip": 0.01086843, + "auxiliary_loss_mlp": 0.01059806, + "balance_loss_clip": 1.02213573, + "balance_loss_mlp": 1.02559924, + "epoch": 0.1819329625732752, + "flos": 20265788522880.0, + "grad_norm": 2.075642825326295, + "language_loss": 0.80575562, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.82722211, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.61328125, + "step": 3026, + "time_per_iteration": 2.403829574584961 + }, + { + "auxiliary_loss_clip": 0.01086328, + "auxiliary_loss_mlp": 0.01061823, + "balance_loss_clip": 1.02443886, + "balance_loss_mlp": 1.02452445, + "epoch": 0.18199308582594317, + "flos": 25914177256320.0, + "grad_norm": 1.6754307782109832, + "language_loss": 0.77353036, + "learning_rate": 3.762515489146692e-06, + "loss": 0.79501188, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.6171875, + "step": 3027, + "time_per_iteration": 2.409081220626831 + }, + { + "auxiliary_loss_clip": 0.01089802, + "auxiliary_loss_mlp": 0.0106136, + "balance_loss_clip": 1.0215199, + "balance_loss_mlp": 1.02519083, + "epoch": 0.18205320907861114, + "flos": 15376621415040.0, + "grad_norm": 2.3612617539968803, + "language_loss": 0.86634833, + "learning_rate": 3.762331382119546e-06, + "loss": 0.88785994, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6484375, + "step": 3028, + "time_per_iteration": 2.36936354637146 + }, + { + "auxiliary_loss_clip": 0.01085598, + "auxiliary_loss_mlp": 0.0106174, + "balance_loss_clip": 1.02104199, + "balance_loss_mlp": 1.02404308, + "epoch": 0.18211333233127913, + "flos": 25623280874880.0, + "grad_norm": 1.8104151260398915, + "language_loss": 0.83967531, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.86114872, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6171875, + "step": 3029, + "time_per_iteration": 2.4371275901794434 + }, + { + "auxiliary_loss_clip": 0.01087404, + "auxiliary_loss_mlp": 0.01067769, + "balance_loss_clip": 1.02649844, + "balance_loss_mlp": 1.02359748, + "epoch": 0.1821734555839471, + "flos": 14975120246400.0, + "grad_norm": 2.033148541034215, + "language_loss": 0.7999903, + "learning_rate": 3.761962967588891e-06, + "loss": 0.82154197, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.640625, + "step": 3030, + "time_per_iteration": 2.368514060974121 + }, + { + "auxiliary_loss_clip": 0.01089877, + "auxiliary_loss_mlp": 0.01064616, + "balance_loss_clip": 1.02384663, + "balance_loss_mlp": 1.02467167, + "epoch": 0.18223357883661506, + "flos": 20192959693440.0, + "grad_norm": 2.5875740927786097, + "language_loss": 0.87399149, + "learning_rate": 3.761778660099352e-06, + "loss": 0.89553648, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.65234375, + "step": 3031, + "time_per_iteration": 2.3956429958343506 + }, + { + "auxiliary_loss_clip": 0.01089175, + "auxiliary_loss_mlp": 0.01055962, + "balance_loss_clip": 1.01860213, + "balance_loss_mlp": 1.02454138, + "epoch": 0.18229370208928303, + "flos": 15231068490240.0, + "grad_norm": 2.1014230955334545, + "language_loss": 0.81965756, + "learning_rate": 3.76159428580299e-06, + "loss": 0.84110892, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.6484375, + "step": 3032, + "time_per_iteration": 2.36570143699646 + }, + { + "auxiliary_loss_clip": 0.01092341, + "auxiliary_loss_mlp": 0.01061443, + "balance_loss_clip": 1.01888514, + "balance_loss_mlp": 1.02569842, + "epoch": 0.182353825341951, + "flos": 23839393489920.0, + "grad_norm": 1.9896430044566205, + "language_loss": 0.8289839, + "learning_rate": 3.761409844706795e-06, + "loss": 0.85052168, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6640625, + "step": 3033, + "time_per_iteration": 2.434062957763672 + }, + { + "auxiliary_loss_clip": 0.01027891, + "auxiliary_loss_mlp": 0.01008749, + "balance_loss_clip": 1.0030508, + "balance_loss_mlp": 1.00880003, + "epoch": 0.18241394859461896, + "flos": 61188114038400.0, + "grad_norm": 0.8936893496221087, + "language_loss": 0.63629818, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65666461, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.05688477, + "router_z_loss_mlp": 0.19140625, + "step": 3034, + "time_per_iteration": 2.9707212448120117 + }, + { + "auxiliary_loss_clip": 0.01087839, + "auxiliary_loss_mlp": 0.0106205, + "balance_loss_clip": 1.02218652, + "balance_loss_mlp": 1.02434611, + "epoch": 0.18247407184728695, + "flos": 18470904059520.0, + "grad_norm": 1.9783715543198084, + "language_loss": 0.82273233, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.84423125, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6328125, + "step": 3035, + "time_per_iteration": 2.4014148712158203 + }, + { + "auxiliary_loss_clip": 0.01086215, + "auxiliary_loss_mlp": 0.01061388, + "balance_loss_clip": 1.02340817, + "balance_loss_mlp": 1.02401125, + "epoch": 0.18253419509995492, + "flos": 21794216423040.0, + "grad_norm": 1.7856736682505145, + "language_loss": 0.85678077, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.8782568, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.62109375, + "step": 3036, + "time_per_iteration": 2.4203968048095703 + }, + { + "auxiliary_loss_clip": 0.01085164, + "auxiliary_loss_mlp": 0.01062422, + "balance_loss_clip": 1.02251101, + "balance_loss_mlp": 1.02389228, + "epoch": 0.18259431835262288, + "flos": 20148934602240.0, + "grad_norm": 2.058689885950488, + "language_loss": 0.8164618, + "learning_rate": 3.760671412463617e-06, + "loss": 0.83793771, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.61328125, + "step": 3037, + "time_per_iteration": 2.3856518268585205 + }, + { + "auxiliary_loss_clip": 0.01091962, + "auxiliary_loss_mlp": 0.01070076, + "balance_loss_clip": 1.0237031, + "balance_loss_mlp": 1.02707267, + "epoch": 0.18265444160529085, + "flos": 16980740876160.0, + "grad_norm": 3.282182173943202, + "language_loss": 0.83216107, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.85378146, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.6484375, + "step": 3038, + "time_per_iteration": 2.368680238723755 + }, + { + "auxiliary_loss_clip": 0.01088557, + "auxiliary_loss_mlp": 0.01063609, + "balance_loss_clip": 1.02405512, + "balance_loss_mlp": 1.02520299, + "epoch": 0.1827145648579588, + "flos": 34421812295040.0, + "grad_norm": 1.674628729349638, + "language_loss": 0.68174195, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.70326364, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.6328125, + "step": 3039, + "time_per_iteration": 2.512702226638794 + }, + { + "auxiliary_loss_clip": 0.01090712, + "auxiliary_loss_mlp": 0.01062382, + "balance_loss_clip": 1.01910913, + "balance_loss_mlp": 1.02620816, + "epoch": 0.18277468811062678, + "flos": 53285035835520.0, + "grad_norm": 2.004027439386442, + "language_loss": 0.75724918, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.77878016, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.64453125, + "step": 3040, + "time_per_iteration": 2.679309606552124 + }, + { + "auxiliary_loss_clip": 0.01086328, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.01840568, + "balance_loss_mlp": 1.02325082, + "epoch": 0.18283481136329474, + "flos": 31649289540480.0, + "grad_norm": 2.5063584583952734, + "language_loss": 0.61691666, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.63838339, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.6328125, + "step": 3041, + "time_per_iteration": 2.48581862449646 + }, + { + "auxiliary_loss_clip": 0.01089594, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_clip": 1.02195537, + "balance_loss_mlp": 1.02479601, + "epoch": 0.18289493461596273, + "flos": 53135782306560.0, + "grad_norm": 1.7987248428715559, + "language_loss": 0.61651057, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.63805264, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6484375, + "step": 3042, + "time_per_iteration": 2.705409049987793 + }, + { + "auxiliary_loss_clip": 0.01087403, + "auxiliary_loss_mlp": 0.01063906, + "balance_loss_clip": 1.02244544, + "balance_loss_mlp": 1.02621293, + "epoch": 0.1829550578686307, + "flos": 25588297825920.0, + "grad_norm": 1.6036165832240268, + "language_loss": 0.88743252, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.90894556, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.61328125, + "step": 3043, + "time_per_iteration": 2.441075086593628 + }, + { + "auxiliary_loss_clip": 0.01088491, + "auxiliary_loss_mlp": 0.01074911, + "balance_loss_clip": 1.02779973, + "balance_loss_mlp": 1.02418399, + "epoch": 0.18301518112129866, + "flos": 22600325871360.0, + "grad_norm": 1.8051722586035226, + "language_loss": 0.72632241, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.74795645, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.64453125, + "step": 3044, + "time_per_iteration": 2.3988680839538574 + }, + { + "auxiliary_loss_clip": 0.01093571, + "auxiliary_loss_mlp": 0.01073375, + "balance_loss_clip": 1.02557254, + "balance_loss_mlp": 1.02622437, + "epoch": 0.18307530437396663, + "flos": 34019403431040.0, + "grad_norm": 2.2116664182741674, + "language_loss": 0.67320204, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.69487149, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.671875, + "step": 3045, + "time_per_iteration": 2.4985058307647705 + }, + { + "auxiliary_loss_clip": 0.01087529, + "auxiliary_loss_mlp": 0.01064001, + "balance_loss_clip": 1.02456629, + "balance_loss_mlp": 1.02470303, + "epoch": 0.1831354276266346, + "flos": 21278933533440.0, + "grad_norm": 2.3995119247398513, + "language_loss": 0.81662399, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.8381393, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.62890625, + "step": 3046, + "time_per_iteration": 2.359522581100464 + }, + { + "auxiliary_loss_clip": 0.01089467, + "auxiliary_loss_mlp": 0.01070138, + "balance_loss_clip": 1.02359879, + "balance_loss_mlp": 1.0245229, + "epoch": 0.18319555087930256, + "flos": 21031887686400.0, + "grad_norm": 1.9304203599580447, + "language_loss": 0.81405413, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.83565015, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.6484375, + "step": 3047, + "time_per_iteration": 2.384692907333374 + }, + { + "auxiliary_loss_clip": 0.01086714, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.01946473, + "balance_loss_mlp": 1.02559137, + "epoch": 0.18325567413197055, + "flos": 34381627453440.0, + "grad_norm": 1.3983823274136562, + "language_loss": 0.81258094, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83403945, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.609375, + "step": 3048, + "time_per_iteration": 2.5276217460632324 + }, + { + "auxiliary_loss_clip": 0.01087679, + "auxiliary_loss_mlp": 0.01062351, + "balance_loss_clip": 1.02172399, + "balance_loss_mlp": 1.0239706, + "epoch": 0.18331579738463852, + "flos": 20557418042880.0, + "grad_norm": 3.0591990629794723, + "language_loss": 0.88414109, + "learning_rate": 3.758449708105424e-06, + "loss": 0.90564144, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.63671875, + "step": 3049, + "time_per_iteration": 2.396831750869751 + }, + { + "auxiliary_loss_clip": 0.01093324, + "auxiliary_loss_mlp": 0.01065109, + "balance_loss_clip": 1.02035737, + "balance_loss_mlp": 1.02563, + "epoch": 0.18337592063730648, + "flos": 19606907744640.0, + "grad_norm": 2.2962946083706397, + "language_loss": 0.79681325, + "learning_rate": 3.75826413248424e-06, + "loss": 0.81839752, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.67578125, + "step": 3050, + "time_per_iteration": 2.347169876098633 + }, + { + "auxiliary_loss_clip": 0.01084625, + "auxiliary_loss_mlp": 0.0105858, + "balance_loss_clip": 1.01933622, + "balance_loss_mlp": 1.02322197, + "epoch": 0.18343604388997445, + "flos": 20849815612800.0, + "grad_norm": 1.9239650577446885, + "language_loss": 1.00769615, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.02912831, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.61328125, + "step": 3051, + "time_per_iteration": 2.4059386253356934 + }, + { + "auxiliary_loss_clip": 0.01084276, + "auxiliary_loss_mlp": 0.01053223, + "balance_loss_clip": 1.01366949, + "balance_loss_mlp": 1.02367806, + "epoch": 0.1834961671426424, + "flos": 24393080741760.0, + "grad_norm": 1.4429815001713389, + "language_loss": 0.88052905, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.90190405, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.60546875, + "step": 3052, + "time_per_iteration": 2.4632773399353027 + }, + { + "auxiliary_loss_clip": 0.01084494, + "auxiliary_loss_mlp": 0.01060615, + "balance_loss_clip": 1.02084684, + "balance_loss_mlp": 1.02281189, + "epoch": 0.18355629039531038, + "flos": 21250548731520.0, + "grad_norm": 1.7459433360976353, + "language_loss": 0.75150394, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.77295506, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6171875, + "step": 3053, + "time_per_iteration": 2.396648645401001 + }, + { + "auxiliary_loss_clip": 0.01087427, + "auxiliary_loss_mlp": 0.01057118, + "balance_loss_clip": 1.01842225, + "balance_loss_mlp": 1.02511621, + "epoch": 0.18361641364797834, + "flos": 28655277920640.0, + "grad_norm": 1.867360908459658, + "language_loss": 0.6486389, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.67008436, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.625, + "step": 3054, + "time_per_iteration": 2.46193790435791 + }, + { + "auxiliary_loss_clip": 0.01087426, + "auxiliary_loss_mlp": 0.01062104, + "balance_loss_clip": 1.02429104, + "balance_loss_mlp": 1.02492166, + "epoch": 0.18367653690064634, + "flos": 20917896497280.0, + "grad_norm": 2.0060789126217378, + "language_loss": 0.80137599, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.82287133, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.625, + "step": 3055, + "time_per_iteration": 2.3853697776794434 + }, + { + "auxiliary_loss_clip": 0.01086468, + "auxiliary_loss_mlp": 0.01061232, + "balance_loss_clip": 1.02246523, + "balance_loss_mlp": 1.0258286, + "epoch": 0.1837366601533143, + "flos": 28764381519360.0, + "grad_norm": 1.718147857592923, + "language_loss": 0.71411479, + "learning_rate": 3.757149278859014e-06, + "loss": 0.73559183, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.609375, + "step": 3056, + "time_per_iteration": 2.461408853530884 + }, + { + "auxiliary_loss_clip": 0.01085411, + "auxiliary_loss_mlp": 0.01059335, + "balance_loss_clip": 1.02061629, + "balance_loss_mlp": 1.0238626, + "epoch": 0.18379678340598227, + "flos": 21250374174720.0, + "grad_norm": 1.5464326600052405, + "language_loss": 0.8165704, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.83801788, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.6171875, + "step": 3057, + "time_per_iteration": 2.404388904571533 + }, + { + "auxiliary_loss_clip": 0.0108944, + "auxiliary_loss_mlp": 0.01076888, + "balance_loss_clip": 1.03027689, + "balance_loss_mlp": 1.02452409, + "epoch": 0.18385690665865023, + "flos": 20448558823680.0, + "grad_norm": 1.9730440045246131, + "language_loss": 0.83663309, + "learning_rate": 3.756777127858533e-06, + "loss": 0.85829639, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.6484375, + "step": 3058, + "time_per_iteration": 2.4258644580841064 + }, + { + "auxiliary_loss_clip": 0.01089294, + "auxiliary_loss_mlp": 0.0106867, + "balance_loss_clip": 1.0251348, + "balance_loss_mlp": 1.024315, + "epoch": 0.1839170299113182, + "flos": 26139366725760.0, + "grad_norm": 2.0982911237163413, + "language_loss": 0.87007654, + "learning_rate": 3.756590952429017e-06, + "loss": 0.89165616, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.6484375, + "step": 3059, + "time_per_iteration": 2.477228879928589 + }, + { + "auxiliary_loss_clip": 0.01084699, + "auxiliary_loss_mlp": 0.0106195, + "balance_loss_clip": 1.02370763, + "balance_loss_mlp": 1.02313328, + "epoch": 0.18397715316398616, + "flos": 31756717393920.0, + "grad_norm": 1.7347866432020167, + "language_loss": 0.74107969, + "learning_rate": 3.756404710389396e-06, + "loss": 0.76254618, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.6171875, + "step": 3060, + "time_per_iteration": 3.891313314437866 + }, + { + "auxiliary_loss_clip": 0.01087447, + "auxiliary_loss_mlp": 0.01067679, + "balance_loss_clip": 1.02502561, + "balance_loss_mlp": 1.02473307, + "epoch": 0.18403727641665413, + "flos": 24610729357440.0, + "grad_norm": 1.5953962559028254, + "language_loss": 0.74338567, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.76493704, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.625, + "step": 3061, + "time_per_iteration": 2.430459976196289 + }, + { + "auxiliary_loss_clip": 0.01086381, + "auxiliary_loss_mlp": 0.01065581, + "balance_loss_clip": 1.02445364, + "balance_loss_mlp": 1.02450836, + "epoch": 0.18409739966932212, + "flos": 23438800016640.0, + "grad_norm": 1.5891660054967958, + "language_loss": 0.8270427, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.84856236, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.62109375, + "step": 3062, + "time_per_iteration": 3.820281744003296 + }, + { + "auxiliary_loss_clip": 0.01089304, + "auxiliary_loss_mlp": 0.01069158, + "balance_loss_clip": 1.02714837, + "balance_loss_mlp": 1.02480853, + "epoch": 0.18415752292199009, + "flos": 21871025147520.0, + "grad_norm": 1.8823804651794063, + "language_loss": 0.74726188, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.76884645, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.6484375, + "step": 3063, + "time_per_iteration": 2.376692056655884 + }, + { + "auxiliary_loss_clip": 0.01084267, + "auxiliary_loss_mlp": 0.01060645, + "balance_loss_clip": 1.02395201, + "balance_loss_mlp": 1.022686, + "epoch": 0.18421764617465805, + "flos": 25409507420160.0, + "grad_norm": 1.7072448736369765, + "language_loss": 0.67444527, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.69589436, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.6171875, + "step": 3064, + "time_per_iteration": 3.849440336227417 + }, + { + "auxiliary_loss_clip": 0.01085706, + "auxiliary_loss_mlp": 0.010659, + "balance_loss_clip": 1.02553558, + "balance_loss_mlp": 1.02343249, + "epoch": 0.18427776942732602, + "flos": 27196920852480.0, + "grad_norm": 1.8284256729450834, + "language_loss": 0.7099449, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.73146093, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.625, + "step": 3065, + "time_per_iteration": 3.81312894821167 + }, + { + "auxiliary_loss_clip": 0.01090762, + "auxiliary_loss_mlp": 0.0106585, + "balance_loss_clip": 1.02217162, + "balance_loss_mlp": 1.0258534, + "epoch": 0.18433789267999398, + "flos": 27851193331200.0, + "grad_norm": 2.203137863668416, + "language_loss": 0.75329155, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.77485764, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6484375, + "step": 3066, + "time_per_iteration": 2.4305264949798584 + }, + { + "auxiliary_loss_clip": 0.01086183, + "auxiliary_loss_mlp": 0.01054678, + "balance_loss_clip": 1.01634061, + "balance_loss_mlp": 1.02320278, + "epoch": 0.18439801593266195, + "flos": 17856013461120.0, + "grad_norm": 1.9777925812161226, + "language_loss": 0.84158009, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.86298871, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.62890625, + "step": 3067, + "time_per_iteration": 2.37589168548584 + }, + { + "auxiliary_loss_clip": 0.01031112, + "auxiliary_loss_mlp": 0.01013274, + "balance_loss_clip": 1.00688446, + "balance_loss_mlp": 1.00956666, + "epoch": 0.18445813918532994, + "flos": 56386403619840.0, + "grad_norm": 0.8184909765164737, + "language_loss": 0.5988096, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61925346, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.06396484, + "router_z_loss_mlp": 0.21484375, + "step": 3068, + "time_per_iteration": 2.8893840312957764 + }, + { + "auxiliary_loss_clip": 0.0108591, + "auxiliary_loss_mlp": 0.01060806, + "balance_loss_clip": 1.02048934, + "balance_loss_mlp": 1.02349842, + "epoch": 0.1845182624379979, + "flos": 20956196125440.0, + "grad_norm": 1.8488439121799434, + "language_loss": 0.77895951, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.8004266, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.625, + "step": 3069, + "time_per_iteration": 2.401163101196289 + }, + { + "auxiliary_loss_clip": 0.01087544, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_clip": 1.0248034, + "balance_loss_mlp": 1.0243454, + "epoch": 0.18457838569066587, + "flos": 20484135365760.0, + "grad_norm": 1.7342326555161953, + "language_loss": 0.86857545, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.89012426, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6328125, + "step": 3070, + "time_per_iteration": 2.3808462619781494 + }, + { + "auxiliary_loss_clip": 0.01087965, + "auxiliary_loss_mlp": 0.01070171, + "balance_loss_clip": 1.02582526, + "balance_loss_mlp": 1.02421904, + "epoch": 0.18463850894333383, + "flos": 25008844124160.0, + "grad_norm": 1.8366442257470255, + "language_loss": 0.7953608, + "learning_rate": 3.754351653708265e-06, + "loss": 0.81694216, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.640625, + "step": 3071, + "time_per_iteration": 2.4279744625091553 + }, + { + "auxiliary_loss_clip": 0.01092491, + "auxiliary_loss_mlp": 0.01078996, + "balance_loss_clip": 1.03527045, + "balance_loss_mlp": 1.02664018, + "epoch": 0.1846986321960018, + "flos": 16799681232000.0, + "grad_norm": 1.88570003969625, + "language_loss": 0.8066628, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.82837772, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.66015625, + "step": 3072, + "time_per_iteration": 2.365452766418457 + }, + { + "auxiliary_loss_clip": 0.01087126, + "auxiliary_loss_mlp": 0.01062748, + "balance_loss_clip": 1.0201664, + "balance_loss_mlp": 1.02374315, + "epoch": 0.18475875544866976, + "flos": 20813261552640.0, + "grad_norm": 1.9762396119604908, + "language_loss": 0.87374622, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.89524496, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6328125, + "step": 3073, + "time_per_iteration": 2.3982694149017334 + }, + { + "auxiliary_loss_clip": 0.01088042, + "auxiliary_loss_mlp": 0.01072552, + "balance_loss_clip": 1.03209221, + "balance_loss_mlp": 1.02439547, + "epoch": 0.18481887870133773, + "flos": 22600325871360.0, + "grad_norm": 2.3955878251263933, + "language_loss": 0.94812417, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.96973008, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.63671875, + "step": 3074, + "time_per_iteration": 2.3870174884796143 + }, + { + "auxiliary_loss_clip": 0.01085305, + "auxiliary_loss_mlp": 0.01064011, + "balance_loss_clip": 1.02054715, + "balance_loss_mlp": 1.0230577, + "epoch": 0.18487900195400572, + "flos": 29457582030720.0, + "grad_norm": 5.43571544791032, + "language_loss": 0.65834761, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67984074, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.62109375, + "step": 3075, + "time_per_iteration": 2.4557814598083496 + }, + { + "auxiliary_loss_clip": 0.01084786, + "auxiliary_loss_mlp": 0.01063901, + "balance_loss_clip": 1.02503872, + "balance_loss_mlp": 1.02424657, + "epoch": 0.1849391252066737, + "flos": 20627803077120.0, + "grad_norm": 1.7349929174226795, + "language_loss": 0.7393961, + "learning_rate": 3.753415784551761e-06, + "loss": 0.76088291, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.609375, + "step": 3076, + "time_per_iteration": 2.3872392177581787 + }, + { + "auxiliary_loss_clip": 0.01087814, + "auxiliary_loss_mlp": 0.01064472, + "balance_loss_clip": 1.02444196, + "balance_loss_mlp": 1.02336967, + "epoch": 0.18499924845934165, + "flos": 14427682128000.0, + "grad_norm": 2.56929167047946, + "language_loss": 0.83118182, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.8527047, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.64453125, + "step": 3077, + "time_per_iteration": 2.3927054405212402 + }, + { + "auxiliary_loss_clip": 0.01082005, + "auxiliary_loss_mlp": 0.01059814, + "balance_loss_clip": 1.02276385, + "balance_loss_mlp": 1.02213991, + "epoch": 0.18505937171200962, + "flos": 23726659109760.0, + "grad_norm": 1.695450509257729, + "language_loss": 0.79981065, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8212288, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.59765625, + "step": 3078, + "time_per_iteration": 2.40616774559021 + }, + { + "auxiliary_loss_clip": 0.01086317, + "auxiliary_loss_mlp": 0.01068974, + "balance_loss_clip": 1.02458084, + "balance_loss_mlp": 1.02352369, + "epoch": 0.18511949496467758, + "flos": 25956317134080.0, + "grad_norm": 1.853565857077892, + "language_loss": 0.79692447, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.81847745, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.62890625, + "step": 3079, + "time_per_iteration": 2.4430689811706543 + }, + { + "auxiliary_loss_clip": 0.01083662, + "auxiliary_loss_mlp": 0.01064207, + "balance_loss_clip": 1.0260365, + "balance_loss_mlp": 1.02200556, + "epoch": 0.18517961821734555, + "flos": 42411895205760.0, + "grad_norm": 1.8645053860812602, + "language_loss": 0.83218193, + "learning_rate": 3.752665892369369e-06, + "loss": 0.85366058, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.6171875, + "step": 3080, + "time_per_iteration": 2.6198067665100098 + }, + { + "auxiliary_loss_clip": 0.01087823, + "auxiliary_loss_mlp": 0.01071168, + "balance_loss_clip": 1.02455759, + "balance_loss_mlp": 1.02336192, + "epoch": 0.18523974147001354, + "flos": 24096423985920.0, + "grad_norm": 2.064912826032835, + "language_loss": 0.75803852, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.77962846, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 0.64453125, + "step": 3081, + "time_per_iteration": 2.4019148349761963 + }, + { + "auxiliary_loss_clip": 0.01088721, + "auxiliary_loss_mlp": 0.01062737, + "balance_loss_clip": 1.02070379, + "balance_loss_mlp": 1.02513361, + "epoch": 0.1852998647226815, + "flos": 27374210069760.0, + "grad_norm": 2.2895650575540523, + "language_loss": 0.73344982, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.75496441, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.63671875, + "step": 3082, + "time_per_iteration": 2.4332594871520996 + }, + { + "auxiliary_loss_clip": 0.01088038, + "auxiliary_loss_mlp": 0.01065402, + "balance_loss_clip": 1.02065074, + "balance_loss_mlp": 1.02449393, + "epoch": 0.18535998797534947, + "flos": 18331774824960.0, + "grad_norm": 1.9112992980851653, + "language_loss": 0.72203958, + "learning_rate": 3.752102775364407e-06, + "loss": 0.74357402, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.6328125, + "step": 3083, + "time_per_iteration": 2.379075288772583 + }, + { + "auxiliary_loss_clip": 0.01084877, + "auxiliary_loss_mlp": 0.01056893, + "balance_loss_clip": 1.01946127, + "balance_loss_mlp": 1.02429295, + "epoch": 0.18542011122801744, + "flos": 37844522899200.0, + "grad_norm": 2.328167418971216, + "language_loss": 0.71095514, + "learning_rate": 3.751914936806767e-06, + "loss": 0.73237282, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.60546875, + "step": 3084, + "time_per_iteration": 2.522299289703369 + }, + { + "auxiliary_loss_clip": 0.01085694, + "auxiliary_loss_mlp": 0.01056977, + "balance_loss_clip": 1.01718545, + "balance_loss_mlp": 1.0235219, + "epoch": 0.1854802344806854, + "flos": 25185120912000.0, + "grad_norm": 1.57987008824658, + "language_loss": 0.78688335, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80831003, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.62109375, + "step": 3085, + "time_per_iteration": 2.498060941696167 + }, + { + "auxiliary_loss_clip": 0.01085612, + "auxiliary_loss_mlp": 0.01061127, + "balance_loss_clip": 1.0194757, + "balance_loss_mlp": 1.02344918, + "epoch": 0.18554035773335337, + "flos": 26683662821760.0, + "grad_norm": 1.73068349005614, + "language_loss": 0.75518072, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7766481, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.62109375, + "step": 3086, + "time_per_iteration": 2.433401584625244 + }, + { + "auxiliary_loss_clip": 0.01086411, + "auxiliary_loss_mlp": 0.01067174, + "balance_loss_clip": 1.02444935, + "balance_loss_mlp": 1.02542579, + "epoch": 0.18560048098602133, + "flos": 22345774081920.0, + "grad_norm": 3.179834886110542, + "language_loss": 0.72181493, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.7433508, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.609375, + "step": 3087, + "time_per_iteration": 2.400320291519165 + }, + { + "auxiliary_loss_clip": 0.01084798, + "auxiliary_loss_mlp": 0.01067658, + "balance_loss_clip": 1.02047467, + "balance_loss_mlp": 1.02305818, + "epoch": 0.18566060423868933, + "flos": 17747573178240.0, + "grad_norm": 1.8982128534767901, + "language_loss": 0.75159168, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.77311623, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.6171875, + "step": 3088, + "time_per_iteration": 2.356985330581665 + }, + { + "auxiliary_loss_clip": 0.01083691, + "auxiliary_loss_mlp": 0.01057141, + "balance_loss_clip": 1.01927972, + "balance_loss_mlp": 1.02326226, + "epoch": 0.1857207274913573, + "flos": 24676226801280.0, + "grad_norm": 1.8268488291895435, + "language_loss": 0.94516218, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.96657044, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.60546875, + "step": 3089, + "time_per_iteration": 2.418142080307007 + }, + { + "auxiliary_loss_clip": 0.01082072, + "auxiliary_loss_mlp": 0.01058493, + "balance_loss_clip": 1.01691282, + "balance_loss_mlp": 1.02210999, + "epoch": 0.18578085074402526, + "flos": 28146558366720.0, + "grad_norm": 2.8117484639316372, + "language_loss": 0.59140193, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.61280763, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.6015625, + "step": 3090, + "time_per_iteration": 2.4544997215270996 + }, + { + "auxiliary_loss_clip": 0.01082213, + "auxiliary_loss_mlp": 0.01058162, + "balance_loss_clip": 1.01631939, + "balance_loss_mlp": 1.02273297, + "epoch": 0.18584097399669322, + "flos": 23950731415680.0, + "grad_norm": 1.6377146111065106, + "language_loss": 0.82996571, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.85136944, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.59375, + "step": 3091, + "time_per_iteration": 2.4049360752105713 + }, + { + "auxiliary_loss_clip": 0.01083761, + "auxiliary_loss_mlp": 0.01062331, + "balance_loss_clip": 1.02141881, + "balance_loss_mlp": 1.02143121, + "epoch": 0.18590109724936119, + "flos": 17200728552960.0, + "grad_norm": 2.514709605222868, + "language_loss": 0.86232072, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.88378161, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.625, + "step": 3092, + "time_per_iteration": 2.360720634460449 + }, + { + "auxiliary_loss_clip": 0.01086163, + "auxiliary_loss_mlp": 0.01063756, + "balance_loss_clip": 1.022843, + "balance_loss_mlp": 1.02246141, + "epoch": 0.18596122050202915, + "flos": 17233791477120.0, + "grad_norm": 2.128200665200217, + "language_loss": 0.94877237, + "learning_rate": 3.750221401168038e-06, + "loss": 0.97027159, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.63671875, + "step": 3093, + "time_per_iteration": 2.3438549041748047 + }, + { + "auxiliary_loss_clip": 0.01087073, + "auxiliary_loss_mlp": 0.01064659, + "balance_loss_clip": 1.02603519, + "balance_loss_mlp": 1.02558815, + "epoch": 0.18602134375469712, + "flos": 19019878277760.0, + "grad_norm": 4.839807206379661, + "language_loss": 0.786358, + "learning_rate": 3.750032898603443e-06, + "loss": 0.80787528, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.6171875, + "step": 3094, + "time_per_iteration": 2.371535301208496 + }, + { + "auxiliary_loss_clip": 0.01082551, + "auxiliary_loss_mlp": 0.01061438, + "balance_loss_clip": 1.02374363, + "balance_loss_mlp": 1.0226934, + "epoch": 0.1860814670073651, + "flos": 50948229248640.0, + "grad_norm": 1.3978417674843782, + "language_loss": 0.71490455, + "learning_rate": 3.749844329677425e-06, + "loss": 0.7363444, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.59765625, + "step": 3095, + "time_per_iteration": 2.6391406059265137 + }, + { + "auxiliary_loss_clip": 0.01085902, + "auxiliary_loss_mlp": 0.0106689, + "balance_loss_clip": 1.02447522, + "balance_loss_mlp": 1.02333677, + "epoch": 0.18614159026003307, + "flos": 19389957356160.0, + "grad_norm": 1.9132089499732208, + "language_loss": 0.82299411, + "learning_rate": 3.749655694397135e-06, + "loss": 0.844522, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.625, + "step": 3096, + "time_per_iteration": 2.383352279663086 + }, + { + "auxiliary_loss_clip": 0.01085309, + "auxiliary_loss_mlp": 0.01064978, + "balance_loss_clip": 1.02425659, + "balance_loss_mlp": 1.02406645, + "epoch": 0.18620171351270104, + "flos": 21797707559040.0, + "grad_norm": 2.115629128419868, + "language_loss": 0.76656556, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.78806841, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.609375, + "step": 3097, + "time_per_iteration": 2.4038069248199463 + }, + { + "auxiliary_loss_clip": 0.01083918, + "auxiliary_loss_mlp": 0.01057491, + "balance_loss_clip": 1.01829481, + "balance_loss_mlp": 1.0239265, + "epoch": 0.186261836765369, + "flos": 16361940205440.0, + "grad_norm": 2.2039627540891016, + "language_loss": 0.68909633, + "learning_rate": 3.749278224802352e-06, + "loss": 0.71051037, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.6015625, + "step": 3098, + "time_per_iteration": 2.3690693378448486 + }, + { + "auxiliary_loss_clip": 0.01084656, + "auxiliary_loss_mlp": 0.01062663, + "balance_loss_clip": 1.02012897, + "balance_loss_mlp": 1.02260435, + "epoch": 0.18632196001803697, + "flos": 23368868830080.0, + "grad_norm": 1.7973197256364408, + "language_loss": 0.71624064, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.73771381, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.62109375, + "step": 3099, + "time_per_iteration": 2.4097647666931152 + }, + { + "auxiliary_loss_clip": 0.01085175, + "auxiliary_loss_mlp": 0.01068012, + "balance_loss_clip": 1.02819586, + "balance_loss_mlp": 1.02385616, + "epoch": 0.18638208327070493, + "flos": 22490908070400.0, + "grad_norm": 2.200512923547313, + "language_loss": 0.73407805, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.75560987, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.609375, + "step": 3100, + "time_per_iteration": 3.8744239807128906 + }, + { + "auxiliary_loss_clip": 0.01085691, + "auxiliary_loss_mlp": 0.01067252, + "balance_loss_clip": 1.02676892, + "balance_loss_mlp": 1.02420521, + "epoch": 0.18644220652337293, + "flos": 29164067297280.0, + "grad_norm": 1.6518247994404798, + "language_loss": 0.8157748, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.83730423, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6171875, + "step": 3101, + "time_per_iteration": 3.854264259338379 + }, + { + "auxiliary_loss_clip": 0.01081565, + "auxiliary_loss_mlp": 0.01058071, + "balance_loss_clip": 1.02235568, + "balance_loss_mlp": 1.02295184, + "epoch": 0.1865023297760409, + "flos": 24242640226560.0, + "grad_norm": 1.5926155058593665, + "language_loss": 0.78555799, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.80695438, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5859375, + "step": 3102, + "time_per_iteration": 2.4154343605041504 + }, + { + "auxiliary_loss_clip": 0.01086823, + "auxiliary_loss_mlp": 0.01060989, + "balance_loss_clip": 1.02169788, + "balance_loss_mlp": 1.02335382, + "epoch": 0.18656245302870886, + "flos": 19127899624320.0, + "grad_norm": 2.0726202983989666, + "language_loss": 0.78071654, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.80219471, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.6328125, + "step": 3103, + "time_per_iteration": 2.4057273864746094 + }, + { + "auxiliary_loss_clip": 0.01086125, + "auxiliary_loss_mlp": 0.01064113, + "balance_loss_clip": 1.02527416, + "balance_loss_mlp": 1.02468431, + "epoch": 0.18662257628137682, + "flos": 17785104756480.0, + "grad_norm": 1.7082538635663989, + "language_loss": 0.81054825, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.83205068, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.61328125, + "step": 3104, + "time_per_iteration": 3.9101979732513428 + }, + { + "auxiliary_loss_clip": 0.01083837, + "auxiliary_loss_mlp": 0.01056709, + "balance_loss_clip": 1.02032626, + "balance_loss_mlp": 1.0246985, + "epoch": 0.1866826995340448, + "flos": 24023246042880.0, + "grad_norm": 1.8658067668836602, + "language_loss": 0.86602473, + "learning_rate": 3.747954992113354e-06, + "loss": 0.88743025, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.58984375, + "step": 3105, + "time_per_iteration": 3.792893171310425 + }, + { + "auxiliary_loss_clip": 0.01089357, + "auxiliary_loss_mlp": 0.01064058, + "balance_loss_clip": 1.01921177, + "balance_loss_mlp": 1.02397943, + "epoch": 0.18674282278671275, + "flos": 26140030041600.0, + "grad_norm": 1.9035677490731786, + "language_loss": 0.88316184, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.90469599, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.65625, + "step": 3106, + "time_per_iteration": 2.4368598461151123 + }, + { + "auxiliary_loss_clip": 0.01086691, + "auxiliary_loss_mlp": 0.01067201, + "balance_loss_clip": 1.02640748, + "balance_loss_mlp": 1.02462292, + "epoch": 0.18680294603938072, + "flos": 19201112478720.0, + "grad_norm": 1.7679761986285105, + "language_loss": 0.8015362, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.82307518, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.62109375, + "step": 3107, + "time_per_iteration": 2.384347677230835 + }, + { + "auxiliary_loss_clip": 0.01084009, + "auxiliary_loss_mlp": 0.01069022, + "balance_loss_clip": 1.02419972, + "balance_loss_mlp": 1.02211428, + "epoch": 0.1868630692920487, + "flos": 28543730526720.0, + "grad_norm": 1.9526447864493444, + "language_loss": 0.77368689, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.79521716, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6171875, + "step": 3108, + "time_per_iteration": 2.458315372467041 + }, + { + "auxiliary_loss_clip": 0.01085451, + "auxiliary_loss_mlp": 0.01067036, + "balance_loss_clip": 1.0259093, + "balance_loss_mlp": 1.0231123, + "epoch": 0.18692319254471668, + "flos": 17237073144960.0, + "grad_norm": 1.6279058158068134, + "language_loss": 0.75597364, + "learning_rate": 3.747197400772658e-06, + "loss": 0.77749848, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.625, + "step": 3109, + "time_per_iteration": 2.369593381881714 + }, + { + "auxiliary_loss_clip": 0.0108214, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.01899552, + "balance_loss_mlp": 1.02215719, + "epoch": 0.18698331579738464, + "flos": 23184073670400.0, + "grad_norm": 1.469906943898414, + "language_loss": 0.85757136, + "learning_rate": 3.747007837284772e-06, + "loss": 0.87897849, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.59765625, + "step": 3110, + "time_per_iteration": 2.403210401535034 + }, + { + "auxiliary_loss_clip": 0.01085738, + "auxiliary_loss_mlp": 0.01066514, + "balance_loss_clip": 1.02564907, + "balance_loss_mlp": 1.02321792, + "epoch": 0.1870434390500526, + "flos": 25515643553280.0, + "grad_norm": 1.512226120124199, + "language_loss": 0.85980129, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.88132381, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.625, + "step": 3111, + "time_per_iteration": 2.4161765575408936 + }, + { + "auxiliary_loss_clip": 0.01082834, + "auxiliary_loss_mlp": 0.01059012, + "balance_loss_clip": 1.02212882, + "balance_loss_mlp": 1.02287769, + "epoch": 0.18710356230272057, + "flos": 19499794093440.0, + "grad_norm": 1.8712439647147094, + "language_loss": 0.785321, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.80673957, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.59765625, + "step": 3112, + "time_per_iteration": 2.3757436275482178 + }, + { + "auxiliary_loss_clip": 0.01083439, + "auxiliary_loss_mlp": 0.01060833, + "balance_loss_clip": 1.02220929, + "balance_loss_mlp": 1.02314901, + "epoch": 0.18716368555538854, + "flos": 26759633673600.0, + "grad_norm": 1.9213064707305052, + "language_loss": 0.66646767, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.68791032, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.6015625, + "step": 3113, + "time_per_iteration": 2.430325508117676 + }, + { + "auxiliary_loss_clip": 0.01090215, + "auxiliary_loss_mlp": 0.0106565, + "balance_loss_clip": 1.02247238, + "balance_loss_mlp": 1.02606773, + "epoch": 0.1872238088080565, + "flos": 25188716782080.0, + "grad_norm": 2.096058527663695, + "language_loss": 0.82670987, + "learning_rate": 3.746248920938024e-06, + "loss": 0.84826851, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.640625, + "step": 3114, + "time_per_iteration": 2.418806791305542 + }, + { + "auxiliary_loss_clip": 0.01084763, + "auxiliary_loss_mlp": 0.010654, + "balance_loss_clip": 1.02241302, + "balance_loss_mlp": 1.02269053, + "epoch": 0.1872839320607245, + "flos": 24133152602880.0, + "grad_norm": 2.2421652752466756, + "language_loss": 0.59772569, + "learning_rate": 3.74605902628851e-06, + "loss": 0.61922729, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6171875, + "step": 3115, + "time_per_iteration": 2.414844036102295 + }, + { + "auxiliary_loss_clip": 0.01083003, + "auxiliary_loss_mlp": 0.01061018, + "balance_loss_clip": 1.02198923, + "balance_loss_mlp": 1.02340877, + "epoch": 0.18734405531339246, + "flos": 21172867223040.0, + "grad_norm": 1.7294891103731427, + "language_loss": 0.72552228, + "learning_rate": 3.745869065428261e-06, + "loss": 0.74696255, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.59375, + "step": 3116, + "time_per_iteration": 2.4110524654388428 + }, + { + "auxiliary_loss_clip": 0.01079641, + "auxiliary_loss_mlp": 0.01050808, + "balance_loss_clip": 1.01602256, + "balance_loss_mlp": 1.02195764, + "epoch": 0.18740417856606043, + "flos": 17236758942720.0, + "grad_norm": 2.520459830212574, + "language_loss": 0.81244212, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.83374661, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.578125, + "step": 3117, + "time_per_iteration": 2.3642938137054443 + }, + { + "auxiliary_loss_clip": 0.01082681, + "auxiliary_loss_mlp": 0.010602, + "balance_loss_clip": 1.02336407, + "balance_loss_mlp": 1.02306271, + "epoch": 0.1874643018187284, + "flos": 32556787176960.0, + "grad_norm": 1.762504169801175, + "language_loss": 0.85533237, + "learning_rate": 3.745488945104381e-06, + "loss": 0.8767612, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.59765625, + "step": 3118, + "time_per_iteration": 2.54512619972229 + }, + { + "auxiliary_loss_clip": 0.01084295, + "auxiliary_loss_mlp": 0.01067272, + "balance_loss_clip": 1.02893388, + "balance_loss_mlp": 1.02388, + "epoch": 0.18752442507139636, + "flos": 23257042145280.0, + "grad_norm": 1.8919257838984636, + "language_loss": 0.78184927, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.80336493, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.60546875, + "step": 3119, + "time_per_iteration": 2.4746477603912354 + }, + { + "auxiliary_loss_clip": 0.01084768, + "auxiliary_loss_mlp": 0.0106419, + "balance_loss_clip": 1.02342021, + "balance_loss_mlp": 1.02265811, + "epoch": 0.18758454832406432, + "flos": 21759896689920.0, + "grad_norm": 1.5586316851429034, + "language_loss": 0.83838421, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.85987377, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.62109375, + "step": 3120, + "time_per_iteration": 2.4567296504974365 + }, + { + "auxiliary_loss_clip": 0.01081718, + "auxiliary_loss_mlp": 0.0105635, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.02281499, + "epoch": 0.1876446715767323, + "flos": 29568919956480.0, + "grad_norm": 1.6912308389007238, + "language_loss": 0.87107766, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.89245832, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.58984375, + "step": 3121, + "time_per_iteration": 2.5234107971191406 + }, + { + "auxiliary_loss_clip": 0.01082505, + "auxiliary_loss_mlp": 0.01060612, + "balance_loss_clip": 1.022084, + "balance_loss_mlp": 1.02321005, + "epoch": 0.18770479482940028, + "flos": 30338580078720.0, + "grad_norm": 1.7687768388054481, + "language_loss": 0.73104614, + "learning_rate": 3.744727910244937e-06, + "loss": 0.75247729, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.59375, + "step": 3122, + "time_per_iteration": 2.542243003845215 + }, + { + "auxiliary_loss_clip": 0.01082469, + "auxiliary_loss_mlp": 0.01063807, + "balance_loss_clip": 1.02620816, + "balance_loss_mlp": 1.02268994, + "epoch": 0.18776491808206824, + "flos": 14464480567680.0, + "grad_norm": 2.016185090872165, + "language_loss": 0.7269361, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.74839884, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.59765625, + "step": 3123, + "time_per_iteration": 2.410095453262329 + }, + { + "auxiliary_loss_clip": 0.01079774, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.01811075, + "balance_loss_mlp": 1.02104485, + "epoch": 0.1878250413347362, + "flos": 24497401484160.0, + "grad_norm": 1.8618684487130834, + "language_loss": 0.76249355, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.78383386, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5859375, + "step": 3124, + "time_per_iteration": 2.4556119441986084 + }, + { + "auxiliary_loss_clip": 0.01081597, + "auxiliary_loss_mlp": 0.0106466, + "balance_loss_clip": 1.02510655, + "balance_loss_mlp": 1.02226102, + "epoch": 0.18788516458740417, + "flos": 39784611173760.0, + "grad_norm": 1.7068599919495242, + "language_loss": 0.8250612, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.84652382, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.59375, + "step": 3125, + "time_per_iteration": 2.5839710235595703 + }, + { + "auxiliary_loss_clip": 0.01038893, + "auxiliary_loss_mlp": 0.01010345, + "balance_loss_clip": 1.00276339, + "balance_loss_mlp": 1.01709127, + "epoch": 0.18794528784007214, + "flos": 64696151738880.0, + "grad_norm": 0.95228770775427, + "language_loss": 0.63650155, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65699393, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.07568359, + "router_z_loss_mlp": 0.21875, + "step": 3126, + "time_per_iteration": 3.1437361240386963 + }, + { + "auxiliary_loss_clip": 0.01080757, + "auxiliary_loss_mlp": 0.01058995, + "balance_loss_clip": 1.02006078, + "balance_loss_mlp": 1.02312803, + "epoch": 0.1880054110927401, + "flos": 28620783630720.0, + "grad_norm": 1.523449309945607, + "language_loss": 0.8248347, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.84623218, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.578125, + "step": 3127, + "time_per_iteration": 2.474926233291626 + }, + { + "auxiliary_loss_clip": 0.01029125, + "auxiliary_loss_mlp": 0.01014344, + "balance_loss_clip": 1.00867009, + "balance_loss_mlp": 1.00902891, + "epoch": 0.1880655343454081, + "flos": 64485625040640.0, + "grad_norm": 0.7732827563800881, + "language_loss": 0.61985981, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64029443, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.20117188, + "step": 3128, + "time_per_iteration": 3.1134345531463623 + }, + { + "auxiliary_loss_clip": 0.01084577, + "auxiliary_loss_mlp": 0.01052966, + "balance_loss_clip": 1.0132457, + "balance_loss_mlp": 1.02285147, + "epoch": 0.18812565759807606, + "flos": 32123095868160.0, + "grad_norm": 1.9117802734220564, + "language_loss": 0.74230522, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.7636807, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6171875, + "step": 3129, + "time_per_iteration": 2.4954752922058105 + }, + { + "auxiliary_loss_clip": 0.01083372, + "auxiliary_loss_mlp": 0.01062488, + "balance_loss_clip": 1.02052617, + "balance_loss_mlp": 1.02337384, + "epoch": 0.18818578085074403, + "flos": 20623683536640.0, + "grad_norm": 1.7216782591208217, + "language_loss": 0.87057012, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.89202869, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.6015625, + "step": 3130, + "time_per_iteration": 2.3849294185638428 + }, + { + "auxiliary_loss_clip": 0.01082307, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_clip": 1.0205245, + "balance_loss_mlp": 1.0220325, + "epoch": 0.188245904103412, + "flos": 28839235207680.0, + "grad_norm": 1.803559567871909, + "language_loss": 0.78691256, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.80832684, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.6015625, + "step": 3131, + "time_per_iteration": 2.454728126525879 + }, + { + "auxiliary_loss_clip": 0.01083681, + "auxiliary_loss_mlp": 0.01055931, + "balance_loss_clip": 1.01840413, + "balance_loss_mlp": 1.02564323, + "epoch": 0.18830602735607996, + "flos": 29419142757120.0, + "grad_norm": 1.831038243895863, + "language_loss": 0.83357215, + "learning_rate": 3.74282069289017e-06, + "loss": 0.85496831, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.578125, + "step": 3132, + "time_per_iteration": 2.5623202323913574 + }, + { + "auxiliary_loss_clip": 0.01087748, + "auxiliary_loss_mlp": 0.01063409, + "balance_loss_clip": 1.02397418, + "balance_loss_mlp": 1.0268743, + "epoch": 0.18836615060874792, + "flos": 28871774461440.0, + "grad_norm": 2.006780157694938, + "language_loss": 0.81906533, + "learning_rate": 3.742629607551614e-06, + "loss": 0.84057689, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.609375, + "step": 3133, + "time_per_iteration": 2.4789841175079346 + }, + { + "auxiliary_loss_clip": 0.01083065, + "auxiliary_loss_mlp": 0.01058188, + "balance_loss_clip": 1.01911139, + "balance_loss_mlp": 1.02500594, + "epoch": 0.18842627386141592, + "flos": 22600570250880.0, + "grad_norm": 1.6269043830474632, + "language_loss": 0.84168696, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.86309952, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.58203125, + "step": 3134, + "time_per_iteration": 2.4466588497161865 + }, + { + "auxiliary_loss_clip": 0.01080897, + "auxiliary_loss_mlp": 0.01057515, + "balance_loss_clip": 1.0204885, + "balance_loss_mlp": 1.02342844, + "epoch": 0.18848639711408388, + "flos": 24572394817920.0, + "grad_norm": 1.3686598160940613, + "language_loss": 0.84467357, + "learning_rate": 3.742247238639684e-06, + "loss": 0.86605769, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.57421875, + "step": 3135, + "time_per_iteration": 2.4523096084594727 + }, + { + "auxiliary_loss_clip": 0.01085028, + "auxiliary_loss_mlp": 0.0106195, + "balance_loss_clip": 1.02261126, + "balance_loss_mlp": 1.02433658, + "epoch": 0.18854652036675185, + "flos": 34165514937600.0, + "grad_norm": 1.692767507986843, + "language_loss": 0.80176973, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.82323945, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.60546875, + "step": 3136, + "time_per_iteration": 2.6902015209198 + }, + { + "auxiliary_loss_clip": 0.01081822, + "auxiliary_loss_mlp": 0.0105847, + "balance_loss_clip": 1.01958394, + "balance_loss_mlp": 1.02375031, + "epoch": 0.1886066436194198, + "flos": 24199278451200.0, + "grad_norm": 2.0229625998800307, + "language_loss": 0.83554947, + "learning_rate": 3.741864605462996e-06, + "loss": 0.85695243, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.578125, + "step": 3137, + "time_per_iteration": 2.483642339706421 + }, + { + "auxiliary_loss_clip": 0.01087958, + "auxiliary_loss_mlp": 0.01056083, + "balance_loss_clip": 1.01877069, + "balance_loss_mlp": 1.02805257, + "epoch": 0.18866676687208778, + "flos": 21250059972480.0, + "grad_norm": 1.6435975605610813, + "language_loss": 0.82577038, + "learning_rate": 3.741673189793504e-06, + "loss": 0.84721082, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.6015625, + "step": 3138, + "time_per_iteration": 2.4727394580841064 + }, + { + "auxiliary_loss_clip": 0.01085725, + "auxiliary_loss_mlp": 0.01060469, + "balance_loss_clip": 1.02136886, + "balance_loss_mlp": 1.0244664, + "epoch": 0.18872689012475574, + "flos": 37307069429760.0, + "grad_norm": 1.7730291902610371, + "language_loss": 0.65203702, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.67349893, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.61328125, + "step": 3139, + "time_per_iteration": 4.021524429321289 + }, + { + "auxiliary_loss_clip": 0.01084121, + "auxiliary_loss_mlp": 0.01067057, + "balance_loss_clip": 1.02364135, + "balance_loss_mlp": 1.02363467, + "epoch": 0.1887870133774237, + "flos": 21651246938880.0, + "grad_norm": 1.898163828307695, + "language_loss": 0.72688675, + "learning_rate": 3.741290160328514e-06, + "loss": 0.74839854, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.60546875, + "step": 3140, + "time_per_iteration": 2.4232425689697266 + }, + { + "auxiliary_loss_clip": 0.01085291, + "auxiliary_loss_mlp": 0.01064285, + "balance_loss_clip": 1.02198923, + "balance_loss_mlp": 1.02415287, + "epoch": 0.1888471366300917, + "flos": 15923745331200.0, + "grad_norm": 2.498201081769096, + "language_loss": 0.89059693, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.91209269, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.609375, + "step": 3141, + "time_per_iteration": 2.43583345413208 + }, + { + "auxiliary_loss_clip": 0.01086697, + "auxiliary_loss_mlp": 0.01068103, + "balance_loss_clip": 1.02304161, + "balance_loss_mlp": 1.02307081, + "epoch": 0.18890725988275966, + "flos": 18550959540480.0, + "grad_norm": 1.751968190766829, + "language_loss": 0.78508341, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.80663145, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 0.63671875, + "step": 3142, + "time_per_iteration": 3.8420066833496094 + }, + { + "auxiliary_loss_clip": 0.01083101, + "auxiliary_loss_mlp": 0.01057703, + "balance_loss_clip": 1.01788735, + "balance_loss_mlp": 1.02272296, + "epoch": 0.18896738313542763, + "flos": 28839584321280.0, + "grad_norm": 1.6477785775240157, + "language_loss": 0.80593175, + "learning_rate": 3.740715120924971e-06, + "loss": 0.82733977, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.6015625, + "step": 3143, + "time_per_iteration": 2.463413715362549 + }, + { + "auxiliary_loss_clip": 0.0108425, + "auxiliary_loss_mlp": 0.01067495, + "balance_loss_clip": 1.02565241, + "balance_loss_mlp": 1.02313209, + "epoch": 0.1890275063880956, + "flos": 22411830107520.0, + "grad_norm": 1.923034527201568, + "language_loss": 0.7324152, + "learning_rate": 3.740523309097912e-06, + "loss": 0.75393265, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.609375, + "step": 3144, + "time_per_iteration": 3.8124053478240967 + }, + { + "auxiliary_loss_clip": 0.01089826, + "auxiliary_loss_mlp": 0.01063362, + "balance_loss_clip": 1.02185297, + "balance_loss_mlp": 1.02646971, + "epoch": 0.18908762964076356, + "flos": 24242744960640.0, + "grad_norm": 2.451809397390331, + "language_loss": 0.77358723, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.79511917, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6328125, + "step": 3145, + "time_per_iteration": 3.8632805347442627 + }, + { + "auxiliary_loss_clip": 0.01082537, + "auxiliary_loss_mlp": 0.01058896, + "balance_loss_clip": 1.02163148, + "balance_loss_mlp": 1.02348208, + "epoch": 0.18914775289343153, + "flos": 16981962773760.0, + "grad_norm": 2.2480848927304735, + "language_loss": 0.78846687, + "learning_rate": 3.740139487448616e-06, + "loss": 0.80988121, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.58984375, + "step": 3146, + "time_per_iteration": 2.385925769805908 + }, + { + "auxiliary_loss_clip": 0.01084327, + "auxiliary_loss_mlp": 0.01063931, + "balance_loss_clip": 1.02237451, + "balance_loss_mlp": 1.02395523, + "epoch": 0.1892078761460995, + "flos": 21542701921920.0, + "grad_norm": 1.6969528198107229, + "language_loss": 0.80307269, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.82455528, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.60546875, + "step": 3147, + "time_per_iteration": 2.4046342372894287 + }, + { + "auxiliary_loss_clip": 0.01084084, + "auxiliary_loss_mlp": 0.01061577, + "balance_loss_clip": 1.02221406, + "balance_loss_mlp": 1.02401185, + "epoch": 0.18926799939876748, + "flos": 23000465496960.0, + "grad_norm": 2.925036254314726, + "language_loss": 0.70171297, + "learning_rate": 3.739755401854267e-06, + "loss": 0.72316957, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.6015625, + "step": 3148, + "time_per_iteration": 2.4274344444274902 + }, + { + "auxiliary_loss_clip": 0.01083246, + "auxiliary_loss_mlp": 0.01061048, + "balance_loss_clip": 1.02077913, + "balance_loss_mlp": 1.02347052, + "epoch": 0.18932812265143545, + "flos": 22271932823040.0, + "grad_norm": 2.2448464111302906, + "language_loss": 0.78012884, + "learning_rate": 3.739563260095902e-06, + "loss": 0.80157179, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.59765625, + "step": 3149, + "time_per_iteration": 2.4002270698547363 + }, + { + "auxiliary_loss_clip": 0.01082593, + "auxiliary_loss_mlp": 0.01063673, + "balance_loss_clip": 1.02609801, + "balance_loss_mlp": 1.02527165, + "epoch": 0.1893882459041034, + "flos": 18623439256320.0, + "grad_norm": 2.054983423126217, + "language_loss": 0.83103019, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.85249287, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5703125, + "step": 3150, + "time_per_iteration": 2.3827619552612305 + }, + { + "auxiliary_loss_clip": 0.01087494, + "auxiliary_loss_mlp": 0.0106954, + "balance_loss_clip": 1.0281502, + "balance_loss_mlp": 1.02588153, + "epoch": 0.18944836915677138, + "flos": 22891885568640.0, + "grad_norm": 1.9397348409952988, + "language_loss": 0.87356341, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.89513373, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6171875, + "step": 3151, + "time_per_iteration": 2.4008162021636963 + }, + { + "auxiliary_loss_clip": 0.01083739, + "auxiliary_loss_mlp": 0.01059437, + "balance_loss_clip": 1.01859593, + "balance_loss_mlp": 1.02279735, + "epoch": 0.18950849240943934, + "flos": 26795349861120.0, + "grad_norm": 1.749166141950305, + "language_loss": 0.75783861, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.77927041, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.609375, + "step": 3152, + "time_per_iteration": 2.4389827251434326 + }, + { + "auxiliary_loss_clip": 0.01084824, + "auxiliary_loss_mlp": 0.01071637, + "balance_loss_clip": 1.02688563, + "balance_loss_mlp": 1.02342331, + "epoch": 0.1895686156621073, + "flos": 24970125559680.0, + "grad_norm": 1.8917255608633212, + "language_loss": 0.77252138, + "learning_rate": 3.738794033491209e-06, + "loss": 0.79408598, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.6171875, + "step": 3153, + "time_per_iteration": 2.417081594467163 + }, + { + "auxiliary_loss_clip": 0.01086025, + "auxiliary_loss_mlp": 0.01060107, + "balance_loss_clip": 1.0199573, + "balance_loss_mlp": 1.02327836, + "epoch": 0.1896287389147753, + "flos": 21943469952000.0, + "grad_norm": 1.6891459572157945, + "language_loss": 0.81947798, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.84093928, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.62890625, + "step": 3154, + "time_per_iteration": 2.400468349456787 + }, + { + "auxiliary_loss_clip": 0.01088576, + "auxiliary_loss_mlp": 0.01069063, + "balance_loss_clip": 1.02488387, + "balance_loss_mlp": 1.0237596, + "epoch": 0.18968886216744327, + "flos": 18178297021440.0, + "grad_norm": 2.68237654687604, + "language_loss": 0.74585426, + "learning_rate": 3.738409024548223e-06, + "loss": 0.76743066, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.6484375, + "step": 3155, + "time_per_iteration": 2.349619150161743 + }, + { + "auxiliary_loss_clip": 0.01081762, + "auxiliary_loss_mlp": 0.01063729, + "balance_loss_clip": 1.02315044, + "balance_loss_mlp": 1.02136016, + "epoch": 0.18974898542011123, + "flos": 20411446181760.0, + "grad_norm": 1.6083172181443763, + "language_loss": 0.75737143, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.77882636, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6015625, + "step": 3156, + "time_per_iteration": 2.4029552936553955 + }, + { + "auxiliary_loss_clip": 0.01087632, + "auxiliary_loss_mlp": 0.0106095, + "balance_loss_clip": 1.01781976, + "balance_loss_mlp": 1.02517509, + "epoch": 0.1898091086727792, + "flos": 23983968896640.0, + "grad_norm": 1.692904118807614, + "language_loss": 0.69694459, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.7184304, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 0.625, + "step": 3157, + "time_per_iteration": 2.414433002471924 + }, + { + "auxiliary_loss_clip": 0.0108414, + "auxiliary_loss_mlp": 0.01065181, + "balance_loss_clip": 1.02164578, + "balance_loss_mlp": 1.02156997, + "epoch": 0.18986923192544716, + "flos": 27635813953920.0, + "grad_norm": 1.7326408635362065, + "language_loss": 0.82271975, + "learning_rate": 3.737831016747176e-06, + "loss": 0.84421295, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.625, + "step": 3158, + "time_per_iteration": 2.458814859390259 + }, + { + "auxiliary_loss_clip": 0.01091774, + "auxiliary_loss_mlp": 0.01069028, + "balance_loss_clip": 1.01986611, + "balance_loss_mlp": 1.02480114, + "epoch": 0.18992935517811513, + "flos": 25482964654080.0, + "grad_norm": 1.6702818893640745, + "language_loss": 0.74299872, + "learning_rate": 3.737638215672964e-06, + "loss": 0.76460677, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.671875, + "step": 3159, + "time_per_iteration": 2.4202733039855957 + }, + { + "auxiliary_loss_clip": 0.01086199, + "auxiliary_loss_mlp": 0.01073915, + "balance_loss_clip": 1.02968824, + "balance_loss_mlp": 1.02293503, + "epoch": 0.1899894784307831, + "flos": 17419843445760.0, + "grad_norm": 2.27787327405281, + "language_loss": 0.87633908, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.89794028, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.6328125, + "step": 3160, + "time_per_iteration": 2.3778696060180664 + }, + { + "auxiliary_loss_clip": 0.01080756, + "auxiliary_loss_mlp": 0.0106487, + "balance_loss_clip": 1.02445817, + "balance_loss_mlp": 1.02168226, + "epoch": 0.19004960168345109, + "flos": 27490959256320.0, + "grad_norm": 1.8789033275510132, + "language_loss": 0.75447452, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.77593076, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.58984375, + "step": 3161, + "time_per_iteration": 2.443905830383301 + }, + { + "auxiliary_loss_clip": 0.01084776, + "auxiliary_loss_mlp": 0.01068556, + "balance_loss_clip": 1.02890694, + "balance_loss_mlp": 1.024212, + "epoch": 0.19010972493611905, + "flos": 38653145965440.0, + "grad_norm": 1.545742370115131, + "language_loss": 0.82422173, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.84575498, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.60546875, + "step": 3162, + "time_per_iteration": 2.57304048538208 + }, + { + "auxiliary_loss_clip": 0.01084004, + "auxiliary_loss_mlp": 0.01065535, + "balance_loss_clip": 1.02178514, + "balance_loss_mlp": 1.02208757, + "epoch": 0.19016984818878702, + "flos": 19243741115520.0, + "grad_norm": 2.1445804524552994, + "language_loss": 0.77616632, + "learning_rate": 3.73686635253511e-06, + "loss": 0.79766172, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6171875, + "step": 3163, + "time_per_iteration": 2.389211416244507 + }, + { + "auxiliary_loss_clip": 0.01081251, + "auxiliary_loss_mlp": 0.01062002, + "balance_loss_clip": 1.02232957, + "balance_loss_mlp": 1.022012, + "epoch": 0.19022997144145498, + "flos": 37595382370560.0, + "grad_norm": 1.6088192648577622, + "language_loss": 0.7616421, + "learning_rate": 3.736673222076982e-06, + "loss": 0.78307462, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.59375, + "step": 3164, + "time_per_iteration": 2.5413148403167725 + }, + { + "auxiliary_loss_clip": 0.01084558, + "auxiliary_loss_mlp": 0.0106435, + "balance_loss_clip": 1.02451026, + "balance_loss_mlp": 1.02317524, + "epoch": 0.19029009469412295, + "flos": 61528762840320.0, + "grad_norm": 1.8716228735454377, + "language_loss": 0.6847474, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.70623648, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.61328125, + "step": 3165, + "time_per_iteration": 2.746187448501587 + }, + { + "auxiliary_loss_clip": 0.01086401, + "auxiliary_loss_mlp": 0.01065453, + "balance_loss_clip": 1.02294326, + "balance_loss_mlp": 1.02486587, + "epoch": 0.1903502179467909, + "flos": 13953980534400.0, + "grad_norm": 2.2561514765800816, + "language_loss": 0.75709558, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.77861404, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.6171875, + "step": 3166, + "time_per_iteration": 2.356621026992798 + }, + { + "auxiliary_loss_clip": 0.01036869, + "auxiliary_loss_mlp": 0.0101899, + "balance_loss_clip": 1.01150417, + "balance_loss_mlp": 1.01345515, + "epoch": 0.1904103411994589, + "flos": 66896168152320.0, + "grad_norm": 0.7926581973361835, + "language_loss": 0.50509405, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52565265, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.07470703, + "router_z_loss_mlp": 0.234375, + "step": 3167, + "time_per_iteration": 3.049598455429077 + }, + { + "auxiliary_loss_clip": 0.01085627, + "auxiliary_loss_mlp": 0.01063135, + "balance_loss_clip": 1.02331865, + "balance_loss_mlp": 1.02472866, + "epoch": 0.19047046445212687, + "flos": 21907649030400.0, + "grad_norm": 2.1419219771123976, + "language_loss": 0.75736415, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.77885181, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.609375, + "step": 3168, + "time_per_iteration": 2.4138827323913574 + }, + { + "auxiliary_loss_clip": 0.01032128, + "auxiliary_loss_mlp": 0.01006693, + "balance_loss_clip": 1.00001717, + "balance_loss_mlp": 1.00868666, + "epoch": 0.19053058770479483, + "flos": 59252424595200.0, + "grad_norm": 0.8623446191001776, + "language_loss": 0.60113782, + "learning_rate": 3.73570658211056e-06, + "loss": 0.621526, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.06689453, + "router_z_loss_mlp": 0.234375, + "step": 3169, + "time_per_iteration": 2.9420535564422607 + }, + { + "auxiliary_loss_clip": 0.01089063, + "auxiliary_loss_mlp": 0.0106774, + "balance_loss_clip": 1.02413309, + "balance_loss_mlp": 1.02463043, + "epoch": 0.1905907109574628, + "flos": 23950172833920.0, + "grad_norm": 1.5010922674037304, + "language_loss": 0.80275166, + "learning_rate": 3.735513056633436e-06, + "loss": 0.82431972, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.64453125, + "step": 3170, + "time_per_iteration": 2.4400479793548584 + }, + { + "auxiliary_loss_clip": 0.01081967, + "auxiliary_loss_mlp": 0.01061914, + "balance_loss_clip": 1.02326584, + "balance_loss_mlp": 1.02309084, + "epoch": 0.19065083421013077, + "flos": 20811306516480.0, + "grad_norm": 1.8604138093733116, + "language_loss": 0.79856074, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.81999958, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5859375, + "step": 3171, + "time_per_iteration": 2.4013826847076416 + }, + { + "auxiliary_loss_clip": 0.01086407, + "auxiliary_loss_mlp": 0.01066253, + "balance_loss_clip": 1.02224112, + "balance_loss_mlp": 1.02432704, + "epoch": 0.19071095746279873, + "flos": 31283644204800.0, + "grad_norm": 2.061955186713241, + "language_loss": 0.81879348, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.84031999, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.62109375, + "step": 3172, + "time_per_iteration": 2.482989549636841 + }, + { + "auxiliary_loss_clip": 0.01084953, + "auxiliary_loss_mlp": 0.01067299, + "balance_loss_clip": 1.02438331, + "balance_loss_mlp": 1.02468586, + "epoch": 0.1907710807154667, + "flos": 14355237323520.0, + "grad_norm": 1.5297332414669371, + "language_loss": 0.82102787, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.8425504, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.6015625, + "step": 3173, + "time_per_iteration": 2.3886890411376953 + }, + { + "auxiliary_loss_clip": 0.01085202, + "auxiliary_loss_mlp": 0.01064, + "balance_loss_clip": 1.02415991, + "balance_loss_mlp": 1.02440619, + "epoch": 0.1908312039681347, + "flos": 26905815002880.0, + "grad_norm": 1.4634268782381288, + "language_loss": 0.80777693, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.82926893, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.609375, + "step": 3174, + "time_per_iteration": 2.466651201248169 + }, + { + "auxiliary_loss_clip": 0.01086902, + "auxiliary_loss_mlp": 0.01070027, + "balance_loss_clip": 1.02816033, + "balance_loss_mlp": 1.02515626, + "epoch": 0.19089132722080265, + "flos": 14494017444480.0, + "grad_norm": 1.8229771460757147, + "language_loss": 0.82319987, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.84476912, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6171875, + "step": 3175, + "time_per_iteration": 2.3872625827789307 + }, + { + "auxiliary_loss_clip": 0.01086228, + "auxiliary_loss_mlp": 0.01065538, + "balance_loss_clip": 1.02624691, + "balance_loss_mlp": 1.02434468, + "epoch": 0.19095145047347062, + "flos": 13952060409600.0, + "grad_norm": 2.0588505950047, + "language_loss": 0.87643075, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.89794838, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.6171875, + "step": 3176, + "time_per_iteration": 2.383037567138672 + }, + { + "auxiliary_loss_clip": 0.01088031, + "auxiliary_loss_mlp": 0.01061442, + "balance_loss_clip": 1.01838374, + "balance_loss_mlp": 1.02613175, + "epoch": 0.19101157372613858, + "flos": 25300648200960.0, + "grad_norm": 2.7567848265313644, + "language_loss": 0.82828796, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.84978265, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.62109375, + "step": 3177, + "time_per_iteration": 2.4586124420166016 + }, + { + "auxiliary_loss_clip": 0.01081206, + "auxiliary_loss_mlp": 0.0105969, + "balance_loss_clip": 1.02066112, + "balance_loss_mlp": 1.02247441, + "epoch": 0.19107169697880655, + "flos": 20557173663360.0, + "grad_norm": 2.095976919705923, + "language_loss": 0.77960569, + "learning_rate": 3.73396248424356e-06, + "loss": 0.80101466, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5859375, + "step": 3178, + "time_per_iteration": 2.4105772972106934 + }, + { + "auxiliary_loss_clip": 0.01085461, + "auxiliary_loss_mlp": 0.01050358, + "balance_loss_clip": 1.01337957, + "balance_loss_mlp": 1.02576017, + "epoch": 0.19113182023147451, + "flos": 22162130997120.0, + "grad_norm": 1.7153744981735082, + "language_loss": 0.82693523, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.84829342, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.59765625, + "step": 3179, + "time_per_iteration": 3.856678009033203 + }, + { + "auxiliary_loss_clip": 0.01086743, + "auxiliary_loss_mlp": 0.01065372, + "balance_loss_clip": 1.02634263, + "balance_loss_mlp": 1.02481163, + "epoch": 0.19119194348414248, + "flos": 18580985176320.0, + "grad_norm": 2.2561959521275896, + "language_loss": 0.81299472, + "learning_rate": 3.733574183478691e-06, + "loss": 0.83451587, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.6171875, + "step": 3180, + "time_per_iteration": 2.381539821624756 + }, + { + "auxiliary_loss_clip": 0.0108433, + "auxiliary_loss_mlp": 0.01065505, + "balance_loss_clip": 1.02487862, + "balance_loss_mlp": 1.02412152, + "epoch": 0.19125206673681047, + "flos": 19025603740800.0, + "grad_norm": 2.232466889612744, + "language_loss": 0.81384355, + "learning_rate": 3.733379934486615e-06, + "loss": 0.83534187, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6015625, + "step": 3181, + "time_per_iteration": 3.77317476272583 + }, + { + "auxiliary_loss_clip": 0.01085552, + "auxiliary_loss_mlp": 0.01059295, + "balance_loss_clip": 1.02119613, + "balance_loss_mlp": 1.02413929, + "epoch": 0.19131218998947844, + "flos": 21689057808000.0, + "grad_norm": 1.643304310203256, + "language_loss": 0.75847828, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.77992678, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.61328125, + "step": 3182, + "time_per_iteration": 2.423046350479126 + }, + { + "auxiliary_loss_clip": 0.01084635, + "auxiliary_loss_mlp": 0.01065519, + "balance_loss_clip": 1.02567887, + "balance_loss_mlp": 1.02418804, + "epoch": 0.1913723132421464, + "flos": 18441506828160.0, + "grad_norm": 2.0691802698709703, + "language_loss": 0.66590953, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.68741107, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.60546875, + "step": 3183, + "time_per_iteration": 3.8191378116607666 + }, + { + "auxiliary_loss_clip": 0.01084228, + "auxiliary_loss_mlp": 0.01062663, + "balance_loss_clip": 1.02186954, + "balance_loss_mlp": 1.02354097, + "epoch": 0.19143243649481437, + "flos": 27158935426560.0, + "grad_norm": 1.555961764338503, + "language_loss": 0.74122155, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.76269042, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.60546875, + "step": 3184, + "time_per_iteration": 3.8376667499542236 + }, + { + "auxiliary_loss_clip": 0.0108341, + "auxiliary_loss_mlp": 0.01067157, + "balance_loss_clip": 1.02529025, + "balance_loss_mlp": 1.02270222, + "epoch": 0.19149255974748233, + "flos": 21718071014400.0, + "grad_norm": 1.7417618556848182, + "language_loss": 0.89541709, + "learning_rate": 3.732602281292598e-06, + "loss": 0.91692269, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.609375, + "step": 3185, + "time_per_iteration": 2.451084613800049 + }, + { + "auxiliary_loss_clip": 0.01083045, + "auxiliary_loss_mlp": 0.01062207, + "balance_loss_clip": 1.02141356, + "balance_loss_mlp": 1.02219248, + "epoch": 0.1915526830001503, + "flos": 22962270602880.0, + "grad_norm": 1.8510129003467697, + "language_loss": 0.74276721, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.76421976, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.609375, + "step": 3186, + "time_per_iteration": 2.4126322269439697 + }, + { + "auxiliary_loss_clip": 0.01087031, + "auxiliary_loss_mlp": 0.01070848, + "balance_loss_clip": 1.02829027, + "balance_loss_mlp": 1.02476883, + "epoch": 0.1916128062528183, + "flos": 26139541282560.0, + "grad_norm": 1.8053096043223287, + "language_loss": 0.85234058, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.87391943, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.62109375, + "step": 3187, + "time_per_iteration": 2.4239084720611572 + }, + { + "auxiliary_loss_clip": 0.01028324, + "auxiliary_loss_mlp": 0.01006342, + "balance_loss_clip": 1.00040507, + "balance_loss_mlp": 1.00807357, + "epoch": 0.19167292950548626, + "flos": 54922809847680.0, + "grad_norm": 0.8544133304846683, + "language_loss": 0.55913365, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57948029, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.05932617, + "router_z_loss_mlp": 0.203125, + "step": 3188, + "time_per_iteration": 3.097135066986084 + }, + { + "auxiliary_loss_clip": 0.01083165, + "auxiliary_loss_mlp": 0.01071926, + "balance_loss_clip": 1.02915382, + "balance_loss_mlp": 1.02323508, + "epoch": 0.19173305275815422, + "flos": 29934286001280.0, + "grad_norm": 1.7739328743056146, + "language_loss": 0.71771872, + "learning_rate": 3.731823576891397e-06, + "loss": 0.73926967, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.59765625, + "step": 3189, + "time_per_iteration": 2.5172438621520996 + }, + { + "auxiliary_loss_clip": 0.01079141, + "auxiliary_loss_mlp": 0.01060974, + "balance_loss_clip": 1.02232599, + "balance_loss_mlp": 1.02189255, + "epoch": 0.1917931760108222, + "flos": 24751359780480.0, + "grad_norm": 1.9957434834599905, + "language_loss": 0.75773591, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.77913707, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5703125, + "step": 3190, + "time_per_iteration": 2.434150218963623 + }, + { + "auxiliary_loss_clip": 0.01081405, + "auxiliary_loss_mlp": 0.0107479, + "balance_loss_clip": 1.03394938, + "balance_loss_mlp": 1.02324557, + "epoch": 0.19185329926349015, + "flos": 18842554149120.0, + "grad_norm": 1.8505138907992102, + "language_loss": 0.85646605, + "learning_rate": 3.73143383063572e-06, + "loss": 0.87802804, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.58203125, + "step": 3191, + "time_per_iteration": 2.3662095069885254 + }, + { + "auxiliary_loss_clip": 0.01079372, + "auxiliary_loss_mlp": 0.01058965, + "balance_loss_clip": 1.01891088, + "balance_loss_mlp": 1.02209711, + "epoch": 0.19191342251615812, + "flos": 22085880854400.0, + "grad_norm": 1.7640933763799815, + "language_loss": 0.9113009, + "learning_rate": 3.73123885901997e-06, + "loss": 0.9326843, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.57421875, + "step": 3192, + "time_per_iteration": 2.457148313522339 + }, + { + "auxiliary_loss_clip": 0.01085946, + "auxiliary_loss_mlp": 0.0106617, + "balance_loss_clip": 1.02361178, + "balance_loss_mlp": 1.02456343, + "epoch": 0.19197354576882608, + "flos": 22198056652800.0, + "grad_norm": 1.7622438233253315, + "language_loss": 0.76553321, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.78705442, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.61328125, + "step": 3193, + "time_per_iteration": 2.3855626583099365 + }, + { + "auxiliary_loss_clip": 0.01084739, + "auxiliary_loss_mlp": 0.01064267, + "balance_loss_clip": 1.02371192, + "balance_loss_mlp": 1.02173471, + "epoch": 0.19203366902149407, + "flos": 24895132225920.0, + "grad_norm": 1.979192657184348, + "language_loss": 0.76786828, + "learning_rate": 3.730848718849612e-06, + "loss": 0.78935832, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6328125, + "step": 3194, + "time_per_iteration": 2.4153823852539062 + }, + { + "auxiliary_loss_clip": 0.01023382, + "auxiliary_loss_mlp": 0.01010983, + "balance_loss_clip": 1.00430715, + "balance_loss_mlp": 1.00406742, + "epoch": 0.19209379227416204, + "flos": 68413633885440.0, + "grad_norm": 0.7797867732910367, + "language_loss": 0.68477917, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.06689453, + "router_z_loss_mlp": 0.19335938, + "step": 3195, + "time_per_iteration": 2.969627857208252 + }, + { + "auxiliary_loss_clip": 0.01081339, + "auxiliary_loss_mlp": 0.01067672, + "balance_loss_clip": 1.02787995, + "balance_loss_mlp": 1.02209496, + "epoch": 0.19215391552683, + "flos": 22054074739200.0, + "grad_norm": 1.808216375548017, + "language_loss": 0.75313151, + "learning_rate": 3.730458316143429e-06, + "loss": 0.77462161, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.59375, + "step": 3196, + "time_per_iteration": 2.415837526321411 + }, + { + "auxiliary_loss_clip": 0.01085512, + "auxiliary_loss_mlp": 0.01067906, + "balance_loss_clip": 1.02549171, + "balance_loss_mlp": 1.02533531, + "epoch": 0.19221403877949797, + "flos": 20301923646720.0, + "grad_norm": 2.3731255235729223, + "language_loss": 0.84916329, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.8706975, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.6015625, + "step": 3197, + "time_per_iteration": 2.389824628829956 + }, + { + "auxiliary_loss_clip": 0.01083172, + "auxiliary_loss_mlp": 0.01063367, + "balance_loss_clip": 1.02259755, + "balance_loss_mlp": 1.02283478, + "epoch": 0.19227416203216594, + "flos": 23184213315840.0, + "grad_norm": 2.0908205694070374, + "language_loss": 0.83494025, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.85640568, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.60546875, + "step": 3198, + "time_per_iteration": 2.423679828643799 + }, + { + "auxiliary_loss_clip": 0.01085099, + "auxiliary_loss_mlp": 0.01064407, + "balance_loss_clip": 1.02308929, + "balance_loss_mlp": 1.02354372, + "epoch": 0.1923342852848339, + "flos": 25775397135360.0, + "grad_norm": 1.7667595151577624, + "language_loss": 0.80170572, + "learning_rate": 3.729872219959029e-06, + "loss": 0.82320082, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.61328125, + "step": 3199, + "time_per_iteration": 2.450272798538208 + }, + { + "auxiliary_loss_clip": 0.01083197, + "auxiliary_loss_mlp": 0.01054686, + "balance_loss_clip": 1.01706314, + "balance_loss_mlp": 1.02469397, + "epoch": 0.19239440853750187, + "flos": 17127410964480.0, + "grad_norm": 3.1141269632375814, + "language_loss": 0.8540597, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.87543845, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5859375, + "step": 3200, + "time_per_iteration": 2.3859081268310547 + }, + { + "auxiliary_loss_clip": 0.01083398, + "auxiliary_loss_mlp": 0.01060581, + "balance_loss_clip": 1.02224326, + "balance_loss_mlp": 1.02414691, + "epoch": 0.19245453179016986, + "flos": 16434175541760.0, + "grad_norm": 2.105446510890205, + "language_loss": 0.80863166, + "learning_rate": 3.729481161172443e-06, + "loss": 0.83007145, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.59375, + "step": 3201, + "time_per_iteration": 2.3685388565063477 + }, + { + "auxiliary_loss_clip": 0.01082682, + "auxiliary_loss_mlp": 0.01057986, + "balance_loss_clip": 1.01831293, + "balance_loss_mlp": 1.022053, + "epoch": 0.19251465504283782, + "flos": 20229234462720.0, + "grad_norm": 2.367856076724108, + "language_loss": 0.71128309, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.73268974, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.60546875, + "step": 3202, + "time_per_iteration": 2.396688222885132 + }, + { + "auxiliary_loss_clip": 0.01081677, + "auxiliary_loss_mlp": 0.01058993, + "balance_loss_clip": 1.02082276, + "balance_loss_mlp": 1.02357626, + "epoch": 0.1925747782955058, + "flos": 19463344767360.0, + "grad_norm": 1.7449268271163034, + "language_loss": 0.92699647, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.94840318, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.58203125, + "step": 3203, + "time_per_iteration": 2.3831541538238525 + }, + { + "auxiliary_loss_clip": 0.01083652, + "auxiliary_loss_mlp": 0.01059027, + "balance_loss_clip": 1.01873422, + "balance_loss_mlp": 1.02430594, + "epoch": 0.19263490154817375, + "flos": 17784615997440.0, + "grad_norm": 2.191808487051013, + "language_loss": 0.84436929, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.86579603, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.59375, + "step": 3204, + "time_per_iteration": 2.389613628387451 + }, + { + "auxiliary_loss_clip": 0.01082067, + "auxiliary_loss_mlp": 0.01058398, + "balance_loss_clip": 1.01972651, + "balance_loss_mlp": 1.02440012, + "epoch": 0.19269502480084172, + "flos": 17456118215040.0, + "grad_norm": 2.0943506648191383, + "language_loss": 0.77907646, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.80048108, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.578125, + "step": 3205, + "time_per_iteration": 2.3694772720336914 + }, + { + "auxiliary_loss_clip": 0.01084801, + "auxiliary_loss_mlp": 0.01061508, + "balance_loss_clip": 1.02245498, + "balance_loss_mlp": 1.02421451, + "epoch": 0.19275514805350968, + "flos": 21505833659520.0, + "grad_norm": 2.40647781046479, + "language_loss": 0.86815929, + "learning_rate": 3.728502366649107e-06, + "loss": 0.88962233, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.609375, + "step": 3206, + "time_per_iteration": 2.400550365447998 + }, + { + "auxiliary_loss_clip": 0.01023745, + "auxiliary_loss_mlp": 0.01011363, + "balance_loss_clip": 1.00633216, + "balance_loss_mlp": 1.00745964, + "epoch": 0.19281527130617768, + "flos": 47693379928320.0, + "grad_norm": 0.8649513357461319, + "language_loss": 0.60694253, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62729359, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.05029297, + "router_z_loss_mlp": 0.16308594, + "step": 3207, + "time_per_iteration": 2.8528149127960205 + }, + { + "auxiliary_loss_clip": 0.01083167, + "auxiliary_loss_mlp": 0.01058959, + "balance_loss_clip": 1.02002525, + "balance_loss_mlp": 1.02385688, + "epoch": 0.19287539455884564, + "flos": 11800467918720.0, + "grad_norm": 2.028885252293734, + "language_loss": 0.78113526, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.80255651, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.59375, + "step": 3208, + "time_per_iteration": 2.400811195373535 + }, + { + "auxiliary_loss_clip": 0.01085929, + "auxiliary_loss_mlp": 0.01062822, + "balance_loss_clip": 1.02317262, + "balance_loss_mlp": 1.02446091, + "epoch": 0.1929355178115136, + "flos": 20630386517760.0, + "grad_norm": 1.9321199987250708, + "language_loss": 0.62718225, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.64866972, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.6171875, + "step": 3209, + "time_per_iteration": 2.410543918609619 + }, + { + "auxiliary_loss_clip": 0.01085459, + "auxiliary_loss_mlp": 0.01073398, + "balance_loss_clip": 1.02960062, + "balance_loss_mlp": 1.02472281, + "epoch": 0.19299564106418157, + "flos": 40806309467520.0, + "grad_norm": 1.8046361334581782, + "language_loss": 0.83063042, + "learning_rate": 3.727718151176243e-06, + "loss": 0.85221899, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.609375, + "step": 3210, + "time_per_iteration": 2.5702404975891113 + }, + { + "auxiliary_loss_clip": 0.01080992, + "auxiliary_loss_mlp": 0.01060753, + "balance_loss_clip": 1.02408397, + "balance_loss_mlp": 1.02318263, + "epoch": 0.19305576431684954, + "flos": 11360702033280.0, + "grad_norm": 1.9242553054528904, + "language_loss": 0.84630072, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.8677181, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.578125, + "step": 3211, + "time_per_iteration": 2.3886914253234863 + }, + { + "auxiliary_loss_clip": 0.01023072, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.02231717, + "balance_loss_mlp": 1.00753307, + "epoch": 0.1931158875695175, + "flos": 54509299171200.0, + "grad_norm": 0.9825845950165659, + "language_loss": 0.63727278, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65777385, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.04711914, + "router_z_loss_mlp": 0.15527344, + "step": 3212, + "time_per_iteration": 2.906512498855591 + }, + { + "auxiliary_loss_clip": 0.01084352, + "auxiliary_loss_mlp": 0.01073715, + "balance_loss_clip": 1.03747559, + "balance_loss_mlp": 1.02555978, + "epoch": 0.19317601082218547, + "flos": 19827419091840.0, + "grad_norm": 1.5503197983534085, + "language_loss": 0.77502429, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.79660499, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5859375, + "step": 3213, + "time_per_iteration": 2.5043790340423584 + }, + { + "auxiliary_loss_clip": 0.01087032, + "auxiliary_loss_mlp": 0.0106132, + "balance_loss_clip": 1.02171826, + "balance_loss_mlp": 1.0256449, + "epoch": 0.19323613407485346, + "flos": 13151222576640.0, + "grad_norm": 1.9578775752113764, + "language_loss": 0.72586149, + "learning_rate": 3.726932887459503e-06, + "loss": 0.74734497, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.6171875, + "step": 3214, + "time_per_iteration": 2.381150245666504 + }, + { + "auxiliary_loss_clip": 0.01084181, + "auxiliary_loss_mlp": 0.010619, + "balance_loss_clip": 1.02117848, + "balance_loss_mlp": 1.02409434, + "epoch": 0.19329625732752143, + "flos": 14026390427520.0, + "grad_norm": 2.3404290889292625, + "language_loss": 0.77295029, + "learning_rate": 3.72673640779803e-06, + "loss": 0.79441112, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.6015625, + "step": 3215, + "time_per_iteration": 2.3550455570220947 + }, + { + "auxiliary_loss_clip": 0.01081176, + "auxiliary_loss_mlp": 0.01060444, + "balance_loss_clip": 1.02403808, + "balance_loss_mlp": 1.02485204, + "epoch": 0.1933563805801894, + "flos": 23440580496000.0, + "grad_norm": 2.02210099837734, + "language_loss": 0.89340574, + "learning_rate": 3.72653986265854e-06, + "loss": 0.91482198, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5625, + "step": 3216, + "time_per_iteration": 2.44232439994812 + }, + { + "auxiliary_loss_clip": 0.01084926, + "auxiliary_loss_mlp": 0.01065443, + "balance_loss_clip": 1.02596092, + "balance_loss_mlp": 1.02718139, + "epoch": 0.19341650383285736, + "flos": 20484275011200.0, + "grad_norm": 1.6429507209507557, + "language_loss": 0.81823635, + "learning_rate": 3.726343252048485e-06, + "loss": 0.83974004, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.578125, + "step": 3217, + "time_per_iteration": 2.4140210151672363 + }, + { + "auxiliary_loss_clip": 0.01085421, + "auxiliary_loss_mlp": 0.01066515, + "balance_loss_clip": 1.02259851, + "balance_loss_mlp": 1.02481484, + "epoch": 0.19347662708552532, + "flos": 17857514649600.0, + "grad_norm": 2.5065899131835443, + "language_loss": 0.66955304, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.6910724, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.60546875, + "step": 3218, + "time_per_iteration": 2.3839337825775146 + }, + { + "auxiliary_loss_clip": 0.01086285, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_clip": 1.02273226, + "balance_loss_mlp": 1.02641344, + "epoch": 0.1935367503381933, + "flos": 18186256811520.0, + "grad_norm": 1.6347597427004064, + "language_loss": 0.81344879, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.83495927, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.59765625, + "step": 3219, + "time_per_iteration": 3.8571202754974365 + }, + { + "auxiliary_loss_clip": 0.01083486, + "auxiliary_loss_mlp": 0.01061873, + "balance_loss_clip": 1.02298713, + "balance_loss_mlp": 1.0255233, + "epoch": 0.19359687359086128, + "flos": 15956319496320.0, + "grad_norm": 1.8562803379847965, + "language_loss": 0.8842082, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.90566182, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.578125, + "step": 3220, + "time_per_iteration": 3.7959933280944824 + }, + { + "auxiliary_loss_clip": 0.01079554, + "auxiliary_loss_mlp": 0.01056569, + "balance_loss_clip": 1.02137828, + "balance_loss_mlp": 1.02504492, + "epoch": 0.19365699684352924, + "flos": 21214134316800.0, + "grad_norm": 1.933790202867839, + "language_loss": 0.85026824, + "learning_rate": 3.725556155051766e-06, + "loss": 0.87162948, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 3221, + "time_per_iteration": 2.410780191421509 + }, + { + "auxiliary_loss_clip": 0.01082433, + "auxiliary_loss_mlp": 0.0106338, + "balance_loss_clip": 1.02728367, + "balance_loss_mlp": 1.02499032, + "epoch": 0.1937171200961972, + "flos": 17310146353920.0, + "grad_norm": 2.1849648010151865, + "language_loss": 0.88021344, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.90167159, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.57421875, + "step": 3222, + "time_per_iteration": 3.8178672790527344 + }, + { + "auxiliary_loss_clip": 0.01082276, + "auxiliary_loss_mlp": 0.01060369, + "balance_loss_clip": 1.01955175, + "balance_loss_mlp": 1.02333605, + "epoch": 0.19377724334886517, + "flos": 22634924895360.0, + "grad_norm": 1.9763383561560408, + "language_loss": 0.79929829, + "learning_rate": 3.72516221392398e-06, + "loss": 0.82072473, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.58984375, + "step": 3223, + "time_per_iteration": 2.4038100242614746 + }, + { + "auxiliary_loss_clip": 0.01081392, + "auxiliary_loss_mlp": 0.01053121, + "balance_loss_clip": 1.01652336, + "balance_loss_mlp": 1.02402794, + "epoch": 0.19383736660153314, + "flos": 15077136839040.0, + "grad_norm": 1.9762995118129922, + "language_loss": 0.77114952, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.79249465, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.57421875, + "step": 3224, + "time_per_iteration": 3.8199617862701416 + }, + { + "auxiliary_loss_clip": 0.01082199, + "auxiliary_loss_mlp": 0.01061036, + "balance_loss_clip": 1.02167296, + "balance_loss_mlp": 1.02512729, + "epoch": 0.1938974898542011, + "flos": 47118152367360.0, + "grad_norm": 1.9528349696147782, + "language_loss": 0.73254651, + "learning_rate": 3.7247680111229e-06, + "loss": 0.75397885, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5703125, + "step": 3225, + "time_per_iteration": 2.6062021255493164 + }, + { + "auxiliary_loss_clip": 0.01083732, + "auxiliary_loss_mlp": 0.01055309, + "balance_loss_clip": 1.01565969, + "balance_loss_mlp": 1.02460873, + "epoch": 0.19395761310686907, + "flos": 25811357702400.0, + "grad_norm": 2.0575037467472184, + "language_loss": 0.70927036, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.73066068, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.58984375, + "step": 3226, + "time_per_iteration": 2.4351251125335693 + }, + { + "auxiliary_loss_clip": 0.01086455, + "auxiliary_loss_mlp": 0.01060883, + "balance_loss_clip": 1.02092433, + "balance_loss_mlp": 1.02792096, + "epoch": 0.19401773635953706, + "flos": 23038485834240.0, + "grad_norm": 1.5417084243803894, + "language_loss": 0.77933848, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.80081189, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.5859375, + "step": 3227, + "time_per_iteration": 2.4073750972747803 + }, + { + "auxiliary_loss_clip": 0.01083169, + "auxiliary_loss_mlp": 0.01064298, + "balance_loss_clip": 1.02669954, + "balance_loss_mlp": 1.02405262, + "epoch": 0.19407785961220503, + "flos": 15919974904320.0, + "grad_norm": 1.8953026924740666, + "language_loss": 0.70795715, + "learning_rate": 3.724176216414662e-06, + "loss": 0.72943187, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.58984375, + "step": 3228, + "time_per_iteration": 2.3855338096618652 + }, + { + "auxiliary_loss_clip": 0.01083768, + "auxiliary_loss_mlp": 0.01062785, + "balance_loss_clip": 1.01979852, + "balance_loss_mlp": 1.02526546, + "epoch": 0.194137982864873, + "flos": 25920531123840.0, + "grad_norm": 1.915018809092652, + "language_loss": 0.75412071, + "learning_rate": 3.72397882074007e-06, + "loss": 0.77558625, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.5859375, + "step": 3229, + "time_per_iteration": 2.4520022869110107 + }, + { + "auxiliary_loss_clip": 0.01083981, + "auxiliary_loss_mlp": 0.01066145, + "balance_loss_clip": 1.02885604, + "balance_loss_mlp": 1.0254333, + "epoch": 0.19419810611754096, + "flos": 13260500732160.0, + "grad_norm": 1.8344317906532686, + "language_loss": 0.67613626, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.6976375, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5859375, + "step": 3230, + "time_per_iteration": 2.389158010482788 + }, + { + "auxiliary_loss_clip": 0.01081515, + "auxiliary_loss_mlp": 0.01061162, + "balance_loss_clip": 1.0223949, + "balance_loss_mlp": 1.0238595, + "epoch": 0.19425822937020892, + "flos": 15704665349760.0, + "grad_norm": 2.0465300869156025, + "language_loss": 0.83470774, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.85613453, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.578125, + "step": 3231, + "time_per_iteration": 2.4355874061584473 + }, + { + "auxiliary_loss_clip": 0.0108654, + "auxiliary_loss_mlp": 0.01071218, + "balance_loss_clip": 1.02985239, + "balance_loss_mlp": 1.02692461, + "epoch": 0.1943183526228769, + "flos": 23104472037120.0, + "grad_norm": 1.6758413739534268, + "language_loss": 0.87863314, + "learning_rate": 3.72338624150555e-06, + "loss": 0.90021074, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.59765625, + "step": 3232, + "time_per_iteration": 2.4273316860198975 + }, + { + "auxiliary_loss_clip": 0.01083553, + "auxiliary_loss_mlp": 0.01058245, + "balance_loss_clip": 1.02012181, + "balance_loss_mlp": 1.02533555, + "epoch": 0.19437847587554485, + "flos": 24711593875200.0, + "grad_norm": 1.9562143056477546, + "language_loss": 0.8675139, + "learning_rate": 3.723188584382096e-06, + "loss": 0.88893187, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.58203125, + "step": 3233, + "time_per_iteration": 2.4416403770446777 + }, + { + "auxiliary_loss_clip": 0.01087363, + "auxiliary_loss_mlp": 0.01068991, + "balance_loss_clip": 1.02538395, + "balance_loss_mlp": 1.02430928, + "epoch": 0.19443859912821285, + "flos": 23114910533760.0, + "grad_norm": 1.6508397711375022, + "language_loss": 0.90070772, + "learning_rate": 3.722990861915158e-06, + "loss": 0.92227125, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6328125, + "step": 3234, + "time_per_iteration": 2.4311366081237793 + }, + { + "auxiliary_loss_clip": 0.01086359, + "auxiliary_loss_mlp": 0.01069979, + "balance_loss_clip": 1.02751637, + "balance_loss_mlp": 1.02408636, + "epoch": 0.1944987223808808, + "flos": 15083525617920.0, + "grad_norm": 2.308764307626172, + "language_loss": 0.80655551, + "learning_rate": 3.722793074112234e-06, + "loss": 0.82811892, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.625, + "step": 3235, + "time_per_iteration": 2.3727498054504395 + }, + { + "auxiliary_loss_clip": 0.01083305, + "auxiliary_loss_mlp": 0.01058739, + "balance_loss_clip": 1.02075875, + "balance_loss_mlp": 1.02464449, + "epoch": 0.19455884563354878, + "flos": 17125979598720.0, + "grad_norm": 1.9203238460183716, + "language_loss": 0.81176734, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.83318782, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5859375, + "step": 3236, + "time_per_iteration": 2.5392937660217285 + }, + { + "auxiliary_loss_clip": 0.01083235, + "auxiliary_loss_mlp": 0.0106356, + "balance_loss_clip": 1.02348173, + "balance_loss_mlp": 1.02295601, + "epoch": 0.19461896888621674, + "flos": 20192366200320.0, + "grad_norm": 1.8909620424721945, + "language_loss": 0.77735883, + "learning_rate": 3.72239730252843e-06, + "loss": 0.79882681, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.6015625, + "step": 3237, + "time_per_iteration": 2.414395332336426 + }, + { + "auxiliary_loss_clip": 0.01086879, + "auxiliary_loss_mlp": 0.01063061, + "balance_loss_clip": 1.02229166, + "balance_loss_mlp": 1.02436304, + "epoch": 0.1946790921388847, + "flos": 25300194353280.0, + "grad_norm": 1.6472097756194939, + "language_loss": 0.7622779, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.7837773, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.625, + "step": 3238, + "time_per_iteration": 2.433539867401123 + }, + { + "auxiliary_loss_clip": 0.01084108, + "auxiliary_loss_mlp": 0.01066145, + "balance_loss_clip": 1.02189469, + "balance_loss_mlp": 1.02447987, + "epoch": 0.19473921539155267, + "flos": 20192366200320.0, + "grad_norm": 3.0936659706921215, + "language_loss": 0.75546062, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.77696317, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.59375, + "step": 3239, + "time_per_iteration": 2.458822727203369 + }, + { + "auxiliary_loss_clip": 0.01083518, + "auxiliary_loss_mlp": 0.01062987, + "balance_loss_clip": 1.02278948, + "balance_loss_mlp": 1.02388477, + "epoch": 0.19479933864422067, + "flos": 20886474407040.0, + "grad_norm": 2.114702087426134, + "language_loss": 0.75190568, + "learning_rate": 3.721803155320412e-06, + "loss": 0.77337074, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.59765625, + "step": 3240, + "time_per_iteration": 2.4102861881256104 + }, + { + "auxiliary_loss_clip": 0.01082747, + "auxiliary_loss_mlp": 0.01056392, + "balance_loss_clip": 1.01619446, + "balance_loss_mlp": 1.02393258, + "epoch": 0.19485946189688863, + "flos": 23293945319040.0, + "grad_norm": 1.8024648636142755, + "language_loss": 0.68317902, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.70457041, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.58984375, + "step": 3241, + "time_per_iteration": 2.401703119277954 + }, + { + "auxiliary_loss_clip": 0.01083875, + "auxiliary_loss_mlp": 0.01062634, + "balance_loss_clip": 1.0203625, + "balance_loss_mlp": 1.02547669, + "epoch": 0.1949195851495566, + "flos": 23293910407680.0, + "grad_norm": 1.3808958963573523, + "language_loss": 0.84263456, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.86409962, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.5859375, + "step": 3242, + "time_per_iteration": 2.474928855895996 + }, + { + "auxiliary_loss_clip": 0.01026446, + "auxiliary_loss_mlp": 0.01008213, + "balance_loss_clip": 1.00435114, + "balance_loss_mlp": 1.01034582, + "epoch": 0.19497970840222456, + "flos": 64959536102400.0, + "grad_norm": 0.8457895392540817, + "language_loss": 0.57710612, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59745264, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.03857422, + "router_z_loss_mlp": 0.16113281, + "step": 3243, + "time_per_iteration": 3.0627028942108154 + }, + { + "auxiliary_loss_clip": 0.01086105, + "auxiliary_loss_mlp": 0.01066338, + "balance_loss_clip": 1.02416134, + "balance_loss_mlp": 1.0247947, + "epoch": 0.19503983165489253, + "flos": 19643741095680.0, + "grad_norm": 1.8690854211890973, + "language_loss": 0.85135633, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.87288076, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.61328125, + "step": 3244, + "time_per_iteration": 2.4119060039520264 + }, + { + "auxiliary_loss_clip": 0.01087512, + "auxiliary_loss_mlp": 0.01076431, + "balance_loss_clip": 1.03191853, + "balance_loss_mlp": 1.02830219, + "epoch": 0.1950999549075605, + "flos": 21140921462400.0, + "grad_norm": 1.7779362489232065, + "language_loss": 0.7875042, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.8091436, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.59375, + "step": 3245, + "time_per_iteration": 2.409534215927124 + }, + { + "auxiliary_loss_clip": 0.01087113, + "auxiliary_loss_mlp": 0.01073467, + "balance_loss_clip": 1.02828693, + "balance_loss_mlp": 1.02603102, + "epoch": 0.19516007816022846, + "flos": 20883821143680.0, + "grad_norm": 1.9074995762852822, + "language_loss": 0.86473, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.88633585, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 0.609375, + "step": 3246, + "time_per_iteration": 2.4371230602264404 + }, + { + "auxiliary_loss_clip": 0.0108779, + "auxiliary_loss_mlp": 0.01073544, + "balance_loss_clip": 1.02957916, + "balance_loss_mlp": 1.02581203, + "epoch": 0.19522020141289645, + "flos": 16909552880640.0, + "grad_norm": 1.9773480785277209, + "language_loss": 0.79132938, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.81294274, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.62109375, + "step": 3247, + "time_per_iteration": 2.3712823390960693 + }, + { + "auxiliary_loss_clip": 0.01085017, + "auxiliary_loss_mlp": 0.01062961, + "balance_loss_clip": 1.02188134, + "balance_loss_mlp": 1.02411008, + "epoch": 0.19528032466556441, + "flos": 26723603283840.0, + "grad_norm": 1.7209590045377845, + "language_loss": 0.77312672, + "learning_rate": 3.720215890515421e-06, + "loss": 0.79460657, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.609375, + "step": 3248, + "time_per_iteration": 2.449979305267334 + }, + { + "auxiliary_loss_clip": 0.01084721, + "auxiliary_loss_mlp": 0.01064659, + "balance_loss_clip": 1.02255416, + "balance_loss_mlp": 1.02447665, + "epoch": 0.19534044791823238, + "flos": 21031748040960.0, + "grad_norm": 2.6796879430926683, + "language_loss": 0.79201245, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.81350631, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6015625, + "step": 3249, + "time_per_iteration": 2.399061918258667 + }, + { + "auxiliary_loss_clip": 0.01086065, + "auxiliary_loss_mlp": 0.01071364, + "balance_loss_clip": 1.02897298, + "balance_loss_mlp": 1.02537286, + "epoch": 0.19540057117090034, + "flos": 22343016084480.0, + "grad_norm": 1.5513463413005752, + "language_loss": 0.74515313, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.76672739, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.60546875, + "step": 3250, + "time_per_iteration": 2.4383411407470703 + }, + { + "auxiliary_loss_clip": 0.01079614, + "auxiliary_loss_mlp": 0.010733, + "balance_loss_clip": 1.03322172, + "balance_loss_mlp": 1.02247238, + "epoch": 0.1954606944235683, + "flos": 20300631926400.0, + "grad_norm": 1.840475524506516, + "language_loss": 0.81221056, + "learning_rate": 3.719619589699017e-06, + "loss": 0.83373964, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.5703125, + "step": 3251, + "time_per_iteration": 2.3836634159088135 + }, + { + "auxiliary_loss_clip": 0.01085229, + "auxiliary_loss_mlp": 0.01065036, + "balance_loss_clip": 1.02619755, + "balance_loss_mlp": 1.02483153, + "epoch": 0.19552081767623627, + "flos": 17345932364160.0, + "grad_norm": 2.1765413347089018, + "language_loss": 0.86041772, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.88192034, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.60546875, + "step": 3252, + "time_per_iteration": 2.3689029216766357 + }, + { + "auxiliary_loss_clip": 0.01087349, + "auxiliary_loss_mlp": 0.01079276, + "balance_loss_clip": 1.03168738, + "balance_loss_mlp": 1.02476823, + "epoch": 0.19558094092890424, + "flos": 31976286134400.0, + "grad_norm": 1.59396559352403, + "language_loss": 0.74936748, + "learning_rate": 3.719221729768117e-06, + "loss": 0.77103382, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 0.625, + "step": 3253, + "time_per_iteration": 2.5081100463867188 + }, + { + "auxiliary_loss_clip": 0.01086354, + "auxiliary_loss_mlp": 0.01066709, + "balance_loss_clip": 1.02315032, + "balance_loss_mlp": 1.02419031, + "epoch": 0.19564106418157223, + "flos": 22267918016640.0, + "grad_norm": 1.8586726193090524, + "language_loss": 0.78355157, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.8050822, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.62109375, + "step": 3254, + "time_per_iteration": 2.396371364593506 + }, + { + "auxiliary_loss_clip": 0.01022588, + "auxiliary_loss_mlp": 0.01010999, + "balance_loss_clip": 1.00637412, + "balance_loss_mlp": 1.00644422, + "epoch": 0.1957011874342402, + "flos": 54362000678400.0, + "grad_norm": 0.7617728207790383, + "language_loss": 0.55383474, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57417059, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.04614258, + "router_z_loss_mlp": 0.16210938, + "step": 3255, + "time_per_iteration": 3.0039737224578857 + }, + { + "auxiliary_loss_clip": 0.01087756, + "auxiliary_loss_mlp": 0.01068649, + "balance_loss_clip": 1.02322984, + "balance_loss_mlp": 1.02572203, + "epoch": 0.19576131068690816, + "flos": 16505817384960.0, + "grad_norm": 2.362473325419355, + "language_loss": 0.7297051, + "learning_rate": 3.718624450942688e-06, + "loss": 0.75126916, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.6171875, + "step": 3256, + "time_per_iteration": 2.4022655487060547 + }, + { + "auxiliary_loss_clip": 0.0108208, + "auxiliary_loss_mlp": 0.01053381, + "balance_loss_clip": 1.014328, + "balance_loss_mlp": 1.02375531, + "epoch": 0.19582143393957613, + "flos": 14718822888960.0, + "grad_norm": 2.1799805916113684, + "language_loss": 0.81944227, + "learning_rate": 3.718425227649987e-06, + "loss": 0.84079689, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.58203125, + "step": 3257, + "time_per_iteration": 2.3655078411102295 + }, + { + "auxiliary_loss_clip": 0.01084968, + "auxiliary_loss_mlp": 0.01058792, + "balance_loss_clip": 1.01816547, + "balance_loss_mlp": 1.0251615, + "epoch": 0.1958815571922441, + "flos": 24424363186560.0, + "grad_norm": 1.7454857819960676, + "language_loss": 0.76814157, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.78957915, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.6015625, + "step": 3258, + "time_per_iteration": 2.456507921218872 + }, + { + "auxiliary_loss_clip": 0.01085862, + "auxiliary_loss_mlp": 0.01061372, + "balance_loss_clip": 1.02029228, + "balance_loss_mlp": 1.02504611, + "epoch": 0.19594168044491206, + "flos": 24899112120960.0, + "grad_norm": 1.7018844568082085, + "language_loss": 0.7578097, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.77928209, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.609375, + "step": 3259, + "time_per_iteration": 3.8855929374694824 + }, + { + "auxiliary_loss_clip": 0.01085227, + "auxiliary_loss_mlp": 0.01061585, + "balance_loss_clip": 1.02107787, + "balance_loss_mlp": 1.02439713, + "epoch": 0.19600180369758005, + "flos": 12056206694400.0, + "grad_norm": 2.1035003053582044, + "language_loss": 0.78808713, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.80955523, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.609375, + "step": 3260, + "time_per_iteration": 3.84858775138855 + }, + { + "auxiliary_loss_clip": 0.01084348, + "auxiliary_loss_mlp": 0.01062177, + "balance_loss_clip": 1.02324343, + "balance_loss_mlp": 1.02443933, + "epoch": 0.19606192695024802, + "flos": 20849152296960.0, + "grad_norm": 2.002539916446017, + "language_loss": 0.84324801, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.86471331, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.59765625, + "step": 3261, + "time_per_iteration": 2.4095029830932617 + }, + { + "auxiliary_loss_clip": 0.01083439, + "auxiliary_loss_mlp": 0.01065347, + "balance_loss_clip": 1.02376664, + "balance_loss_mlp": 1.02535892, + "epoch": 0.19612205020291598, + "flos": 28474253187840.0, + "grad_norm": 1.7084418057694921, + "language_loss": 0.77505237, + "learning_rate": 3.717428133894807e-06, + "loss": 0.79654026, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.58203125, + "step": 3262, + "time_per_iteration": 3.893674373626709 + }, + { + "auxiliary_loss_clip": 0.01085646, + "auxiliary_loss_mlp": 0.01057003, + "balance_loss_clip": 1.01837969, + "balance_loss_mlp": 1.02680981, + "epoch": 0.19618217345558395, + "flos": 25555444369920.0, + "grad_norm": 1.508612408541025, + "language_loss": 0.86876476, + "learning_rate": 3.71722851973837e-06, + "loss": 0.8901912, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.58984375, + "step": 3263, + "time_per_iteration": 2.4454469680786133 + }, + { + "auxiliary_loss_clip": 0.01081015, + "auxiliary_loss_mlp": 0.01066584, + "balance_loss_clip": 1.02848434, + "balance_loss_mlp": 1.02335107, + "epoch": 0.1962422967082519, + "flos": 25263256268160.0, + "grad_norm": 1.639808589862323, + "language_loss": 0.75494194, + "learning_rate": 3.717028840464455e-06, + "loss": 0.77641797, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.578125, + "step": 3264, + "time_per_iteration": 3.8143575191497803 + }, + { + "auxiliary_loss_clip": 0.01080165, + "auxiliary_loss_mlp": 0.01059377, + "balance_loss_clip": 1.02285099, + "balance_loss_mlp": 1.02497673, + "epoch": 0.19630241996091988, + "flos": 18806349202560.0, + "grad_norm": 1.8375671917416017, + "language_loss": 0.8084079, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.82980335, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.55078125, + "step": 3265, + "time_per_iteration": 2.3842501640319824 + }, + { + "auxiliary_loss_clip": 0.01020701, + "auxiliary_loss_mlp": 0.01013692, + "balance_loss_clip": 1.00925708, + "balance_loss_mlp": 1.0046134, + "epoch": 0.19636254321358784, + "flos": 62318287526400.0, + "grad_norm": 0.7881727588794616, + "language_loss": 0.53524613, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55559003, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.04443359, + "router_z_loss_mlp": 0.16113281, + "step": 3266, + "time_per_iteration": 3.0483055114746094 + }, + { + "auxiliary_loss_clip": 0.01085162, + "auxiliary_loss_mlp": 0.01062716, + "balance_loss_clip": 1.02077794, + "balance_loss_mlp": 1.02371573, + "epoch": 0.19642266646625584, + "flos": 21068267189760.0, + "grad_norm": 1.7849483017988634, + "language_loss": 0.82185704, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.84333581, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6171875, + "step": 3267, + "time_per_iteration": 2.384989023208618 + }, + { + "auxiliary_loss_clip": 0.01078492, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_clip": 1.01675439, + "balance_loss_mlp": 1.02344894, + "epoch": 0.1964827897189238, + "flos": 14537763244800.0, + "grad_norm": 1.814344854024507, + "language_loss": 0.88595629, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.90726876, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.55078125, + "step": 3268, + "time_per_iteration": 2.3979086875915527 + }, + { + "auxiliary_loss_clip": 0.01083368, + "auxiliary_loss_mlp": 0.01054692, + "balance_loss_clip": 1.01952481, + "balance_loss_mlp": 1.0262723, + "epoch": 0.19654291297159177, + "flos": 19243636381440.0, + "grad_norm": 2.029877590060043, + "language_loss": 0.72062612, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.74200672, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5703125, + "step": 3269, + "time_per_iteration": 2.3872671127319336 + }, + { + "auxiliary_loss_clip": 0.01084577, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_clip": 1.02095032, + "balance_loss_mlp": 1.02571678, + "epoch": 0.19660303622425973, + "flos": 25774524351360.0, + "grad_norm": 1.7691858619469076, + "language_loss": 0.81743562, + "learning_rate": 3.715829397778135e-06, + "loss": 0.83885705, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.58984375, + "step": 3270, + "time_per_iteration": 2.443452835083008 + }, + { + "auxiliary_loss_clip": 0.01080909, + "auxiliary_loss_mlp": 0.01055231, + "balance_loss_clip": 1.01744175, + "balance_loss_mlp": 1.0236578, + "epoch": 0.1966631594769277, + "flos": 20594041925760.0, + "grad_norm": 1.9720930003833919, + "language_loss": 0.85886109, + "learning_rate": 3.715629262894028e-06, + "loss": 0.88022244, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5703125, + "step": 3271, + "time_per_iteration": 2.3985233306884766 + }, + { + "auxiliary_loss_clip": 0.01078661, + "auxiliary_loss_mlp": 0.01059644, + "balance_loss_clip": 1.02106833, + "balance_loss_mlp": 1.02459693, + "epoch": 0.19672328272959566, + "flos": 23622059076480.0, + "grad_norm": 1.722700810482503, + "language_loss": 0.81038868, + "learning_rate": 3.715429062953087e-06, + "loss": 0.83177179, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5390625, + "step": 3272, + "time_per_iteration": 2.460452079772949 + }, + { + "auxiliary_loss_clip": 0.01080254, + "auxiliary_loss_mlp": 0.01054354, + "balance_loss_clip": 1.01914001, + "balance_loss_mlp": 1.02412224, + "epoch": 0.19678340598226365, + "flos": 23109848386560.0, + "grad_norm": 1.7585527412374538, + "language_loss": 0.82201338, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.84335953, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5625, + "step": 3273, + "time_per_iteration": 2.4052979946136475 + }, + { + "auxiliary_loss_clip": 0.0108095, + "auxiliary_loss_mlp": 0.01064836, + "balance_loss_clip": 1.02780986, + "balance_loss_mlp": 1.02452135, + "epoch": 0.19684352923493162, + "flos": 24533711164800.0, + "grad_norm": 1.5992888757573545, + "language_loss": 0.78954595, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.8110038, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5625, + "step": 3274, + "time_per_iteration": 2.418731689453125 + }, + { + "auxiliary_loss_clip": 0.01078858, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.01866221, + "balance_loss_mlp": 1.02325594, + "epoch": 0.19690365248759958, + "flos": 21795438320640.0, + "grad_norm": 2.4849995833887903, + "language_loss": 0.83450687, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.85585356, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5546875, + "step": 3275, + "time_per_iteration": 2.3808791637420654 + }, + { + "auxiliary_loss_clip": 0.0107878, + "auxiliary_loss_mlp": 0.01056237, + "balance_loss_clip": 1.02006841, + "balance_loss_mlp": 1.02297354, + "epoch": 0.19696377574026755, + "flos": 19055803933440.0, + "grad_norm": 1.8051915476854958, + "language_loss": 0.82332826, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8446784, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5546875, + "step": 3276, + "time_per_iteration": 2.367443323135376 + }, + { + "auxiliary_loss_clip": 0.01079407, + "auxiliary_loss_mlp": 0.01049442, + "balance_loss_clip": 1.01148558, + "balance_loss_mlp": 1.02362704, + "epoch": 0.19702389899293551, + "flos": 22819545498240.0, + "grad_norm": 2.5247899354661456, + "language_loss": 0.90818912, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.92947757, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.55859375, + "step": 3277, + "time_per_iteration": 2.400944948196411 + }, + { + "auxiliary_loss_clip": 0.010814, + "auxiliary_loss_mlp": 0.01064346, + "balance_loss_clip": 1.02214634, + "balance_loss_mlp": 1.02279365, + "epoch": 0.19708402224560348, + "flos": 22893107466240.0, + "grad_norm": 1.8464015012354407, + "language_loss": 0.64160037, + "learning_rate": 3.714226497539239e-06, + "loss": 0.6630578, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.5859375, + "step": 3278, + "time_per_iteration": 2.389479160308838 + }, + { + "auxiliary_loss_clip": 0.01081127, + "auxiliary_loss_mlp": 0.01067398, + "balance_loss_clip": 1.02860701, + "balance_loss_mlp": 1.02465177, + "epoch": 0.19714414549827144, + "flos": 25661440857600.0, + "grad_norm": 2.194794940533298, + "language_loss": 0.76210839, + "learning_rate": 3.714025842413166e-06, + "loss": 0.78359365, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5625, + "step": 3279, + "time_per_iteration": 2.4656901359558105 + }, + { + "auxiliary_loss_clip": 0.01080549, + "auxiliary_loss_mlp": 0.01059672, + "balance_loss_clip": 1.02517271, + "balance_loss_mlp": 1.02327538, + "epoch": 0.19720426875093944, + "flos": 23914666114560.0, + "grad_norm": 1.569702998998282, + "language_loss": 0.84142232, + "learning_rate": 3.713825122291061e-06, + "loss": 0.86282456, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5703125, + "step": 3280, + "time_per_iteration": 2.425898313522339 + }, + { + "auxiliary_loss_clip": 0.01080791, + "auxiliary_loss_mlp": 0.01055744, + "balance_loss_clip": 1.02057719, + "balance_loss_mlp": 1.02253139, + "epoch": 0.1972643920036074, + "flos": 13881081882240.0, + "grad_norm": 1.9898685995801606, + "language_loss": 0.79225934, + "learning_rate": 3.713624337180536e-06, + "loss": 0.81362468, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.58203125, + "step": 3281, + "time_per_iteration": 2.374589443206787 + }, + { + "auxiliary_loss_clip": 0.01077823, + "auxiliary_loss_mlp": 0.01060471, + "balance_loss_clip": 1.02635348, + "balance_loss_mlp": 1.02358842, + "epoch": 0.19732451525627537, + "flos": 19862611608960.0, + "grad_norm": 1.7063324202471128, + "language_loss": 0.81290251, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.83428544, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.54296875, + "step": 3282, + "time_per_iteration": 2.396775722503662 + }, + { + "auxiliary_loss_clip": 0.01081887, + "auxiliary_loss_mlp": 0.01053984, + "balance_loss_clip": 1.01769662, + "balance_loss_mlp": 1.02379394, + "epoch": 0.19738463850894333, + "flos": 24972255152640.0, + "grad_norm": 2.021273994852195, + "language_loss": 0.74001527, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.761374, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.58203125, + "step": 3283, + "time_per_iteration": 2.4460651874542236 + }, + { + "auxiliary_loss_clip": 0.01078383, + "auxiliary_loss_mlp": 0.01068565, + "balance_loss_clip": 1.03294516, + "balance_loss_mlp": 1.02197134, + "epoch": 0.1974447617616113, + "flos": 18367909948800.0, + "grad_norm": 1.6573146809718862, + "language_loss": 0.80710816, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.82857764, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.56640625, + "step": 3284, + "time_per_iteration": 2.359327554702759 + }, + { + "auxiliary_loss_clip": 0.01081798, + "auxiliary_loss_mlp": 0.01061094, + "balance_loss_clip": 1.02356732, + "balance_loss_mlp": 1.02321589, + "epoch": 0.19750488501427926, + "flos": 22891850657280.0, + "grad_norm": 2.125667042280686, + "language_loss": 0.87403202, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.89546096, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5859375, + "step": 3285, + "time_per_iteration": 2.3930392265319824 + }, + { + "auxiliary_loss_clip": 0.01079454, + "auxiliary_loss_mlp": 0.0106265, + "balance_loss_clip": 1.02543306, + "balance_loss_mlp": 1.02448475, + "epoch": 0.19756500826694723, + "flos": 21870431654400.0, + "grad_norm": 1.9059839958766212, + "language_loss": 0.90037012, + "learning_rate": 3.712619437068174e-06, + "loss": 0.9217912, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.55078125, + "step": 3286, + "time_per_iteration": 2.423926591873169 + }, + { + "auxiliary_loss_clip": 0.01084052, + "auxiliary_loss_mlp": 0.01063933, + "balance_loss_clip": 1.02416515, + "balance_loss_mlp": 1.02584553, + "epoch": 0.19762513151961522, + "flos": 15158065104000.0, + "grad_norm": 2.2421801372417947, + "language_loss": 0.79497677, + "learning_rate": 3.712418262187102e-06, + "loss": 0.81645662, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.58203125, + "step": 3287, + "time_per_iteration": 2.4905669689178467 + }, + { + "auxiliary_loss_clip": 0.01082792, + "auxiliary_loss_mlp": 0.01054977, + "balance_loss_clip": 1.01821291, + "balance_loss_mlp": 1.02466774, + "epoch": 0.1976852547722832, + "flos": 16978331992320.0, + "grad_norm": 1.930721817429809, + "language_loss": 0.82889926, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.85027695, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.578125, + "step": 3288, + "time_per_iteration": 2.3741817474365234 + }, + { + "auxiliary_loss_clip": 0.0107636, + "auxiliary_loss_mlp": 0.01054543, + "balance_loss_clip": 1.0194, + "balance_loss_mlp": 1.02198744, + "epoch": 0.19774537802495115, + "flos": 20301888735360.0, + "grad_norm": 1.8539539368528961, + "language_loss": 0.73779112, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75910014, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.54296875, + "step": 3289, + "time_per_iteration": 2.3878018856048584 + }, + { + "auxiliary_loss_clip": 0.01080562, + "auxiliary_loss_mlp": 0.01053991, + "balance_loss_clip": 1.01720273, + "balance_loss_mlp": 1.02362847, + "epoch": 0.19780550127761912, + "flos": 27234242962560.0, + "grad_norm": 1.642155026436488, + "language_loss": 0.81419319, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.83553874, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5703125, + "step": 3290, + "time_per_iteration": 2.427053213119507 + }, + { + "auxiliary_loss_clip": 0.01019524, + "auxiliary_loss_mlp": 0.01004168, + "balance_loss_clip": 0.99937612, + "balance_loss_mlp": 1.00387287, + "epoch": 0.19786562453028708, + "flos": 63547368629760.0, + "grad_norm": 0.8971938191030218, + "language_loss": 0.60443646, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62467337, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.04785156, + "router_z_loss_mlp": 0.15625, + "step": 3291, + "time_per_iteration": 3.0537054538726807 + }, + { + "auxiliary_loss_clip": 0.01082555, + "auxiliary_loss_mlp": 0.01060364, + "balance_loss_clip": 1.01699543, + "balance_loss_mlp": 1.02243686, + "epoch": 0.19792574778295505, + "flos": 26285443320960.0, + "grad_norm": 1.7390938043985176, + "language_loss": 0.83536983, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.85679901, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.6015625, + "step": 3292, + "time_per_iteration": 2.419462203979492 + }, + { + "auxiliary_loss_clip": 0.01074734, + "auxiliary_loss_mlp": 0.01058425, + "balance_loss_clip": 1.02282953, + "balance_loss_mlp": 1.0214808, + "epoch": 0.19798587103562304, + "flos": 19937081272320.0, + "grad_norm": 1.6774547119094516, + "language_loss": 0.84225357, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.86358517, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.53125, + "step": 3293, + "time_per_iteration": 2.396024703979492 + }, + { + "auxiliary_loss_clip": 0.01087615, + "auxiliary_loss_mlp": 0.0107031, + "balance_loss_clip": 1.02539182, + "balance_loss_mlp": 1.02606118, + "epoch": 0.198045994288291, + "flos": 20119258080000.0, + "grad_norm": 1.855918326626777, + "language_loss": 0.63563347, + "learning_rate": 3.711008220265093e-06, + "loss": 0.65721273, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6171875, + "step": 3294, + "time_per_iteration": 2.376526117324829 + }, + { + "auxiliary_loss_clip": 0.01078704, + "auxiliary_loss_mlp": 0.01057871, + "balance_loss_clip": 1.0203675, + "balance_loss_mlp": 1.02210283, + "epoch": 0.19810611754095897, + "flos": 17966688071040.0, + "grad_norm": 1.7837364789569605, + "language_loss": 0.89377952, + "learning_rate": 3.710806526117251e-06, + "loss": 0.91514528, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5625, + "step": 3295, + "time_per_iteration": 2.366445779800415 + }, + { + "auxiliary_loss_clip": 0.01080684, + "auxiliary_loss_mlp": 0.01055031, + "balance_loss_clip": 1.02026963, + "balance_loss_mlp": 1.0234921, + "epoch": 0.19816624079362694, + "flos": 15084119111040.0, + "grad_norm": 2.268540756841552, + "language_loss": 0.83145887, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.85281605, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5703125, + "step": 3296, + "time_per_iteration": 2.363415479660034 + }, + { + "auxiliary_loss_clip": 0.01085123, + "auxiliary_loss_mlp": 0.01066935, + "balance_loss_clip": 1.01970458, + "balance_loss_mlp": 1.024611, + "epoch": 0.1982263640462949, + "flos": 24899147032320.0, + "grad_norm": 2.021485831276522, + "language_loss": 0.69310403, + "learning_rate": 3.710402943207354e-06, + "loss": 0.71462464, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.60546875, + "step": 3297, + "time_per_iteration": 2.45711612701416 + }, + { + "auxiliary_loss_clip": 0.01075817, + "auxiliary_loss_mlp": 0.01051969, + "balance_loss_clip": 1.01599121, + "balance_loss_mlp": 1.02189827, + "epoch": 0.19828648729896287, + "flos": 20375136501120.0, + "grad_norm": 1.608012730119746, + "language_loss": 0.82971895, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.85099679, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5390625, + "step": 3298, + "time_per_iteration": 3.8261466026306152 + }, + { + "auxiliary_loss_clip": 0.01083279, + "auxiliary_loss_mlp": 0.01061285, + "balance_loss_clip": 1.01741576, + "balance_loss_mlp": 1.02311277, + "epoch": 0.19834661055163083, + "flos": 18879038386560.0, + "grad_norm": 1.9040768396078307, + "language_loss": 0.86584336, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.88728899, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.6015625, + "step": 3299, + "time_per_iteration": 2.3663084506988525 + }, + { + "auxiliary_loss_clip": 0.0102024, + "auxiliary_loss_mlp": 0.01009211, + "balance_loss_clip": 1.00434709, + "balance_loss_mlp": 1.0033977, + "epoch": 0.19840673380429882, + "flos": 60255897292800.0, + "grad_norm": 0.7794681193137385, + "language_loss": 0.53296947, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55326396, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.04858398, + "router_z_loss_mlp": 0.16796875, + "step": 3300, + "time_per_iteration": 4.40226674079895 + }, + { + "auxiliary_loss_clip": 0.01078798, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.0246973, + "balance_loss_mlp": 1.02202022, + "epoch": 0.1984668570569668, + "flos": 19900981059840.0, + "grad_norm": 1.5823739569328903, + "language_loss": 0.75206113, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.77350754, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.56640625, + "step": 3301, + "time_per_iteration": 2.3982300758361816 + }, + { + "auxiliary_loss_clip": 0.01079684, + "auxiliary_loss_mlp": 0.01053771, + "balance_loss_clip": 1.01505184, + "balance_loss_mlp": 1.0228337, + "epoch": 0.19852698030963475, + "flos": 15629916395520.0, + "grad_norm": 2.2851310359430745, + "language_loss": 0.89997405, + "learning_rate": 3.709392851040235e-06, + "loss": 0.92130864, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5703125, + "step": 3302, + "time_per_iteration": 3.843670606613159 + }, + { + "auxiliary_loss_clip": 0.01082347, + "auxiliary_loss_mlp": 0.01061965, + "balance_loss_clip": 1.02191067, + "balance_loss_mlp": 1.02377748, + "epoch": 0.19858710356230272, + "flos": 43141335575040.0, + "grad_norm": 1.8194401724790086, + "language_loss": 0.75640321, + "learning_rate": 3.709190638115111e-06, + "loss": 0.77784634, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.5859375, + "step": 3303, + "time_per_iteration": 3.9413540363311768 + }, + { + "auxiliary_loss_clip": 0.01081449, + "auxiliary_loss_mlp": 0.01067234, + "balance_loss_clip": 1.02539134, + "balance_loss_mlp": 1.02398872, + "epoch": 0.19864722681497068, + "flos": 35142873937920.0, + "grad_norm": 2.07927257164105, + "language_loss": 0.76674622, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.7882331, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.57421875, + "step": 3304, + "time_per_iteration": 2.492401599884033 + }, + { + "auxiliary_loss_clip": 0.01079154, + "auxiliary_loss_mlp": 0.01052422, + "balance_loss_clip": 1.01849544, + "balance_loss_mlp": 1.02315044, + "epoch": 0.19870735006763865, + "flos": 19425219696000.0, + "grad_norm": 1.6167147576218766, + "language_loss": 0.8774724, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.89878821, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.55859375, + "step": 3305, + "time_per_iteration": 2.385627508163452 + }, + { + "auxiliary_loss_clip": 0.01081246, + "auxiliary_loss_mlp": 0.01060764, + "balance_loss_clip": 1.0237143, + "balance_loss_mlp": 1.02230334, + "epoch": 0.19876747332030664, + "flos": 23546332604160.0, + "grad_norm": 1.5222421350462447, + "language_loss": 0.70133382, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.72275388, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.58984375, + "step": 3306, + "time_per_iteration": 2.4195263385772705 + }, + { + "auxiliary_loss_clip": 0.01078908, + "auxiliary_loss_mlp": 0.01059314, + "balance_loss_clip": 1.01971316, + "balance_loss_mlp": 1.02187896, + "epoch": 0.1988275965729746, + "flos": 19828361698560.0, + "grad_norm": 1.8611716202788697, + "language_loss": 0.77642268, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.79780495, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.5703125, + "step": 3307, + "time_per_iteration": 2.4185261726379395 + }, + { + "auxiliary_loss_clip": 0.01079714, + "auxiliary_loss_mlp": 0.01055969, + "balance_loss_clip": 1.02151775, + "balance_loss_mlp": 1.02368641, + "epoch": 0.19888771982564257, + "flos": 23512501630080.0, + "grad_norm": 1.7292401300168632, + "language_loss": 0.77697599, + "learning_rate": 3.708178601452737e-06, + "loss": 0.79833281, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5625, + "step": 3308, + "time_per_iteration": 2.4124317169189453 + }, + { + "auxiliary_loss_clip": 0.0108003, + "auxiliary_loss_mlp": 0.01052888, + "balance_loss_clip": 1.01433635, + "balance_loss_mlp": 1.02214146, + "epoch": 0.19894784307831054, + "flos": 18149528194560.0, + "grad_norm": 1.9707655771267316, + "language_loss": 0.77673101, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.79806024, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.578125, + "step": 3309, + "time_per_iteration": 2.377485513687134 + }, + { + "auxiliary_loss_clip": 0.01079918, + "auxiliary_loss_mlp": 0.01065487, + "balance_loss_clip": 1.02791262, + "balance_loss_mlp": 1.02305245, + "epoch": 0.1990079663309785, + "flos": 24275004923520.0, + "grad_norm": 1.5235879387182822, + "language_loss": 0.89239913, + "learning_rate": 3.707773333313917e-06, + "loss": 0.91385317, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5703125, + "step": 3310, + "time_per_iteration": 2.4250919818878174 + }, + { + "auxiliary_loss_clip": 0.01077007, + "auxiliary_loss_mlp": 0.01055742, + "balance_loss_clip": 1.01904905, + "balance_loss_mlp": 1.0223546, + "epoch": 0.19906808958364647, + "flos": 34896212115840.0, + "grad_norm": 1.9906853138709864, + "language_loss": 0.65763807, + "learning_rate": 3.70757060210226e-06, + "loss": 0.67896557, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.546875, + "step": 3311, + "time_per_iteration": 2.505826711654663 + }, + { + "auxiliary_loss_clip": 0.01082231, + "auxiliary_loss_mlp": 0.01060287, + "balance_loss_clip": 1.0223074, + "balance_loss_mlp": 1.02294958, + "epoch": 0.19912821283631443, + "flos": 24023734801920.0, + "grad_norm": 2.2549863836852344, + "language_loss": 0.76776177, + "learning_rate": 3.707367806139355e-06, + "loss": 0.78918695, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.59375, + "step": 3312, + "time_per_iteration": 2.396540641784668 + }, + { + "auxiliary_loss_clip": 0.01080368, + "auxiliary_loss_mlp": 0.01064157, + "balance_loss_clip": 1.0273211, + "balance_loss_mlp": 1.02369428, + "epoch": 0.19918833608898243, + "flos": 19858177866240.0, + "grad_norm": 1.884056568472047, + "language_loss": 0.85625732, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.87770247, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.56640625, + "step": 3313, + "time_per_iteration": 2.3903145790100098 + }, + { + "auxiliary_loss_clip": 0.01081106, + "auxiliary_loss_mlp": 0.01061317, + "balance_loss_clip": 1.02481461, + "balance_loss_mlp": 1.02377248, + "epoch": 0.1992484593416504, + "flos": 29094520135680.0, + "grad_norm": 1.9680400677524739, + "language_loss": 0.82680357, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.84822786, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.57421875, + "step": 3314, + "time_per_iteration": 2.4379985332489014 + }, + { + "auxiliary_loss_clip": 0.01076469, + "auxiliary_loss_mlp": 0.01050668, + "balance_loss_clip": 1.01714659, + "balance_loss_mlp": 1.02131641, + "epoch": 0.19930858259431836, + "flos": 23293875496320.0, + "grad_norm": 1.4736603471429648, + "language_loss": 0.88725781, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.90852916, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.55078125, + "step": 3315, + "time_per_iteration": 2.4478707313537598 + }, + { + "auxiliary_loss_clip": 0.01081077, + "auxiliary_loss_mlp": 0.01058332, + "balance_loss_clip": 1.02094746, + "balance_loss_mlp": 1.02407432, + "epoch": 0.19936870584698632, + "flos": 25377526748160.0, + "grad_norm": 1.4023462681535126, + "language_loss": 0.72763228, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.74902642, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5703125, + "step": 3316, + "time_per_iteration": 2.4666762351989746 + }, + { + "auxiliary_loss_clip": 0.01037722, + "auxiliary_loss_mlp": 0.01062013, + "balance_loss_clip": 1.05590916, + "balance_loss_mlp": 1.0161922, + "epoch": 0.1994288290996543, + "flos": 62164388920320.0, + "grad_norm": 0.8580797455409654, + "language_loss": 0.6646325, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68562984, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.06103516, + "router_z_loss_mlp": 0.21484375, + "step": 3317, + "time_per_iteration": 3.1550674438476562 + }, + { + "auxiliary_loss_clip": 0.0108279, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_clip": 1.0294416, + "balance_loss_mlp": 1.02317476, + "epoch": 0.19948895235232225, + "flos": 19024835690880.0, + "grad_norm": 2.0517361990808713, + "language_loss": 0.75580907, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.7773155, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.59375, + "step": 3318, + "time_per_iteration": 2.355504274368286 + }, + { + "auxiliary_loss_clip": 0.01079819, + "auxiliary_loss_mlp": 0.01058064, + "balance_loss_clip": 1.02325535, + "balance_loss_mlp": 1.02436829, + "epoch": 0.19954907560499022, + "flos": 37814287795200.0, + "grad_norm": 2.0248954144951203, + "language_loss": 0.81221652, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.8335954, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5546875, + "step": 3319, + "time_per_iteration": 2.527707815170288 + }, + { + "auxiliary_loss_clip": 0.01080979, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_clip": 1.03116071, + "balance_loss_mlp": 1.02323377, + "epoch": 0.1996091988576582, + "flos": 49563329414400.0, + "grad_norm": 2.1429904359690575, + "language_loss": 0.78197861, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.80349052, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.578125, + "step": 3320, + "time_per_iteration": 2.6357204914093018 + }, + { + "auxiliary_loss_clip": 0.01080131, + "auxiliary_loss_mlp": 0.01079927, + "balance_loss_clip": 1.04371142, + "balance_loss_mlp": 1.0231266, + "epoch": 0.19966932211032618, + "flos": 22634750338560.0, + "grad_norm": 1.5569904265538717, + "language_loss": 0.81730306, + "learning_rate": 3.705539729936701e-06, + "loss": 0.83890355, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5703125, + "step": 3321, + "time_per_iteration": 2.4090964794158936 + }, + { + "auxiliary_loss_clip": 0.01046601, + "auxiliary_loss_mlp": 0.01015904, + "balance_loss_clip": 1.00908506, + "balance_loss_mlp": 1.02618766, + "epoch": 0.19972944536299414, + "flos": 54079308466560.0, + "grad_norm": 0.9221185868853592, + "language_loss": 0.65278035, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67340541, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.06835938, + "router_z_loss_mlp": 0.20410156, + "step": 3322, + "time_per_iteration": 2.925981283187866 + }, + { + "auxiliary_loss_clip": 0.01037951, + "auxiliary_loss_mlp": 0.01009645, + "balance_loss_clip": 1.00356567, + "balance_loss_mlp": 1.01785469, + "epoch": 0.1997895686156621, + "flos": 69352204498560.0, + "grad_norm": 0.7842056666234318, + "language_loss": 0.57141411, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59189004, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.06079102, + "router_z_loss_mlp": 0.20117188, + "step": 3323, + "time_per_iteration": 3.2517261505126953 + }, + { + "auxiliary_loss_clip": 0.01078904, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_clip": 1.04778123, + "balance_loss_mlp": 1.02293754, + "epoch": 0.19984969186833007, + "flos": 18551064274560.0, + "grad_norm": 1.778717345735008, + "language_loss": 0.83088863, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.8525331, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5625, + "step": 3324, + "time_per_iteration": 2.4766030311584473 + }, + { + "auxiliary_loss_clip": 0.01078478, + "auxiliary_loss_mlp": 0.01074439, + "balance_loss_clip": 1.03784132, + "balance_loss_mlp": 1.02332258, + "epoch": 0.19990981512099804, + "flos": 26428552450560.0, + "grad_norm": 3.2817141480414027, + "language_loss": 0.5563674, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.5778966, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5546875, + "step": 3325, + "time_per_iteration": 2.5553600788116455 + }, + { + "auxiliary_loss_clip": 0.01084037, + "auxiliary_loss_mlp": 0.01084045, + "balance_loss_clip": 1.04937935, + "balance_loss_mlp": 1.02668869, + "epoch": 0.19996993837366603, + "flos": 16325071943040.0, + "grad_norm": 1.798446405901701, + "language_loss": 0.86941135, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.89109224, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.57421875, + "step": 3326, + "time_per_iteration": 2.4714813232421875 + }, + { + "auxiliary_loss_clip": 0.01083148, + "auxiliary_loss_mlp": 0.01075561, + "balance_loss_clip": 1.04361272, + "balance_loss_mlp": 1.0280726, + "epoch": 0.200030061626334, + "flos": 20843287188480.0, + "grad_norm": 1.8319564819365488, + "language_loss": 0.73640645, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.75799352, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.55078125, + "step": 3327, + "time_per_iteration": 2.4870636463165283 + }, + { + "auxiliary_loss_clip": 0.01090861, + "auxiliary_loss_mlp": 0.01068936, + "balance_loss_clip": 1.02978802, + "balance_loss_mlp": 1.03210163, + "epoch": 0.20009018487900196, + "flos": 23761677070080.0, + "grad_norm": 1.627857748091465, + "language_loss": 0.77914745, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.80074537, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5859375, + "step": 3328, + "time_per_iteration": 2.502701759338379 + }, + { + "auxiliary_loss_clip": 0.01086885, + "auxiliary_loss_mlp": 0.01051384, + "balance_loss_clip": 1.0206995, + "balance_loss_mlp": 1.03333902, + "epoch": 0.20015030813166992, + "flos": 28110283597440.0, + "grad_norm": 1.8589492427325165, + "language_loss": 0.70258236, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.72396505, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.5390625, + "step": 3329, + "time_per_iteration": 2.566476583480835 + }, + { + "auxiliary_loss_clip": 0.01098136, + "auxiliary_loss_mlp": 0.01056349, + "balance_loss_clip": 1.01810706, + "balance_loss_mlp": 1.0375998, + "epoch": 0.2002104313843379, + "flos": 26065979314560.0, + "grad_norm": 2.336731956622303, + "language_loss": 0.82191348, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.84345829, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.60546875, + "step": 3330, + "time_per_iteration": 2.50862979888916 + }, + { + "auxiliary_loss_clip": 0.01097339, + "auxiliary_loss_mlp": 0.01056281, + "balance_loss_clip": 1.0187062, + "balance_loss_mlp": 1.03907907, + "epoch": 0.20027055463700585, + "flos": 22965517359360.0, + "grad_norm": 2.439616203289545, + "language_loss": 0.79377413, + "learning_rate": 3.703502390349417e-06, + "loss": 0.81531036, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.58203125, + "step": 3331, + "time_per_iteration": 2.469548225402832 + }, + { + "auxiliary_loss_clip": 0.01100469, + "auxiliary_loss_mlp": 0.01062753, + "balance_loss_clip": 1.02181649, + "balance_loss_mlp": 1.0402782, + "epoch": 0.20033067788967382, + "flos": 17164698163200.0, + "grad_norm": 1.9423214555842956, + "language_loss": 0.80973554, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.83136773, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.6015625, + "step": 3332, + "time_per_iteration": 2.460225820541382 + }, + { + "auxiliary_loss_clip": 0.01060067, + "auxiliary_loss_mlp": 0.01052258, + "balance_loss_clip": 1.04415143, + "balance_loss_mlp": 1.03618073, + "epoch": 0.2003908011423418, + "flos": 60822747993600.0, + "grad_norm": 0.9675365705878571, + "language_loss": 0.62084913, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64197242, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.08105469, + "router_z_loss_mlp": 0.23828125, + "step": 3333, + "time_per_iteration": 2.984750747680664 + }, + { + "auxiliary_loss_clip": 0.0109666, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_clip": 1.03221703, + "balance_loss_mlp": 1.03770781, + "epoch": 0.20045092439500978, + "flos": 24205108648320.0, + "grad_norm": 2.1048184003341435, + "language_loss": 0.83132589, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.85298377, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.58984375, + "step": 3334, + "time_per_iteration": 2.488971710205078 + }, + { + "auxiliary_loss_clip": 0.01099543, + "auxiliary_loss_mlp": 0.01072902, + "balance_loss_clip": 1.03289545, + "balance_loss_mlp": 1.03825688, + "epoch": 0.20051104764767774, + "flos": 29386324212480.0, + "grad_norm": 2.219718965331155, + "language_loss": 0.77080059, + "learning_rate": 3.702685645366134e-06, + "loss": 0.79252505, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.61328125, + "step": 3335, + "time_per_iteration": 2.522547483444214 + }, + { + "auxiliary_loss_clip": 0.01096788, + "auxiliary_loss_mlp": 0.01079748, + "balance_loss_clip": 1.04236364, + "balance_loss_mlp": 1.03625655, + "epoch": 0.2005711709003457, + "flos": 23512676186880.0, + "grad_norm": 1.95160214847437, + "language_loss": 0.81367344, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.83543873, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.60546875, + "step": 3336, + "time_per_iteration": 2.451488733291626 + }, + { + "auxiliary_loss_clip": 0.01094572, + "auxiliary_loss_mlp": 0.01069689, + "balance_loss_clip": 1.02741766, + "balance_loss_mlp": 1.03294218, + "epoch": 0.20063129415301367, + "flos": 22522434894720.0, + "grad_norm": 1.9683041105257713, + "language_loss": 0.80090547, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.82254803, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.6171875, + "step": 3337, + "time_per_iteration": 2.4526665210723877 + }, + { + "auxiliary_loss_clip": 0.01095834, + "auxiliary_loss_mlp": 0.01080263, + "balance_loss_clip": 1.03932619, + "balance_loss_mlp": 1.03525698, + "epoch": 0.20069141740568164, + "flos": 25957050272640.0, + "grad_norm": 1.9015099033290792, + "language_loss": 0.70984638, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.73160732, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.60546875, + "step": 3338, + "time_per_iteration": 3.9459729194641113 + }, + { + "auxiliary_loss_clip": 0.01091621, + "auxiliary_loss_mlp": 0.01079449, + "balance_loss_clip": 1.04301882, + "balance_loss_mlp": 1.03196621, + "epoch": 0.2007515406583496, + "flos": 24789449940480.0, + "grad_norm": 1.9913561375805733, + "language_loss": 0.7141633, + "learning_rate": 3.701867867326735e-06, + "loss": 0.73587406, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.59765625, + "step": 3339, + "time_per_iteration": 3.9836084842681885 + }, + { + "auxiliary_loss_clip": 0.01092605, + "auxiliary_loss_mlp": 0.01072698, + "balance_loss_clip": 1.03443193, + "balance_loss_mlp": 1.03063977, + "epoch": 0.2008116639110176, + "flos": 37924054709760.0, + "grad_norm": 3.132017948914609, + "language_loss": 0.68006003, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.70171309, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.62109375, + "step": 3340, + "time_per_iteration": 2.6603782176971436 + }, + { + "auxiliary_loss_clip": 0.01087822, + "auxiliary_loss_mlp": 0.01062716, + "balance_loss_clip": 1.02509344, + "balance_loss_mlp": 1.02844286, + "epoch": 0.20087178716368556, + "flos": 20739490116480.0, + "grad_norm": 2.8267823543542225, + "language_loss": 0.76469362, + "learning_rate": 3.701458591066019e-06, + "loss": 0.78619909, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.59375, + "step": 3341, + "time_per_iteration": 3.8599376678466797 + }, + { + "auxiliary_loss_clip": 0.01082547, + "auxiliary_loss_mlp": 0.01054885, + "balance_loss_clip": 1.02126777, + "balance_loss_mlp": 1.02696514, + "epoch": 0.20093191041635353, + "flos": 23841139057920.0, + "grad_norm": 2.2938589596073067, + "language_loss": 0.73514783, + "learning_rate": 3.70125385615256e-06, + "loss": 0.75652212, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5546875, + "step": 3342, + "time_per_iteration": 3.9261491298675537 + }, + { + "auxiliary_loss_clip": 0.01086174, + "auxiliary_loss_mlp": 0.01071415, + "balance_loss_clip": 1.0307405, + "balance_loss_mlp": 1.02723813, + "epoch": 0.2009920336690215, + "flos": 21791179134720.0, + "grad_norm": 2.2110705331751275, + "language_loss": 0.7474966, + "learning_rate": 3.701049056727384e-06, + "loss": 0.76907253, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.58984375, + "step": 3343, + "time_per_iteration": 2.4341864585876465 + }, + { + "auxiliary_loss_clip": 0.01086399, + "auxiliary_loss_mlp": 0.01066653, + "balance_loss_clip": 1.02464414, + "balance_loss_mlp": 1.0278883, + "epoch": 0.20105215692168946, + "flos": 26358027770880.0, + "grad_norm": 1.871258141894508, + "language_loss": 0.82603049, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.84756106, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.5859375, + "step": 3344, + "time_per_iteration": 2.6114580631256104 + }, + { + "auxiliary_loss_clip": 0.01086493, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_clip": 1.02634394, + "balance_loss_mlp": 1.02756429, + "epoch": 0.20111228017435742, + "flos": 18806279379840.0, + "grad_norm": 2.7282885901765987, + "language_loss": 0.85468316, + "learning_rate": 3.700639264372948e-06, + "loss": 0.87616867, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5859375, + "step": 3345, + "time_per_iteration": 2.4305505752563477 + }, + { + "auxiliary_loss_clip": 0.01081648, + "auxiliary_loss_mlp": 0.01052595, + "balance_loss_clip": 1.01885843, + "balance_loss_mlp": 1.02870226, + "epoch": 0.20117240342702541, + "flos": 19974019357440.0, + "grad_norm": 1.806068562068603, + "language_loss": 0.70239371, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.72373611, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.53125, + "step": 3346, + "time_per_iteration": 2.5411899089813232 + }, + { + "auxiliary_loss_clip": 0.01086924, + "auxiliary_loss_mlp": 0.01061974, + "balance_loss_clip": 1.02792764, + "balance_loss_mlp": 1.02876008, + "epoch": 0.20123252667969338, + "flos": 23141759235840.0, + "grad_norm": 2.2839584858810187, + "language_loss": 0.75294083, + "learning_rate": 3.70022921406487e-06, + "loss": 0.7744298, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.58203125, + "step": 3347, + "time_per_iteration": 2.6223604679107666 + }, + { + "auxiliary_loss_clip": 0.01085592, + "auxiliary_loss_mlp": 0.01056815, + "balance_loss_clip": 1.02288795, + "balance_loss_mlp": 1.02920866, + "epoch": 0.20129264993236134, + "flos": 23220557907840.0, + "grad_norm": 1.807354776460203, + "language_loss": 0.87675911, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89818317, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.56640625, + "step": 3348, + "time_per_iteration": 2.559908390045166 + }, + { + "auxiliary_loss_clip": 0.01081382, + "auxiliary_loss_mlp": 0.01048923, + "balance_loss_clip": 1.01223004, + "balance_loss_mlp": 1.02684021, + "epoch": 0.2013527731850293, + "flos": 21870396743040.0, + "grad_norm": 2.193181988173124, + "language_loss": 0.72569764, + "learning_rate": 3.699818905865346e-06, + "loss": 0.74700069, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.546875, + "step": 3349, + "time_per_iteration": 2.4944732189178467 + }, + { + "auxiliary_loss_clip": 0.01087574, + "auxiliary_loss_mlp": 0.01062808, + "balance_loss_clip": 1.02373171, + "balance_loss_mlp": 1.02862132, + "epoch": 0.20141289643769728, + "flos": 18039831102720.0, + "grad_norm": 1.5460533535430128, + "language_loss": 0.73305702, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.75456083, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.58984375, + "step": 3350, + "time_per_iteration": 2.486546754837036 + }, + { + "auxiliary_loss_clip": 0.01085695, + "auxiliary_loss_mlp": 0.0105594, + "balance_loss_clip": 1.01722097, + "balance_loss_mlp": 1.02670956, + "epoch": 0.20147301969036524, + "flos": 23950277568000.0, + "grad_norm": 2.4536909489015697, + "language_loss": 0.78826702, + "learning_rate": 3.69940833983661e-06, + "loss": 0.80968332, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.58984375, + "step": 3351, + "time_per_iteration": 2.4541614055633545 + }, + { + "auxiliary_loss_clip": 0.01085353, + "auxiliary_loss_mlp": 0.01061253, + "balance_loss_clip": 1.01955318, + "balance_loss_mlp": 1.02586007, + "epoch": 0.2015331429430332, + "flos": 25587425041920.0, + "grad_norm": 1.5496842764746228, + "language_loss": 0.82215965, + "learning_rate": 3.699202960155748e-06, + "loss": 0.84362572, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.59375, + "step": 3352, + "time_per_iteration": 2.511916160583496 + }, + { + "auxiliary_loss_clip": 0.01082859, + "auxiliary_loss_mlp": 0.01056045, + "balance_loss_clip": 1.01832712, + "balance_loss_mlp": 1.02495241, + "epoch": 0.2015932661957012, + "flos": 26723742929280.0, + "grad_norm": 2.18908892610129, + "language_loss": 0.81921756, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.84060669, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.578125, + "step": 3353, + "time_per_iteration": 2.467240810394287 + }, + { + "auxiliary_loss_clip": 0.0108033, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_clip": 1.01801622, + "balance_loss_mlp": 1.02495325, + "epoch": 0.20165338944836916, + "flos": 15632220545280.0, + "grad_norm": 1.7512100636744692, + "language_loss": 0.91404587, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.93535334, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.5546875, + "step": 3354, + "time_per_iteration": 2.455674886703491 + }, + { + "auxiliary_loss_clip": 0.01051993, + "auxiliary_loss_mlp": 0.01023476, + "balance_loss_clip": 1.01470256, + "balance_loss_mlp": 1.02950597, + "epoch": 0.20171351270103713, + "flos": 57909629727360.0, + "grad_norm": 0.8480104990314864, + "language_loss": 0.55999786, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.58075255, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.22460938, + "step": 3355, + "time_per_iteration": 3.0621132850646973 + }, + { + "auxiliary_loss_clip": 0.01079836, + "auxiliary_loss_mlp": 0.01055364, + "balance_loss_clip": 1.02132964, + "balance_loss_mlp": 1.02500772, + "epoch": 0.2017736359537051, + "flos": 20813296464000.0, + "grad_norm": 1.537189298249129, + "language_loss": 0.85715771, + "learning_rate": 3.698380797170751e-06, + "loss": 0.87850964, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.546875, + "step": 3356, + "time_per_iteration": 2.582019805908203 + }, + { + "auxiliary_loss_clip": 0.01085362, + "auxiliary_loss_mlp": 0.01060545, + "balance_loss_clip": 1.01698613, + "balance_loss_mlp": 1.02296638, + "epoch": 0.20183375920637306, + "flos": 17091101283840.0, + "grad_norm": 2.473792910425018, + "language_loss": 0.71650076, + "learning_rate": 3.698175095398085e-06, + "loss": 0.73795986, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.625, + "step": 3357, + "time_per_iteration": 2.4947268962860107 + }, + { + "auxiliary_loss_clip": 0.01083136, + "auxiliary_loss_mlp": 0.01063642, + "balance_loss_clip": 1.0217514, + "balance_loss_mlp": 1.02314329, + "epoch": 0.20189388245904102, + "flos": 18660342430080.0, + "grad_norm": 1.6807148264465332, + "language_loss": 0.73630011, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.75776792, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.6015625, + "step": 3358, + "time_per_iteration": 2.5460891723632812 + }, + { + "auxiliary_loss_clip": 0.01076505, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.01758647, + "balance_loss_mlp": 1.02129185, + "epoch": 0.20195400571170902, + "flos": 16796678855040.0, + "grad_norm": 1.6387256325639246, + "language_loss": 0.8459956, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.86724925, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.55078125, + "step": 3359, + "time_per_iteration": 2.474761962890625 + }, + { + "auxiliary_loss_clip": 0.0104536, + "auxiliary_loss_mlp": 0.01022075, + "balance_loss_clip": 1.01539886, + "balance_loss_mlp": 1.02187252, + "epoch": 0.20201412896437698, + "flos": 67171703535360.0, + "grad_norm": 0.7855790809797424, + "language_loss": 0.5909586, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61163294, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.06689453, + "router_z_loss_mlp": 0.234375, + "step": 3360, + "time_per_iteration": 3.1155338287353516 + }, + { + "auxiliary_loss_clip": 0.01081928, + "auxiliary_loss_mlp": 0.01066346, + "balance_loss_clip": 1.02564836, + "balance_loss_mlp": 1.02390838, + "epoch": 0.20207425221704495, + "flos": 21323936142720.0, + "grad_norm": 2.9519421654197995, + "language_loss": 0.63439697, + "learning_rate": 3.697351644435763e-06, + "loss": 0.65587974, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.58203125, + "step": 3361, + "time_per_iteration": 2.6716678142547607 + }, + { + "auxiliary_loss_clip": 0.01078481, + "auxiliary_loss_mlp": 0.01070696, + "balance_loss_clip": 1.03664935, + "balance_loss_mlp": 1.02303696, + "epoch": 0.2021343754697129, + "flos": 22526100587520.0, + "grad_norm": 1.883599790244205, + "language_loss": 0.77750921, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.79900098, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5546875, + "step": 3362, + "time_per_iteration": 2.570836305618286 + }, + { + "auxiliary_loss_clip": 0.01078958, + "auxiliary_loss_mlp": 0.01063137, + "balance_loss_clip": 1.02830434, + "balance_loss_mlp": 1.02364099, + "epoch": 0.20219449872238088, + "flos": 19061773776000.0, + "grad_norm": 1.5128335746377586, + "language_loss": 0.78733063, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.80875158, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5546875, + "step": 3363, + "time_per_iteration": 2.4778952598571777 + }, + { + "auxiliary_loss_clip": 0.01082348, + "auxiliary_loss_mlp": 0.01076868, + "balance_loss_clip": 1.04527712, + "balance_loss_mlp": 1.02632153, + "epoch": 0.20225462197504884, + "flos": 24715887972480.0, + "grad_norm": 1.5882351827976677, + "language_loss": 0.76535451, + "learning_rate": 3.696733380367391e-06, + "loss": 0.78694665, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.5625, + "step": 3364, + "time_per_iteration": 2.533872127532959 + }, + { + "auxiliary_loss_clip": 0.01081601, + "auxiliary_loss_mlp": 0.01073941, + "balance_loss_clip": 1.03958511, + "balance_loss_mlp": 1.02495337, + "epoch": 0.2023147452277168, + "flos": 22017206476800.0, + "grad_norm": 2.0653837482382755, + "language_loss": 0.74019152, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.761747, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.56640625, + "step": 3365, + "time_per_iteration": 2.513274669647217 + }, + { + "auxiliary_loss_clip": 0.01083298, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03093266, + "balance_loss_mlp": 1.02705324, + "epoch": 0.2023748684803848, + "flos": 17744500978560.0, + "grad_norm": 1.778994636792695, + "language_loss": 0.87279904, + "learning_rate": 3.696320882607286e-06, + "loss": 0.89426756, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.5625, + "step": 3366, + "time_per_iteration": 2.496013879776001 + }, + { + "auxiliary_loss_clip": 0.01083525, + "auxiliary_loss_mlp": 0.01066545, + "balance_loss_clip": 1.03497887, + "balance_loss_mlp": 1.02897644, + "epoch": 0.20243499173305277, + "flos": 31137602520960.0, + "grad_norm": 1.636083371096517, + "language_loss": 0.71423566, + "learning_rate": 3.696114537236335e-06, + "loss": 0.73573637, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.546875, + "step": 3367, + "time_per_iteration": 2.5839383602142334 + }, + { + "auxiliary_loss_clip": 0.01086378, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.023628, + "balance_loss_mlp": 1.02739215, + "epoch": 0.20249511498572073, + "flos": 33837820116480.0, + "grad_norm": 1.467762501803307, + "language_loss": 0.69720852, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.71870553, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.58984375, + "step": 3368, + "time_per_iteration": 2.6633589267730713 + }, + { + "auxiliary_loss_clip": 0.01083983, + "auxiliary_loss_mlp": 0.01050347, + "balance_loss_clip": 1.01806521, + "balance_loss_mlp": 1.03054571, + "epoch": 0.2025552382383887, + "flos": 21214553253120.0, + "grad_norm": 1.6350782677366928, + "language_loss": 0.79033387, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.81167716, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.53515625, + "step": 3369, + "time_per_iteration": 2.5226757526397705 + }, + { + "auxiliary_loss_clip": 0.0108531, + "auxiliary_loss_mlp": 0.01061452, + "balance_loss_clip": 1.0284071, + "balance_loss_mlp": 1.02952981, + "epoch": 0.20261536149105666, + "flos": 14646517729920.0, + "grad_norm": 2.4203761279325353, + "language_loss": 0.68332911, + "learning_rate": 3.695495115253795e-06, + "loss": 0.70479667, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.55859375, + "step": 3370, + "time_per_iteration": 2.449333429336548 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.01017429, + "balance_loss_clip": 1.01132596, + "balance_loss_mlp": 1.02023554, + "epoch": 0.20267548474372463, + "flos": 66780361572480.0, + "grad_norm": 0.6826637093295438, + "language_loss": 0.58161908, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60221684, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.06103516, + "router_z_loss_mlp": 0.22070312, + "step": 3371, + "time_per_iteration": 3.169339179992676 + }, + { + "auxiliary_loss_clip": 0.01085223, + "auxiliary_loss_mlp": 0.01054794, + "balance_loss_clip": 1.020033, + "balance_loss_mlp": 1.02940655, + "epoch": 0.2027356079963926, + "flos": 24679648114560.0, + "grad_norm": 1.6696526973909118, + "language_loss": 0.92558646, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94698668, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.55859375, + "step": 3372, + "time_per_iteration": 2.482783079147339 + }, + { + "auxiliary_loss_clip": 0.01086909, + "auxiliary_loss_mlp": 0.01062907, + "balance_loss_clip": 1.02504671, + "balance_loss_mlp": 1.02960861, + "epoch": 0.20279573124906058, + "flos": 26391649276800.0, + "grad_norm": 1.5921135880823727, + "language_loss": 0.79852957, + "learning_rate": 3.694875114631167e-06, + "loss": 0.82002771, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5703125, + "step": 3373, + "time_per_iteration": 2.5324020385742188 + }, + { + "auxiliary_loss_clip": 0.01081092, + "auxiliary_loss_mlp": 0.01067765, + "balance_loss_clip": 1.03393364, + "balance_loss_mlp": 1.02831602, + "epoch": 0.20285585450172855, + "flos": 33798647704320.0, + "grad_norm": 1.9317379267857537, + "language_loss": 0.73018241, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.75167102, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.52734375, + "step": 3374, + "time_per_iteration": 2.6139607429504395 + }, + { + "auxiliary_loss_clip": 0.01030681, + "auxiliary_loss_mlp": 0.01037284, + "balance_loss_clip": 1.0304172, + "balance_loss_mlp": 1.00983405, + "epoch": 0.20291597775439651, + "flos": 71161332796800.0, + "grad_norm": 1.0195783722117788, + "language_loss": 0.62570882, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64638841, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.20898438, + "step": 3375, + "time_per_iteration": 3.0842885971069336 + }, + { + "auxiliary_loss_clip": 0.01079746, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_clip": 1.02853751, + "balance_loss_mlp": 1.02520347, + "epoch": 0.20297610100706448, + "flos": 19493440225920.0, + "grad_norm": 1.4802459177987417, + "language_loss": 0.82972914, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.85115695, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.54296875, + "step": 3376, + "time_per_iteration": 2.477184772491455 + }, + { + "auxiliary_loss_clip": 0.01083451, + "auxiliary_loss_mlp": 0.01068735, + "balance_loss_clip": 1.03116035, + "balance_loss_mlp": 1.02619326, + "epoch": 0.20303622425973245, + "flos": 25043128945920.0, + "grad_norm": 2.0210227006813644, + "language_loss": 0.82784545, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.84936732, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5703125, + "step": 3377, + "time_per_iteration": 3.9714958667755127 + }, + { + "auxiliary_loss_clip": 0.01081356, + "auxiliary_loss_mlp": 0.01057634, + "balance_loss_clip": 1.02244341, + "balance_loss_mlp": 1.02526879, + "epoch": 0.2030963475124004, + "flos": 21978941760000.0, + "grad_norm": 1.7960219855218151, + "language_loss": 0.78993821, + "learning_rate": 3.69384049496805e-06, + "loss": 0.81132805, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5625, + "step": 3378, + "time_per_iteration": 3.857881546020508 + }, + { + "auxiliary_loss_clip": 0.01080033, + "auxiliary_loss_mlp": 0.01057693, + "balance_loss_clip": 1.02104831, + "balance_loss_mlp": 1.02434254, + "epoch": 0.2031564707650684, + "flos": 19499375157120.0, + "grad_norm": 1.8394034080071586, + "language_loss": 0.81949568, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.840873, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5546875, + "step": 3379, + "time_per_iteration": 2.443969488143921 + }, + { + "auxiliary_loss_clip": 0.01079339, + "auxiliary_loss_mlp": 0.01051389, + "balance_loss_clip": 1.02051425, + "balance_loss_mlp": 1.02664304, + "epoch": 0.20321659401773637, + "flos": 22745983530240.0, + "grad_norm": 1.6701707804267938, + "language_loss": 0.87904489, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.90035212, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.52734375, + "step": 3380, + "time_per_iteration": 3.8981990814208984 + }, + { + "auxiliary_loss_clip": 0.01085424, + "auxiliary_loss_mlp": 0.0105873, + "balance_loss_clip": 1.02525663, + "balance_loss_mlp": 1.03153336, + "epoch": 0.20327671727040433, + "flos": 22454738035200.0, + "grad_norm": 1.9412546093708938, + "language_loss": 0.76518202, + "learning_rate": 3.693218952340186e-06, + "loss": 0.78662354, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5390625, + "step": 3381, + "time_per_iteration": 4.111631870269775 + }, + { + "auxiliary_loss_clip": 0.01090228, + "auxiliary_loss_mlp": 0.01059008, + "balance_loss_clip": 1.02164793, + "balance_loss_mlp": 1.03140843, + "epoch": 0.2033368405230723, + "flos": 19534044003840.0, + "grad_norm": 1.5254779028223935, + "language_loss": 0.80480748, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.82629985, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.58984375, + "step": 3382, + "time_per_iteration": 2.483736753463745 + }, + { + "auxiliary_loss_clip": 0.01091416, + "auxiliary_loss_mlp": 0.0106371, + "balance_loss_clip": 1.02484822, + "balance_loss_mlp": 1.03214049, + "epoch": 0.20339696377574026, + "flos": 13808357786880.0, + "grad_norm": 1.9243825332283717, + "language_loss": 0.82234561, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.84389687, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.59375, + "step": 3383, + "time_per_iteration": 2.554927349090576 + }, + { + "auxiliary_loss_clip": 0.01088875, + "auxiliary_loss_mlp": 0.0105834, + "balance_loss_clip": 1.02252972, + "balance_loss_mlp": 1.03181481, + "epoch": 0.20345708702840823, + "flos": 20338372972800.0, + "grad_norm": 2.0487493454658194, + "language_loss": 0.76764667, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.78911883, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5703125, + "step": 3384, + "time_per_iteration": 2.510700225830078 + }, + { + "auxiliary_loss_clip": 0.01094593, + "auxiliary_loss_mlp": 0.01065431, + "balance_loss_clip": 1.02504253, + "balance_loss_mlp": 1.03412771, + "epoch": 0.2035172102810762, + "flos": 20333066446080.0, + "grad_norm": 2.1678412139901426, + "language_loss": 0.79697943, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.81857961, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.60546875, + "step": 3385, + "time_per_iteration": 2.526949405670166 + }, + { + "auxiliary_loss_clip": 0.01084574, + "auxiliary_loss_mlp": 0.01072104, + "balance_loss_clip": 1.0349102, + "balance_loss_mlp": 1.02893019, + "epoch": 0.2035773335337442, + "flos": 23329870974720.0, + "grad_norm": 1.5878476148511826, + "language_loss": 0.70842159, + "learning_rate": 3.692181763924639e-06, + "loss": 0.72998834, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5546875, + "step": 3386, + "time_per_iteration": 2.549823045730591 + }, + { + "auxiliary_loss_clip": 0.0108487, + "auxiliary_loss_mlp": 0.01071434, + "balance_loss_clip": 1.03507566, + "balance_loss_mlp": 1.02828526, + "epoch": 0.20363745678641215, + "flos": 28329014465280.0, + "grad_norm": 1.3784503027152957, + "language_loss": 0.81966329, + "learning_rate": 3.691974133706947e-06, + "loss": 0.84122634, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.56640625, + "step": 3387, + "time_per_iteration": 2.597156524658203 + }, + { + "auxiliary_loss_clip": 0.01081003, + "auxiliary_loss_mlp": 0.01056402, + "balance_loss_clip": 1.02320182, + "balance_loss_mlp": 1.0281738, + "epoch": 0.20369758003908012, + "flos": 18914684751360.0, + "grad_norm": 2.441832893124233, + "language_loss": 0.81840312, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.83977717, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.52734375, + "step": 3388, + "time_per_iteration": 2.5161049365997314 + }, + { + "auxiliary_loss_clip": 0.01084263, + "auxiliary_loss_mlp": 0.0105857, + "balance_loss_clip": 1.02471447, + "balance_loss_mlp": 1.02758086, + "epoch": 0.20375770329174808, + "flos": 19205546221440.0, + "grad_norm": 1.8327658664859487, + "language_loss": 0.7355597, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.75698805, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.56640625, + "step": 3389, + "time_per_iteration": 2.5361363887786865 + }, + { + "auxiliary_loss_clip": 0.01080297, + "auxiliary_loss_mlp": 0.01054647, + "balance_loss_clip": 1.02236557, + "balance_loss_mlp": 1.02617085, + "epoch": 0.20381782654441605, + "flos": 19389992267520.0, + "grad_norm": 1.8813690128145175, + "language_loss": 0.8893441, + "learning_rate": 3.691350858126404e-06, + "loss": 0.91069353, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.5390625, + "step": 3390, + "time_per_iteration": 2.5815482139587402 + }, + { + "auxiliary_loss_clip": 0.01080487, + "auxiliary_loss_mlp": 0.01061363, + "balance_loss_clip": 1.02648282, + "balance_loss_mlp": 1.02442288, + "epoch": 0.203877949797084, + "flos": 24826527671040.0, + "grad_norm": 2.590403192734352, + "language_loss": 0.73297858, + "learning_rate": 3.691142971316662e-06, + "loss": 0.75439709, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5625, + "step": 3391, + "time_per_iteration": 2.4868462085723877 + }, + { + "auxiliary_loss_clip": 0.01079446, + "auxiliary_loss_mlp": 0.01061406, + "balance_loss_clip": 1.02697802, + "balance_loss_mlp": 1.0242393, + "epoch": 0.20393807304975198, + "flos": 18002753372160.0, + "grad_norm": 2.441955708226163, + "language_loss": 0.88948178, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.91089034, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.55078125, + "step": 3392, + "time_per_iteration": 2.483677864074707 + }, + { + "auxiliary_loss_clip": 0.01082526, + "auxiliary_loss_mlp": 0.01059011, + "balance_loss_clip": 1.02162683, + "balance_loss_mlp": 1.02419424, + "epoch": 0.20399819630241997, + "flos": 24205841786880.0, + "grad_norm": 1.4008940769704208, + "language_loss": 0.81670475, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8381201, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5859375, + "step": 3393, + "time_per_iteration": 2.553766965866089 + }, + { + "auxiliary_loss_clip": 0.01079435, + "auxiliary_loss_mlp": 0.01055658, + "balance_loss_clip": 1.0229708, + "balance_loss_mlp": 1.0240674, + "epoch": 0.20405831955508794, + "flos": 20776079088000.0, + "grad_norm": 1.6832163701048999, + "language_loss": 0.87428629, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.89563721, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.5546875, + "step": 3394, + "time_per_iteration": 2.491151809692383 + }, + { + "auxiliary_loss_clip": 0.01081471, + "auxiliary_loss_mlp": 0.01054626, + "balance_loss_clip": 1.01902962, + "balance_loss_mlp": 1.02578402, + "epoch": 0.2041184428077559, + "flos": 15486004304640.0, + "grad_norm": 2.297980616768581, + "language_loss": 0.86004883, + "learning_rate": 3.69031078287345e-06, + "loss": 0.88140988, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5546875, + "step": 3395, + "time_per_iteration": 2.4670214653015137 + }, + { + "auxiliary_loss_clip": 0.01084261, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_clip": 1.01575828, + "balance_loss_mlp": 1.02555966, + "epoch": 0.20417856606042387, + "flos": 15587776517760.0, + "grad_norm": 1.9271382792552922, + "language_loss": 0.86298382, + "learning_rate": 3.690102575501033e-06, + "loss": 0.88435137, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5859375, + "step": 3396, + "time_per_iteration": 2.5540900230407715 + }, + { + "auxiliary_loss_clip": 0.01079057, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.01362467, + "balance_loss_mlp": 1.02473426, + "epoch": 0.20423868931309183, + "flos": 24278216768640.0, + "grad_norm": 2.0306887647645886, + "language_loss": 0.79616487, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.81742525, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.54296875, + "step": 3397, + "time_per_iteration": 2.503183364868164 + }, + { + "auxiliary_loss_clip": 0.01079513, + "auxiliary_loss_mlp": 0.01052085, + "balance_loss_clip": 1.01882601, + "balance_loss_mlp": 1.02461553, + "epoch": 0.2042988125657598, + "flos": 18614152834560.0, + "grad_norm": 2.2973179795310585, + "language_loss": 0.89579952, + "learning_rate": 3.689685968497518e-06, + "loss": 0.91711545, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.546875, + "step": 3398, + "time_per_iteration": 2.4394702911376953 + }, + { + "auxiliary_loss_clip": 0.01082428, + "auxiliary_loss_mlp": 0.0106378, + "balance_loss_clip": 1.0257287, + "balance_loss_mlp": 1.02644777, + "epoch": 0.2043589358184278, + "flos": 17850462554880.0, + "grad_norm": 1.916967727223325, + "language_loss": 0.8019464, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.82340842, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.55859375, + "step": 3399, + "time_per_iteration": 2.4549760818481445 + }, + { + "auxiliary_loss_clip": 0.01081591, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.01605308, + "balance_loss_mlp": 1.02529454, + "epoch": 0.20441905907109575, + "flos": 21434121993600.0, + "grad_norm": 2.112543226758495, + "language_loss": 0.77452958, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.7958672, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5625, + "step": 3400, + "time_per_iteration": 2.482046365737915 + }, + { + "auxiliary_loss_clip": 0.01080747, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.01271224, + "balance_loss_mlp": 1.02632558, + "epoch": 0.20447918232376372, + "flos": 27706513190400.0, + "grad_norm": 1.6228678431976133, + "language_loss": 0.80872917, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.8299706, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.546875, + "step": 3401, + "time_per_iteration": 2.5284976959228516 + }, + { + "auxiliary_loss_clip": 0.01082363, + "auxiliary_loss_mlp": 0.01053504, + "balance_loss_clip": 1.01776505, + "balance_loss_mlp": 1.02507091, + "epoch": 0.20453930557643168, + "flos": 30522746833920.0, + "grad_norm": 1.5680828466368133, + "language_loss": 0.71153963, + "learning_rate": 3.688851985676991e-06, + "loss": 0.73289835, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5703125, + "step": 3402, + "time_per_iteration": 2.660501003265381 + }, + { + "auxiliary_loss_clip": 0.01081148, + "auxiliary_loss_mlp": 0.01049397, + "balance_loss_clip": 1.01718688, + "balance_loss_mlp": 1.02489662, + "epoch": 0.20459942882909965, + "flos": 18986815353600.0, + "grad_norm": 1.8242491661217968, + "language_loss": 0.8280772, + "learning_rate": 3.688643329848496e-06, + "loss": 0.84938264, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5625, + "step": 3403, + "time_per_iteration": 2.4998619556427 + }, + { + "auxiliary_loss_clip": 0.01080834, + "auxiliary_loss_mlp": 0.01060881, + "balance_loss_clip": 1.02561915, + "balance_loss_mlp": 1.02483559, + "epoch": 0.20465955208176762, + "flos": 20338023859200.0, + "grad_norm": 1.8483662171207136, + "language_loss": 0.85711038, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.87852752, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.55859375, + "step": 3404, + "time_per_iteration": 2.475260019302368 + }, + { + "auxiliary_loss_clip": 0.01079355, + "auxiliary_loss_mlp": 0.01055475, + "balance_loss_clip": 1.01823354, + "balance_loss_mlp": 1.0216608, + "epoch": 0.20471967533443558, + "flos": 21250234529280.0, + "grad_norm": 2.4795666752082184, + "language_loss": 0.87503958, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.89638788, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.578125, + "step": 3405, + "time_per_iteration": 2.5046210289001465 + }, + { + "auxiliary_loss_clip": 0.01077672, + "auxiliary_loss_mlp": 0.01052746, + "balance_loss_clip": 1.01772213, + "balance_loss_mlp": 1.02222705, + "epoch": 0.20477979858710357, + "flos": 14500685514240.0, + "grad_norm": 2.3004473795105325, + "language_loss": 0.85956526, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.88086951, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5546875, + "step": 3406, + "time_per_iteration": 2.5068047046661377 + }, + { + "auxiliary_loss_clip": 0.01079999, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.01773071, + "balance_loss_mlp": 1.02469444, + "epoch": 0.20483992183977154, + "flos": 11399525331840.0, + "grad_norm": 1.8719592103356284, + "language_loss": 0.69443166, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.71573848, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5546875, + "step": 3407, + "time_per_iteration": 2.4473609924316406 + }, + { + "auxiliary_loss_clip": 0.01078114, + "auxiliary_loss_mlp": 0.01053787, + "balance_loss_clip": 1.01811934, + "balance_loss_mlp": 1.0219233, + "epoch": 0.2049000450924395, + "flos": 19059329980800.0, + "grad_norm": 2.0956891094529464, + "language_loss": 0.85687715, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.87819618, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5625, + "step": 3408, + "time_per_iteration": 2.4824230670928955 + }, + { + "auxiliary_loss_clip": 0.01083142, + "auxiliary_loss_mlp": 0.0105932, + "balance_loss_clip": 1.02291358, + "balance_loss_mlp": 1.02445054, + "epoch": 0.20496016834510747, + "flos": 14573688900480.0, + "grad_norm": 2.1162215923682965, + "language_loss": 0.65972614, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.68115079, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.58984375, + "step": 3409, + "time_per_iteration": 2.4528567790985107 + }, + { + "auxiliary_loss_clip": 0.01079508, + "auxiliary_loss_mlp": 0.01057204, + "balance_loss_clip": 1.02146482, + "balance_loss_mlp": 1.02152753, + "epoch": 0.20502029159777543, + "flos": 22125576936960.0, + "grad_norm": 1.5327772103257251, + "language_loss": 0.81990111, + "learning_rate": 3.687180946553745e-06, + "loss": 0.84126818, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.578125, + "step": 3410, + "time_per_iteration": 2.5217361450195312 + }, + { + "auxiliary_loss_clip": 0.010765, + "auxiliary_loss_mlp": 0.01053156, + "balance_loss_clip": 1.01853764, + "balance_loss_mlp": 1.02225101, + "epoch": 0.2050804148504434, + "flos": 25366913694720.0, + "grad_norm": 2.1165403750549183, + "language_loss": 0.78134012, + "learning_rate": 3.686971778678803e-06, + "loss": 0.80263668, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.54296875, + "step": 3411, + "time_per_iteration": 2.555506944656372 + }, + { + "auxiliary_loss_clip": 0.01077983, + "auxiliary_loss_mlp": 0.01056447, + "balance_loss_clip": 1.02287769, + "balance_loss_mlp": 1.02289391, + "epoch": 0.2051405381031114, + "flos": 23619126522240.0, + "grad_norm": 1.9862186391934864, + "language_loss": 0.75212622, + "learning_rate": 3.686762546833722e-06, + "loss": 0.77347052, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5546875, + "step": 3412, + "time_per_iteration": 2.4741744995117188 + }, + { + "auxiliary_loss_clip": 0.01080721, + "auxiliary_loss_mlp": 0.01061385, + "balance_loss_clip": 1.02495503, + "balance_loss_mlp": 1.02154613, + "epoch": 0.20520066135577936, + "flos": 19564732955520.0, + "grad_norm": 2.53559594632031, + "language_loss": 0.80178857, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.82320964, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.59375, + "step": 3413, + "time_per_iteration": 2.3855199813842773 + }, + { + "auxiliary_loss_clip": 0.01075017, + "auxiliary_loss_mlp": 0.01054537, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.02152658, + "epoch": 0.20526078460844732, + "flos": 17675372753280.0, + "grad_norm": 2.2932244490416616, + "language_loss": 0.85955203, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.88084751, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.53515625, + "step": 3414, + "time_per_iteration": 2.3977506160736084 + }, + { + "auxiliary_loss_clip": 0.01078475, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.01761889, + "balance_loss_mlp": 1.0209918, + "epoch": 0.2053209078611153, + "flos": 21499444880640.0, + "grad_norm": 2.2484380593846636, + "language_loss": 0.82257795, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.84390867, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.57421875, + "step": 3415, + "time_per_iteration": 2.475327730178833 + }, + { + "auxiliary_loss_clip": 0.01078817, + "auxiliary_loss_mlp": 0.01054044, + "balance_loss_clip": 1.02083254, + "balance_loss_mlp": 1.02266192, + "epoch": 0.20538103111378325, + "flos": 25662418375680.0, + "grad_norm": 3.0297632718918157, + "language_loss": 0.74498451, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.76631314, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5625, + "step": 3416, + "time_per_iteration": 2.4483890533447266 + }, + { + "auxiliary_loss_clip": 0.01079034, + "auxiliary_loss_mlp": 0.0105512, + "balance_loss_clip": 1.0203588, + "balance_loss_mlp": 1.02199173, + "epoch": 0.20544115436645122, + "flos": 23147833812480.0, + "grad_norm": 2.3860077446664283, + "language_loss": 0.80287045, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.82421196, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5703125, + "step": 3417, + "time_per_iteration": 3.812279462814331 + }, + { + "auxiliary_loss_clip": 0.01080304, + "auxiliary_loss_mlp": 0.01057842, + "balance_loss_clip": 1.01971865, + "balance_loss_mlp": 1.02265525, + "epoch": 0.20550127761911918, + "flos": 19389433685760.0, + "grad_norm": 2.1240258662215625, + "language_loss": 0.89830112, + "learning_rate": 3.685505812834798e-06, + "loss": 0.91968262, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.578125, + "step": 3418, + "time_per_iteration": 2.4312901496887207 + }, + { + "auxiliary_loss_clip": 0.01079762, + "auxiliary_loss_mlp": 0.01062499, + "balance_loss_clip": 1.02485275, + "balance_loss_mlp": 1.02160537, + "epoch": 0.20556140087178718, + "flos": 22892025214080.0, + "grad_norm": 2.2090514405943886, + "language_loss": 0.63966012, + "learning_rate": 3.685296133421035e-06, + "loss": 0.66108268, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.58203125, + "step": 3419, + "time_per_iteration": 5.221128225326538 + }, + { + "auxiliary_loss_clip": 0.01083013, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_clip": 1.01837397, + "balance_loss_mlp": 1.0244441, + "epoch": 0.20562152412445514, + "flos": 19788700527360.0, + "grad_norm": 1.7941150173476457, + "language_loss": 0.87209088, + "learning_rate": 3.685086390100674e-06, + "loss": 0.89353132, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.5859375, + "step": 3420, + "time_per_iteration": 3.8003735542297363 + }, + { + "auxiliary_loss_clip": 0.0107978, + "auxiliary_loss_mlp": 0.0105492, + "balance_loss_clip": 1.0182507, + "balance_loss_mlp": 1.02312565, + "epoch": 0.2056816473771231, + "flos": 31500699327360.0, + "grad_norm": 2.8645725146676906, + "language_loss": 0.73578554, + "learning_rate": 3.684876582881668e-06, + "loss": 0.75713253, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5625, + "step": 3421, + "time_per_iteration": 2.541628360748291 + }, + { + "auxiliary_loss_clip": 0.01077524, + "auxiliary_loss_mlp": 0.01054199, + "balance_loss_clip": 1.01772141, + "balance_loss_mlp": 1.02217698, + "epoch": 0.20574177062979107, + "flos": 23257251613440.0, + "grad_norm": 1.9289920939675889, + "language_loss": 0.7282325, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.74954975, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5546875, + "step": 3422, + "time_per_iteration": 2.5945487022399902 + }, + { + "auxiliary_loss_clip": 0.01033335, + "auxiliary_loss_mlp": 0.01038393, + "balance_loss_clip": 1.032933, + "balance_loss_mlp": 1.01216793, + "epoch": 0.20580189388245904, + "flos": 70309417777920.0, + "grad_norm": 0.8325219443895382, + "language_loss": 0.5571512, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57786846, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.0546875, + "router_z_loss_mlp": 0.21191406, + "step": 3423, + "time_per_iteration": 3.1985268592834473 + }, + { + "auxiliary_loss_clip": 0.010795, + "auxiliary_loss_mlp": 0.01056185, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.02165985, + "epoch": 0.205862017135127, + "flos": 30736520288640.0, + "grad_norm": 2.0242679550889546, + "language_loss": 0.73818767, + "learning_rate": 3.684246777912353e-06, + "loss": 0.75954449, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.578125, + "step": 3424, + "time_per_iteration": 2.55841064453125 + }, + { + "auxiliary_loss_clip": 0.010811, + "auxiliary_loss_mlp": 0.01055413, + "balance_loss_clip": 1.02005589, + "balance_loss_mlp": 1.02420771, + "epoch": 0.20592214038779497, + "flos": 21323482295040.0, + "grad_norm": 1.392352559189381, + "language_loss": 0.7596736, + "learning_rate": 3.684036715178351e-06, + "loss": 0.7810387, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5703125, + "step": 3425, + "time_per_iteration": 2.4185333251953125 + }, + { + "auxiliary_loss_clip": 0.0108057, + "auxiliary_loss_mlp": 0.01058527, + "balance_loss_clip": 1.02030873, + "balance_loss_mlp": 1.02395225, + "epoch": 0.20598226364046296, + "flos": 22890593848320.0, + "grad_norm": 1.7128955290262635, + "language_loss": 0.89190334, + "learning_rate": 3.683826588585508e-06, + "loss": 0.91329432, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.56640625, + "step": 3426, + "time_per_iteration": 2.431118965148926 + }, + { + "auxiliary_loss_clip": 0.01080031, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_clip": 1.02474856, + "balance_loss_mlp": 1.0242995, + "epoch": 0.20604238689313092, + "flos": 23877413827200.0, + "grad_norm": 1.7816908951563362, + "language_loss": 0.78802097, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.80944562, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.55859375, + "step": 3427, + "time_per_iteration": 2.438218116760254 + }, + { + "auxiliary_loss_clip": 0.0108135, + "auxiliary_loss_mlp": 0.0106142, + "balance_loss_clip": 1.02227187, + "balance_loss_mlp": 1.02267039, + "epoch": 0.2061025101457989, + "flos": 22490454222720.0, + "grad_norm": 1.4752005978713252, + "language_loss": 0.75699723, + "learning_rate": 3.683406143855174e-06, + "loss": 0.77842492, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5859375, + "step": 3428, + "time_per_iteration": 2.4123916625976562 + }, + { + "auxiliary_loss_clip": 0.01080412, + "auxiliary_loss_mlp": 0.0106419, + "balance_loss_clip": 1.02301526, + "balance_loss_mlp": 1.02141988, + "epoch": 0.20616263339846685, + "flos": 22777964202240.0, + "grad_norm": 3.459563831983276, + "language_loss": 0.74928808, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.77073413, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.58984375, + "step": 3429, + "time_per_iteration": 2.3799846172332764 + }, + { + "auxiliary_loss_clip": 0.0108691, + "auxiliary_loss_mlp": 0.0106323, + "balance_loss_clip": 1.02174497, + "balance_loss_mlp": 1.02726388, + "epoch": 0.20622275665113482, + "flos": 20881272614400.0, + "grad_norm": 1.8368102908029464, + "language_loss": 0.87156141, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.89306277, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.59765625, + "step": 3430, + "time_per_iteration": 2.4000511169433594 + }, + { + "auxiliary_loss_clip": 0.01082276, + "auxiliary_loss_mlp": 0.01064193, + "balance_loss_clip": 1.02285171, + "balance_loss_mlp": 1.02295256, + "epoch": 0.20628287990380278, + "flos": 19353403296000.0, + "grad_norm": 1.4931025478692177, + "language_loss": 0.71355724, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.73502189, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.59375, + "step": 3431, + "time_per_iteration": 2.4930005073547363 + }, + { + "auxiliary_loss_clip": 0.01034123, + "auxiliary_loss_mlp": 0.01027503, + "balance_loss_clip": 1.02020741, + "balance_loss_mlp": 1.01238775, + "epoch": 0.20634300315647078, + "flos": 71514759156480.0, + "grad_norm": 0.8246060400423088, + "language_loss": 0.6021865, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62280273, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.07275391, + "router_z_loss_mlp": 0.21679688, + "step": 3432, + "time_per_iteration": 3.208040475845337 + }, + { + "auxiliary_loss_clip": 0.01082014, + "auxiliary_loss_mlp": 0.01057958, + "balance_loss_clip": 1.01995432, + "balance_loss_mlp": 1.02340603, + "epoch": 0.20640312640913874, + "flos": 21722923693440.0, + "grad_norm": 1.9261022170804483, + "language_loss": 0.73669529, + "learning_rate": 3.682353915057679e-06, + "loss": 0.75809503, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5859375, + "step": 3433, + "time_per_iteration": 2.531592845916748 + }, + { + "auxiliary_loss_clip": 0.01082055, + "auxiliary_loss_mlp": 0.01068704, + "balance_loss_clip": 1.02993703, + "balance_loss_mlp": 1.02340579, + "epoch": 0.2064632496618067, + "flos": 20553682527360.0, + "grad_norm": 1.8957738984038566, + "language_loss": 0.87363064, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.89513826, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5859375, + "step": 3434, + "time_per_iteration": 2.405364513397217 + }, + { + "auxiliary_loss_clip": 0.0108438, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.02893662, + "balance_loss_mlp": 1.02298903, + "epoch": 0.20652337291447467, + "flos": 29822040380160.0, + "grad_norm": 2.061814239983077, + "language_loss": 0.7072596, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.72880745, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.6171875, + "step": 3435, + "time_per_iteration": 2.431727647781372 + }, + { + "auxiliary_loss_clip": 0.0108102, + "auxiliary_loss_mlp": 0.01059518, + "balance_loss_clip": 1.02244377, + "balance_loss_mlp": 1.02384281, + "epoch": 0.20658349616714264, + "flos": 26212439934720.0, + "grad_norm": 1.5973930367228042, + "language_loss": 0.90899295, + "learning_rate": 3.681721812174988e-06, + "loss": 0.93039829, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5703125, + "step": 3436, + "time_per_iteration": 2.4210095405578613 + }, + { + "auxiliary_loss_clip": 0.01082341, + "auxiliary_loss_mlp": 0.01066704, + "balance_loss_clip": 1.02855754, + "balance_loss_mlp": 1.02380693, + "epoch": 0.2066436194198106, + "flos": 25993185396480.0, + "grad_norm": 2.9713264921302525, + "language_loss": 0.79147828, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.81296873, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5859375, + "step": 3437, + "time_per_iteration": 2.4167771339416504 + }, + { + "auxiliary_loss_clip": 0.01080874, + "auxiliary_loss_mlp": 0.01077791, + "balance_loss_clip": 1.03964353, + "balance_loss_mlp": 1.02237725, + "epoch": 0.20670374267247857, + "flos": 21360001443840.0, + "grad_norm": 1.9327046772419387, + "language_loss": 0.79931521, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.82090181, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5859375, + "step": 3438, + "time_per_iteration": 2.445948600769043 + }, + { + "auxiliary_loss_clip": 0.01031379, + "auxiliary_loss_mlp": 0.01065617, + "balance_loss_clip": 1.06018102, + "balance_loss_mlp": 1.008479, + "epoch": 0.20676386592514656, + "flos": 66379977567360.0, + "grad_norm": 0.8457568754135891, + "language_loss": 0.67197919, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69294918, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.05444336, + "router_z_loss_mlp": 0.22851562, + "step": 3439, + "time_per_iteration": 2.999277114868164 + }, + { + "auxiliary_loss_clip": 0.01082814, + "auxiliary_loss_mlp": 0.01067948, + "balance_loss_clip": 1.0284903, + "balance_loss_mlp": 1.02290118, + "epoch": 0.20682398917781453, + "flos": 17273627205120.0, + "grad_norm": 7.113654006740864, + "language_loss": 0.8594929, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.88100052, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.59765625, + "step": 3440, + "time_per_iteration": 2.3838064670562744 + }, + { + "auxiliary_loss_clip": 0.01082389, + "auxiliary_loss_mlp": 0.01061164, + "balance_loss_clip": 1.02482891, + "balance_loss_mlp": 1.02429354, + "epoch": 0.2068841124304825, + "flos": 18076315340160.0, + "grad_norm": 1.7907523385707482, + "language_loss": 0.86460376, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.88603926, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.58203125, + "step": 3441, + "time_per_iteration": 2.3778467178344727 + }, + { + "auxiliary_loss_clip": 0.01080851, + "auxiliary_loss_mlp": 0.01066987, + "balance_loss_clip": 1.02872038, + "balance_loss_mlp": 1.02391016, + "epoch": 0.20694423568315046, + "flos": 27345720533760.0, + "grad_norm": 1.8399187021927128, + "language_loss": 0.87594664, + "learning_rate": 3.680455884806959e-06, + "loss": 0.89742506, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5703125, + "step": 3442, + "time_per_iteration": 2.487086057662964 + }, + { + "auxiliary_loss_clip": 0.01083552, + "auxiliary_loss_mlp": 0.01064763, + "balance_loss_clip": 1.02437472, + "balance_loss_mlp": 1.02508533, + "epoch": 0.20700435893581842, + "flos": 20228815526400.0, + "grad_norm": 1.8898497726678651, + "language_loss": 0.7408967, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.76237988, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.5859375, + "step": 3443, + "time_per_iteration": 2.393321990966797 + }, + { + "auxiliary_loss_clip": 0.01081818, + "auxiliary_loss_mlp": 0.01058934, + "balance_loss_clip": 1.02195525, + "balance_loss_mlp": 1.02354717, + "epoch": 0.2070644821884864, + "flos": 20630072315520.0, + "grad_norm": 1.7833026802980583, + "language_loss": 0.86340815, + "learning_rate": 3.680033399147797e-06, + "loss": 0.88481563, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.58203125, + "step": 3444, + "time_per_iteration": 2.4629693031311035 + }, + { + "auxiliary_loss_clip": 0.01029894, + "auxiliary_loss_mlp": 0.01008196, + "balance_loss_clip": 1.00128222, + "balance_loss_mlp": 1.01031518, + "epoch": 0.20712460544115438, + "flos": 65937907532160.0, + "grad_norm": 0.710896124302317, + "language_loss": 0.57233882, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59271967, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.06933594, + "router_z_loss_mlp": 0.1953125, + "step": 3445, + "time_per_iteration": 2.9888806343078613 + }, + { + "auxiliary_loss_clip": 0.0108351, + "auxiliary_loss_mlp": 0.01060906, + "balance_loss_clip": 1.02070904, + "balance_loss_mlp": 1.02422142, + "epoch": 0.20718472869382235, + "flos": 19424765848320.0, + "grad_norm": 1.5551783445421308, + "language_loss": 0.79348969, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.81493384, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.59375, + "step": 3446, + "time_per_iteration": 2.541849136352539 + }, + { + "auxiliary_loss_clip": 0.0108625, + "auxiliary_loss_mlp": 0.01067196, + "balance_loss_clip": 1.02237344, + "balance_loss_mlp": 1.02435637, + "epoch": 0.2072448519464903, + "flos": 24497890243200.0, + "grad_norm": 2.1721581253454754, + "language_loss": 0.65285647, + "learning_rate": 3.679399192876334e-06, + "loss": 0.67439097, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.6171875, + "step": 3447, + "time_per_iteration": 2.4382736682891846 + }, + { + "auxiliary_loss_clip": 0.01084438, + "auxiliary_loss_mlp": 0.01065234, + "balance_loss_clip": 1.02320123, + "balance_loss_mlp": 1.0251044, + "epoch": 0.20730497519915828, + "flos": 23074586046720.0, + "grad_norm": 1.7109040521488625, + "language_loss": 0.88112891, + "learning_rate": 3.679187663409184e-06, + "loss": 0.90262568, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.59375, + "step": 3448, + "time_per_iteration": 2.4012832641601562 + }, + { + "auxiliary_loss_clip": 0.01081615, + "auxiliary_loss_mlp": 0.01065667, + "balance_loss_clip": 1.02275133, + "balance_loss_mlp": 1.02347136, + "epoch": 0.20736509845182624, + "flos": 21067987898880.0, + "grad_norm": 1.9360048281608466, + "language_loss": 0.77406693, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.79553974, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.58203125, + "step": 3449, + "time_per_iteration": 2.5030901432037354 + }, + { + "auxiliary_loss_clip": 0.0108582, + "auxiliary_loss_mlp": 0.01063154, + "balance_loss_clip": 1.01916552, + "balance_loss_mlp": 1.02312648, + "epoch": 0.2074252217044942, + "flos": 17632499736960.0, + "grad_norm": 2.083926693713025, + "language_loss": 0.78036571, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.80185544, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.625, + "step": 3450, + "time_per_iteration": 2.4779601097106934 + }, + { + "auxiliary_loss_clip": 0.01083465, + "auxiliary_loss_mlp": 0.01066219, + "balance_loss_clip": 1.02254081, + "balance_loss_mlp": 1.02321923, + "epoch": 0.20748534495716217, + "flos": 23545948579200.0, + "grad_norm": 1.5662543310249561, + "language_loss": 0.83037031, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.85186714, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.6015625, + "step": 3451, + "time_per_iteration": 2.4809937477111816 + }, + { + "auxiliary_loss_clip": 0.01026974, + "auxiliary_loss_mlp": 0.01067036, + "balance_loss_clip": 1.06074131, + "balance_loss_mlp": 1.00640106, + "epoch": 0.20754546820983016, + "flos": 52250313738240.0, + "grad_norm": 0.834651532400257, + "language_loss": 0.56653333, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58747351, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.06298828, + "router_z_loss_mlp": 0.20507812, + "step": 3452, + "time_per_iteration": 2.9846134185791016 + }, + { + "auxiliary_loss_clip": 0.01086407, + "auxiliary_loss_mlp": 0.01072722, + "balance_loss_clip": 1.03090334, + "balance_loss_mlp": 1.02404809, + "epoch": 0.20760559146249813, + "flos": 20411341447680.0, + "grad_norm": 1.8188041860647663, + "language_loss": 0.90183049, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.92342174, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.625, + "step": 3453, + "time_per_iteration": 2.4110021591186523 + }, + { + "auxiliary_loss_clip": 0.01082842, + "auxiliary_loss_mlp": 0.01064391, + "balance_loss_clip": 1.02254844, + "balance_loss_mlp": 1.0227809, + "epoch": 0.2076657147151661, + "flos": 23184876631680.0, + "grad_norm": 1.5689014006818618, + "language_loss": 0.81531161, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.83678401, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6015625, + "step": 3454, + "time_per_iteration": 2.513597011566162 + }, + { + "auxiliary_loss_clip": 0.0108189, + "auxiliary_loss_mlp": 0.0106854, + "balance_loss_clip": 1.02562428, + "balance_loss_mlp": 1.02203548, + "epoch": 0.20772583796783406, + "flos": 18292323121920.0, + "grad_norm": 4.5523196335609315, + "language_loss": 0.79911673, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.82062107, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.59765625, + "step": 3455, + "time_per_iteration": 2.3937573432922363 + }, + { + "auxiliary_loss_clip": 0.01082722, + "auxiliary_loss_mlp": 0.01067554, + "balance_loss_clip": 1.02664161, + "balance_loss_mlp": 1.02329707, + "epoch": 0.20778596122050202, + "flos": 17601845696640.0, + "grad_norm": 1.798640563179911, + "language_loss": 0.82354736, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.84505016, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.59375, + "step": 3456, + "time_per_iteration": 2.500481367111206 + }, + { + "auxiliary_loss_clip": 0.01084907, + "auxiliary_loss_mlp": 0.01068578, + "balance_loss_clip": 1.02420878, + "balance_loss_mlp": 1.02357125, + "epoch": 0.20784608447317, + "flos": 23804445352320.0, + "grad_norm": 1.6504554787600385, + "language_loss": 0.79892361, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.82045853, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 0.6171875, + "step": 3457, + "time_per_iteration": 3.891096830368042 + }, + { + "auxiliary_loss_clip": 0.01086265, + "auxiliary_loss_mlp": 0.01098262, + "balance_loss_clip": 1.05253351, + "balance_loss_mlp": 1.02315319, + "epoch": 0.20790620772583795, + "flos": 17638329934080.0, + "grad_norm": 1.884268839282867, + "language_loss": 0.85359001, + "learning_rate": 3.677068867939333e-06, + "loss": 0.87543529, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 0.6328125, + "step": 3458, + "time_per_iteration": 5.299649715423584 + }, + { + "auxiliary_loss_clip": 0.01081607, + "auxiliary_loss_mlp": 0.01076643, + "balance_loss_clip": 1.03756666, + "balance_loss_mlp": 1.02312899, + "epoch": 0.20796633097850595, + "flos": 27672228368640.0, + "grad_norm": 1.7777466724680218, + "language_loss": 0.77338862, + "learning_rate": 3.676856638489272e-06, + "loss": 0.79497111, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5859375, + "step": 3459, + "time_per_iteration": 2.4948678016662598 + }, + { + "auxiliary_loss_clip": 0.01079926, + "auxiliary_loss_mlp": 0.01092869, + "balance_loss_clip": 1.05481744, + "balance_loss_mlp": 1.02231193, + "epoch": 0.2080264542311739, + "flos": 19244578988160.0, + "grad_norm": 2.9488171572086737, + "language_loss": 0.78610408, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.807832, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.578125, + "step": 3460, + "time_per_iteration": 3.8273770809173584 + }, + { + "auxiliary_loss_clip": 0.01083825, + "auxiliary_loss_mlp": 0.01087094, + "balance_loss_clip": 1.04672968, + "balance_loss_mlp": 1.0236944, + "epoch": 0.20808657748384188, + "flos": 27524720407680.0, + "grad_norm": 2.1557182452839054, + "language_loss": 0.76956534, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.79127455, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.6015625, + "step": 3461, + "time_per_iteration": 2.4912352561950684 + }, + { + "auxiliary_loss_clip": 0.01083629, + "auxiliary_loss_mlp": 0.01084344, + "balance_loss_clip": 1.04264426, + "balance_loss_mlp": 1.02197635, + "epoch": 0.20814670073650984, + "flos": 26905710268800.0, + "grad_norm": 2.352920069315673, + "language_loss": 0.90057266, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.9222523, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.6171875, + "step": 3462, + "time_per_iteration": 2.5406596660614014 + }, + { + "auxiliary_loss_clip": 0.01029412, + "auxiliary_loss_mlp": 0.01076399, + "balance_loss_clip": 1.07196426, + "balance_loss_mlp": 1.0110569, + "epoch": 0.2082068239891778, + "flos": 70172383224960.0, + "grad_norm": 0.7836881169329297, + "language_loss": 0.59117579, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.61223388, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.04443359, + "router_z_loss_mlp": 0.18359375, + "step": 3463, + "time_per_iteration": 3.1784863471984863 + }, + { + "auxiliary_loss_clip": 0.01081233, + "auxiliary_loss_mlp": 0.01077884, + "balance_loss_clip": 1.03322852, + "balance_loss_mlp": 1.02097869, + "epoch": 0.20826694724184577, + "flos": 24606924019200.0, + "grad_norm": 2.43066250151913, + "language_loss": 0.69575334, + "learning_rate": 3.675794537601429e-06, + "loss": 0.71734452, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.6015625, + "step": 3464, + "time_per_iteration": 2.463078498840332 + }, + { + "auxiliary_loss_clip": 0.01085528, + "auxiliary_loss_mlp": 0.01075422, + "balance_loss_clip": 1.03140998, + "balance_loss_mlp": 1.02391517, + "epoch": 0.20832707049451377, + "flos": 12892097399040.0, + "grad_norm": 1.8719361988412462, + "language_loss": 0.86314642, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.88475591, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 0.6171875, + "step": 3465, + "time_per_iteration": 2.4493515491485596 + }, + { + "auxiliary_loss_clip": 0.01086221, + "auxiliary_loss_mlp": 0.01065666, + "balance_loss_clip": 1.02320361, + "balance_loss_mlp": 1.02550006, + "epoch": 0.20838719374718173, + "flos": 22197777361920.0, + "grad_norm": 2.2062954331755966, + "language_loss": 0.83942378, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.8609426, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.60546875, + "step": 3466, + "time_per_iteration": 2.45220947265625 + }, + { + "auxiliary_loss_clip": 0.01081548, + "auxiliary_loss_mlp": 0.01059279, + "balance_loss_clip": 1.02098918, + "balance_loss_mlp": 1.02347159, + "epoch": 0.2084473169998497, + "flos": 15157750901760.0, + "grad_norm": 1.7719595531401438, + "language_loss": 0.83769011, + "learning_rate": 3.675156514448716e-06, + "loss": 0.85909843, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.58203125, + "step": 3467, + "time_per_iteration": 2.497887134552002 + }, + { + "auxiliary_loss_clip": 0.01080472, + "auxiliary_loss_mlp": 0.01056439, + "balance_loss_clip": 1.01998472, + "balance_loss_mlp": 1.02457142, + "epoch": 0.20850744025251766, + "flos": 17455838924160.0, + "grad_norm": 1.9984752380161444, + "language_loss": 0.83218479, + "learning_rate": 3.674943713009518e-06, + "loss": 0.85355389, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.55859375, + "step": 3468, + "time_per_iteration": 2.4471094608306885 + }, + { + "auxiliary_loss_clip": 0.01083759, + "auxiliary_loss_mlp": 0.01064324, + "balance_loss_clip": 1.02107453, + "balance_loss_mlp": 1.02384639, + "epoch": 0.20856756350518563, + "flos": 25697890183680.0, + "grad_norm": 1.9447082890174219, + "language_loss": 0.9133445, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.9348253, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.6015625, + "step": 3469, + "time_per_iteration": 2.5326671600341797 + }, + { + "auxiliary_loss_clip": 0.01085035, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_clip": 1.02361465, + "balance_loss_mlp": 1.02733493, + "epoch": 0.2086276867578536, + "flos": 37887535560960.0, + "grad_norm": 1.8474981704439475, + "language_loss": 0.78199339, + "learning_rate": 3.674517919597092e-06, + "loss": 0.80347306, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.578125, + "step": 3470, + "time_per_iteration": 2.606663465499878 + }, + { + "auxiliary_loss_clip": 0.01080488, + "auxiliary_loss_mlp": 0.01063044, + "balance_loss_clip": 1.02344298, + "balance_loss_mlp": 1.02296996, + "epoch": 0.20868781001052156, + "flos": 25555863306240.0, + "grad_norm": 2.328080468389899, + "language_loss": 0.77165937, + "learning_rate": 3.674304927640011e-06, + "loss": 0.79309464, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.57421875, + "step": 3471, + "time_per_iteration": 2.6040830612182617 + }, + { + "auxiliary_loss_clip": 0.01085884, + "auxiliary_loss_mlp": 0.01063108, + "balance_loss_clip": 1.02009678, + "balance_loss_mlp": 1.0238719, + "epoch": 0.20874793326318955, + "flos": 27527897341440.0, + "grad_norm": 2.0834732381917487, + "language_loss": 0.77362728, + "learning_rate": 3.67409187219312e-06, + "loss": 0.79511726, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.62109375, + "step": 3472, + "time_per_iteration": 2.5315070152282715 + }, + { + "auxiliary_loss_clip": 0.0108492, + "auxiliary_loss_mlp": 0.01065155, + "balance_loss_clip": 1.02698469, + "balance_loss_mlp": 1.02654386, + "epoch": 0.20880805651585752, + "flos": 18547887340800.0, + "grad_norm": 2.001030060739717, + "language_loss": 0.86416578, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.88566649, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.58203125, + "step": 3473, + "time_per_iteration": 2.480185031890869 + }, + { + "auxiliary_loss_clip": 0.01023624, + "auxiliary_loss_mlp": 0.01015565, + "balance_loss_clip": 1.00981879, + "balance_loss_mlp": 1.00638008, + "epoch": 0.20886817976852548, + "flos": 65943318792960.0, + "grad_norm": 0.9007945072550463, + "language_loss": 0.63710821, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65750003, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.05737305, + "router_z_loss_mlp": 0.171875, + "step": 3474, + "time_per_iteration": 3.014744997024536 + }, + { + "auxiliary_loss_clip": 0.01087789, + "auxiliary_loss_mlp": 0.01065561, + "balance_loss_clip": 1.02369452, + "balance_loss_mlp": 1.02705932, + "epoch": 0.20892830302119345, + "flos": 36537688598400.0, + "grad_norm": 1.9005096448107774, + "language_loss": 0.72705245, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.74858594, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.60546875, + "step": 3475, + "time_per_iteration": 2.5894439220428467 + }, + { + "auxiliary_loss_clip": 0.010851, + "auxiliary_loss_mlp": 0.0106005, + "balance_loss_clip": 1.02080584, + "balance_loss_mlp": 1.02672887, + "epoch": 0.2089884262738614, + "flos": 20955777189120.0, + "grad_norm": 1.6112843232057985, + "language_loss": 0.72309959, + "learning_rate": 3.673239015669065e-06, + "loss": 0.74455106, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.5859375, + "step": 3476, + "time_per_iteration": 2.4683215618133545 + }, + { + "auxiliary_loss_clip": 0.01083568, + "auxiliary_loss_mlp": 0.01056778, + "balance_loss_clip": 1.01937032, + "balance_loss_mlp": 1.02644646, + "epoch": 0.20904854952652938, + "flos": 22782921615360.0, + "grad_norm": 1.8187398228568794, + "language_loss": 0.90713066, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.92853415, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5703125, + "step": 3477, + "time_per_iteration": 2.5477263927459717 + }, + { + "auxiliary_loss_clip": 0.01083927, + "auxiliary_loss_mlp": 0.01059135, + "balance_loss_clip": 1.02110767, + "balance_loss_mlp": 1.02620232, + "epoch": 0.20910867277919734, + "flos": 27302184201600.0, + "grad_norm": 2.2941911775548673, + "language_loss": 0.70504916, + "learning_rate": 3.672812206678344e-06, + "loss": 0.72647977, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.578125, + "step": 3478, + "time_per_iteration": 2.4730052947998047 + }, + { + "auxiliary_loss_clip": 0.01082967, + "auxiliary_loss_mlp": 0.01068742, + "balance_loss_clip": 1.03078604, + "balance_loss_mlp": 1.02595866, + "epoch": 0.20916879603186533, + "flos": 14318369061120.0, + "grad_norm": 1.9265775073012146, + "language_loss": 0.86240149, + "learning_rate": 3.672598707029127e-06, + "loss": 0.88391852, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5703125, + "step": 3479, + "time_per_iteration": 2.4959194660186768 + }, + { + "auxiliary_loss_clip": 0.01084455, + "auxiliary_loss_mlp": 0.0107245, + "balance_loss_clip": 1.03439832, + "balance_loss_mlp": 1.02585673, + "epoch": 0.2092289192845333, + "flos": 22271932823040.0, + "grad_norm": 2.6016377706739755, + "language_loss": 0.7616297, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.78319877, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5859375, + "step": 3480, + "time_per_iteration": 2.47739577293396 + }, + { + "auxiliary_loss_clip": 0.01078733, + "auxiliary_loss_mlp": 0.01075163, + "balance_loss_clip": 1.04080653, + "balance_loss_mlp": 1.02333844, + "epoch": 0.20928904253720126, + "flos": 14829811701120.0, + "grad_norm": 1.92079381505119, + "language_loss": 0.77507699, + "learning_rate": 3.67217151746346e-06, + "loss": 0.79661596, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5546875, + "step": 3481, + "time_per_iteration": 2.4941303730010986 + }, + { + "auxiliary_loss_clip": 0.01082629, + "auxiliary_loss_mlp": 0.01075945, + "balance_loss_clip": 1.04056406, + "balance_loss_mlp": 1.02446556, + "epoch": 0.20934916578986923, + "flos": 23258019663360.0, + "grad_norm": 1.859210068912591, + "language_loss": 0.86723137, + "learning_rate": 3.671957827563209e-06, + "loss": 0.88881713, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.58203125, + "step": 3482, + "time_per_iteration": 2.460158348083496 + }, + { + "auxiliary_loss_clip": 0.010816, + "auxiliary_loss_mlp": 0.01060279, + "balance_loss_clip": 1.02301466, + "balance_loss_mlp": 1.02432871, + "epoch": 0.2094092890425372, + "flos": 32013049662720.0, + "grad_norm": 2.049512072197987, + "language_loss": 0.72962904, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.75104785, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5703125, + "step": 3483, + "time_per_iteration": 2.554166793823242 + }, + { + "auxiliary_loss_clip": 0.0108243, + "auxiliary_loss_mlp": 0.01086651, + "balance_loss_clip": 1.04495144, + "balance_loss_mlp": 1.02306306, + "epoch": 0.20946941229520516, + "flos": 20009630810880.0, + "grad_norm": 1.618577042073715, + "language_loss": 0.77365232, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.7953431, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.59375, + "step": 3484, + "time_per_iteration": 2.447955369949341 + }, + { + "auxiliary_loss_clip": 0.0108159, + "auxiliary_loss_mlp": 0.01063806, + "balance_loss_clip": 1.02756608, + "balance_loss_mlp": 1.02442789, + "epoch": 0.20952953554787315, + "flos": 30738684792960.0, + "grad_norm": 1.648411291324392, + "language_loss": 0.71616703, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.73762107, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5703125, + "step": 3485, + "time_per_iteration": 2.5829174518585205 + }, + { + "auxiliary_loss_clip": 0.0108349, + "auxiliary_loss_mlp": 0.01062726, + "balance_loss_clip": 1.02414954, + "balance_loss_mlp": 1.02446544, + "epoch": 0.20958965880054112, + "flos": 27048086259840.0, + "grad_norm": 1.88693161329905, + "language_loss": 0.84866309, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.87012523, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.58984375, + "step": 3486, + "time_per_iteration": 2.488337993621826 + }, + { + "auxiliary_loss_clip": 0.01080647, + "auxiliary_loss_mlp": 0.01065614, + "balance_loss_clip": 1.02811074, + "balance_loss_mlp": 1.02222359, + "epoch": 0.20964978205320908, + "flos": 34202697402240.0, + "grad_norm": 1.706761308043827, + "language_loss": 0.88384086, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.90530348, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5859375, + "step": 3487, + "time_per_iteration": 2.6209545135498047 + }, + { + "auxiliary_loss_clip": 0.01077501, + "auxiliary_loss_mlp": 0.01056964, + "balance_loss_clip": 1.02036667, + "balance_loss_mlp": 1.02181244, + "epoch": 0.20970990530587705, + "flos": 23476261772160.0, + "grad_norm": 3.0979526579659535, + "language_loss": 0.74752271, + "learning_rate": 3.670674357028504e-06, + "loss": 0.76886737, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.55859375, + "step": 3488, + "time_per_iteration": 2.489739179611206 + }, + { + "auxiliary_loss_clip": 0.01080814, + "auxiliary_loss_mlp": 0.0105861, + "balance_loss_clip": 1.02096391, + "balance_loss_mlp": 1.02325106, + "epoch": 0.209770028558545, + "flos": 18550470781440.0, + "grad_norm": 4.540321485983759, + "language_loss": 0.8176775, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.83907175, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.57421875, + "step": 3489, + "time_per_iteration": 2.512094497680664 + }, + { + "auxiliary_loss_clip": 0.01080209, + "auxiliary_loss_mlp": 0.01061789, + "balance_loss_clip": 1.02223492, + "balance_loss_mlp": 1.02297974, + "epoch": 0.20983015181121298, + "flos": 21615914776320.0, + "grad_norm": 1.7954753571760769, + "language_loss": 0.74746668, + "learning_rate": 3.670246026613266e-06, + "loss": 0.76888669, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5703125, + "step": 3490, + "time_per_iteration": 2.516413688659668 + }, + { + "auxiliary_loss_clip": 0.01076208, + "auxiliary_loss_mlp": 0.01058779, + "balance_loss_clip": 1.02349317, + "balance_loss_mlp": 1.02282238, + "epoch": 0.20989027506388094, + "flos": 16613873642880.0, + "grad_norm": 1.810607671683018, + "language_loss": 0.72204244, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.74339229, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.53515625, + "step": 3491, + "time_per_iteration": 2.4933903217315674 + }, + { + "auxiliary_loss_clip": 0.01081335, + "auxiliary_loss_mlp": 0.01058512, + "balance_loss_clip": 1.01790929, + "balance_loss_mlp": 1.02358115, + "epoch": 0.20995039831654894, + "flos": 23215844874240.0, + "grad_norm": 2.2710123969800744, + "language_loss": 0.81423163, + "learning_rate": 3.669817442854444e-06, + "loss": 0.83563012, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.578125, + "step": 3492, + "time_per_iteration": 2.430368185043335 + }, + { + "auxiliary_loss_clip": 0.0107906, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.02072716, + "balance_loss_mlp": 1.02192986, + "epoch": 0.2100105215692169, + "flos": 18146595640320.0, + "grad_norm": 1.8259088592094537, + "language_loss": 0.88271517, + "learning_rate": 3.669603055991502e-06, + "loss": 0.90409029, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5703125, + "step": 3493, + "time_per_iteration": 2.5071730613708496 + }, + { + "auxiliary_loss_clip": 0.01074479, + "auxiliary_loss_mlp": 0.01057447, + "balance_loss_clip": 1.02204204, + "balance_loss_mlp": 1.02051008, + "epoch": 0.21007064482188487, + "flos": 15960683416320.0, + "grad_norm": 1.5520883966444965, + "language_loss": 0.71691978, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.73823905, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5390625, + "step": 3494, + "time_per_iteration": 2.46099853515625 + }, + { + "auxiliary_loss_clip": 0.01083311, + "auxiliary_loss_mlp": 0.01058053, + "balance_loss_clip": 1.02217078, + "balance_loss_mlp": 1.02450395, + "epoch": 0.21013076807455283, + "flos": 32232932605440.0, + "grad_norm": 1.6824633031720868, + "language_loss": 0.80413532, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.82554895, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5859375, + "step": 3495, + "time_per_iteration": 2.6416337490081787 + }, + { + "auxiliary_loss_clip": 0.01080842, + "auxiliary_loss_mlp": 0.010568, + "balance_loss_clip": 1.02122819, + "balance_loss_mlp": 1.02310038, + "epoch": 0.2101908913272208, + "flos": 23695481399040.0, + "grad_norm": 1.524301454157636, + "language_loss": 0.78966159, + "learning_rate": 3.668959515566116e-06, + "loss": 0.81103802, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.578125, + "step": 3496, + "time_per_iteration": 4.051529884338379 + }, + { + "auxiliary_loss_clip": 0.01082032, + "auxiliary_loss_mlp": 0.0106195, + "balance_loss_clip": 1.02292109, + "balance_loss_mlp": 1.02404821, + "epoch": 0.21025101457988876, + "flos": 20374752476160.0, + "grad_norm": 1.7153110884139955, + "language_loss": 0.8350966, + "learning_rate": 3.668744875505915e-06, + "loss": 0.85653639, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.578125, + "step": 3497, + "time_per_iteration": 2.4276392459869385 + }, + { + "auxiliary_loss_clip": 0.01080588, + "auxiliary_loss_mlp": 0.01054306, + "balance_loss_clip": 1.01549101, + "balance_loss_mlp": 1.02335525, + "epoch": 0.21031113783255675, + "flos": 25774454528640.0, + "grad_norm": 2.3097452453200105, + "language_loss": 0.69245642, + "learning_rate": 3.668530172166741e-06, + "loss": 0.71380532, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5703125, + "step": 3498, + "time_per_iteration": 5.447112083435059 + }, + { + "auxiliary_loss_clip": 0.01080769, + "auxiliary_loss_mlp": 0.01059955, + "balance_loss_clip": 1.02412105, + "balance_loss_mlp": 1.02241492, + "epoch": 0.21037126108522472, + "flos": 22017101742720.0, + "grad_norm": 1.6364546343620845, + "language_loss": 0.82991219, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.85131943, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.58203125, + "step": 3499, + "time_per_iteration": 2.4871790409088135 + }, + { + "auxiliary_loss_clip": 0.01079005, + "auxiliary_loss_mlp": 0.01046455, + "balance_loss_clip": 1.01281428, + "balance_loss_mlp": 1.02237368, + "epoch": 0.21043138433789269, + "flos": 25333327100160.0, + "grad_norm": 1.6090654867025471, + "language_loss": 0.79425931, + "learning_rate": 3.668100575684043e-06, + "loss": 0.81551385, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.56640625, + "step": 3500, + "time_per_iteration": 3.939704656600952 + }, + { + "auxiliary_loss_clip": 0.01077402, + "auxiliary_loss_mlp": 0.01050442, + "balance_loss_clip": 1.01467896, + "balance_loss_mlp": 1.02084911, + "epoch": 0.21049150759056065, + "flos": 25555479281280.0, + "grad_norm": 1.6052125454853359, + "language_loss": 0.75827396, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.77955246, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.56640625, + "step": 3501, + "time_per_iteration": 2.524071216583252 + }, + { + "auxiliary_loss_clip": 0.01075696, + "auxiliary_loss_mlp": 0.01054475, + "balance_loss_clip": 1.02042866, + "balance_loss_mlp": 1.02074742, + "epoch": 0.21055163084322862, + "flos": 24494538752640.0, + "grad_norm": 1.5169545182945614, + "language_loss": 0.76563883, + "learning_rate": 3.667670726183183e-06, + "loss": 0.78694057, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.55078125, + "step": 3502, + "time_per_iteration": 2.4781298637390137 + }, + { + "auxiliary_loss_clip": 0.01076829, + "auxiliary_loss_mlp": 0.01049451, + "balance_loss_clip": 1.01409316, + "balance_loss_mlp": 1.02208924, + "epoch": 0.21061175409589658, + "flos": 25737865557120.0, + "grad_norm": 1.9403607815419364, + "language_loss": 0.78484476, + "learning_rate": 3.667455706571316e-06, + "loss": 0.80610752, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.546875, + "step": 3503, + "time_per_iteration": 2.4953548908233643 + }, + { + "auxiliary_loss_clip": 0.01081256, + "auxiliary_loss_mlp": 0.01068273, + "balance_loss_clip": 1.02693176, + "balance_loss_mlp": 1.02182102, + "epoch": 0.21067187734856455, + "flos": 18988176896640.0, + "grad_norm": 2.7018364083388544, + "language_loss": 0.81515193, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.83664727, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.59375, + "step": 3504, + "time_per_iteration": 2.428044080734253 + }, + { + "auxiliary_loss_clip": 0.01081311, + "auxiliary_loss_mlp": 0.01067906, + "balance_loss_clip": 1.02885282, + "balance_loss_mlp": 1.02261972, + "epoch": 0.21073200060123254, + "flos": 24680206696320.0, + "grad_norm": 1.5373218428548072, + "language_loss": 0.77947247, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.80096471, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.58984375, + "step": 3505, + "time_per_iteration": 2.495490550994873 + }, + { + "auxiliary_loss_clip": 0.01073975, + "auxiliary_loss_mlp": 0.01060924, + "balance_loss_clip": 1.02623439, + "balance_loss_mlp": 1.01998496, + "epoch": 0.2107921238539005, + "flos": 28548059535360.0, + "grad_norm": 1.8592482758563134, + "language_loss": 0.6498819, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.67123085, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5390625, + "step": 3506, + "time_per_iteration": 2.545870065689087 + }, + { + "auxiliary_loss_clip": 0.01078284, + "auxiliary_loss_mlp": 0.01060732, + "balance_loss_clip": 1.02465916, + "balance_loss_mlp": 1.02273762, + "epoch": 0.21085224710656847, + "flos": 25884640379520.0, + "grad_norm": 1.6441995037273525, + "language_loss": 0.83744764, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.8588379, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5546875, + "step": 3507, + "time_per_iteration": 2.5157885551452637 + }, + { + "auxiliary_loss_clip": 0.01077584, + "auxiliary_loss_mlp": 0.01052375, + "balance_loss_clip": 1.01930594, + "balance_loss_mlp": 1.02296734, + "epoch": 0.21091237035923643, + "flos": 14975399537280.0, + "grad_norm": 1.9802373392745583, + "language_loss": 0.77682662, + "learning_rate": 3.666379660223824e-06, + "loss": 0.79812622, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.546875, + "step": 3508, + "time_per_iteration": 2.3967137336730957 + }, + { + "auxiliary_loss_clip": 0.01081239, + "auxiliary_loss_mlp": 0.01061688, + "balance_loss_clip": 1.02351749, + "balance_loss_mlp": 1.02317226, + "epoch": 0.2109724936119044, + "flos": 16361591091840.0, + "grad_norm": 2.824364816398747, + "language_loss": 0.87341201, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.89484131, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.58203125, + "step": 3509, + "time_per_iteration": 2.3933122158050537 + }, + { + "auxiliary_loss_clip": 0.01080193, + "auxiliary_loss_mlp": 0.01057155, + "balance_loss_clip": 1.01996183, + "balance_loss_mlp": 1.02307642, + "epoch": 0.21103261686457236, + "flos": 31501188086400.0, + "grad_norm": 1.8902045118338804, + "language_loss": 0.69544005, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.71681356, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5703125, + "step": 3510, + "time_per_iteration": 2.577608346939087 + }, + { + "auxiliary_loss_clip": 0.01077392, + "auxiliary_loss_mlp": 0.01055244, + "balance_loss_clip": 1.01957703, + "balance_loss_mlp": 1.0203526, + "epoch": 0.21109274011724033, + "flos": 27342857802240.0, + "grad_norm": 1.740796558637469, + "language_loss": 0.74011064, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.76143706, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5703125, + "step": 3511, + "time_per_iteration": 2.503680944442749 + }, + { + "auxiliary_loss_clip": 0.01080123, + "auxiliary_loss_mlp": 0.01058537, + "balance_loss_clip": 1.01805365, + "balance_loss_mlp": 1.0220145, + "epoch": 0.21115286336990832, + "flos": 17819459400960.0, + "grad_norm": 2.465041930275446, + "language_loss": 0.72048414, + "learning_rate": 3.665517685689794e-06, + "loss": 0.74187076, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.578125, + "step": 3512, + "time_per_iteration": 2.3319976329803467 + }, + { + "auxiliary_loss_clip": 0.01078772, + "auxiliary_loss_mlp": 0.0105647, + "balance_loss_clip": 1.01813281, + "balance_loss_mlp": 1.02205968, + "epoch": 0.2112129866225763, + "flos": 27196781207040.0, + "grad_norm": 1.5815788880746124, + "language_loss": 0.74717116, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.76852357, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5703125, + "step": 3513, + "time_per_iteration": 2.449000597000122 + }, + { + "auxiliary_loss_clip": 0.01077166, + "auxiliary_loss_mlp": 0.01047945, + "balance_loss_clip": 1.01301718, + "balance_loss_mlp": 1.02311826, + "epoch": 0.21127310987524425, + "flos": 23730185157120.0, + "grad_norm": 2.0908572438768807, + "language_loss": 0.76032579, + "learning_rate": 3.665086319450502e-06, + "loss": 0.78157687, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5390625, + "step": 3514, + "time_per_iteration": 2.39233660697937 + }, + { + "auxiliary_loss_clip": 0.01080771, + "auxiliary_loss_mlp": 0.01056532, + "balance_loss_clip": 1.01662076, + "balance_loss_mlp": 1.02309346, + "epoch": 0.21133323312791222, + "flos": 18331530445440.0, + "grad_norm": 1.8918228637923897, + "language_loss": 0.79036659, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.81173968, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.578125, + "step": 3515, + "time_per_iteration": 2.3582193851470947 + }, + { + "auxiliary_loss_clip": 0.01078668, + "auxiliary_loss_mlp": 0.01056974, + "balance_loss_clip": 1.02218843, + "balance_loss_mlp": 1.02252889, + "epoch": 0.21139335638058018, + "flos": 17930238744960.0, + "grad_norm": 2.1427093922967884, + "language_loss": 0.70468462, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.72604102, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5625, + "step": 3516, + "time_per_iteration": 2.335310459136963 + }, + { + "auxiliary_loss_clip": 0.01082461, + "auxiliary_loss_mlp": 0.01065132, + "balance_loss_clip": 1.02209759, + "balance_loss_mlp": 1.02331352, + "epoch": 0.21145347963324815, + "flos": 24570928540800.0, + "grad_norm": 1.6965930147025639, + "language_loss": 0.86300814, + "learning_rate": 3.664438796560225e-06, + "loss": 0.88448411, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.58984375, + "step": 3517, + "time_per_iteration": 2.418478488922119 + }, + { + "auxiliary_loss_clip": 0.01077144, + "auxiliary_loss_mlp": 0.01048735, + "balance_loss_clip": 1.01130319, + "balance_loss_mlp": 1.02088678, + "epoch": 0.21151360288591614, + "flos": 35844488087040.0, + "grad_norm": 2.4284584833747163, + "language_loss": 0.65358388, + "learning_rate": 3.664222829354512e-06, + "loss": 0.67484266, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5625, + "step": 3518, + "time_per_iteration": 2.526299238204956 + }, + { + "auxiliary_loss_clip": 0.01077193, + "auxiliary_loss_mlp": 0.01053728, + "balance_loss_clip": 1.01717877, + "balance_loss_mlp": 1.02094662, + "epoch": 0.2115737261385841, + "flos": 24640510613760.0, + "grad_norm": 1.9842426761824759, + "language_loss": 0.91318178, + "learning_rate": 3.664006799041303e-06, + "loss": 0.93449104, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5625, + "step": 3519, + "time_per_iteration": 2.446359395980835 + }, + { + "auxiliary_loss_clip": 0.01082936, + "auxiliary_loss_mlp": 0.01059468, + "balance_loss_clip": 1.01862645, + "balance_loss_mlp": 1.02432239, + "epoch": 0.21163384939125207, + "flos": 25225759601280.0, + "grad_norm": 1.5747457148479167, + "language_loss": 0.82521361, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.84663767, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.5859375, + "step": 3520, + "time_per_iteration": 2.435229778289795 + }, + { + "auxiliary_loss_clip": 0.01077297, + "auxiliary_loss_mlp": 0.01051894, + "balance_loss_clip": 1.01725137, + "balance_loss_mlp": 1.02200842, + "epoch": 0.21169397264392004, + "flos": 26066328428160.0, + "grad_norm": 1.5068059781286331, + "language_loss": 0.77638853, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.79768044, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5546875, + "step": 3521, + "time_per_iteration": 2.4477579593658447 + }, + { + "auxiliary_loss_clip": 0.0107936, + "auxiliary_loss_mlp": 0.01047629, + "balance_loss_clip": 1.01327276, + "balance_loss_mlp": 1.02280521, + "epoch": 0.211754095896588, + "flos": 23107264945920.0, + "grad_norm": 1.9471875393671179, + "language_loss": 0.76783288, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78910285, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.56640625, + "step": 3522, + "time_per_iteration": 2.378406524658203 + }, + { + "auxiliary_loss_clip": 0.0107858, + "auxiliary_loss_mlp": 0.01066391, + "balance_loss_clip": 1.02864909, + "balance_loss_mlp": 1.02222884, + "epoch": 0.21181421914925597, + "flos": 27921264163200.0, + "grad_norm": 1.821971492874629, + "language_loss": 0.71163213, + "learning_rate": 3.663142046877374e-06, + "loss": 0.73308188, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5625, + "step": 3523, + "time_per_iteration": 2.4742448329925537 + }, + { + "auxiliary_loss_clip": 0.01080308, + "auxiliary_loss_mlp": 0.0107032, + "balance_loss_clip": 1.03422308, + "balance_loss_mlp": 1.02364218, + "epoch": 0.21187434240192393, + "flos": 17127690255360.0, + "grad_norm": 2.6182169141902, + "language_loss": 0.793571, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.8150773, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5703125, + "step": 3524, + "time_per_iteration": 2.386579751968384 + }, + { + "auxiliary_loss_clip": 0.01077842, + "auxiliary_loss_mlp": 0.01053302, + "balance_loss_clip": 1.01479781, + "balance_loss_mlp": 1.02127016, + "epoch": 0.21193446565459192, + "flos": 22346193018240.0, + "grad_norm": 1.7680579400449392, + "language_loss": 0.83069462, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.85200608, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.56640625, + "step": 3525, + "time_per_iteration": 2.421335220336914 + }, + { + "auxiliary_loss_clip": 0.01081224, + "auxiliary_loss_mlp": 0.0105613, + "balance_loss_clip": 1.01898408, + "balance_loss_mlp": 1.02399957, + "epoch": 0.2119945889072599, + "flos": 27198072927360.0, + "grad_norm": 1.7354535220365397, + "language_loss": 0.76685011, + "learning_rate": 3.662492820527356e-06, + "loss": 0.78822368, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5703125, + "step": 3526, + "time_per_iteration": 2.43612003326416 + }, + { + "auxiliary_loss_clip": 0.0108017, + "auxiliary_loss_mlp": 0.01053496, + "balance_loss_clip": 1.01434767, + "balance_loss_mlp": 1.02221513, + "epoch": 0.21205471215992786, + "flos": 20990934794880.0, + "grad_norm": 2.105942925841992, + "language_loss": 0.77934325, + "learning_rate": 3.662276285649284e-06, + "loss": 0.80067992, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.578125, + "step": 3527, + "time_per_iteration": 2.446697473526001 + }, + { + "auxiliary_loss_clip": 0.01075422, + "auxiliary_loss_mlp": 0.01061918, + "balance_loss_clip": 1.0229125, + "balance_loss_mlp": 1.01974666, + "epoch": 0.21211483541259582, + "flos": 20776602758400.0, + "grad_norm": 1.9034721732058224, + "language_loss": 0.7951569, + "learning_rate": 3.662059687737528e-06, + "loss": 0.81653029, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.55859375, + "step": 3528, + "time_per_iteration": 2.4107720851898193 + }, + { + "auxiliary_loss_clip": 0.0107714, + "auxiliary_loss_mlp": 0.01059639, + "balance_loss_clip": 1.0217067, + "balance_loss_mlp": 1.02140939, + "epoch": 0.21217495866526379, + "flos": 18988979857920.0, + "grad_norm": 1.7807344436843526, + "language_loss": 0.82401979, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.84538758, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.55859375, + "step": 3529, + "time_per_iteration": 2.3664729595184326 + }, + { + "auxiliary_loss_clip": 0.01078866, + "auxiliary_loss_mlp": 0.01065973, + "balance_loss_clip": 1.02699125, + "balance_loss_mlp": 1.02165914, + "epoch": 0.21223508191793175, + "flos": 20666277262080.0, + "grad_norm": 1.9949807434699562, + "language_loss": 0.78806585, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.80951422, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5703125, + "step": 3530, + "time_per_iteration": 2.424936532974243 + }, + { + "auxiliary_loss_clip": 0.01078064, + "auxiliary_loss_mlp": 0.0104916, + "balance_loss_clip": 1.01566267, + "balance_loss_mlp": 1.02271843, + "epoch": 0.21229520517059972, + "flos": 21615391105920.0, + "grad_norm": 3.6414356103505092, + "language_loss": 0.85269988, + "learning_rate": 3.661409515882308e-06, + "loss": 0.87397206, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5546875, + "step": 3531, + "time_per_iteration": 2.4150054454803467 + }, + { + "auxiliary_loss_clip": 0.0108037, + "auxiliary_loss_mlp": 0.01053778, + "balance_loss_clip": 1.01486778, + "balance_loss_mlp": 1.02400529, + "epoch": 0.2123553284232677, + "flos": 13990185480960.0, + "grad_norm": 2.6036539394578764, + "language_loss": 0.74962109, + "learning_rate": 3.661192665917977e-06, + "loss": 0.77096254, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5625, + "step": 3532, + "time_per_iteration": 2.381605625152588 + }, + { + "auxiliary_loss_clip": 0.01074943, + "auxiliary_loss_mlp": 0.0105145, + "balance_loss_clip": 1.01807165, + "balance_loss_mlp": 1.0208236, + "epoch": 0.21241545167593567, + "flos": 18295779346560.0, + "grad_norm": 1.6524261742824184, + "language_loss": 0.75960129, + "learning_rate": 3.660975752961054e-06, + "loss": 0.78086519, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5390625, + "step": 3533, + "time_per_iteration": 2.3671417236328125 + }, + { + "auxiliary_loss_clip": 0.01078039, + "auxiliary_loss_mlp": 0.01052455, + "balance_loss_clip": 1.01454663, + "balance_loss_mlp": 1.02113557, + "epoch": 0.21247557492860364, + "flos": 34711731158400.0, + "grad_norm": 1.679169777724469, + "language_loss": 0.72003299, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.7413379, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5703125, + "step": 3534, + "time_per_iteration": 2.576613187789917 + }, + { + "auxiliary_loss_clip": 0.01076444, + "auxiliary_loss_mlp": 0.01050297, + "balance_loss_clip": 1.01498747, + "balance_loss_mlp": 1.02218151, + "epoch": 0.2125356981812716, + "flos": 22052748107520.0, + "grad_norm": 2.3397176477962316, + "language_loss": 0.73893762, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.76020503, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.54296875, + "step": 3535, + "time_per_iteration": 2.394279956817627 + }, + { + "auxiliary_loss_clip": 0.01073877, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_clip": 1.02144361, + "balance_loss_mlp": 1.02090681, + "epoch": 0.21259582143393957, + "flos": 28547082017280.0, + "grad_norm": 1.9853459743827278, + "language_loss": 0.72891223, + "learning_rate": 3.660324636216996e-06, + "loss": 0.75020349, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.53125, + "step": 3536, + "time_per_iteration": 4.000921010971069 + }, + { + "auxiliary_loss_clip": 0.01078003, + "auxiliary_loss_mlp": 0.01052241, + "balance_loss_clip": 1.015095, + "balance_loss_mlp": 1.0221746, + "epoch": 0.21265594468660753, + "flos": 20119851573120.0, + "grad_norm": 1.9679903315656282, + "language_loss": 0.88912189, + "learning_rate": 3.660107471371981e-06, + "loss": 0.91042435, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.55859375, + "step": 3537, + "time_per_iteration": 3.9771268367767334 + }, + { + "auxiliary_loss_clip": 0.01073461, + "auxiliary_loss_mlp": 0.01054259, + "balance_loss_clip": 1.02166724, + "balance_loss_mlp": 1.02073753, + "epoch": 0.21271606793927553, + "flos": 23075039894400.0, + "grad_norm": 1.667033295500522, + "language_loss": 0.82401311, + "learning_rate": 3.659890243575524e-06, + "loss": 0.8452903, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.52734375, + "step": 3538, + "time_per_iteration": 3.8022384643554688 + }, + { + "auxiliary_loss_clip": 0.01074949, + "auxiliary_loss_mlp": 0.01051749, + "balance_loss_clip": 1.01891899, + "balance_loss_mlp": 1.02188754, + "epoch": 0.2127761911919435, + "flos": 26387215534080.0, + "grad_norm": 1.6003455254754893, + "language_loss": 0.88531435, + "learning_rate": 3.659672952835863e-06, + "loss": 0.90658134, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.53125, + "step": 3539, + "time_per_iteration": 4.895423889160156 + }, + { + "auxiliary_loss_clip": 0.01075665, + "auxiliary_loss_mlp": 0.01054287, + "balance_loss_clip": 1.01830924, + "balance_loss_mlp": 1.02130723, + "epoch": 0.21283631444461146, + "flos": 20227279426560.0, + "grad_norm": 2.896414643848383, + "language_loss": 0.60012078, + "learning_rate": 3.659455599161237e-06, + "loss": 0.62142026, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.54296875, + "step": 3540, + "time_per_iteration": 4.092722415924072 + }, + { + "auxiliary_loss_clip": 0.01076839, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_clip": 1.02112126, + "balance_loss_mlp": 1.02111316, + "epoch": 0.21289643769727942, + "flos": 13516134773760.0, + "grad_norm": 2.226847074772495, + "language_loss": 0.78767765, + "learning_rate": 3.659238182559888e-06, + "loss": 0.80900729, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.55859375, + "step": 3541, + "time_per_iteration": 3.605156421661377 + }, + { + "auxiliary_loss_clip": 0.01075684, + "auxiliary_loss_mlp": 0.01052774, + "balance_loss_clip": 1.01980042, + "balance_loss_mlp": 1.02143431, + "epoch": 0.2129565609499474, + "flos": 24825864355200.0, + "grad_norm": 4.605673856277093, + "language_loss": 0.71839589, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.73968053, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.54296875, + "step": 3542, + "time_per_iteration": 3.5339195728302 + }, + { + "auxiliary_loss_clip": 0.01073294, + "auxiliary_loss_mlp": 0.0105435, + "balance_loss_clip": 1.02209163, + "balance_loss_mlp": 1.02043009, + "epoch": 0.21301668420261535, + "flos": 23658124377600.0, + "grad_norm": 1.7624784051349671, + "language_loss": 0.78376412, + "learning_rate": 3.658803160610004e-06, + "loss": 0.8050406, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.53125, + "step": 3543, + "time_per_iteration": 2.5844695568084717 + }, + { + "auxiliary_loss_clip": 0.01075783, + "auxiliary_loss_mlp": 0.01053865, + "balance_loss_clip": 1.01784003, + "balance_loss_mlp": 1.02200592, + "epoch": 0.21307680745528332, + "flos": 16361870382720.0, + "grad_norm": 2.011890661585965, + "language_loss": 0.68564641, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.70694286, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53515625, + "step": 3544, + "time_per_iteration": 2.4860470294952393 + }, + { + "auxiliary_loss_clip": 0.01077432, + "auxiliary_loss_mlp": 0.01050087, + "balance_loss_clip": 1.01442003, + "balance_loss_mlp": 1.02228761, + "epoch": 0.2131369307079513, + "flos": 19098048545280.0, + "grad_norm": 1.8838346716196614, + "language_loss": 0.7238847, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.74515986, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.55078125, + "step": 3545, + "time_per_iteration": 2.502211570739746 + }, + { + "auxiliary_loss_clip": 0.01077857, + "auxiliary_loss_mlp": 0.01057475, + "balance_loss_clip": 1.02056777, + "balance_loss_mlp": 1.02163672, + "epoch": 0.21319705396061928, + "flos": 30370979687040.0, + "grad_norm": 1.6056765603748655, + "language_loss": 0.73789716, + "learning_rate": 3.658150155940946e-06, + "loss": 0.75925046, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5625, + "step": 3546, + "time_per_iteration": 2.5712828636169434 + }, + { + "auxiliary_loss_clip": 0.01077349, + "auxiliary_loss_mlp": 0.01058524, + "balance_loss_clip": 1.02316618, + "balance_loss_mlp": 1.02269018, + "epoch": 0.21325717721328724, + "flos": 21755288390400.0, + "grad_norm": 1.7571312181466094, + "language_loss": 0.81322825, + "learning_rate": 3.657932361952479e-06, + "loss": 0.83458698, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.546875, + "step": 3547, + "time_per_iteration": 2.5686182975769043 + }, + { + "auxiliary_loss_clip": 0.01078799, + "auxiliary_loss_mlp": 0.01059895, + "balance_loss_clip": 1.02043629, + "balance_loss_mlp": 1.02174842, + "epoch": 0.2133173004659552, + "flos": 28729607938560.0, + "grad_norm": 2.7259620468008565, + "language_loss": 0.77250826, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.79389518, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5703125, + "step": 3548, + "time_per_iteration": 2.5431466102600098 + }, + { + "auxiliary_loss_clip": 0.01082711, + "auxiliary_loss_mlp": 0.01056197, + "balance_loss_clip": 1.01623821, + "balance_loss_mlp": 1.02413034, + "epoch": 0.21337742371862317, + "flos": 16836130558080.0, + "grad_norm": 1.9597723203961, + "language_loss": 0.75296259, + "learning_rate": 3.657496585376922e-06, + "loss": 0.7743516, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.5859375, + "step": 3549, + "time_per_iteration": 2.4662280082702637 + }, + { + "auxiliary_loss_clip": 0.01079935, + "auxiliary_loss_mlp": 0.01058668, + "balance_loss_clip": 1.02228498, + "balance_loss_mlp": 1.02219725, + "epoch": 0.21343754697129114, + "flos": 24423804604800.0, + "grad_norm": 1.7310685694299006, + "language_loss": 0.81919748, + "learning_rate": 3.657278602806357e-06, + "loss": 0.8405835, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.578125, + "step": 3550, + "time_per_iteration": 2.5224063396453857 + }, + { + "auxiliary_loss_clip": 0.01074352, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.01715434, + "balance_loss_mlp": 1.02145123, + "epoch": 0.21349767022395913, + "flos": 19276908773760.0, + "grad_norm": 1.6590566137264122, + "language_loss": 0.88705981, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90829718, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.53125, + "step": 3551, + "time_per_iteration": 2.5013210773468018 + }, + { + "auxiliary_loss_clip": 0.01078325, + "auxiliary_loss_mlp": 0.01052822, + "balance_loss_clip": 1.01784635, + "balance_loss_mlp": 1.0217371, + "epoch": 0.2135577934766271, + "flos": 17346595680000.0, + "grad_norm": 2.0666097222841664, + "language_loss": 0.84808642, + "learning_rate": 3.656842449140983e-06, + "loss": 0.86939788, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.56640625, + "step": 3552, + "time_per_iteration": 2.4512176513671875 + }, + { + "auxiliary_loss_clip": 0.01077283, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_clip": 1.0203383, + "balance_loss_mlp": 1.0218184, + "epoch": 0.21361791672929506, + "flos": 24056169321600.0, + "grad_norm": 1.7440992287025727, + "language_loss": 0.78001273, + "learning_rate": 3.656624278062713e-06, + "loss": 0.80136687, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5546875, + "step": 3553, + "time_per_iteration": 2.526937246322632 + }, + { + "auxiliary_loss_clip": 0.01076171, + "auxiliary_loss_mlp": 0.01060145, + "balance_loss_clip": 1.02612209, + "balance_loss_mlp": 1.02222276, + "epoch": 0.21367803998196302, + "flos": 22161258213120.0, + "grad_norm": 1.6218052140369443, + "language_loss": 0.73949873, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.76086187, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5390625, + "step": 3554, + "time_per_iteration": 2.5060367584228516 + }, + { + "auxiliary_loss_clip": 0.01076968, + "auxiliary_loss_mlp": 0.01054896, + "balance_loss_clip": 1.02102828, + "balance_loss_mlp": 1.02224636, + "epoch": 0.213738163234631, + "flos": 20885811091200.0, + "grad_norm": 1.9929622491852619, + "language_loss": 0.69887239, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.720191, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.546875, + "step": 3555, + "time_per_iteration": 2.5676944255828857 + }, + { + "auxiliary_loss_clip": 0.01078282, + "auxiliary_loss_mlp": 0.01056024, + "balance_loss_clip": 1.01928389, + "balance_loss_mlp": 1.02157593, + "epoch": 0.21379828648729896, + "flos": 28401843294720.0, + "grad_norm": 2.0491694938709406, + "language_loss": 0.6664049, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.68774796, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.56640625, + "step": 3556, + "time_per_iteration": 2.480003595352173 + }, + { + "auxiliary_loss_clip": 0.01077046, + "auxiliary_loss_mlp": 0.01061459, + "balance_loss_clip": 1.02271557, + "balance_loss_mlp": 1.02118921, + "epoch": 0.21385840973996692, + "flos": 25478600734080.0, + "grad_norm": 1.9109193340671864, + "language_loss": 0.74595028, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.7673353, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.55859375, + "step": 3557, + "time_per_iteration": 2.4668121337890625 + }, + { + "auxiliary_loss_clip": 0.01079985, + "auxiliary_loss_mlp": 0.01055189, + "balance_loss_clip": 1.01661325, + "balance_loss_mlp": 1.02180529, + "epoch": 0.2139185329926349, + "flos": 28073031310080.0, + "grad_norm": 1.7917205395012528, + "language_loss": 0.6936155, + "learning_rate": 3.655532480546528e-06, + "loss": 0.71496725, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.58203125, + "step": 3558, + "time_per_iteration": 2.4951870441436768 + }, + { + "auxiliary_loss_clip": 0.01079654, + "auxiliary_loss_mlp": 0.01053426, + "balance_loss_clip": 1.01587486, + "balance_loss_mlp": 1.0208106, + "epoch": 0.21397865624530288, + "flos": 19607710705920.0, + "grad_norm": 1.864253143308329, + "language_loss": 0.816131, + "learning_rate": 3.655313932676286e-06, + "loss": 0.83746177, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.58984375, + "step": 3559, + "time_per_iteration": 2.4470503330230713 + }, + { + "auxiliary_loss_clip": 0.01075063, + "auxiliary_loss_mlp": 0.01063641, + "balance_loss_clip": 1.02895117, + "balance_loss_mlp": 1.02080607, + "epoch": 0.21403877949797084, + "flos": 24680311430400.0, + "grad_norm": 2.3465789422942445, + "language_loss": 0.69249439, + "learning_rate": 3.655095322036373e-06, + "loss": 0.71388137, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.54296875, + "step": 3560, + "time_per_iteration": 2.4751124382019043 + }, + { + "auxiliary_loss_clip": 0.0108045, + "auxiliary_loss_mlp": 0.01051752, + "balance_loss_clip": 1.01699042, + "balance_loss_mlp": 1.02241111, + "epoch": 0.2140989027506388, + "flos": 19860237636480.0, + "grad_norm": 2.0518011038148596, + "language_loss": 0.75555611, + "learning_rate": 3.65487664863508e-06, + "loss": 0.77687812, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.578125, + "step": 3561, + "time_per_iteration": 2.4217565059661865 + }, + { + "auxiliary_loss_clip": 0.01079073, + "auxiliary_loss_mlp": 0.01061642, + "balance_loss_clip": 1.02363825, + "balance_loss_mlp": 1.02318048, + "epoch": 0.21415902600330677, + "flos": 19134323314560.0, + "grad_norm": 2.528207343473672, + "language_loss": 0.79578805, + "learning_rate": 3.654657912480698e-06, + "loss": 0.81719518, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5546875, + "step": 3562, + "time_per_iteration": 2.4195830821990967 + }, + { + "auxiliary_loss_clip": 0.01078583, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.01877487, + "balance_loss_mlp": 1.02211988, + "epoch": 0.21421914925597474, + "flos": 22271548798080.0, + "grad_norm": 1.4919946865586253, + "language_loss": 0.85701585, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.87834966, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5625, + "step": 3563, + "time_per_iteration": 2.4329442977905273 + }, + { + "auxiliary_loss_clip": 0.01077155, + "auxiliary_loss_mlp": 0.01049622, + "balance_loss_clip": 1.01433599, + "balance_loss_mlp": 1.02217293, + "epoch": 0.2142792725086427, + "flos": 33873710860800.0, + "grad_norm": 1.5750206757641079, + "language_loss": 0.78234357, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.8036114, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.55078125, + "step": 3564, + "time_per_iteration": 2.5424301624298096 + }, + { + "auxiliary_loss_clip": 0.01078593, + "auxiliary_loss_mlp": 0.01053941, + "balance_loss_clip": 1.01853585, + "balance_loss_mlp": 1.02208638, + "epoch": 0.2143393957613107, + "flos": 19859329941120.0, + "grad_norm": 1.6790447528623056, + "language_loss": 0.90101826, + "learning_rate": 3.654001327581981e-06, + "loss": 0.92234361, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5625, + "step": 3565, + "time_per_iteration": 2.4943928718566895 + }, + { + "auxiliary_loss_clip": 0.01032553, + "auxiliary_loss_mlp": 0.01020719, + "balance_loss_clip": 1.01542652, + "balance_loss_mlp": 1.01147461, + "epoch": 0.21439951901397866, + "flos": 68526891936000.0, + "grad_norm": 0.883926374346326, + "language_loss": 0.52250433, + "learning_rate": 3.653782340498215e-06, + "loss": 0.543037, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.05297852, + "router_z_loss_mlp": 0.2109375, + "step": 3566, + "time_per_iteration": 3.0206620693206787 + }, + { + "auxiliary_loss_clip": 0.01076713, + "auxiliary_loss_mlp": 0.01048933, + "balance_loss_clip": 1.01367116, + "balance_loss_mlp": 1.02275467, + "epoch": 0.21445964226664663, + "flos": 19681970901120.0, + "grad_norm": 1.8683246294094795, + "language_loss": 0.69002378, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.71128023, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5390625, + "step": 3567, + "time_per_iteration": 2.400717258453369 + }, + { + "auxiliary_loss_clip": 0.01077823, + "auxiliary_loss_mlp": 0.01051263, + "balance_loss_clip": 1.01688278, + "balance_loss_mlp": 1.02386391, + "epoch": 0.2145197655193146, + "flos": 31105796405760.0, + "grad_norm": 1.6801751952706347, + "language_loss": 0.7498889, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.77117974, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5390625, + "step": 3568, + "time_per_iteration": 2.532634735107422 + }, + { + "auxiliary_loss_clip": 0.01082105, + "auxiliary_loss_mlp": 0.01058109, + "balance_loss_clip": 1.02091551, + "balance_loss_mlp": 1.02528977, + "epoch": 0.21457988877198256, + "flos": 20119746839040.0, + "grad_norm": 1.7178452630680678, + "language_loss": 0.79574013, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.81714225, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5703125, + "step": 3569, + "time_per_iteration": 2.4438276290893555 + }, + { + "auxiliary_loss_clip": 0.01087303, + "auxiliary_loss_mlp": 0.01060123, + "balance_loss_clip": 1.01992536, + "balance_loss_mlp": 1.02662635, + "epoch": 0.21464001202465052, + "flos": 18587059752960.0, + "grad_norm": 2.198707820500583, + "language_loss": 0.7209388, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.74241304, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.609375, + "step": 3570, + "time_per_iteration": 2.3739547729492188 + }, + { + "auxiliary_loss_clip": 0.01082354, + "auxiliary_loss_mlp": 0.01061694, + "balance_loss_clip": 1.02373767, + "balance_loss_mlp": 1.02446556, + "epoch": 0.21470013527731852, + "flos": 21834087062400.0, + "grad_norm": 2.4370424525047976, + "language_loss": 0.81743014, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.83887058, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.578125, + "step": 3571, + "time_per_iteration": 2.4606640338897705 + }, + { + "auxiliary_loss_clip": 0.01082003, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.0197469, + "balance_loss_mlp": 1.02480567, + "epoch": 0.21476025852998648, + "flos": 17602229721600.0, + "grad_norm": 2.438234229883375, + "language_loss": 0.85016525, + "learning_rate": 3.652467101342991e-06, + "loss": 0.87158328, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.57421875, + "step": 3572, + "time_per_iteration": 2.3747923374176025 + }, + { + "auxiliary_loss_clip": 0.01083394, + "auxiliary_loss_mlp": 0.01066373, + "balance_loss_clip": 1.02548444, + "balance_loss_mlp": 1.02258468, + "epoch": 0.21482038178265445, + "flos": 24826946607360.0, + "grad_norm": 2.350614727132885, + "language_loss": 0.67205262, + "learning_rate": 3.652247675452598e-06, + "loss": 0.69355029, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.609375, + "step": 3573, + "time_per_iteration": 2.455657958984375 + }, + { + "auxiliary_loss_clip": 0.01076042, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.02054596, + "balance_loss_mlp": 1.02249169, + "epoch": 0.2148805050353224, + "flos": 23257111968000.0, + "grad_norm": 2.810532359432018, + "language_loss": 0.77042526, + "learning_rate": 3.652028186908807e-06, + "loss": 0.79175591, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5390625, + "step": 3574, + "time_per_iteration": 2.4501664638519287 + }, + { + "auxiliary_loss_clip": 0.01076666, + "auxiliary_loss_mlp": 0.01058528, + "balance_loss_clip": 1.0214541, + "balance_loss_mlp": 1.02212727, + "epoch": 0.21494062828799038, + "flos": 21320130804480.0, + "grad_norm": 2.214477791736336, + "language_loss": 0.75522923, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.77658117, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.546875, + "step": 3575, + "time_per_iteration": 3.8628697395324707 + }, + { + "auxiliary_loss_clip": 0.01079163, + "auxiliary_loss_mlp": 0.01060893, + "balance_loss_clip": 1.02322268, + "balance_loss_mlp": 1.02405834, + "epoch": 0.21500075154065834, + "flos": 18842344680960.0, + "grad_norm": 1.6358474657294553, + "language_loss": 0.70347989, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.72488046, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.55078125, + "step": 3576, + "time_per_iteration": 2.398956060409546 + }, + { + "auxiliary_loss_clip": 0.01079407, + "auxiliary_loss_mlp": 0.0105865, + "balance_loss_clip": 1.0186429, + "balance_loss_mlp": 1.02227306, + "epoch": 0.2150608747933263, + "flos": 18441018069120.0, + "grad_norm": 2.08734167681856, + "language_loss": 0.90776616, + "learning_rate": 3.651369345440292e-06, + "loss": 0.92914677, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.5703125, + "step": 3577, + "time_per_iteration": 5.174992322921753 + }, + { + "auxiliary_loss_clip": 0.01028243, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.03452218, + "balance_loss_mlp": 1.00926816, + "epoch": 0.2151209980459943, + "flos": 66595042742400.0, + "grad_norm": 0.8360473313004207, + "language_loss": 0.56211936, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58280855, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.06152344, + "router_z_loss_mlp": 0.18945312, + "step": 3578, + "time_per_iteration": 3.00282883644104 + }, + { + "auxiliary_loss_clip": 0.01078035, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_clip": 1.02302027, + "balance_loss_mlp": 1.02239394, + "epoch": 0.21518112129866226, + "flos": 21574926973440.0, + "grad_norm": 1.6506500438371856, + "language_loss": 0.89359319, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.91495973, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5546875, + "step": 3579, + "time_per_iteration": 3.779792070388794 + }, + { + "auxiliary_loss_clip": 0.01081093, + "auxiliary_loss_mlp": 0.01061283, + "balance_loss_clip": 1.0218966, + "balance_loss_mlp": 1.02372634, + "epoch": 0.21524124455133023, + "flos": 20046603807360.0, + "grad_norm": 1.8169005984089306, + "language_loss": 0.79435802, + "learning_rate": 3.650709940390972e-06, + "loss": 0.81578183, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.57421875, + "step": 3580, + "time_per_iteration": 2.3938746452331543 + }, + { + "auxiliary_loss_clip": 0.01077655, + "auxiliary_loss_mlp": 0.01053594, + "balance_loss_clip": 1.01914263, + "balance_loss_mlp": 1.02311492, + "epoch": 0.2153013678039982, + "flos": 23950661592960.0, + "grad_norm": 1.667991509928619, + "language_loss": 0.7543329, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.77564538, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.546875, + "step": 3581, + "time_per_iteration": 2.4758265018463135 + }, + { + "auxiliary_loss_clip": 0.01080988, + "auxiliary_loss_mlp": 0.0105889, + "balance_loss_clip": 1.0206238, + "balance_loss_mlp": 1.02541649, + "epoch": 0.21536149105666616, + "flos": 20593797546240.0, + "grad_norm": 2.8453311658675013, + "language_loss": 0.72998321, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.75138199, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5546875, + "step": 3582, + "time_per_iteration": 2.3840277194976807 + }, + { + "auxiliary_loss_clip": 0.0108119, + "auxiliary_loss_mlp": 0.01066484, + "balance_loss_clip": 1.02864742, + "balance_loss_mlp": 1.02507997, + "epoch": 0.21542161430933413, + "flos": 12859209031680.0, + "grad_norm": 2.63675628288287, + "language_loss": 0.8717798, + "learning_rate": 3.650049971985889e-06, + "loss": 0.89325649, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5625, + "step": 3583, + "time_per_iteration": 2.4051544666290283 + }, + { + "auxiliary_loss_clip": 0.01085134, + "auxiliary_loss_mlp": 0.01069924, + "balance_loss_clip": 1.03392315, + "balance_loss_mlp": 1.02754772, + "epoch": 0.21548173756200212, + "flos": 26102742842880.0, + "grad_norm": 2.192854479089798, + "language_loss": 0.85377038, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.87532103, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.578125, + "step": 3584, + "time_per_iteration": 2.428769826889038 + }, + { + "auxiliary_loss_clip": 0.01079161, + "auxiliary_loss_mlp": 0.01058527, + "balance_loss_clip": 1.02481508, + "balance_loss_mlp": 1.02545786, + "epoch": 0.21554186081467008, + "flos": 22162689578880.0, + "grad_norm": 1.7890503104416033, + "language_loss": 0.91852522, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.93990207, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5390625, + "step": 3585, + "time_per_iteration": 2.4998252391815186 + }, + { + "auxiliary_loss_clip": 0.01077453, + "auxiliary_loss_mlp": 0.01060302, + "balance_loss_clip": 1.02809143, + "balance_loss_mlp": 1.02400374, + "epoch": 0.21560198406733805, + "flos": 22965622093440.0, + "grad_norm": 1.7529066461844183, + "language_loss": 0.76169896, + "learning_rate": 3.649389440450277e-06, + "loss": 0.78307641, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.53125, + "step": 3586, + "time_per_iteration": 2.3942840099334717 + }, + { + "auxiliary_loss_clip": 0.01080528, + "auxiliary_loss_mlp": 0.01060243, + "balance_loss_clip": 1.02571964, + "balance_loss_mlp": 1.02416503, + "epoch": 0.215662107320006, + "flos": 22782956526720.0, + "grad_norm": 1.7268256844416523, + "language_loss": 0.83908904, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.86049676, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5625, + "step": 3587, + "time_per_iteration": 2.4388835430145264 + }, + { + "auxiliary_loss_clip": 0.01078793, + "auxiliary_loss_mlp": 0.0106185, + "balance_loss_clip": 1.02670741, + "balance_loss_mlp": 1.02412879, + "epoch": 0.21572223057267398, + "flos": 30882527061120.0, + "grad_norm": 1.7517542120606797, + "language_loss": 0.7648021, + "learning_rate": 3.648948773354224e-06, + "loss": 0.78620857, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 3588, + "time_per_iteration": 2.4840939044952393 + }, + { + "auxiliary_loss_clip": 0.01077395, + "auxiliary_loss_mlp": 0.01064873, + "balance_loss_clip": 1.03070712, + "balance_loss_mlp": 1.02225709, + "epoch": 0.21578235382534194, + "flos": 26909166493440.0, + "grad_norm": 1.9359750400478146, + "language_loss": 0.82516342, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.84658611, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.55078125, + "step": 3589, + "time_per_iteration": 2.4737401008605957 + }, + { + "auxiliary_loss_clip": 0.01075756, + "auxiliary_loss_mlp": 0.0105846, + "balance_loss_clip": 1.02651215, + "balance_loss_mlp": 1.0216347, + "epoch": 0.2158424770780099, + "flos": 24424572654720.0, + "grad_norm": 1.8140386543093965, + "language_loss": 0.74956858, + "learning_rate": 3.648507856144961e-06, + "loss": 0.77091074, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.5390625, + "step": 3590, + "time_per_iteration": 2.409327983856201 + }, + { + "auxiliary_loss_clip": 0.01077362, + "auxiliary_loss_mlp": 0.01061316, + "balance_loss_clip": 1.02400386, + "balance_loss_mlp": 1.02106369, + "epoch": 0.2159026003306779, + "flos": 23948881113600.0, + "grad_norm": 1.706182936332368, + "language_loss": 0.85513538, + "learning_rate": 3.648287303768775e-06, + "loss": 0.87652218, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5625, + "step": 3591, + "time_per_iteration": 2.4395828247070312 + }, + { + "auxiliary_loss_clip": 0.01078061, + "auxiliary_loss_mlp": 0.0106039, + "balance_loss_clip": 1.02226698, + "balance_loss_mlp": 1.0207262, + "epoch": 0.21596272358334587, + "flos": 30039758818560.0, + "grad_norm": 1.841778292137882, + "language_loss": 0.7077713, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.72915578, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.57421875, + "step": 3592, + "time_per_iteration": 2.519544839859009 + }, + { + "auxiliary_loss_clip": 0.01077606, + "auxiliary_loss_mlp": 0.01052168, + "balance_loss_clip": 1.01726413, + "balance_loss_mlp": 1.02181697, + "epoch": 0.21602284683601383, + "flos": 20375171412480.0, + "grad_norm": 2.560179532515289, + "language_loss": 0.86102891, + "learning_rate": 3.647846011515108e-06, + "loss": 0.8823266, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5546875, + "step": 3593, + "time_per_iteration": 2.483529806137085 + }, + { + "auxiliary_loss_clip": 0.01078269, + "auxiliary_loss_mlp": 0.01057896, + "balance_loss_clip": 1.02108431, + "balance_loss_mlp": 1.02242398, + "epoch": 0.2160829700886818, + "flos": 20776288556160.0, + "grad_norm": 2.3936630583189085, + "language_loss": 0.77533627, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.79669791, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.55859375, + "step": 3594, + "time_per_iteration": 2.3891115188598633 + }, + { + "auxiliary_loss_clip": 0.01073026, + "auxiliary_loss_mlp": 0.01049983, + "balance_loss_clip": 1.01770091, + "balance_loss_mlp": 1.02048504, + "epoch": 0.21614309334134976, + "flos": 22308661440000.0, + "grad_norm": 1.5475976172832224, + "language_loss": 0.81817615, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.83940625, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.5234375, + "step": 3595, + "time_per_iteration": 2.4194211959838867 + }, + { + "auxiliary_loss_clip": 0.01080284, + "auxiliary_loss_mlp": 0.01050928, + "balance_loss_clip": 1.01487911, + "balance_loss_mlp": 1.02340221, + "epoch": 0.21620321659401773, + "flos": 19608513667200.0, + "grad_norm": 1.9403694035521886, + "language_loss": 0.80626571, + "learning_rate": 3.647183604506897e-06, + "loss": 0.82757777, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5703125, + "step": 3596, + "time_per_iteration": 2.3840231895446777 + }, + { + "auxiliary_loss_clip": 0.01076966, + "auxiliary_loss_mlp": 0.01052329, + "balance_loss_clip": 1.01747227, + "balance_loss_mlp": 1.02330077, + "epoch": 0.2162633398466857, + "flos": 18843531667200.0, + "grad_norm": 1.5411130393797947, + "language_loss": 0.85201538, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.87330836, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53515625, + "step": 3597, + "time_per_iteration": 2.3868656158447266 + }, + { + "auxiliary_loss_clip": 0.01077778, + "auxiliary_loss_mlp": 0.01055636, + "balance_loss_clip": 1.01849079, + "balance_loss_mlp": 1.02268863, + "epoch": 0.21632346309935369, + "flos": 18767875017600.0, + "grad_norm": 2.8426852217499463, + "language_loss": 0.8139987, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.83533281, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.55078125, + "step": 3598, + "time_per_iteration": 2.3663063049316406 + }, + { + "auxiliary_loss_clip": 0.01078938, + "auxiliary_loss_mlp": 0.0105749, + "balance_loss_clip": 1.0190804, + "balance_loss_mlp": 1.02199316, + "epoch": 0.21638358635202165, + "flos": 26322939987840.0, + "grad_norm": 2.179651054724553, + "language_loss": 0.83550125, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.85686553, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5703125, + "step": 3599, + "time_per_iteration": 2.449903964996338 + }, + { + "auxiliary_loss_clip": 0.01077025, + "auxiliary_loss_mlp": 0.01054863, + "balance_loss_clip": 1.01948118, + "balance_loss_mlp": 1.0235368, + "epoch": 0.21644370960468962, + "flos": 20739804318720.0, + "grad_norm": 2.102827965564334, + "language_loss": 0.7794655, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.80078435, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53125, + "step": 3600, + "time_per_iteration": 2.400702953338623 + }, + { + "auxiliary_loss_clip": 0.01077151, + "auxiliary_loss_mlp": 0.01061535, + "balance_loss_clip": 1.02770364, + "balance_loss_mlp": 1.02125096, + "epoch": 0.21650383285735758, + "flos": 23951080529280.0, + "grad_norm": 1.921282567487024, + "language_loss": 0.81958187, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.84096873, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.55859375, + "step": 3601, + "time_per_iteration": 2.449150323867798 + }, + { + "auxiliary_loss_clip": 0.01079766, + "auxiliary_loss_mlp": 0.01059277, + "balance_loss_clip": 1.02384758, + "balance_loss_mlp": 1.02421272, + "epoch": 0.21656395611002555, + "flos": 23694957728640.0, + "grad_norm": 2.1117338382257107, + "language_loss": 0.85284764, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.87423807, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5546875, + "step": 3602, + "time_per_iteration": 2.471130132675171 + }, + { + "auxiliary_loss_clip": 0.01078156, + "auxiliary_loss_mlp": 0.01056977, + "balance_loss_clip": 1.01701808, + "balance_loss_mlp": 1.02177525, + "epoch": 0.2166240793626935, + "flos": 20665055364480.0, + "grad_norm": 1.77972753948087, + "language_loss": 0.76590347, + "learning_rate": 3.645635802397693e-06, + "loss": 0.78725481, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.5625, + "step": 3603, + "time_per_iteration": 2.4325335025787354 + }, + { + "auxiliary_loss_clip": 0.01076662, + "auxiliary_loss_mlp": 0.01051139, + "balance_loss_clip": 1.01580524, + "balance_loss_mlp": 1.02152133, + "epoch": 0.2166842026153615, + "flos": 21579325804800.0, + "grad_norm": 2.0257707522519426, + "language_loss": 0.75924897, + "learning_rate": 3.645414438132855e-06, + "loss": 0.780527, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.55078125, + "step": 3604, + "time_per_iteration": 2.421057939529419 + }, + { + "auxiliary_loss_clip": 0.0107576, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.01122129, + "balance_loss_mlp": 1.02177656, + "epoch": 0.21674432586802947, + "flos": 25628761958400.0, + "grad_norm": 1.7708673281509733, + "language_loss": 0.8166647, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.83787405, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5390625, + "step": 3605, + "time_per_iteration": 2.4958317279815674 + }, + { + "auxiliary_loss_clip": 0.01035933, + "auxiliary_loss_mlp": 0.01009201, + "balance_loss_clip": 0.99971181, + "balance_loss_mlp": 1.00902617, + "epoch": 0.21680444912069743, + "flos": 56414893155840.0, + "grad_norm": 0.7560061165901457, + "language_loss": 0.58413363, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60458493, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.09472656, + "router_z_loss_mlp": 0.26953125, + "step": 3606, + "time_per_iteration": 3.1175642013549805 + }, + { + "auxiliary_loss_clip": 0.01078942, + "auxiliary_loss_mlp": 0.01059401, + "balance_loss_clip": 1.01886988, + "balance_loss_mlp": 1.02092862, + "epoch": 0.2168645723733654, + "flos": 23877797852160.0, + "grad_norm": 2.075685618677511, + "language_loss": 0.74941123, + "learning_rate": 3.644749971006248e-06, + "loss": 0.77079463, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.578125, + "step": 3607, + "time_per_iteration": 2.4181196689605713 + }, + { + "auxiliary_loss_clip": 0.01080918, + "auxiliary_loss_mlp": 0.01057543, + "balance_loss_clip": 1.01870489, + "balance_loss_mlp": 1.02289867, + "epoch": 0.21692469562603336, + "flos": 16945234156800.0, + "grad_norm": 2.0333566061137707, + "language_loss": 0.78896642, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.81035101, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.578125, + "step": 3608, + "time_per_iteration": 2.3698537349700928 + }, + { + "auxiliary_loss_clip": 0.01079013, + "auxiliary_loss_mlp": 0.01057049, + "balance_loss_clip": 1.01954579, + "balance_loss_mlp": 1.02161407, + "epoch": 0.21698481887870133, + "flos": 25117877900160.0, + "grad_norm": 1.797916668853597, + "language_loss": 0.75989258, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.78125316, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.57421875, + "step": 3609, + "time_per_iteration": 2.435075044631958 + }, + { + "auxiliary_loss_clip": 0.01078535, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.01784086, + "balance_loss_mlp": 1.02209151, + "epoch": 0.2170449421313693, + "flos": 17893719596160.0, + "grad_norm": 40.10880221878159, + "language_loss": 0.90571076, + "learning_rate": 3.6440849425579e-06, + "loss": 0.92704117, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.56640625, + "step": 3610, + "time_per_iteration": 2.3640034198760986 + }, + { + "auxiliary_loss_clip": 0.01075983, + "auxiliary_loss_mlp": 0.01052004, + "balance_loss_clip": 1.01678967, + "balance_loss_mlp": 1.02114391, + "epoch": 0.2171050653840373, + "flos": 22637333779200.0, + "grad_norm": 1.5743607102706705, + "language_loss": 0.79266703, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.81394684, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 3611, + "time_per_iteration": 2.4144301414489746 + }, + { + "auxiliary_loss_clip": 0.01077189, + "auxiliary_loss_mlp": 0.01063537, + "balance_loss_clip": 1.02703524, + "balance_loss_mlp": 1.02165937, + "epoch": 0.21716518863670525, + "flos": 19498991132160.0, + "grad_norm": 1.8865436480322173, + "language_loss": 0.6514371, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.67284435, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5546875, + "step": 3612, + "time_per_iteration": 2.3780720233917236 + }, + { + "auxiliary_loss_clip": 0.01077525, + "auxiliary_loss_mlp": 0.01063038, + "balance_loss_clip": 1.02539194, + "balance_loss_mlp": 1.02107406, + "epoch": 0.21722531188937322, + "flos": 19791004677120.0, + "grad_norm": 1.918481300859056, + "language_loss": 0.77652746, + "learning_rate": 3.643419353014776e-06, + "loss": 0.7979331, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5625, + "step": 3613, + "time_per_iteration": 2.398261785507202 + }, + { + "auxiliary_loss_clip": 0.0107591, + "auxiliary_loss_mlp": 0.01065041, + "balance_loss_clip": 1.0259409, + "balance_loss_mlp": 1.02086699, + "epoch": 0.21728543514204118, + "flos": 13333539029760.0, + "grad_norm": 1.9527688733194357, + "language_loss": 0.73295522, + "learning_rate": 3.643197365185261e-06, + "loss": 0.75436473, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.55078125, + "step": 3614, + "time_per_iteration": 3.81750750541687 + }, + { + "auxiliary_loss_clip": 0.01076089, + "auxiliary_loss_mlp": 0.01057502, + "balance_loss_clip": 1.02154899, + "balance_loss_mlp": 1.02087808, + "epoch": 0.21734555839470915, + "flos": 15230963756160.0, + "grad_norm": 1.644551007125277, + "language_loss": 0.74653929, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.76787519, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.55078125, + "step": 3615, + "time_per_iteration": 2.3857126235961914 + }, + { + "auxiliary_loss_clip": 0.01081006, + "auxiliary_loss_mlp": 0.01058644, + "balance_loss_clip": 1.0186379, + "balance_loss_mlp": 1.02186024, + "epoch": 0.2174056816473771, + "flos": 19972972016640.0, + "grad_norm": 2.198393141482538, + "language_loss": 0.9181639, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.93956053, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.59375, + "step": 3616, + "time_per_iteration": 3.7836973667144775 + }, + { + "auxiliary_loss_clip": 0.01077658, + "auxiliary_loss_mlp": 0.01057392, + "balance_loss_clip": 1.01802945, + "balance_loss_mlp": 1.02116609, + "epoch": 0.21746580490004508, + "flos": 16686458092800.0, + "grad_norm": 3.025435472576452, + "language_loss": 0.8354528, + "learning_rate": 3.642531027869148e-06, + "loss": 0.8568033, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.56640625, + "step": 3617, + "time_per_iteration": 3.714735984802246 + }, + { + "auxiliary_loss_clip": 0.01079559, + "auxiliary_loss_mlp": 0.01059278, + "balance_loss_clip": 1.02043986, + "balance_loss_mlp": 1.02177596, + "epoch": 0.21752592815271307, + "flos": 25771207772160.0, + "grad_norm": 2.4405616283302227, + "language_loss": 0.76465499, + "learning_rate": 3.642308790849329e-06, + "loss": 0.78604329, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.578125, + "step": 3618, + "time_per_iteration": 3.788785219192505 + }, + { + "auxiliary_loss_clip": 0.01078525, + "auxiliary_loss_mlp": 0.01060816, + "balance_loss_clip": 1.02214408, + "balance_loss_mlp": 1.02132678, + "epoch": 0.21758605140538104, + "flos": 11253902584320.0, + "grad_norm": 1.9530326784205676, + "language_loss": 0.71730661, + "learning_rate": 3.642086491552996e-06, + "loss": 0.73869997, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5703125, + "step": 3619, + "time_per_iteration": 2.3366973400115967 + }, + { + "auxiliary_loss_clip": 0.01079288, + "auxiliary_loss_mlp": 0.01059566, + "balance_loss_clip": 1.02039349, + "balance_loss_mlp": 1.02140534, + "epoch": 0.217646174658049, + "flos": 19241681345280.0, + "grad_norm": 1.7006236640415535, + "language_loss": 0.79327059, + "learning_rate": 3.641864129988579e-06, + "loss": 0.81465912, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.578125, + "step": 3620, + "time_per_iteration": 2.422985076904297 + }, + { + "auxiliary_loss_clip": 0.01073879, + "auxiliary_loss_mlp": 0.01052213, + "balance_loss_clip": 1.01439977, + "balance_loss_mlp": 1.02006173, + "epoch": 0.21770629791071697, + "flos": 21943993622400.0, + "grad_norm": 1.4742900372777883, + "language_loss": 0.81288534, + "learning_rate": 3.641641706164509e-06, + "loss": 0.83414626, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5390625, + "step": 3621, + "time_per_iteration": 2.426555871963501 + }, + { + "auxiliary_loss_clip": 0.01075137, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.01069641, + "balance_loss_mlp": 1.01961946, + "epoch": 0.21776642116338493, + "flos": 24935596358400.0, + "grad_norm": 1.9743310511914032, + "language_loss": 0.89041531, + "learning_rate": 3.641419220089221e-06, + "loss": 0.91161144, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5546875, + "step": 3622, + "time_per_iteration": 2.4637105464935303 + }, + { + "auxiliary_loss_clip": 0.01079055, + "auxiliary_loss_mlp": 0.01053948, + "balance_loss_clip": 1.01379824, + "balance_loss_mlp": 1.02222657, + "epoch": 0.2178265444160529, + "flos": 17820367096320.0, + "grad_norm": 2.6543261005162537, + "language_loss": 0.79788852, + "learning_rate": 3.641196671771152e-06, + "loss": 0.81921852, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.5703125, + "step": 3623, + "time_per_iteration": 2.391129493713379 + }, + { + "auxiliary_loss_clip": 0.01080691, + "auxiliary_loss_mlp": 0.01067771, + "balance_loss_clip": 1.02721643, + "balance_loss_mlp": 1.02127624, + "epoch": 0.2178866676687209, + "flos": 17711926813440.0, + "grad_norm": 1.9790541424059598, + "language_loss": 0.85859627, + "learning_rate": 3.640974061218741e-06, + "loss": 0.88008088, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.59375, + "step": 3624, + "time_per_iteration": 2.4465255737304688 + }, + { + "auxiliary_loss_clip": 0.01080506, + "auxiliary_loss_mlp": 0.01062274, + "balance_loss_clip": 1.02264917, + "balance_loss_mlp": 1.02290726, + "epoch": 0.21794679092138886, + "flos": 16944919954560.0, + "grad_norm": 2.6435606316248506, + "language_loss": 0.80112731, + "learning_rate": 3.640751388440429e-06, + "loss": 0.82255512, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.57421875, + "step": 3625, + "time_per_iteration": 2.4003117084503174 + }, + { + "auxiliary_loss_clip": 0.01038884, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.0205096, + "balance_loss_mlp": 1.01311755, + "epoch": 0.21800691417405682, + "flos": 63715371425280.0, + "grad_norm": 0.8215005596152994, + "language_loss": 0.6072557, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62792403, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.07421875, + "router_z_loss_mlp": 0.2578125, + "step": 3626, + "time_per_iteration": 3.1468913555145264 + }, + { + "auxiliary_loss_clip": 0.01080367, + "auxiliary_loss_mlp": 0.01059748, + "balance_loss_clip": 1.02086186, + "balance_loss_mlp": 1.02095318, + "epoch": 0.21806703742672479, + "flos": 21615321283200.0, + "grad_norm": 2.2255081583196605, + "language_loss": 0.92065489, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.94205594, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.59375, + "step": 3627, + "time_per_iteration": 2.4180872440338135 + }, + { + "auxiliary_loss_clip": 0.01076967, + "auxiliary_loss_mlp": 0.01051187, + "balance_loss_clip": 1.01401734, + "balance_loss_mlp": 1.02035201, + "epoch": 0.21812716067939275, + "flos": 19353857143680.0, + "grad_norm": 4.361490247217413, + "language_loss": 0.75762403, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.77890563, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.56640625, + "step": 3628, + "time_per_iteration": 2.4645495414733887 + }, + { + "auxiliary_loss_clip": 0.0107711, + "auxiliary_loss_mlp": 0.01062286, + "balance_loss_clip": 1.02354252, + "balance_loss_mlp": 1.02003717, + "epoch": 0.21818728393206072, + "flos": 23546995920000.0, + "grad_norm": 3.481698463952475, + "language_loss": 0.79471183, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.81610572, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5703125, + "step": 3629, + "time_per_iteration": 2.4796934127807617 + }, + { + "auxiliary_loss_clip": 0.0107795, + "auxiliary_loss_mlp": 0.01058313, + "balance_loss_clip": 1.02250242, + "balance_loss_mlp": 1.02115571, + "epoch": 0.21824740718472868, + "flos": 30224379421440.0, + "grad_norm": 1.7070116811665181, + "language_loss": 0.73199368, + "learning_rate": 3.63963709145597e-06, + "loss": 0.75335622, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.56640625, + "step": 3630, + "time_per_iteration": 2.53882098197937 + }, + { + "auxiliary_loss_clip": 0.01075091, + "auxiliary_loss_mlp": 0.01058275, + "balance_loss_clip": 1.02375197, + "balance_loss_mlp": 1.02105367, + "epoch": 0.21830753043739667, + "flos": 26133641262720.0, + "grad_norm": 2.089826153725213, + "language_loss": 0.78325498, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.80458868, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5390625, + "step": 3631, + "time_per_iteration": 2.463078498840332 + }, + { + "auxiliary_loss_clip": 0.01077848, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_clip": 1.02412391, + "balance_loss_mlp": 1.02143013, + "epoch": 0.21836765369006464, + "flos": 21719781671040.0, + "grad_norm": 2.8084006608511207, + "language_loss": 0.76494604, + "learning_rate": 3.639190937376594e-06, + "loss": 0.78635436, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5625, + "step": 3632, + "time_per_iteration": 2.4561684131622314 + }, + { + "auxiliary_loss_clip": 0.01073798, + "auxiliary_loss_mlp": 0.01065278, + "balance_loss_clip": 1.03003931, + "balance_loss_mlp": 1.01897359, + "epoch": 0.2184277769427326, + "flos": 19936592513280.0, + "grad_norm": 2.507264490092034, + "language_loss": 0.85630119, + "learning_rate": 3.638967767095249e-06, + "loss": 0.87769192, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 3633, + "time_per_iteration": 2.452582597732544 + }, + { + "auxiliary_loss_clip": 0.01076631, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.03817308, + "balance_loss_mlp": 1.02026665, + "epoch": 0.21848790019540057, + "flos": 20339175934080.0, + "grad_norm": 1.9893652520767453, + "language_loss": 0.81987011, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.84139788, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5625, + "step": 3634, + "time_per_iteration": 2.475949287414551 + }, + { + "auxiliary_loss_clip": 0.01082452, + "auxiliary_loss_mlp": 0.01068007, + "balance_loss_clip": 1.03002667, + "balance_loss_mlp": 1.02191794, + "epoch": 0.21854802344806853, + "flos": 15449904092160.0, + "grad_norm": 1.7808670314219763, + "language_loss": 0.76545846, + "learning_rate": 3.638521240091558e-06, + "loss": 0.78696311, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.60546875, + "step": 3635, + "time_per_iteration": 2.391291618347168 + }, + { + "auxiliary_loss_clip": 0.01077314, + "auxiliary_loss_mlp": 0.01070069, + "balance_loss_clip": 1.03237486, + "balance_loss_mlp": 1.02086711, + "epoch": 0.2186081467007365, + "flos": 16319939973120.0, + "grad_norm": 15.920512277141414, + "language_loss": 0.89669275, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.91816664, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.56640625, + "step": 3636, + "time_per_iteration": 2.5335419178009033 + }, + { + "auxiliary_loss_clip": 0.01078187, + "auxiliary_loss_mlp": 0.01064782, + "balance_loss_clip": 1.0287571, + "balance_loss_mlp": 1.02099907, + "epoch": 0.2186682699534045, + "flos": 21688185024000.0, + "grad_norm": 2.0684801242222712, + "language_loss": 0.77663094, + "learning_rate": 3.638074464556311e-06, + "loss": 0.79806066, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5703125, + "step": 3637, + "time_per_iteration": 2.4054672718048096 + }, + { + "auxiliary_loss_clip": 0.01081478, + "auxiliary_loss_mlp": 0.01062945, + "balance_loss_clip": 1.02162707, + "balance_loss_mlp": 1.02222538, + "epoch": 0.21872839320607246, + "flos": 17738566047360.0, + "grad_norm": 2.633944491478341, + "language_loss": 0.93846357, + "learning_rate": 3.63785098361053e-06, + "loss": 0.95990783, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.59375, + "step": 3638, + "time_per_iteration": 2.4505255222320557 + }, + { + "auxiliary_loss_clip": 0.01075278, + "auxiliary_loss_mlp": 0.01061854, + "balance_loss_clip": 1.02268159, + "balance_loss_mlp": 1.01991296, + "epoch": 0.21878851645874042, + "flos": 18651544767360.0, + "grad_norm": 3.364062918374491, + "language_loss": 0.92254508, + "learning_rate": 3.637627440557275e-06, + "loss": 0.94391638, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5546875, + "step": 3639, + "time_per_iteration": 2.3909716606140137 + }, + { + "auxiliary_loss_clip": 0.01078507, + "auxiliary_loss_mlp": 0.01056584, + "balance_loss_clip": 1.02175117, + "balance_loss_mlp": 1.02192044, + "epoch": 0.2188486397114084, + "flos": 25556107685760.0, + "grad_norm": 1.752420769180744, + "language_loss": 0.80976689, + "learning_rate": 3.637403835405024e-06, + "loss": 0.83111787, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.56640625, + "step": 3640, + "time_per_iteration": 2.458373546600342 + }, + { + "auxiliary_loss_clip": 0.01079664, + "auxiliary_loss_mlp": 0.0106, + "balance_loss_clip": 1.02016068, + "balance_loss_mlp": 1.02291846, + "epoch": 0.21890876296407635, + "flos": 17891171066880.0, + "grad_norm": 2.8004317026074608, + "language_loss": 0.74107325, + "learning_rate": 3.637180168162255e-06, + "loss": 0.76246989, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.56640625, + "step": 3641, + "time_per_iteration": 2.4070820808410645 + }, + { + "auxiliary_loss_clip": 0.0107843, + "auxiliary_loss_mlp": 0.01055139, + "balance_loss_clip": 1.01672947, + "balance_loss_mlp": 1.02261353, + "epoch": 0.21896888621674432, + "flos": 17748131760000.0, + "grad_norm": 2.014286715398528, + "language_loss": 0.82695532, + "learning_rate": 3.63695643883745e-06, + "loss": 0.84829104, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.55859375, + "step": 3642, + "time_per_iteration": 2.3967037200927734 + }, + { + "auxiliary_loss_clip": 0.0108207, + "auxiliary_loss_mlp": 0.01058216, + "balance_loss_clip": 1.01856732, + "balance_loss_mlp": 1.0238905, + "epoch": 0.21902900946941228, + "flos": 23075039894400.0, + "grad_norm": 1.6373162136761177, + "language_loss": 0.72934932, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.75075221, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.58203125, + "step": 3643, + "time_per_iteration": 2.4296765327453613 + }, + { + "auxiliary_loss_clip": 0.01082419, + "auxiliary_loss_mlp": 0.0105708, + "balance_loss_clip": 1.01876581, + "balance_loss_mlp": 1.02311659, + "epoch": 0.21908913272208028, + "flos": 48176718923520.0, + "grad_norm": 2.18705834746407, + "language_loss": 0.6904155, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.71181047, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.59375, + "step": 3644, + "time_per_iteration": 2.702288866043091 + }, + { + "auxiliary_loss_clip": 0.01079405, + "auxiliary_loss_mlp": 0.01058477, + "balance_loss_clip": 1.01837468, + "balance_loss_mlp": 1.02139044, + "epoch": 0.21914925597474824, + "flos": 22235658053760.0, + "grad_norm": 2.5026944135088063, + "language_loss": 0.79165423, + "learning_rate": 3.636284878455669e-06, + "loss": 0.81303304, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.58203125, + "step": 3645, + "time_per_iteration": 2.4037957191467285 + }, + { + "auxiliary_loss_clip": 0.01076793, + "auxiliary_loss_mlp": 0.01048189, + "balance_loss_clip": 1.01466691, + "balance_loss_mlp": 1.0221417, + "epoch": 0.2192093792274162, + "flos": 22124564507520.0, + "grad_norm": 1.708625233330519, + "language_loss": 0.83388233, + "learning_rate": 3.636060900887582e-06, + "loss": 0.85513216, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.546875, + "step": 3646, + "time_per_iteration": 2.4480836391448975 + }, + { + "auxiliary_loss_clip": 0.0107549, + "auxiliary_loss_mlp": 0.01054942, + "balance_loss_clip": 1.01801109, + "balance_loss_mlp": 1.02124405, + "epoch": 0.21926950248008417, + "flos": 15668530225920.0, + "grad_norm": 1.8864099734719884, + "language_loss": 0.84112442, + "learning_rate": 3.635836861279901e-06, + "loss": 0.86242872, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5390625, + "step": 3647, + "time_per_iteration": 2.4078726768493652 + }, + { + "auxiliary_loss_clip": 0.01075641, + "auxiliary_loss_mlp": 0.01056213, + "balance_loss_clip": 1.02097499, + "balance_loss_mlp": 1.02032423, + "epoch": 0.21932962573275214, + "flos": 30261212772480.0, + "grad_norm": 2.1014201236892314, + "language_loss": 0.73299098, + "learning_rate": 3.635612759641123e-06, + "loss": 0.75430954, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5546875, + "step": 3648, + "time_per_iteration": 2.5395350456237793 + }, + { + "auxiliary_loss_clip": 0.01077156, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.01824105, + "balance_loss_mlp": 1.01967406, + "epoch": 0.2193897489854201, + "flos": 10779363118080.0, + "grad_norm": 4.4825131658163295, + "language_loss": 0.75344253, + "learning_rate": 3.635388595979745e-06, + "loss": 0.77476752, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.57421875, + "step": 3649, + "time_per_iteration": 2.4040582180023193 + }, + { + "auxiliary_loss_clip": 0.01072919, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.01779914, + "balance_loss_mlp": 1.02040958, + "epoch": 0.21944987223808807, + "flos": 19132368278400.0, + "grad_norm": 2.005674220891219, + "language_loss": 0.87110919, + "learning_rate": 3.635164370304267e-06, + "loss": 0.89234567, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.5234375, + "step": 3650, + "time_per_iteration": 2.4472267627716064 + }, + { + "auxiliary_loss_clip": 0.01075728, + "auxiliary_loss_mlp": 0.01052311, + "balance_loss_clip": 1.01652408, + "balance_loss_mlp": 1.01943946, + "epoch": 0.21950999549075606, + "flos": 22709988051840.0, + "grad_norm": 2.1461400438689155, + "language_loss": 0.86145568, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.88273603, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5625, + "step": 3651, + "time_per_iteration": 2.4507038593292236 + }, + { + "auxiliary_loss_clip": 0.01073065, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.01636219, + "balance_loss_mlp": 1.01930022, + "epoch": 0.21957011874342403, + "flos": 10560562427520.0, + "grad_norm": 2.709895609614865, + "language_loss": 0.77428585, + "learning_rate": 3.634715732945027e-06, + "loss": 0.79549319, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.5390625, + "step": 3652, + "time_per_iteration": 2.4118905067443848 + }, + { + "auxiliary_loss_clip": 0.01027491, + "auxiliary_loss_mlp": 0.01011525, + "balance_loss_clip": 1.00601792, + "balance_loss_mlp": 1.00899887, + "epoch": 0.219630241996092, + "flos": 65745047848320.0, + "grad_norm": 0.7915258013978405, + "language_loss": 0.51722836, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53761852, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.18554688, + "step": 3653, + "time_per_iteration": 4.465547800064087 + }, + { + "auxiliary_loss_clip": 0.01074365, + "auxiliary_loss_mlp": 0.01062134, + "balance_loss_clip": 1.02758741, + "balance_loss_mlp": 1.02105224, + "epoch": 0.21969036524875996, + "flos": 23695376664960.0, + "grad_norm": 1.7426420820149264, + "language_loss": 0.76678616, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.78815114, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.53515625, + "step": 3654, + "time_per_iteration": 2.457834243774414 + }, + { + "auxiliary_loss_clip": 0.01078138, + "auxiliary_loss_mlp": 0.01060251, + "balance_loss_clip": 1.0223428, + "balance_loss_mlp": 1.02176058, + "epoch": 0.21975048850142792, + "flos": 19640040491520.0, + "grad_norm": 1.8295280314055467, + "language_loss": 0.74236947, + "learning_rate": 3.634042312013064e-06, + "loss": 0.76375335, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5625, + "step": 3655, + "time_per_iteration": 2.449251890182495 + }, + { + "auxiliary_loss_clip": 0.01075444, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.01141024, + "balance_loss_mlp": 1.02076614, + "epoch": 0.21981061175409589, + "flos": 22447651029120.0, + "grad_norm": 1.6848249092387113, + "language_loss": 0.81961161, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.8408218, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.546875, + "step": 3656, + "time_per_iteration": 4.727744102478027 + }, + { + "auxiliary_loss_clip": 0.01075946, + "auxiliary_loss_mlp": 0.01047495, + "balance_loss_clip": 1.01335359, + "balance_loss_mlp": 1.02166355, + "epoch": 0.21987073500676388, + "flos": 18150051864960.0, + "grad_norm": 2.0473443216253995, + "language_loss": 0.87136978, + "learning_rate": 3.63359305489566e-06, + "loss": 0.89260417, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.54296875, + "step": 3657, + "time_per_iteration": 3.764265775680542 + }, + { + "auxiliary_loss_clip": 0.01076045, + "auxiliary_loss_mlp": 0.01052876, + "balance_loss_clip": 1.01692247, + "balance_loss_mlp": 1.0216521, + "epoch": 0.21993085825943184, + "flos": 25625096265600.0, + "grad_norm": 1.6512844143681358, + "language_loss": 0.8201158, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.84140503, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.54296875, + "step": 3658, + "time_per_iteration": 3.8667969703674316 + }, + { + "auxiliary_loss_clip": 0.01032005, + "auxiliary_loss_mlp": 0.01022006, + "balance_loss_clip": 1.01759541, + "balance_loss_mlp": 1.01492071, + "epoch": 0.2199909815120998, + "flos": 70919349096960.0, + "grad_norm": 0.7813363199321396, + "language_loss": 0.58273, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60327005, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.17089844, + "step": 3659, + "time_per_iteration": 3.1734459400177 + }, + { + "auxiliary_loss_clip": 0.01077252, + "auxiliary_loss_mlp": 0.01047175, + "balance_loss_clip": 1.01486969, + "balance_loss_mlp": 1.02369976, + "epoch": 0.22005110476476777, + "flos": 21542457542400.0, + "grad_norm": 2.1069754495823334, + "language_loss": 0.75718296, + "learning_rate": 3.632918704645772e-06, + "loss": 0.77842724, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.53515625, + "step": 3660, + "time_per_iteration": 2.399705171585083 + }, + { + "auxiliary_loss_clip": 0.01078227, + "auxiliary_loss_mlp": 0.01052579, + "balance_loss_clip": 1.01736474, + "balance_loss_mlp": 1.02412689, + "epoch": 0.22011122801743574, + "flos": 22053411423360.0, + "grad_norm": 2.542889185780689, + "language_loss": 0.81714797, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83845603, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.54296875, + "step": 3661, + "time_per_iteration": 2.445868968963623 + }, + { + "auxiliary_loss_clip": 0.01076514, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_clip": 1.01586378, + "balance_loss_mlp": 1.0246768, + "epoch": 0.2201713512701037, + "flos": 26686385907840.0, + "grad_norm": 1.7159015368158572, + "language_loss": 0.74884677, + "learning_rate": 3.632468828196102e-06, + "loss": 0.77007103, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.51953125, + "step": 3662, + "time_per_iteration": 2.46297025680542 + }, + { + "auxiliary_loss_clip": 0.01075892, + "auxiliary_loss_mlp": 0.01050662, + "balance_loss_clip": 1.02211142, + "balance_loss_mlp": 1.02454472, + "epoch": 0.22023147452277167, + "flos": 22161153479040.0, + "grad_norm": 1.6700417600311428, + "language_loss": 0.80186772, + "learning_rate": 3.632243797111929e-06, + "loss": 0.82313323, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.515625, + "step": 3663, + "time_per_iteration": 2.4535279273986816 + }, + { + "auxiliary_loss_clip": 0.01077557, + "auxiliary_loss_mlp": 0.01056255, + "balance_loss_clip": 1.02404428, + "balance_loss_mlp": 1.02401364, + "epoch": 0.22029159777543966, + "flos": 22522330160640.0, + "grad_norm": 2.949447933059579, + "language_loss": 0.81599796, + "learning_rate": 3.632018704132908e-06, + "loss": 0.83733606, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.53515625, + "step": 3664, + "time_per_iteration": 2.4290482997894287 + }, + { + "auxiliary_loss_clip": 0.01080139, + "auxiliary_loss_mlp": 0.01063695, + "balance_loss_clip": 1.02871943, + "balance_loss_mlp": 1.02469277, + "epoch": 0.22035172102810763, + "flos": 13041630218880.0, + "grad_norm": 2.741169024872157, + "language_loss": 0.78783846, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.80927682, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5546875, + "step": 3665, + "time_per_iteration": 2.4008941650390625 + }, + { + "auxiliary_loss_clip": 0.01074832, + "auxiliary_loss_mlp": 0.01066725, + "balance_loss_clip": 1.0359689, + "balance_loss_mlp": 1.02334762, + "epoch": 0.2204118442807756, + "flos": 12165031002240.0, + "grad_norm": 3.1557182905746712, + "language_loss": 0.99302453, + "learning_rate": 3.631568332524466e-06, + "loss": 1.01444006, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.515625, + "step": 3666, + "time_per_iteration": 2.373237133026123 + }, + { + "auxiliary_loss_clip": 0.01073975, + "auxiliary_loss_mlp": 0.0106226, + "balance_loss_clip": 1.02971613, + "balance_loss_mlp": 1.02168369, + "epoch": 0.22047196753344356, + "flos": 40107383493120.0, + "grad_norm": 1.730850934356633, + "language_loss": 0.81527555, + "learning_rate": 3.631343053912122e-06, + "loss": 0.83663797, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.5234375, + "step": 3667, + "time_per_iteration": 2.592818260192871 + }, + { + "auxiliary_loss_clip": 0.0107498, + "auxiliary_loss_mlp": 0.01073154, + "balance_loss_clip": 1.03929901, + "balance_loss_mlp": 1.02133656, + "epoch": 0.22053209078611152, + "flos": 20700178058880.0, + "grad_norm": 1.8766342092713673, + "language_loss": 0.78284168, + "learning_rate": 3.631117713439087e-06, + "loss": 0.80432296, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5390625, + "step": 3668, + "time_per_iteration": 2.394470691680908 + }, + { + "auxiliary_loss_clip": 0.01074389, + "auxiliary_loss_mlp": 0.01063818, + "balance_loss_clip": 1.03079724, + "balance_loss_mlp": 1.02239394, + "epoch": 0.2205922140387795, + "flos": 24715189745280.0, + "grad_norm": 1.6383201253323754, + "language_loss": 0.73074746, + "learning_rate": 3.630892311113904e-06, + "loss": 0.75212955, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.51953125, + "step": 3669, + "time_per_iteration": 2.495992422103882 + }, + { + "auxiliary_loss_clip": 0.01072941, + "auxiliary_loss_mlp": 0.01056893, + "balance_loss_clip": 1.0259223, + "balance_loss_mlp": 1.02150428, + "epoch": 0.22065233729144745, + "flos": 23476122126720.0, + "grad_norm": 1.7998602960158632, + "language_loss": 0.86199462, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.88329291, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.515625, + "step": 3670, + "time_per_iteration": 2.456986427307129 + }, + { + "auxiliary_loss_clip": 0.01077433, + "auxiliary_loss_mlp": 0.01055387, + "balance_loss_clip": 1.02534676, + "balance_loss_mlp": 1.02352405, + "epoch": 0.22071246054411545, + "flos": 35224116405120.0, + "grad_norm": 1.6766895852134245, + "language_loss": 0.77440917, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79573739, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.5390625, + "step": 3671, + "time_per_iteration": 2.589750051498413 + }, + { + "auxiliary_loss_clip": 0.01076145, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_clip": 1.01988554, + "balance_loss_mlp": 1.02338886, + "epoch": 0.2207725837967834, + "flos": 18149318726400.0, + "grad_norm": 2.0888457761017936, + "language_loss": 0.82253331, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.84380549, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.52734375, + "step": 3672, + "time_per_iteration": 2.401921510696411 + }, + { + "auxiliary_loss_clip": 0.01080346, + "auxiliary_loss_mlp": 0.01053853, + "balance_loss_clip": 1.0220958, + "balance_loss_mlp": 1.02720535, + "epoch": 0.22083270704945138, + "flos": 20478793927680.0, + "grad_norm": 1.9700190571663594, + "language_loss": 0.74882901, + "learning_rate": 3.629990083462682e-06, + "loss": 0.77017099, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.53125, + "step": 3673, + "time_per_iteration": 2.429598331451416 + }, + { + "auxiliary_loss_clip": 0.01079708, + "auxiliary_loss_mlp": 0.01053935, + "balance_loss_clip": 1.01804078, + "balance_loss_mlp": 1.0262903, + "epoch": 0.22089283030211934, + "flos": 34124527134720.0, + "grad_norm": 2.033280219669827, + "language_loss": 0.78043067, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.80176711, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53515625, + "step": 3674, + "time_per_iteration": 2.550156354904175 + }, + { + "auxiliary_loss_clip": 0.01083367, + "auxiliary_loss_mlp": 0.01052607, + "balance_loss_clip": 1.01581907, + "balance_loss_mlp": 1.02907455, + "epoch": 0.2209529535547873, + "flos": 18076245517440.0, + "grad_norm": 2.3671845016577424, + "language_loss": 0.76841748, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.78977722, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.54296875, + "step": 3675, + "time_per_iteration": 2.4818050861358643 + }, + { + "auxiliary_loss_clip": 0.01081156, + "auxiliary_loss_mlp": 0.01054185, + "balance_loss_clip": 1.02154565, + "balance_loss_mlp": 1.02614045, + "epoch": 0.22101307680745527, + "flos": 27234103317120.0, + "grad_norm": 1.8346643327463348, + "language_loss": 0.81429446, + "learning_rate": 3.629312763695772e-06, + "loss": 0.83564794, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.55078125, + "step": 3676, + "time_per_iteration": 2.477440595626831 + }, + { + "auxiliary_loss_clip": 0.01086325, + "auxiliary_loss_mlp": 0.01058877, + "balance_loss_clip": 1.02149343, + "balance_loss_mlp": 1.02979589, + "epoch": 0.22107320006012326, + "flos": 16542371445120.0, + "grad_norm": 2.499892925672237, + "language_loss": 0.76629579, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.78774774, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.56640625, + "step": 3677, + "time_per_iteration": 2.399937629699707 + }, + { + "auxiliary_loss_clip": 0.01080738, + "auxiliary_loss_mlp": 0.01051851, + "balance_loss_clip": 1.01909268, + "balance_loss_mlp": 1.02641773, + "epoch": 0.22113332331279123, + "flos": 22053376512000.0, + "grad_norm": 1.7973742561768564, + "language_loss": 0.85199714, + "learning_rate": 3.628860908251712e-06, + "loss": 0.87332302, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.54296875, + "step": 3678, + "time_per_iteration": 2.426067590713501 + }, + { + "auxiliary_loss_clip": 0.01084117, + "auxiliary_loss_mlp": 0.01057499, + "balance_loss_clip": 1.02207017, + "balance_loss_mlp": 1.03067911, + "epoch": 0.2211934465654592, + "flos": 26611636953600.0, + "grad_norm": 1.8345784904010092, + "language_loss": 0.90666413, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.92808032, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53515625, + "step": 3679, + "time_per_iteration": 2.501152276992798 + }, + { + "auxiliary_loss_clip": 0.01085341, + "auxiliary_loss_mlp": 0.0106454, + "balance_loss_clip": 1.02811015, + "balance_loss_mlp": 1.02970982, + "epoch": 0.22125356981812716, + "flos": 16359496410240.0, + "grad_norm": 2.667651978536447, + "language_loss": 0.87920356, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.90070236, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5546875, + "step": 3680, + "time_per_iteration": 2.3931174278259277 + }, + { + "auxiliary_loss_clip": 0.01080923, + "auxiliary_loss_mlp": 0.0106248, + "balance_loss_clip": 1.02848208, + "balance_loss_mlp": 1.0294441, + "epoch": 0.22131369307079513, + "flos": 21650094864000.0, + "grad_norm": 2.0484754987442377, + "language_loss": 0.82039022, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84182429, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.515625, + "step": 3681, + "time_per_iteration": 2.4510788917541504 + }, + { + "auxiliary_loss_clip": 0.01078967, + "auxiliary_loss_mlp": 0.01065651, + "balance_loss_clip": 1.0328449, + "balance_loss_mlp": 1.02690029, + "epoch": 0.2213738163234631, + "flos": 19608513667200.0, + "grad_norm": 2.2740220521415027, + "language_loss": 0.8120544, + "learning_rate": 3.62795645623335e-06, + "loss": 0.83350062, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.51953125, + "step": 3682, + "time_per_iteration": 2.434425115585327 + }, + { + "auxiliary_loss_clip": 0.01080444, + "auxiliary_loss_mlp": 0.01063062, + "balance_loss_clip": 1.02684617, + "balance_loss_mlp": 1.026196, + "epoch": 0.22143393957613106, + "flos": 23622268544640.0, + "grad_norm": 1.5774245557716333, + "language_loss": 0.78728312, + "learning_rate": 3.627730188876638e-06, + "loss": 0.8087182, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.54296875, + "step": 3683, + "time_per_iteration": 2.4775383472442627 + }, + { + "auxiliary_loss_clip": 0.01080756, + "auxiliary_loss_mlp": 0.0106646, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.02497244, + "epoch": 0.22149406282879905, + "flos": 26176584101760.0, + "grad_norm": 1.9483620246264541, + "language_loss": 0.74037635, + "learning_rate": 3.627503859796234e-06, + "loss": 0.76184845, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.55859375, + "step": 3684, + "time_per_iteration": 2.476248264312744 + }, + { + "auxiliary_loss_clip": 0.01078932, + "auxiliary_loss_mlp": 0.01060025, + "balance_loss_clip": 1.02392864, + "balance_loss_mlp": 1.02448726, + "epoch": 0.221554186081467, + "flos": 14537867978880.0, + "grad_norm": 1.8216885217031387, + "language_loss": 0.81001657, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.83140618, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.54296875, + "step": 3685, + "time_per_iteration": 2.4087557792663574 + }, + { + "auxiliary_loss_clip": 0.01075306, + "auxiliary_loss_mlp": 0.0106184, + "balance_loss_clip": 1.02660131, + "balance_loss_mlp": 1.02301884, + "epoch": 0.22161430933413498, + "flos": 22237124330880.0, + "grad_norm": 1.6290538355393331, + "language_loss": 0.88194227, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.90331376, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5234375, + "step": 3686, + "time_per_iteration": 2.452080488204956 + }, + { + "auxiliary_loss_clip": 0.01075388, + "auxiliary_loss_mlp": 0.01054316, + "balance_loss_clip": 1.01888716, + "balance_loss_mlp": 1.02251756, + "epoch": 0.22167443258680294, + "flos": 23475423899520.0, + "grad_norm": 1.897713983838019, + "language_loss": 0.80359101, + "learning_rate": 3.626824502298707e-06, + "loss": 0.82488811, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.52734375, + "step": 3687, + "time_per_iteration": 2.4678051471710205 + }, + { + "auxiliary_loss_clip": 0.01081957, + "auxiliary_loss_mlp": 0.01060926, + "balance_loss_clip": 1.02163458, + "balance_loss_mlp": 1.02371168, + "epoch": 0.2217345558394709, + "flos": 23220034237440.0, + "grad_norm": 1.8461983255296257, + "language_loss": 0.86874962, + "learning_rate": 3.626597926409383e-06, + "loss": 0.89017844, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.58203125, + "step": 3688, + "time_per_iteration": 2.450288772583008 + }, + { + "auxiliary_loss_clip": 0.01079312, + "auxiliary_loss_mlp": 0.01056853, + "balance_loss_clip": 1.01846802, + "balance_loss_mlp": 1.0239861, + "epoch": 0.22179467909213887, + "flos": 20010049747200.0, + "grad_norm": 1.776696639717824, + "language_loss": 0.82819569, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.84955734, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5546875, + "step": 3689, + "time_per_iteration": 2.433804512023926 + }, + { + "auxiliary_loss_clip": 0.0107979, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.02359629, + "balance_loss_mlp": 1.02598786, + "epoch": 0.22185480234480687, + "flos": 19682005812480.0, + "grad_norm": 1.7629901017607228, + "language_loss": 0.72167015, + "learning_rate": 3.626144589597061e-06, + "loss": 0.74306017, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5390625, + "step": 3690, + "time_per_iteration": 2.449148654937744 + }, + { + "auxiliary_loss_clip": 0.01079787, + "auxiliary_loss_mlp": 0.01058661, + "balance_loss_clip": 1.02022815, + "balance_loss_mlp": 1.02394783, + "epoch": 0.22191492559747483, + "flos": 21980233480320.0, + "grad_norm": 1.8348451931997531, + "language_loss": 0.74223411, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.76361853, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5546875, + "step": 3691, + "time_per_iteration": 2.437488079071045 + }, + { + "auxiliary_loss_clip": 0.01081643, + "auxiliary_loss_mlp": 0.01057941, + "balance_loss_clip": 1.0220356, + "balance_loss_mlp": 1.02809024, + "epoch": 0.2219750488501428, + "flos": 23220941932800.0, + "grad_norm": 2.118115705037344, + "language_loss": 0.73467183, + "learning_rate": 3.625691006130477e-06, + "loss": 0.75606769, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53515625, + "step": 3692, + "time_per_iteration": 2.4711406230926514 + }, + { + "auxiliary_loss_clip": 0.01082869, + "auxiliary_loss_mlp": 0.01061484, + "balance_loss_clip": 1.02309835, + "balance_loss_mlp": 1.02626967, + "epoch": 0.22203517210281076, + "flos": 22452643353600.0, + "grad_norm": 1.750287445627396, + "language_loss": 0.88279527, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.90423882, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.56640625, + "step": 3693, + "time_per_iteration": 3.989405870437622 + }, + { + "auxiliary_loss_clip": 0.0107885, + "auxiliary_loss_mlp": 0.01050057, + "balance_loss_clip": 1.01789463, + "balance_loss_mlp": 1.02682745, + "epoch": 0.22209529535547873, + "flos": 17563650802560.0, + "grad_norm": 2.15093843742596, + "language_loss": 0.87533534, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.89662439, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.51953125, + "step": 3694, + "time_per_iteration": 2.4410159587860107 + }, + { + "auxiliary_loss_clip": 0.01087041, + "auxiliary_loss_mlp": 0.0105706, + "balance_loss_clip": 1.01710129, + "balance_loss_mlp": 1.0292145, + "epoch": 0.2221554186081467, + "flos": 21467987879040.0, + "grad_norm": 2.676437127247946, + "language_loss": 0.71308362, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.73452461, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.578125, + "step": 3695, + "time_per_iteration": 4.64707088470459 + }, + { + "auxiliary_loss_clip": 0.01083555, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_clip": 1.01088798, + "balance_loss_mlp": 1.03160286, + "epoch": 0.22221554186081466, + "flos": 27672193457280.0, + "grad_norm": 1.6277341708000246, + "language_loss": 0.73188239, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.75313675, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.51953125, + "step": 3696, + "time_per_iteration": 3.9229955673217773 + }, + { + "auxiliary_loss_clip": 0.01083333, + "auxiliary_loss_mlp": 0.01049662, + "balance_loss_clip": 1.01468611, + "balance_loss_mlp": 1.02864492, + "epoch": 0.22227566511348265, + "flos": 25957713588480.0, + "grad_norm": 1.7378580175663, + "language_loss": 0.88799453, + "learning_rate": 3.624555968803217e-06, + "loss": 0.90932453, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.546875, + "step": 3697, + "time_per_iteration": 3.9732067584991455 + }, + { + "auxiliary_loss_clip": 0.01076953, + "auxiliary_loss_mlp": 0.01046602, + "balance_loss_clip": 1.01641798, + "balance_loss_mlp": 1.02610564, + "epoch": 0.22233578836615062, + "flos": 39202085272320.0, + "grad_norm": 1.559396353572843, + "language_loss": 0.67654616, + "learning_rate": 3.624328776493346e-06, + "loss": 0.69778168, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.5078125, + "step": 3698, + "time_per_iteration": 2.6268131732940674 + }, + { + "auxiliary_loss_clip": 0.01081389, + "auxiliary_loss_mlp": 0.01058821, + "balance_loss_clip": 1.02212787, + "balance_loss_mlp": 1.02598524, + "epoch": 0.22239591161881858, + "flos": 36282298936320.0, + "grad_norm": 1.7958569107260984, + "language_loss": 0.8409971, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.86239922, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5546875, + "step": 3699, + "time_per_iteration": 2.585388422012329 + }, + { + "auxiliary_loss_clip": 0.01081372, + "auxiliary_loss_mlp": 0.01049637, + "balance_loss_clip": 1.01551962, + "balance_loss_mlp": 1.02752662, + "epoch": 0.22245603487148655, + "flos": 19718559872640.0, + "grad_norm": 1.9710811813034157, + "language_loss": 0.81149131, + "learning_rate": 3.62387420709809e-06, + "loss": 0.83280134, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5390625, + "step": 3700, + "time_per_iteration": 2.4705026149749756 + }, + { + "auxiliary_loss_clip": 0.01084336, + "auxiliary_loss_mlp": 0.01067251, + "balance_loss_clip": 1.02722013, + "balance_loss_mlp": 1.02729952, + "epoch": 0.2225161581241545, + "flos": 46278700704000.0, + "grad_norm": 2.2984912798984, + "language_loss": 0.73602057, + "learning_rate": 3.623646830029943e-06, + "loss": 0.75753641, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.5703125, + "step": 3701, + "time_per_iteration": 2.6548750400543213 + }, + { + "auxiliary_loss_clip": 0.01076999, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.02319264, + "balance_loss_mlp": 1.02383614, + "epoch": 0.22257628137682248, + "flos": 23695062462720.0, + "grad_norm": 1.7224129199326386, + "language_loss": 0.82523394, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.84656107, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.53125, + "step": 3702, + "time_per_iteration": 2.4736104011535645 + }, + { + "auxiliary_loss_clip": 0.01072571, + "auxiliary_loss_mlp": 0.01055312, + "balance_loss_clip": 1.02279139, + "balance_loss_mlp": 1.02236187, + "epoch": 0.22263640462949044, + "flos": 19352984359680.0, + "grad_norm": 1.8400514293373422, + "language_loss": 0.79894245, + "learning_rate": 3.623191891195723e-06, + "loss": 0.82022119, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.50390625, + "step": 3703, + "time_per_iteration": 2.437183141708374 + }, + { + "auxiliary_loss_clip": 0.0107706, + "auxiliary_loss_mlp": 0.01062353, + "balance_loss_clip": 1.02811599, + "balance_loss_mlp": 1.02176452, + "epoch": 0.22269652788215843, + "flos": 20775031747200.0, + "grad_norm": 1.9965288626340112, + "language_loss": 0.76487577, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.7862699, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5546875, + "step": 3704, + "time_per_iteration": 2.4091475009918213 + }, + { + "auxiliary_loss_clip": 0.01072493, + "auxiliary_loss_mlp": 0.01055056, + "balance_loss_clip": 1.02387047, + "balance_loss_mlp": 1.02273822, + "epoch": 0.2227566511348264, + "flos": 47957045448960.0, + "grad_norm": 4.340216078379273, + "language_loss": 0.65674013, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.67801559, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.49804688, + "step": 3705, + "time_per_iteration": 2.7136647701263428 + }, + { + "auxiliary_loss_clip": 0.01031506, + "auxiliary_loss_mlp": 0.01044999, + "balance_loss_clip": 1.04023099, + "balance_loss_mlp": 1.01675391, + "epoch": 0.22281677438749437, + "flos": 66216166001280.0, + "grad_norm": 1.3418975660239376, + "language_loss": 0.65370679, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67447186, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.04760742, + "router_z_loss_mlp": 0.14746094, + "step": 3706, + "time_per_iteration": 2.9096553325653076 + }, + { + "auxiliary_loss_clip": 0.01076008, + "auxiliary_loss_mlp": 0.01054494, + "balance_loss_clip": 1.01982784, + "balance_loss_mlp": 1.02248907, + "epoch": 0.22287689764016233, + "flos": 21870536388480.0, + "grad_norm": 3.0578313577447576, + "language_loss": 0.82510471, + "learning_rate": 3.622281274977141e-06, + "loss": 0.84640968, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53515625, + "step": 3707, + "time_per_iteration": 2.4361467361450195 + }, + { + "auxiliary_loss_clip": 0.01075946, + "auxiliary_loss_mlp": 0.01054954, + "balance_loss_clip": 1.01973963, + "balance_loss_mlp": 1.02404392, + "epoch": 0.2229370208928303, + "flos": 27671250850560.0, + "grad_norm": 1.6828363891243179, + "language_loss": 0.80490088, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.8262099, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.51953125, + "step": 3708, + "time_per_iteration": 2.446653127670288 + }, + { + "auxiliary_loss_clip": 0.01080679, + "auxiliary_loss_mlp": 0.01051792, + "balance_loss_clip": 1.01726925, + "balance_loss_mlp": 1.02725673, + "epoch": 0.22299714414549826, + "flos": 30153331071360.0, + "grad_norm": 1.9305707872171725, + "language_loss": 0.82840008, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.84972477, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.53515625, + "step": 3709, + "time_per_iteration": 2.520690441131592 + }, + { + "auxiliary_loss_clip": 0.01080048, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.02552056, + "epoch": 0.22305726739816625, + "flos": 23142178172160.0, + "grad_norm": 2.1311535448042935, + "language_loss": 0.71046317, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.73179084, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 3710, + "time_per_iteration": 2.46634840965271 + }, + { + "auxiliary_loss_clip": 0.01081765, + "auxiliary_loss_mlp": 0.01048907, + "balance_loss_clip": 1.01621962, + "balance_loss_mlp": 1.02687514, + "epoch": 0.22311739065083422, + "flos": 19171051931520.0, + "grad_norm": 4.247073522138863, + "language_loss": 0.92473722, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.94604397, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.546875, + "step": 3711, + "time_per_iteration": 2.4315686225891113 + }, + { + "auxiliary_loss_clip": 0.01083237, + "auxiliary_loss_mlp": 0.0105734, + "balance_loss_clip": 1.02238774, + "balance_loss_mlp": 1.02987075, + "epoch": 0.22317751390350218, + "flos": 13617138936960.0, + "grad_norm": 2.364382512654724, + "language_loss": 0.92011476, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.94152057, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.53125, + "step": 3712, + "time_per_iteration": 2.373652935028076 + }, + { + "auxiliary_loss_clip": 0.01083482, + "auxiliary_loss_mlp": 0.01054788, + "balance_loss_clip": 1.02000308, + "balance_loss_mlp": 1.03161216, + "epoch": 0.22323763715617015, + "flos": 11028468735360.0, + "grad_norm": 3.0535825222822024, + "language_loss": 0.78459632, + "learning_rate": 3.620913505310117e-06, + "loss": 0.80597901, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.51953125, + "step": 3713, + "time_per_iteration": 2.3980233669281006 + }, + { + "auxiliary_loss_clip": 0.01082238, + "auxiliary_loss_mlp": 0.01059236, + "balance_loss_clip": 1.02418876, + "balance_loss_mlp": 1.03058314, + "epoch": 0.22329776040883811, + "flos": 41350012070400.0, + "grad_norm": 2.034392755089827, + "language_loss": 0.64015257, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.66156739, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.515625, + "step": 3714, + "time_per_iteration": 2.645505428314209 + }, + { + "auxiliary_loss_clip": 0.01083461, + "auxiliary_loss_mlp": 0.01059178, + "balance_loss_clip": 1.02802896, + "balance_loss_mlp": 1.03081226, + "epoch": 0.22335788366150608, + "flos": 25118296836480.0, + "grad_norm": 1.803924241902877, + "language_loss": 0.80821711, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.82964349, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.52734375, + "step": 3715, + "time_per_iteration": 2.465149402618408 + }, + { + "auxiliary_loss_clip": 0.01083669, + "auxiliary_loss_mlp": 0.01067205, + "balance_loss_clip": 1.03411222, + "balance_loss_mlp": 1.03130484, + "epoch": 0.22341800691417404, + "flos": 16982416621440.0, + "grad_norm": 1.543415868716006, + "language_loss": 0.78222841, + "learning_rate": 3.620228790579645e-06, + "loss": 0.80373716, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5234375, + "step": 3716, + "time_per_iteration": 2.407839059829712 + }, + { + "auxiliary_loss_clip": 0.0108078, + "auxiliary_loss_mlp": 0.01065142, + "balance_loss_clip": 1.03169155, + "balance_loss_mlp": 1.0288465, + "epoch": 0.22347813016684204, + "flos": 14135878051200.0, + "grad_norm": 2.347894321867998, + "language_loss": 0.81116652, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.83262575, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.51953125, + "step": 3717, + "time_per_iteration": 2.395129680633545 + }, + { + "auxiliary_loss_clip": 0.01080749, + "auxiliary_loss_mlp": 0.01065121, + "balance_loss_clip": 1.03188515, + "balance_loss_mlp": 1.02755177, + "epoch": 0.22353825341951, + "flos": 23582118614400.0, + "grad_norm": 9.571191063909529, + "language_loss": 0.69043493, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.71189362, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.53125, + "step": 3718, + "time_per_iteration": 2.467946767807007 + }, + { + "auxiliary_loss_clip": 0.01080045, + "auxiliary_loss_mlp": 0.01058701, + "balance_loss_clip": 1.02346301, + "balance_loss_mlp": 1.02714634, + "epoch": 0.22359837667217797, + "flos": 29822948075520.0, + "grad_norm": 1.786894299410668, + "language_loss": 0.82468098, + "learning_rate": 3.619543522896045e-06, + "loss": 0.8460685, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.53125, + "step": 3719, + "time_per_iteration": 2.4715735912323 + }, + { + "auxiliary_loss_clip": 0.01080576, + "auxiliary_loss_mlp": 0.01061185, + "balance_loss_clip": 1.02556574, + "balance_loss_mlp": 1.02506995, + "epoch": 0.22365849992484593, + "flos": 17602124987520.0, + "grad_norm": 1.9461292135264183, + "language_loss": 0.88537496, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.90679252, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5546875, + "step": 3720, + "time_per_iteration": 2.3799004554748535 + }, + { + "auxiliary_loss_clip": 0.01074996, + "auxiliary_loss_mlp": 0.01053132, + "balance_loss_clip": 1.02114856, + "balance_loss_mlp": 1.0249964, + "epoch": 0.2237186231775139, + "flos": 22709848406400.0, + "grad_norm": 1.5878534228818606, + "language_loss": 0.75521302, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77649432, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.5, + "step": 3721, + "time_per_iteration": 2.434608221054077 + }, + { + "auxiliary_loss_clip": 0.01076887, + "auxiliary_loss_mlp": 0.01055351, + "balance_loss_clip": 1.01851487, + "balance_loss_mlp": 1.02208793, + "epoch": 0.22377874643018186, + "flos": 13370651671680.0, + "grad_norm": 2.305769587653549, + "language_loss": 0.80373895, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.82506132, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.546875, + "step": 3722, + "time_per_iteration": 2.369907855987549 + }, + { + "auxiliary_loss_clip": 0.01071502, + "auxiliary_loss_mlp": 0.01047296, + "balance_loss_clip": 1.0150615, + "balance_loss_mlp": 1.02160835, + "epoch": 0.22383886968284986, + "flos": 17893998887040.0, + "grad_norm": 1.9277601432998899, + "language_loss": 0.84049833, + "learning_rate": 3.618628972906178e-06, + "loss": 0.86168623, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5, + "step": 3723, + "time_per_iteration": 2.408546209335327 + }, + { + "auxiliary_loss_clip": 0.01074041, + "auxiliary_loss_mlp": 0.01057539, + "balance_loss_clip": 1.02406478, + "balance_loss_mlp": 1.02105391, + "epoch": 0.22389899293551782, + "flos": 23877972408960.0, + "grad_norm": 4.3806221256252735, + "language_loss": 0.86645722, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.88777304, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.52734375, + "step": 3724, + "time_per_iteration": 2.412684679031372 + }, + { + "auxiliary_loss_clip": 0.01072975, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_clip": 1.01494443, + "balance_loss_mlp": 1.02132654, + "epoch": 0.2239591161881858, + "flos": 27271181047680.0, + "grad_norm": 1.7030504538746665, + "language_loss": 0.81114405, + "learning_rate": 3.618171329605121e-06, + "loss": 0.83234632, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.515625, + "step": 3725, + "time_per_iteration": 2.469985246658325 + }, + { + "auxiliary_loss_clip": 0.01073994, + "auxiliary_loss_mlp": 0.01048548, + "balance_loss_clip": 1.01531267, + "balance_loss_mlp": 1.02252865, + "epoch": 0.22401923944085375, + "flos": 22235762787840.0, + "grad_norm": 1.7866748850788903, + "language_loss": 0.78731221, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.80853766, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.515625, + "step": 3726, + "time_per_iteration": 2.417520523071289 + }, + { + "auxiliary_loss_clip": 0.01078495, + "auxiliary_loss_mlp": 0.01062286, + "balance_loss_clip": 1.02409148, + "balance_loss_mlp": 1.02169156, + "epoch": 0.22407936269352172, + "flos": 12052959937920.0, + "grad_norm": 2.7890993205189543, + "language_loss": 0.74366951, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.76507735, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.56640625, + "step": 3727, + "time_per_iteration": 2.4231162071228027 + }, + { + "auxiliary_loss_clip": 0.01078904, + "auxiliary_loss_mlp": 0.01056032, + "balance_loss_clip": 1.01840925, + "balance_loss_mlp": 1.02342677, + "epoch": 0.22413948594618968, + "flos": 19352565423360.0, + "grad_norm": 2.2068934721115028, + "language_loss": 0.88888526, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.91023463, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5546875, + "step": 3728, + "time_per_iteration": 2.4105615615844727 + }, + { + "auxiliary_loss_clip": 0.0107589, + "auxiliary_loss_mlp": 0.01054305, + "balance_loss_clip": 1.01708794, + "balance_loss_mlp": 1.02262068, + "epoch": 0.22419960919885765, + "flos": 24168868790400.0, + "grad_norm": 2.848761672193175, + "language_loss": 0.82694185, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.84824377, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.53125, + "step": 3729, + "time_per_iteration": 2.467771530151367 + }, + { + "auxiliary_loss_clip": 0.01076086, + "auxiliary_loss_mlp": 0.01052256, + "balance_loss_clip": 1.02183366, + "balance_loss_mlp": 1.02505112, + "epoch": 0.22425973245152564, + "flos": 27377805939840.0, + "grad_norm": 1.6000956780699833, + "language_loss": 0.88277751, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.90406096, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.51171875, + "step": 3730, + "time_per_iteration": 2.5001840591430664 + }, + { + "auxiliary_loss_clip": 0.01077315, + "auxiliary_loss_mlp": 0.01049955, + "balance_loss_clip": 1.01776862, + "balance_loss_mlp": 1.02575231, + "epoch": 0.2243198557041936, + "flos": 13734795818880.0, + "grad_norm": 1.8286053836821556, + "language_loss": 0.74970353, + "learning_rate": 3.616796927310559e-06, + "loss": 0.77097631, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.515625, + "step": 3731, + "time_per_iteration": 2.4410037994384766 + }, + { + "auxiliary_loss_clip": 0.01082913, + "auxiliary_loss_mlp": 0.01055538, + "balance_loss_clip": 1.01872563, + "balance_loss_mlp": 1.02821922, + "epoch": 0.22437997895686157, + "flos": 19529854640640.0, + "grad_norm": 1.8192810309320693, + "language_loss": 0.7696228, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.79100728, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.546875, + "step": 3732, + "time_per_iteration": 2.400709629058838 + }, + { + "auxiliary_loss_clip": 0.01081096, + "auxiliary_loss_mlp": 0.01051521, + "balance_loss_clip": 1.01695061, + "balance_loss_mlp": 1.0275892, + "epoch": 0.22444010220952954, + "flos": 23695097374080.0, + "grad_norm": 1.7034674376543364, + "language_loss": 0.89063776, + "learning_rate": 3.616338302646873e-06, + "loss": 0.91196394, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.53515625, + "step": 3733, + "time_per_iteration": 3.8822503089904785 + }, + { + "auxiliary_loss_clip": 0.01075758, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.01614618, + "balance_loss_mlp": 1.02302265, + "epoch": 0.2245002254621975, + "flos": 22381804471680.0, + "grad_norm": 1.6439294889564866, + "language_loss": 0.85971105, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.88097173, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.52734375, + "step": 3734, + "time_per_iteration": 2.390564441680908 + }, + { + "auxiliary_loss_clip": 0.01077334, + "auxiliary_loss_mlp": 0.01055745, + "balance_loss_clip": 1.0205785, + "balance_loss_mlp": 1.0244472, + "epoch": 0.22456034871486547, + "flos": 26941112254080.0, + "grad_norm": 2.066821635884086, + "language_loss": 0.7792089, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.80053967, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.53125, + "step": 3735, + "time_per_iteration": 4.120121479034424 + }, + { + "auxiliary_loss_clip": 0.01071968, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_clip": 1.01902866, + "balance_loss_mlp": 1.02257586, + "epoch": 0.22462047196753343, + "flos": 28982344337280.0, + "grad_norm": 1.956759547756066, + "language_loss": 0.8522706, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.87349975, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.49414062, + "step": 3736, + "time_per_iteration": 5.205406188964844 + }, + { + "auxiliary_loss_clip": 0.0107658, + "auxiliary_loss_mlp": 0.01052589, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.02353334, + "epoch": 0.22468059522020142, + "flos": 20010294126720.0, + "grad_norm": 1.9800575663822655, + "language_loss": 0.8812409, + "learning_rate": 3.615420317888586e-06, + "loss": 0.90253258, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.53125, + "step": 3737, + "time_per_iteration": 2.4491443634033203 + }, + { + "auxiliary_loss_clip": 0.01076541, + "auxiliary_loss_mlp": 0.01057589, + "balance_loss_clip": 1.01891732, + "balance_loss_mlp": 1.02225041, + "epoch": 0.2247407184728694, + "flos": 29312971712640.0, + "grad_norm": 2.182586289678639, + "language_loss": 0.8053754, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.82671666, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.54296875, + "step": 3738, + "time_per_iteration": 2.4488210678100586 + }, + { + "auxiliary_loss_clip": 0.01075922, + "auxiliary_loss_mlp": 0.01048868, + "balance_loss_clip": 1.01482201, + "balance_loss_mlp": 1.02160263, + "epoch": 0.22480084172553735, + "flos": 22309254933120.0, + "grad_norm": 2.0400655902194704, + "language_loss": 0.7797246, + "learning_rate": 3.614960957933224e-06, + "loss": 0.80097252, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.54296875, + "step": 3739, + "time_per_iteration": 2.4318060874938965 + }, + { + "auxiliary_loss_clip": 0.01074603, + "auxiliary_loss_mlp": 0.01054272, + "balance_loss_clip": 1.01815176, + "balance_loss_mlp": 1.02145672, + "epoch": 0.22486096497820532, + "flos": 25590148128000.0, + "grad_norm": 2.0028869255423, + "language_loss": 0.76069522, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.78198391, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 3740, + "time_per_iteration": 2.4435157775878906 + }, + { + "auxiliary_loss_clip": 0.01072964, + "auxiliary_loss_mlp": 0.01050589, + "balance_loss_clip": 1.01713896, + "balance_loss_mlp": 1.02073121, + "epoch": 0.22492108823087328, + "flos": 17638853604480.0, + "grad_norm": 2.003208018822855, + "language_loss": 0.76900792, + "learning_rate": 3.614501353019939e-06, + "loss": 0.79024345, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.51953125, + "step": 3741, + "time_per_iteration": 2.508251190185547 + }, + { + "auxiliary_loss_clip": 0.01074664, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.01413596, + "balance_loss_mlp": 1.02292728, + "epoch": 0.22498121148354125, + "flos": 16033721713920.0, + "grad_norm": 1.7162476377977107, + "language_loss": 0.89720774, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.91842306, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.515625, + "step": 3742, + "time_per_iteration": 2.449946403503418 + }, + { + "auxiliary_loss_clip": 0.01075209, + "auxiliary_loss_mlp": 0.01060337, + "balance_loss_clip": 1.02278662, + "balance_loss_mlp": 1.02274776, + "epoch": 0.22504133473620924, + "flos": 24022652549760.0, + "grad_norm": 1.7928813574301303, + "language_loss": 0.82633126, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84768671, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5234375, + "step": 3743, + "time_per_iteration": 2.424997329711914 + }, + { + "auxiliary_loss_clip": 0.01074838, + "auxiliary_loss_mlp": 0.01050078, + "balance_loss_clip": 1.01472104, + "balance_loss_mlp": 1.0217067, + "epoch": 0.2251014579888772, + "flos": 16763022437760.0, + "grad_norm": 2.758491594045999, + "language_loss": 0.64393544, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66518462, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53125, + "step": 3744, + "time_per_iteration": 2.356741189956665 + }, + { + "auxiliary_loss_clip": 0.01073481, + "auxiliary_loss_mlp": 0.01054688, + "balance_loss_clip": 1.01787663, + "balance_loss_mlp": 1.02020311, + "epoch": 0.22516158124154517, + "flos": 13990150569600.0, + "grad_norm": 3.0169022852405827, + "language_loss": 0.78888905, + "learning_rate": 3.613581408598489e-06, + "loss": 0.81017077, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.53125, + "step": 3745, + "time_per_iteration": 2.3743629455566406 + }, + { + "auxiliary_loss_clip": 0.01075923, + "auxiliary_loss_mlp": 0.01057689, + "balance_loss_clip": 1.02099681, + "balance_loss_mlp": 1.02213264, + "epoch": 0.22522170449421314, + "flos": 14389207943040.0, + "grad_norm": 1.9753480622855957, + "language_loss": 0.81916124, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.84049731, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5390625, + "step": 3746, + "time_per_iteration": 2.368948221206665 + }, + { + "auxiliary_loss_clip": 0.01074352, + "auxiliary_loss_mlp": 0.01058549, + "balance_loss_clip": 1.0218569, + "balance_loss_mlp": 1.01998925, + "epoch": 0.2252818277468811, + "flos": 23804410440960.0, + "grad_norm": 2.642852693434061, + "language_loss": 0.87288272, + "learning_rate": 3.613121069229862e-06, + "loss": 0.89421177, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.54296875, + "step": 3747, + "time_per_iteration": 2.4143056869506836 + }, + { + "auxiliary_loss_clip": 0.01075121, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.01740003, + "balance_loss_mlp": 1.02112603, + "epoch": 0.22534195099954907, + "flos": 24716865490560.0, + "grad_norm": 1.7737672003025267, + "language_loss": 0.78601527, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.80728143, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5390625, + "step": 3748, + "time_per_iteration": 2.450960159301758 + }, + { + "auxiliary_loss_clip": 0.01078356, + "auxiliary_loss_mlp": 0.01059888, + "balance_loss_clip": 1.02331495, + "balance_loss_mlp": 1.02280903, + "epoch": 0.22540207425221703, + "flos": 21031294193280.0, + "grad_norm": 2.0841925296530346, + "language_loss": 0.81185114, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.83323359, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5546875, + "step": 3749, + "time_per_iteration": 2.4315288066864014 + }, + { + "auxiliary_loss_clip": 0.01072968, + "auxiliary_loss_mlp": 0.01053781, + "balance_loss_clip": 1.01837623, + "balance_loss_mlp": 1.02127957, + "epoch": 0.22546219750488503, + "flos": 19389363863040.0, + "grad_norm": 1.7099897909663124, + "language_loss": 0.80889589, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.83016342, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.515625, + "step": 3750, + "time_per_iteration": 2.402207374572754 + }, + { + "auxiliary_loss_clip": 0.01077923, + "auxiliary_loss_mlp": 0.01060427, + "balance_loss_clip": 1.02137411, + "balance_loss_mlp": 1.02212501, + "epoch": 0.225522320757553, + "flos": 25191439868160.0, + "grad_norm": 1.99409046581492, + "language_loss": 0.84323943, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.86462289, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.55859375, + "step": 3751, + "time_per_iteration": 2.462245464324951 + }, + { + "auxiliary_loss_clip": 0.01077869, + "auxiliary_loss_mlp": 0.01056406, + "balance_loss_clip": 1.01718593, + "balance_loss_mlp": 1.02190292, + "epoch": 0.22558244401022096, + "flos": 17162219456640.0, + "grad_norm": 1.9290964690679395, + "language_loss": 0.85115117, + "learning_rate": 3.611969150491165e-06, + "loss": 0.87249386, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.55859375, + "step": 3752, + "time_per_iteration": 2.3641231060028076 + }, + { + "auxiliary_loss_clip": 0.01074642, + "auxiliary_loss_mlp": 0.01056794, + "balance_loss_clip": 1.02157927, + "balance_loss_mlp": 1.02121472, + "epoch": 0.22564256726288892, + "flos": 15230125883520.0, + "grad_norm": 1.9468173262183865, + "language_loss": 0.79427242, + "learning_rate": 3.611738583330375e-06, + "loss": 0.81558681, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.53515625, + "step": 3753, + "time_per_iteration": 2.3877739906311035 + }, + { + "auxiliary_loss_clip": 0.01074104, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.02283669, + "balance_loss_mlp": 1.02077866, + "epoch": 0.2257026905155569, + "flos": 34567225574400.0, + "grad_norm": 1.761120256880445, + "language_loss": 0.80148923, + "learning_rate": 3.611507955052295e-06, + "loss": 0.82284135, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.53125, + "step": 3754, + "time_per_iteration": 2.5627999305725098 + }, + { + "auxiliary_loss_clip": 0.01076014, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.02265632, + "balance_loss_mlp": 1.0229336, + "epoch": 0.22576281376822485, + "flos": 19937395474560.0, + "grad_norm": 1.8309953566097916, + "language_loss": 0.71505678, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.73644328, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.53125, + "step": 3755, + "time_per_iteration": 2.475804090499878 + }, + { + "auxiliary_loss_clip": 0.01080936, + "auxiliary_loss_mlp": 0.01063887, + "balance_loss_clip": 1.02166355, + "balance_loss_mlp": 1.02398503, + "epoch": 0.22582293702089282, + "flos": 24601023999360.0, + "grad_norm": 2.3243880735699665, + "language_loss": 0.79018605, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.8116343, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.5703125, + "step": 3756, + "time_per_iteration": 2.4097156524658203 + }, + { + "auxiliary_loss_clip": 0.01078497, + "auxiliary_loss_mlp": 0.01058682, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.02217853, + "epoch": 0.2258830602735608, + "flos": 23034436116480.0, + "grad_norm": 1.8030275551302184, + "language_loss": 0.84123921, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.86261106, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.5625, + "step": 3757, + "time_per_iteration": 2.391437530517578 + }, + { + "auxiliary_loss_clip": 0.01079585, + "auxiliary_loss_mlp": 0.01057481, + "balance_loss_clip": 1.01556683, + "balance_loss_mlp": 1.02355886, + "epoch": 0.22594318352622877, + "flos": 22157487786240.0, + "grad_norm": 1.7585797077820955, + "language_loss": 0.74257922, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.76394987, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.5625, + "step": 3758, + "time_per_iteration": 2.3796911239624023 + }, + { + "auxiliary_loss_clip": 0.01078886, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.02268088, + "epoch": 0.22600330677889674, + "flos": 20593273875840.0, + "grad_norm": 2.2120709841464894, + "language_loss": 0.78754312, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.8089835, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.5625, + "step": 3759, + "time_per_iteration": 2.366210460662842 + }, + { + "auxiliary_loss_clip": 0.01077351, + "auxiliary_loss_mlp": 0.01060623, + "balance_loss_clip": 1.01825571, + "balance_loss_mlp": 1.02076125, + "epoch": 0.2260634300315647, + "flos": 35658436118400.0, + "grad_norm": 1.6400533011059164, + "language_loss": 0.79728132, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.81866109, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.56640625, + "step": 3760, + "time_per_iteration": 2.527611255645752 + }, + { + "auxiliary_loss_clip": 0.0102277, + "auxiliary_loss_mlp": 0.01012322, + "balance_loss_clip": 1.00726736, + "balance_loss_mlp": 1.00644112, + "epoch": 0.22612355328423267, + "flos": 72087579699840.0, + "grad_norm": 0.9653545928513323, + "language_loss": 0.60138416, + "learning_rate": 3.609891846556569e-06, + "loss": 0.6217351, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.05053711, + "router_z_loss_mlp": 0.16308594, + "step": 3761, + "time_per_iteration": 2.967061758041382 + }, + { + "auxiliary_loss_clip": 0.01081928, + "auxiliary_loss_mlp": 0.01066662, + "balance_loss_clip": 1.02169621, + "balance_loss_mlp": 1.02362823, + "epoch": 0.22618367653690064, + "flos": 22782677235840.0, + "grad_norm": 2.099608677852756, + "language_loss": 0.79281557, + "learning_rate": 3.609660729655211e-06, + "loss": 0.81430149, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 0.58203125, + "step": 3762, + "time_per_iteration": 2.4093644618988037 + }, + { + "auxiliary_loss_clip": 0.0108189, + "auxiliary_loss_mlp": 0.01066698, + "balance_loss_clip": 1.02425909, + "balance_loss_mlp": 1.02402282, + "epoch": 0.22624379978956863, + "flos": 20447232192000.0, + "grad_norm": 2.1827158605893775, + "language_loss": 0.80767208, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.82915801, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.578125, + "step": 3763, + "time_per_iteration": 2.389605760574341 + }, + { + "auxiliary_loss_clip": 0.01081098, + "auxiliary_loss_mlp": 0.0107746, + "balance_loss_clip": 1.03158855, + "balance_loss_mlp": 1.02301133, + "epoch": 0.2263039230422366, + "flos": 17493335591040.0, + "grad_norm": 1.748025847668226, + "language_loss": 0.92769641, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.94928199, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 0.58203125, + "step": 3764, + "time_per_iteration": 2.4005534648895264 + }, + { + "auxiliary_loss_clip": 0.01078018, + "auxiliary_loss_mlp": 0.01071112, + "balance_loss_clip": 1.03198719, + "balance_loss_mlp": 1.02362823, + "epoch": 0.22636404629490456, + "flos": 28328490794880.0, + "grad_norm": 2.4298426257038908, + "language_loss": 0.76489854, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.78638983, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.54296875, + "step": 3765, + "time_per_iteration": 2.4509835243225098 + }, + { + "auxiliary_loss_clip": 0.0107913, + "auxiliary_loss_mlp": 0.01072647, + "balance_loss_clip": 1.0311389, + "balance_loss_mlp": 1.0232811, + "epoch": 0.22642416954757252, + "flos": 17488308355200.0, + "grad_norm": 3.6673798661362746, + "language_loss": 0.92181659, + "learning_rate": 3.608735651752494e-06, + "loss": 0.94333434, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.55859375, + "step": 3766, + "time_per_iteration": 2.4195759296417236 + }, + { + "auxiliary_loss_clip": 0.01074924, + "auxiliary_loss_mlp": 0.01071165, + "balance_loss_clip": 1.02956128, + "balance_loss_mlp": 1.02158248, + "epoch": 0.2264842928002405, + "flos": 24383515029120.0, + "grad_norm": 1.5377269365882533, + "language_loss": 0.76294994, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.78441083, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.53515625, + "step": 3767, + "time_per_iteration": 2.4248480796813965 + }, + { + "auxiliary_loss_clip": 0.01077395, + "auxiliary_loss_mlp": 0.01067837, + "balance_loss_clip": 1.02473116, + "balance_loss_mlp": 1.02135706, + "epoch": 0.22654441605290845, + "flos": 19829443950720.0, + "grad_norm": 1.5689595315607066, + "language_loss": 0.72895324, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.75040555, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.5625, + "step": 3768, + "time_per_iteration": 2.424954891204834 + }, + { + "auxiliary_loss_clip": 0.01078614, + "auxiliary_loss_mlp": 0.01073538, + "balance_loss_clip": 1.03362703, + "balance_loss_mlp": 1.02361655, + "epoch": 0.22660453930557642, + "flos": 27453322944000.0, + "grad_norm": 1.7506208268526953, + "language_loss": 0.80012155, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.82164311, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.55078125, + "step": 3769, + "time_per_iteration": 2.4324588775634766 + }, + { + "auxiliary_loss_clip": 0.010791, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_clip": 1.02346492, + "balance_loss_mlp": 1.02205062, + "epoch": 0.2266646625582444, + "flos": 23987006184960.0, + "grad_norm": 1.7148365973901927, + "language_loss": 0.70762956, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.7290653, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.5703125, + "step": 3770, + "time_per_iteration": 2.4687676429748535 + }, + { + "auxiliary_loss_clip": 0.01080765, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.03230202, + "balance_loss_mlp": 1.02441049, + "epoch": 0.22672478581091238, + "flos": 26026946547840.0, + "grad_norm": 1.4973181902171637, + "language_loss": 0.81425035, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.83577543, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5625, + "step": 3771, + "time_per_iteration": 2.4268112182617188 + }, + { + "auxiliary_loss_clip": 0.01077122, + "auxiliary_loss_mlp": 0.01060338, + "balance_loss_clip": 1.02197647, + "balance_loss_mlp": 1.02422047, + "epoch": 0.22678490906358034, + "flos": 23840685210240.0, + "grad_norm": 1.7325538981947635, + "language_loss": 0.79400682, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81538135, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.52734375, + "step": 3772, + "time_per_iteration": 3.8738229274749756 + }, + { + "auxiliary_loss_clip": 0.0102236, + "auxiliary_loss_mlp": 0.01016896, + "balance_loss_clip": 1.01136458, + "balance_loss_mlp": 1.00635552, + "epoch": 0.2268450323162483, + "flos": 65044409351040.0, + "grad_norm": 0.6596604093680121, + "language_loss": 0.54462755, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56502008, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.05541992, + "router_z_loss_mlp": 0.16015625, + "step": 3773, + "time_per_iteration": 3.114429473876953 + }, + { + "auxiliary_loss_clip": 0.01077899, + "auxiliary_loss_mlp": 0.01058079, + "balance_loss_clip": 1.02019453, + "balance_loss_mlp": 1.02411282, + "epoch": 0.22690515556891627, + "flos": 22525053246720.0, + "grad_norm": 1.7139651221697103, + "language_loss": 0.71583951, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.73719925, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5390625, + "step": 3774, + "time_per_iteration": 4.066363573074341 + }, + { + "auxiliary_loss_clip": 0.01079371, + "auxiliary_loss_mlp": 0.01061415, + "balance_loss_clip": 1.02207589, + "balance_loss_mlp": 1.02534449, + "epoch": 0.22696527882158424, + "flos": 18222461758080.0, + "grad_norm": 2.00572599222934, + "language_loss": 0.7539171, + "learning_rate": 3.606650658627658e-06, + "loss": 0.775325, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.5390625, + "step": 3775, + "time_per_iteration": 3.7913928031921387 + }, + { + "auxiliary_loss_clip": 0.01080676, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.01606894, + "balance_loss_mlp": 1.02507305, + "epoch": 0.22702540207425223, + "flos": 17018307365760.0, + "grad_norm": 2.018726172943752, + "language_loss": 0.84216273, + "learning_rate": 3.606418687985928e-06, + "loss": 0.86350924, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5546875, + "step": 3776, + "time_per_iteration": 3.7953555583953857 + }, + { + "auxiliary_loss_clip": 0.01080264, + "auxiliary_loss_mlp": 0.0106147, + "balance_loss_clip": 1.02117705, + "balance_loss_mlp": 1.0243696, + "epoch": 0.2270855253269202, + "flos": 21324634369920.0, + "grad_norm": 2.144313083575548, + "language_loss": 0.84432113, + "learning_rate": 3.606186656428641e-06, + "loss": 0.86573851, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.5546875, + "step": 3777, + "time_per_iteration": 2.4127323627471924 + }, + { + "auxiliary_loss_clip": 0.01080335, + "auxiliary_loss_mlp": 0.01064941, + "balance_loss_clip": 1.02836788, + "balance_loss_mlp": 1.02578938, + "epoch": 0.22714564857958816, + "flos": 23549334981120.0, + "grad_norm": 1.8445656371394266, + "language_loss": 0.74792439, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.76937711, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.546875, + "step": 3778, + "time_per_iteration": 2.4186604022979736 + }, + { + "auxiliary_loss_clip": 0.01079663, + "auxiliary_loss_mlp": 0.01065255, + "balance_loss_clip": 1.02727509, + "balance_loss_mlp": 1.02390528, + "epoch": 0.22720577183225613, + "flos": 25988821476480.0, + "grad_norm": 2.479747366543403, + "language_loss": 0.67030466, + "learning_rate": 3.605722410602591e-06, + "loss": 0.69175386, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.55859375, + "step": 3779, + "time_per_iteration": 2.439324140548706 + }, + { + "auxiliary_loss_clip": 0.01077739, + "auxiliary_loss_mlp": 0.01066768, + "balance_loss_clip": 1.03136313, + "balance_loss_mlp": 1.02465129, + "epoch": 0.2272658950849241, + "flos": 20813017173120.0, + "grad_norm": 1.735996973023303, + "language_loss": 0.72194207, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.7433871, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53125, + "step": 3780, + "time_per_iteration": 2.392807960510254 + }, + { + "auxiliary_loss_clip": 0.01078731, + "auxiliary_loss_mlp": 0.01068096, + "balance_loss_clip": 1.02906656, + "balance_loss_mlp": 1.02450752, + "epoch": 0.22732601833759206, + "flos": 23908347158400.0, + "grad_norm": 2.6206620092282016, + "language_loss": 0.90543306, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.92690134, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.54296875, + "step": 3781, + "time_per_iteration": 2.4279842376708984 + }, + { + "auxiliary_loss_clip": 0.01077243, + "auxiliary_loss_mlp": 0.0107685, + "balance_loss_clip": 1.03860784, + "balance_loss_mlp": 1.02314115, + "epoch": 0.22738614159026002, + "flos": 15923500951680.0, + "grad_norm": 2.370614931144449, + "language_loss": 0.76325476, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.78479564, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5390625, + "step": 3782, + "time_per_iteration": 2.3630990982055664 + }, + { + "auxiliary_loss_clip": 0.01072768, + "auxiliary_loss_mlp": 0.01063222, + "balance_loss_clip": 1.03053451, + "balance_loss_mlp": 1.02091706, + "epoch": 0.22744626484292801, + "flos": 24204410421120.0, + "grad_norm": 1.4296595950945508, + "language_loss": 0.83800763, + "learning_rate": 3.604793188351095e-06, + "loss": 0.85936755, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.51953125, + "step": 3783, + "time_per_iteration": 2.449126958847046 + }, + { + "auxiliary_loss_clip": 0.01074811, + "auxiliary_loss_mlp": 0.01079996, + "balance_loss_clip": 1.04041862, + "balance_loss_mlp": 1.02110767, + "epoch": 0.22750638809559598, + "flos": 24790427458560.0, + "grad_norm": 11.897368688239734, + "language_loss": 0.77707493, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.79862297, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.53515625, + "step": 3784, + "time_per_iteration": 2.4196720123291016 + }, + { + "auxiliary_loss_clip": 0.01072876, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.02802753, + "balance_loss_mlp": 1.02039123, + "epoch": 0.22756651134826394, + "flos": 22235413674240.0, + "grad_norm": 1.570363928891433, + "language_loss": 0.71642983, + "learning_rate": 3.604328212066594e-06, + "loss": 0.73778939, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.52734375, + "step": 3785, + "time_per_iteration": 2.3873963356018066 + }, + { + "auxiliary_loss_clip": 0.01022912, + "auxiliary_loss_mlp": 0.01137434, + "balance_loss_clip": 1.1314497, + "balance_loss_mlp": 1.00831246, + "epoch": 0.2276266346009319, + "flos": 62704006894080.0, + "grad_norm": 0.9088014436395508, + "language_loss": 0.6195038, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.64110726, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.05981445, + "router_z_loss_mlp": 0.14648438, + "step": 3786, + "time_per_iteration": 3.0614724159240723 + }, + { + "auxiliary_loss_clip": 0.0108226, + "auxiliary_loss_mlp": 0.01060798, + "balance_loss_clip": 1.02217436, + "balance_loss_mlp": 1.0263077, + "epoch": 0.22768675785359987, + "flos": 18613245139200.0, + "grad_norm": 2.7531578060162145, + "language_loss": 0.88978052, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.91121113, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.55859375, + "step": 3787, + "time_per_iteration": 2.4823479652404785 + }, + { + "auxiliary_loss_clip": 0.01079054, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.02144182, + "balance_loss_mlp": 1.02721858, + "epoch": 0.22774688110626784, + "flos": 26868981651840.0, + "grad_norm": 1.3316552857065989, + "language_loss": 0.74358332, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.76492661, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.51953125, + "step": 3788, + "time_per_iteration": 2.4530677795410156 + }, + { + "auxiliary_loss_clip": 0.01083123, + "auxiliary_loss_mlp": 0.01061204, + "balance_loss_clip": 1.02510715, + "balance_loss_mlp": 1.02870059, + "epoch": 0.2278070043589358, + "flos": 15552863291520.0, + "grad_norm": 2.2482902875850375, + "language_loss": 0.69660223, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.71804541, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.546875, + "step": 3789, + "time_per_iteration": 2.5739314556121826 + }, + { + "auxiliary_loss_clip": 0.01083261, + "auxiliary_loss_mlp": 0.01075228, + "balance_loss_clip": 1.0384872, + "balance_loss_mlp": 1.03025842, + "epoch": 0.2278671276116038, + "flos": 22415775091200.0, + "grad_norm": 1.9028082488207625, + "language_loss": 0.7663188, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78790361, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.53125, + "step": 3790, + "time_per_iteration": 2.5688347816467285 + }, + { + "auxiliary_loss_clip": 0.01082448, + "auxiliary_loss_mlp": 0.01078365, + "balance_loss_clip": 1.04400921, + "balance_loss_mlp": 1.0302856, + "epoch": 0.22792725086427176, + "flos": 20630316695040.0, + "grad_norm": 1.9588276168581793, + "language_loss": 0.92733955, + "learning_rate": 3.602931823424522e-06, + "loss": 0.94894767, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5234375, + "step": 3791, + "time_per_iteration": 2.4066824913024902 + }, + { + "auxiliary_loss_clip": 0.01085036, + "auxiliary_loss_mlp": 0.01082148, + "balance_loss_clip": 1.04581261, + "balance_loss_mlp": 1.0305692, + "epoch": 0.22798737411693973, + "flos": 31427661029760.0, + "grad_norm": 1.9173723442049408, + "language_loss": 0.84493244, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.86660421, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.54296875, + "step": 3792, + "time_per_iteration": 2.6513619422912598 + }, + { + "auxiliary_loss_clip": 0.01052317, + "auxiliary_loss_mlp": 0.01100871, + "balance_loss_clip": 1.09550703, + "balance_loss_mlp": 1.03480268, + "epoch": 0.2280474973696077, + "flos": 52394121095040.0, + "grad_norm": 1.1981163944147672, + "language_loss": 0.65805137, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67958331, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.17578125, + "step": 3793, + "time_per_iteration": 2.80916166305542 + }, + { + "auxiliary_loss_clip": 0.0108691, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.06173587, + "balance_loss_mlp": 1.02894855, + "epoch": 0.22810762062227566, + "flos": 26394861121920.0, + "grad_norm": 1.8279853037267464, + "language_loss": 0.79174948, + "learning_rate": 3.602232808409293e-06, + "loss": 0.81362718, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.578125, + "step": 3794, + "time_per_iteration": 2.4345741271972656 + }, + { + "auxiliary_loss_clip": 0.01080164, + "auxiliary_loss_mlp": 0.01068714, + "balance_loss_clip": 1.03488255, + "balance_loss_mlp": 1.02650511, + "epoch": 0.22816774387494362, + "flos": 25629076160640.0, + "grad_norm": 1.9275908242812172, + "language_loss": 0.8207283, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.84221709, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.53515625, + "step": 3795, + "time_per_iteration": 2.4776947498321533 + }, + { + "auxiliary_loss_clip": 0.0107757, + "auxiliary_loss_mlp": 0.01079634, + "balance_loss_clip": 1.04401445, + "balance_loss_mlp": 1.02432489, + "epoch": 0.22822786712761162, + "flos": 22450618494720.0, + "grad_norm": 2.495370410469278, + "language_loss": 0.78674644, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.8083185, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.53125, + "step": 3796, + "time_per_iteration": 2.4067163467407227 + }, + { + "auxiliary_loss_clip": 0.01077288, + "auxiliary_loss_mlp": 0.01066944, + "balance_loss_clip": 1.03492391, + "balance_loss_mlp": 1.02462089, + "epoch": 0.22828799038027958, + "flos": 12201759619200.0, + "grad_norm": 2.4594310372463206, + "language_loss": 0.97052014, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.99196243, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.52734375, + "step": 3797, + "time_per_iteration": 2.412616491317749 + }, + { + "auxiliary_loss_clip": 0.0107689, + "auxiliary_loss_mlp": 0.01064951, + "balance_loss_clip": 1.03028536, + "balance_loss_mlp": 1.0254519, + "epoch": 0.22834811363294755, + "flos": 22084763690880.0, + "grad_norm": 2.1766701571844163, + "language_loss": 0.82676709, + "learning_rate": 3.601299937834666e-06, + "loss": 0.84818554, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.515625, + "step": 3798, + "time_per_iteration": 2.506627082824707 + }, + { + "auxiliary_loss_clip": 0.0108, + "auxiliary_loss_mlp": 0.01061674, + "balance_loss_clip": 1.02419424, + "balance_loss_mlp": 1.02669489, + "epoch": 0.2284082368856155, + "flos": 24859555683840.0, + "grad_norm": 1.8327463962727781, + "language_loss": 0.80652761, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.8279444, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.53125, + "step": 3799, + "time_per_iteration": 2.5439326763153076 + }, + { + "auxiliary_loss_clip": 0.01082891, + "auxiliary_loss_mlp": 0.01053404, + "balance_loss_clip": 1.02004957, + "balance_loss_mlp": 1.02948916, + "epoch": 0.22846836013828348, + "flos": 23291815726080.0, + "grad_norm": 1.7479429438928942, + "language_loss": 0.77442139, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.79578435, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.53515625, + "step": 3800, + "time_per_iteration": 2.574922800064087 + }, + { + "auxiliary_loss_clip": 0.01083571, + "auxiliary_loss_mlp": 0.01051859, + "balance_loss_clip": 1.01964855, + "balance_loss_mlp": 1.03085637, + "epoch": 0.22852848339095144, + "flos": 27415093138560.0, + "grad_norm": 1.9179511185760247, + "language_loss": 0.65726745, + "learning_rate": 3.600599647297484e-06, + "loss": 0.67862177, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.52734375, + "step": 3801, + "time_per_iteration": 2.635030508041382 + }, + { + "auxiliary_loss_clip": 0.01087872, + "auxiliary_loss_mlp": 0.01054117, + "balance_loss_clip": 1.02266932, + "balance_loss_mlp": 1.03665161, + "epoch": 0.2285886066436194, + "flos": 26320007433600.0, + "grad_norm": 1.5476234461322504, + "language_loss": 0.81905222, + "learning_rate": 3.60036609571682e-06, + "loss": 0.8404721, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.51171875, + "step": 3802, + "time_per_iteration": 2.5267748832702637 + }, + { + "auxiliary_loss_clip": 0.01089294, + "auxiliary_loss_mlp": 0.01071616, + "balance_loss_clip": 1.03258693, + "balance_loss_mlp": 1.03399134, + "epoch": 0.2286487298962874, + "flos": 29715171108480.0, + "grad_norm": 2.0171383866546484, + "language_loss": 0.79933929, + "learning_rate": 3.600132483450114e-06, + "loss": 0.82094842, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5546875, + "step": 3803, + "time_per_iteration": 2.535651445388794 + }, + { + "auxiliary_loss_clip": 0.01091819, + "auxiliary_loss_mlp": 0.0105943, + "balance_loss_clip": 1.02302325, + "balance_loss_mlp": 1.03698051, + "epoch": 0.22870885314895537, + "flos": 21286160184960.0, + "grad_norm": 2.1772108885169725, + "language_loss": 0.86666542, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.88817799, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.546875, + "step": 3804, + "time_per_iteration": 2.467409372329712 + }, + { + "auxiliary_loss_clip": 0.01091565, + "auxiliary_loss_mlp": 0.01064342, + "balance_loss_clip": 1.02822185, + "balance_loss_mlp": 1.03576231, + "epoch": 0.22876897640162333, + "flos": 14938566186240.0, + "grad_norm": 2.046465337724459, + "language_loss": 0.78884363, + "learning_rate": 3.59966507689401e-06, + "loss": 0.81040269, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.55859375, + "step": 3805, + "time_per_iteration": 2.3974766731262207 + }, + { + "auxiliary_loss_clip": 0.01091523, + "auxiliary_loss_mlp": 0.01059099, + "balance_loss_clip": 1.02166772, + "balance_loss_mlp": 1.03579283, + "epoch": 0.2288290996542913, + "flos": 18112939223040.0, + "grad_norm": 5.759009039180327, + "language_loss": 0.81158751, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.83309376, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.55859375, + "step": 3806, + "time_per_iteration": 2.3993659019470215 + }, + { + "auxiliary_loss_clip": 0.01093333, + "auxiliary_loss_mlp": 0.01067699, + "balance_loss_clip": 1.03365278, + "balance_loss_mlp": 1.03891003, + "epoch": 0.22888922290695926, + "flos": 39853983778560.0, + "grad_norm": 3.435165574401829, + "language_loss": 0.72109228, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.7427026, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.546875, + "step": 3807, + "time_per_iteration": 2.6026110649108887 + }, + { + "auxiliary_loss_clip": 0.01093624, + "auxiliary_loss_mlp": 0.01059472, + "balance_loss_clip": 1.02177811, + "balance_loss_mlp": 1.03782499, + "epoch": 0.22894934615962723, + "flos": 23402664892800.0, + "grad_norm": 2.2443196715142735, + "language_loss": 0.6716392, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.69317019, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5546875, + "step": 3808, + "time_per_iteration": 2.4397480487823486 + }, + { + "auxiliary_loss_clip": 0.01090073, + "auxiliary_loss_mlp": 0.01056719, + "balance_loss_clip": 1.02150416, + "balance_loss_mlp": 1.0351932, + "epoch": 0.22900946941229522, + "flos": 18842030478720.0, + "grad_norm": 2.1241815276886, + "language_loss": 0.76037002, + "learning_rate": 3.598729535939222e-06, + "loss": 0.78183794, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 3809, + "time_per_iteration": 2.4108328819274902 + }, + { + "auxiliary_loss_clip": 0.01087021, + "auxiliary_loss_mlp": 0.01063797, + "balance_loss_clip": 1.02946448, + "balance_loss_mlp": 1.03298903, + "epoch": 0.22906959266496318, + "flos": 22928299983360.0, + "grad_norm": 1.5071166635324513, + "language_loss": 0.82967126, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.85117948, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5390625, + "step": 3810, + "time_per_iteration": 2.4223828315734863 + }, + { + "auxiliary_loss_clip": 0.01080442, + "auxiliary_loss_mlp": 0.01055736, + "balance_loss_clip": 1.02216661, + "balance_loss_mlp": 1.02760851, + "epoch": 0.22912971591763115, + "flos": 19353508030080.0, + "grad_norm": 2.2565356386753948, + "language_loss": 0.79861641, + "learning_rate": 3.598261401682441e-06, + "loss": 0.81997824, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.52734375, + "step": 3811, + "time_per_iteration": 3.8563482761383057 + }, + { + "auxiliary_loss_clip": 0.01083473, + "auxiliary_loss_mlp": 0.01066743, + "balance_loss_clip": 1.02976441, + "balance_loss_mlp": 1.02873874, + "epoch": 0.22918983917029911, + "flos": 19932647529600.0, + "grad_norm": 1.6796878744160415, + "language_loss": 0.83922154, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.86072367, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.546875, + "step": 3812, + "time_per_iteration": 2.393127202987671 + }, + { + "auxiliary_loss_clip": 0.01083805, + "auxiliary_loss_mlp": 0.01059066, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.02780664, + "epoch": 0.22924996242296708, + "flos": 16689949228800.0, + "grad_norm": 2.5196303163533735, + "language_loss": 0.85880673, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.88023543, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.55859375, + "step": 3813, + "time_per_iteration": 2.473724126815796 + }, + { + "auxiliary_loss_clip": 0.01077553, + "auxiliary_loss_mlp": 0.01058769, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.02354074, + "epoch": 0.22931008567563504, + "flos": 33034782867840.0, + "grad_norm": 1.5347605163130804, + "language_loss": 0.71178246, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.73314559, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5390625, + "step": 3814, + "time_per_iteration": 3.988420248031616 + }, + { + "auxiliary_loss_clip": 0.01073868, + "auxiliary_loss_mlp": 0.01057143, + "balance_loss_clip": 1.02085614, + "balance_loss_mlp": 1.02183843, + "epoch": 0.229370208928303, + "flos": 23329591683840.0, + "grad_norm": 2.674126582663077, + "language_loss": 0.68373144, + "learning_rate": 3.597324405965139e-06, + "loss": 0.70504153, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.51953125, + "step": 3815, + "time_per_iteration": 5.24737548828125 + }, + { + "auxiliary_loss_clip": 0.01078775, + "auxiliary_loss_mlp": 0.01077661, + "balance_loss_clip": 1.03834581, + "balance_loss_mlp": 1.02509201, + "epoch": 0.229430332180971, + "flos": 28616070597120.0, + "grad_norm": 1.5676399597643875, + "language_loss": 0.84253085, + "learning_rate": 3.597090005586848e-06, + "loss": 0.86409521, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5390625, + "step": 3816, + "time_per_iteration": 2.4860639572143555 + }, + { + "auxiliary_loss_clip": 0.01076402, + "auxiliary_loss_mlp": 0.01059769, + "balance_loss_clip": 1.02271843, + "balance_loss_mlp": 1.02305841, + "epoch": 0.22949045543363897, + "flos": 17237247701760.0, + "grad_norm": 2.5632249904248177, + "language_loss": 0.88786429, + "learning_rate": 3.596855544646742e-06, + "loss": 0.909226, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.53515625, + "step": 3817, + "time_per_iteration": 2.395220994949341 + }, + { + "auxiliary_loss_clip": 0.01082759, + "auxiliary_loss_mlp": 0.01058699, + "balance_loss_clip": 1.0239141, + "balance_loss_mlp": 1.02735281, + "epoch": 0.22955057868630693, + "flos": 27488236170240.0, + "grad_norm": 1.6143322631932495, + "language_loss": 0.76532567, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.78674024, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5546875, + "step": 3818, + "time_per_iteration": 2.4864964485168457 + }, + { + "auxiliary_loss_clip": 0.0108143, + "auxiliary_loss_mlp": 0.01057728, + "balance_loss_clip": 1.02034402, + "balance_loss_mlp": 1.02562475, + "epoch": 0.2296107019389749, + "flos": 23475319165440.0, + "grad_norm": 1.6822920567235238, + "language_loss": 0.76447725, + "learning_rate": 3.596386441116659e-06, + "loss": 0.78586888, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.55859375, + "step": 3819, + "time_per_iteration": 2.433497905731201 + }, + { + "auxiliary_loss_clip": 0.0108093, + "auxiliary_loss_mlp": 0.01054437, + "balance_loss_clip": 1.02110624, + "balance_loss_mlp": 1.02717686, + "epoch": 0.22967082519164286, + "flos": 31283818761600.0, + "grad_norm": 1.7059456985632468, + "language_loss": 0.81796867, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83932233, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5390625, + "step": 3820, + "time_per_iteration": 2.4901673793792725 + }, + { + "auxiliary_loss_clip": 0.01083998, + "auxiliary_loss_mlp": 0.01057688, + "balance_loss_clip": 1.0202086, + "balance_loss_mlp": 1.02829921, + "epoch": 0.22973094844431083, + "flos": 14642188721280.0, + "grad_norm": 2.2725665362968512, + "language_loss": 0.71303183, + "learning_rate": 3.595917095446042e-06, + "loss": 0.73444867, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.55859375, + "step": 3821, + "time_per_iteration": 2.3944218158721924 + }, + { + "auxiliary_loss_clip": 0.01080394, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.01527905, + "balance_loss_mlp": 1.02829242, + "epoch": 0.2297910716969788, + "flos": 22822652609280.0, + "grad_norm": 1.4576916618847127, + "language_loss": 0.84348977, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.8647691, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.51953125, + "step": 3822, + "time_per_iteration": 2.4481804370880127 + }, + { + "auxiliary_loss_clip": 0.01082642, + "auxiliary_loss_mlp": 0.01050669, + "balance_loss_clip": 1.01566875, + "balance_loss_mlp": 1.02915716, + "epoch": 0.2298511949496468, + "flos": 23037927252480.0, + "grad_norm": 1.4446168406178788, + "language_loss": 0.67887318, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.70020628, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.53515625, + "step": 3823, + "time_per_iteration": 2.4602413177490234 + }, + { + "auxiliary_loss_clip": 0.01044774, + "auxiliary_loss_mlp": 0.01018885, + "balance_loss_clip": 1.01485598, + "balance_loss_mlp": 1.02918291, + "epoch": 0.22991131820231475, + "flos": 66887684691840.0, + "grad_norm": 0.8946945510217301, + "language_loss": 0.56779897, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58843565, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.15625, + "step": 3824, + "time_per_iteration": 3.1251044273376465 + }, + { + "auxiliary_loss_clip": 0.01083086, + "auxiliary_loss_mlp": 0.01062631, + "balance_loss_clip": 1.02827525, + "balance_loss_mlp": 1.03049159, + "epoch": 0.22997144145498272, + "flos": 17886492944640.0, + "grad_norm": 2.196981468235083, + "language_loss": 0.75365782, + "learning_rate": 3.594977677968009e-06, + "loss": 0.77511501, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5234375, + "step": 3825, + "time_per_iteration": 2.3993124961853027 + }, + { + "auxiliary_loss_clip": 0.01086892, + "auxiliary_loss_mlp": 0.01072959, + "balance_loss_clip": 1.03559852, + "balance_loss_mlp": 1.03190875, + "epoch": 0.23003156470765068, + "flos": 24675807864960.0, + "grad_norm": 1.7754565771592172, + "language_loss": 0.89169484, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.91329336, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.55078125, + "step": 3826, + "time_per_iteration": 2.472381830215454 + }, + { + "auxiliary_loss_clip": 0.01085286, + "auxiliary_loss_mlp": 0.01069287, + "balance_loss_clip": 1.03183174, + "balance_loss_mlp": 1.02884078, + "epoch": 0.23009168796031865, + "flos": 15813245278080.0, + "grad_norm": 2.2254702792978898, + "language_loss": 0.83982635, + "learning_rate": 3.594507606303083e-06, + "loss": 0.86137211, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5625, + "step": 3827, + "time_per_iteration": 2.396425724029541 + }, + { + "auxiliary_loss_clip": 0.01079954, + "auxiliary_loss_mlp": 0.01064881, + "balance_loss_clip": 1.03124022, + "balance_loss_mlp": 1.02825069, + "epoch": 0.2301518112129866, + "flos": 16212023360640.0, + "grad_norm": 1.7662975057166757, + "language_loss": 0.88212711, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.90357548, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.515625, + "step": 3828, + "time_per_iteration": 2.3893747329711914 + }, + { + "auxiliary_loss_clip": 0.01081232, + "auxiliary_loss_mlp": 0.01082532, + "balance_loss_clip": 1.04786634, + "balance_loss_mlp": 1.02656078, + "epoch": 0.2302119344656546, + "flos": 20594391039360.0, + "grad_norm": 1.9307601830250327, + "language_loss": 0.72571397, + "learning_rate": 3.594037292782607e-06, + "loss": 0.74735165, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.546875, + "step": 3829, + "time_per_iteration": 2.4291698932647705 + }, + { + "auxiliary_loss_clip": 0.01077458, + "auxiliary_loss_mlp": 0.01075462, + "balance_loss_clip": 1.04329932, + "balance_loss_mlp": 1.02562404, + "epoch": 0.23027205771832257, + "flos": 26795698974720.0, + "grad_norm": 1.6759985216374191, + "language_loss": 0.8572855, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.87881464, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.51953125, + "step": 3830, + "time_per_iteration": 2.479471206665039 + }, + { + "auxiliary_loss_clip": 0.01076807, + "auxiliary_loss_mlp": 0.01080235, + "balance_loss_clip": 1.04595089, + "balance_loss_mlp": 1.02388465, + "epoch": 0.23033218097099054, + "flos": 43871439260160.0, + "grad_norm": 1.7005254020765854, + "language_loss": 0.68420684, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.70577729, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.52734375, + "step": 3831, + "time_per_iteration": 2.6173925399780273 + }, + { + "auxiliary_loss_clip": 0.01078386, + "auxiliary_loss_mlp": 0.01085482, + "balance_loss_clip": 1.04945683, + "balance_loss_mlp": 1.02323794, + "epoch": 0.2303923042236585, + "flos": 26066468073600.0, + "grad_norm": 3.3710234354371855, + "language_loss": 0.77231652, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.79395521, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.55078125, + "step": 3832, + "time_per_iteration": 2.4454007148742676 + }, + { + "auxiliary_loss_clip": 0.01077886, + "auxiliary_loss_mlp": 0.01072717, + "balance_loss_clip": 1.03628659, + "balance_loss_mlp": 1.02339613, + "epoch": 0.23045242747632647, + "flos": 18295395321600.0, + "grad_norm": 1.7456724105041774, + "language_loss": 0.88826501, + "learning_rate": 3.593095940460389e-06, + "loss": 0.90977097, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.54296875, + "step": 3833, + "time_per_iteration": 2.3832011222839355 + }, + { + "auxiliary_loss_clip": 0.01077375, + "auxiliary_loss_mlp": 0.01073637, + "balance_loss_clip": 1.0368011, + "balance_loss_mlp": 1.02321827, + "epoch": 0.23051255072899443, + "flos": 25519344157440.0, + "grad_norm": 1.6209488467526094, + "language_loss": 0.76802683, + "learning_rate": 3.592860451331624e-06, + "loss": 0.78953695, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.54296875, + "step": 3834, + "time_per_iteration": 2.4406964778900146 + }, + { + "auxiliary_loss_clip": 0.0107803, + "auxiliary_loss_mlp": 0.01071765, + "balance_loss_clip": 1.03287911, + "balance_loss_mlp": 1.0253247, + "epoch": 0.2305726739816624, + "flos": 21214134316800.0, + "grad_norm": 1.8089720173055888, + "language_loss": 0.87640178, + "learning_rate": 3.592624901801432e-06, + "loss": 0.89789969, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.52734375, + "step": 3835, + "time_per_iteration": 2.4242966175079346 + }, + { + "auxiliary_loss_clip": 0.01083558, + "auxiliary_loss_mlp": 0.01074649, + "balance_loss_clip": 1.03530979, + "balance_loss_mlp": 1.02573037, + "epoch": 0.2306327972343304, + "flos": 23330010620160.0, + "grad_norm": 2.120321685173461, + "language_loss": 0.84784555, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.86942762, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.578125, + "step": 3836, + "time_per_iteration": 2.4211487770080566 + }, + { + "auxiliary_loss_clip": 0.0108259, + "auxiliary_loss_mlp": 0.01052278, + "balance_loss_clip": 1.01560903, + "balance_loss_mlp": 1.02748346, + "epoch": 0.23069292048699835, + "flos": 20665718680320.0, + "grad_norm": 1.58680414190843, + "language_loss": 0.81130826, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.83265698, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.55078125, + "step": 3837, + "time_per_iteration": 2.469966173171997 + }, + { + "auxiliary_loss_clip": 0.01027624, + "auxiliary_loss_mlp": 0.01008995, + "balance_loss_clip": 1.00482225, + "balance_loss_mlp": 1.01326168, + "epoch": 0.23075304373966632, + "flos": 70451828680320.0, + "grad_norm": 0.8979738606755284, + "language_loss": 0.65471232, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67507851, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.14355469, + "step": 3838, + "time_per_iteration": 2.972496271133423 + }, + { + "auxiliary_loss_clip": 0.0108291, + "auxiliary_loss_mlp": 0.01059192, + "balance_loss_clip": 1.02104473, + "balance_loss_mlp": 1.02980888, + "epoch": 0.23081316699233428, + "flos": 16617050576640.0, + "grad_norm": 1.8882657201820703, + "language_loss": 0.76819861, + "learning_rate": 3.591682099845058e-06, + "loss": 0.78961968, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.53125, + "step": 3839, + "time_per_iteration": 2.42195463180542 + }, + { + "auxiliary_loss_clip": 0.01085871, + "auxiliary_loss_mlp": 0.01056913, + "balance_loss_clip": 1.01774132, + "balance_loss_mlp": 1.0291419, + "epoch": 0.23087329024500225, + "flos": 13297229349120.0, + "grad_norm": 2.0166163954711616, + "language_loss": 0.70947504, + "learning_rate": 3.591446248441752e-06, + "loss": 0.73090291, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.5703125, + "step": 3840, + "time_per_iteration": 2.3671774864196777 + }, + { + "auxiliary_loss_clip": 0.01086988, + "auxiliary_loss_mlp": 0.01061636, + "balance_loss_clip": 1.01955509, + "balance_loss_mlp": 1.03212357, + "epoch": 0.23093341349767021, + "flos": 17784755642880.0, + "grad_norm": 2.392429705017942, + "language_loss": 0.80433583, + "learning_rate": 3.591210336690645e-06, + "loss": 0.82582206, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.546875, + "step": 3841, + "time_per_iteration": 2.404345989227295 + }, + { + "auxiliary_loss_clip": 0.01081794, + "auxiliary_loss_mlp": 0.01060668, + "balance_loss_clip": 1.02426171, + "balance_loss_mlp": 1.02805543, + "epoch": 0.23099353675033818, + "flos": 23986936362240.0, + "grad_norm": 1.7891897988560148, + "language_loss": 0.84517938, + "learning_rate": 3.590974364600683e-06, + "loss": 0.86660397, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5390625, + "step": 3842, + "time_per_iteration": 2.420278310775757 + }, + { + "auxiliary_loss_clip": 0.01083304, + "auxiliary_loss_mlp": 0.01060791, + "balance_loss_clip": 1.02405024, + "balance_loss_mlp": 1.02943003, + "epoch": 0.23105366000300617, + "flos": 35993601970560.0, + "grad_norm": 1.4771713114810963, + "language_loss": 0.67328256, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.69472349, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5390625, + "step": 3843, + "time_per_iteration": 2.691020965576172 + }, + { + "auxiliary_loss_clip": 0.01079702, + "auxiliary_loss_mlp": 0.0107002, + "balance_loss_clip": 1.03218281, + "balance_loss_mlp": 1.02738094, + "epoch": 0.23111378325567414, + "flos": 31244087767680.0, + "grad_norm": 1.7520987629700062, + "language_loss": 0.79233742, + "learning_rate": 3.590502239439987e-06, + "loss": 0.81383467, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5234375, + "step": 3844, + "time_per_iteration": 2.492827892303467 + }, + { + "auxiliary_loss_clip": 0.01081066, + "auxiliary_loss_mlp": 0.01066393, + "balance_loss_clip": 1.02805495, + "balance_loss_mlp": 1.0264852, + "epoch": 0.2311739065083421, + "flos": 19207221966720.0, + "grad_norm": 1.7027254585797362, + "language_loss": 0.79021549, + "learning_rate": 3.590266086387156e-06, + "loss": 0.81169003, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.546875, + "step": 3845, + "time_per_iteration": 2.4231648445129395 + }, + { + "auxiliary_loss_clip": 0.01074266, + "auxiliary_loss_mlp": 0.01056544, + "balance_loss_clip": 1.01970863, + "balance_loss_mlp": 1.02345395, + "epoch": 0.23123402976101007, + "flos": 23359268206080.0, + "grad_norm": 2.107521785294402, + "language_loss": 0.77725959, + "learning_rate": 3.590029873031276e-06, + "loss": 0.79856771, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5078125, + "step": 3846, + "time_per_iteration": 2.4166083335876465 + }, + { + "auxiliary_loss_clip": 0.0107782, + "auxiliary_loss_mlp": 0.01063569, + "balance_loss_clip": 1.02544558, + "balance_loss_mlp": 1.0240407, + "epoch": 0.23129415301367803, + "flos": 13734516528000.0, + "grad_norm": 15.412235081732916, + "language_loss": 0.72215766, + "learning_rate": 3.589793599381304e-06, + "loss": 0.74357158, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5390625, + "step": 3847, + "time_per_iteration": 2.422774314880371 + }, + { + "auxiliary_loss_clip": 0.01020206, + "auxiliary_loss_mlp": 0.01011304, + "balance_loss_clip": 1.00698876, + "balance_loss_mlp": 1.00599849, + "epoch": 0.231354276266346, + "flos": 69733699591680.0, + "grad_norm": 0.7991716046350351, + "language_loss": 0.61126554, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63158065, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.04321289, + "router_z_loss_mlp": 0.14257812, + "step": 3848, + "time_per_iteration": 2.9412503242492676 + }, + { + "auxiliary_loss_clip": 0.01075455, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_clip": 1.0276283, + "balance_loss_mlp": 1.0217917, + "epoch": 0.231414399519014, + "flos": 18835118029440.0, + "grad_norm": 1.962019724401641, + "language_loss": 0.79821956, + "learning_rate": 3.589320871234923e-06, + "loss": 0.8196336, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5390625, + "step": 3849, + "time_per_iteration": 2.4136805534362793 + }, + { + "auxiliary_loss_clip": 0.01077863, + "auxiliary_loss_mlp": 0.01066217, + "balance_loss_clip": 1.02864218, + "balance_loss_mlp": 1.02325392, + "epoch": 0.23147452277168196, + "flos": 36134057836800.0, + "grad_norm": 1.908384855656575, + "language_loss": 0.734909, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.7563498, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.546875, + "step": 3850, + "time_per_iteration": 3.988124370574951 + }, + { + "auxiliary_loss_clip": 0.0107442, + "auxiliary_loss_mlp": 0.01057031, + "balance_loss_clip": 1.0183599, + "balance_loss_mlp": 1.02175093, + "epoch": 0.23153464602434992, + "flos": 20811900009600.0, + "grad_norm": 2.1193110614384936, + "language_loss": 0.78200793, + "learning_rate": 3.588847902019718e-06, + "loss": 0.80332243, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5234375, + "step": 3851, + "time_per_iteration": 2.4412426948547363 + }, + { + "auxiliary_loss_clip": 0.01075785, + "auxiliary_loss_mlp": 0.01059049, + "balance_loss_clip": 1.02218997, + "balance_loss_mlp": 1.02336788, + "epoch": 0.2315947692770179, + "flos": 19938198435840.0, + "grad_norm": 2.7730733839084567, + "language_loss": 0.71329618, + "learning_rate": 3.588611327033723e-06, + "loss": 0.73464447, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5234375, + "step": 3852, + "time_per_iteration": 2.4036614894866943 + }, + { + "auxiliary_loss_clip": 0.01079925, + "auxiliary_loss_mlp": 0.0105848, + "balance_loss_clip": 1.02319384, + "balance_loss_mlp": 1.02525377, + "epoch": 0.23165489252968585, + "flos": 12854845111680.0, + "grad_norm": 2.0900406131465563, + "language_loss": 0.69817513, + "learning_rate": 3.588374691807428e-06, + "loss": 0.71955913, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 3853, + "time_per_iteration": 2.4215612411499023 + }, + { + "auxiliary_loss_clip": 0.01077925, + "auxiliary_loss_mlp": 0.01061294, + "balance_loss_clip": 1.02188349, + "balance_loss_mlp": 1.02374065, + "epoch": 0.23171501578235382, + "flos": 30626962842240.0, + "grad_norm": 1.577177893863858, + "language_loss": 0.81854284, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.83993506, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.54296875, + "step": 3854, + "time_per_iteration": 3.972559690475464 + }, + { + "auxiliary_loss_clip": 0.01083504, + "auxiliary_loss_mlp": 0.01067245, + "balance_loss_clip": 1.02640402, + "balance_loss_mlp": 1.02523434, + "epoch": 0.23177513903502178, + "flos": 23841627816960.0, + "grad_norm": 1.9540903752090786, + "language_loss": 0.68880332, + "learning_rate": 3.587901240669831e-06, + "loss": 0.71031082, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.58203125, + "step": 3855, + "time_per_iteration": 3.8932440280914307 + }, + { + "auxiliary_loss_clip": 0.01081233, + "auxiliary_loss_mlp": 0.0106835, + "balance_loss_clip": 1.02848661, + "balance_loss_mlp": 1.02511775, + "epoch": 0.23183526228768978, + "flos": 29568989779200.0, + "grad_norm": 2.2537957095707366, + "language_loss": 0.73104459, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.75254041, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.5625, + "step": 3856, + "time_per_iteration": 2.478543996810913 + }, + { + "auxiliary_loss_clip": 0.01078323, + "auxiliary_loss_mlp": 0.01058525, + "balance_loss_clip": 1.02405012, + "balance_loss_mlp": 1.02492869, + "epoch": 0.23189538554035774, + "flos": 34457284103040.0, + "grad_norm": 1.5371372799892635, + "language_loss": 0.7830472, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.8044157, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.53515625, + "step": 3857, + "time_per_iteration": 2.578979253768921 + }, + { + "auxiliary_loss_clip": 0.01084473, + "auxiliary_loss_mlp": 0.01071975, + "balance_loss_clip": 1.03170657, + "balance_loss_mlp": 1.02664995, + "epoch": 0.2319555087930257, + "flos": 18002858106240.0, + "grad_norm": 2.1201502130445844, + "language_loss": 0.92967904, + "learning_rate": 3.587190612385584e-06, + "loss": 0.95124352, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.578125, + "step": 3858, + "time_per_iteration": 2.371002674102783 + }, + { + "auxiliary_loss_clip": 0.01076029, + "auxiliary_loss_mlp": 0.01054053, + "balance_loss_clip": 1.02138996, + "balance_loss_mlp": 1.02453637, + "epoch": 0.23201563204569367, + "flos": 23142876399360.0, + "grad_norm": 1.69314717698541, + "language_loss": 0.7776745, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.79897529, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.515625, + "step": 3859, + "time_per_iteration": 2.4574496746063232 + }, + { + "auxiliary_loss_clip": 0.01080219, + "auxiliary_loss_mlp": 0.01054932, + "balance_loss_clip": 1.017524, + "balance_loss_mlp": 1.02576613, + "epoch": 0.23207575529836164, + "flos": 20666940577920.0, + "grad_norm": 2.1788548230399183, + "language_loss": 0.85462558, + "learning_rate": 3.58671655924898e-06, + "loss": 0.87597704, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.54296875, + "step": 3860, + "time_per_iteration": 2.397991418838501 + }, + { + "auxiliary_loss_clip": 0.01078931, + "auxiliary_loss_mlp": 0.01065067, + "balance_loss_clip": 1.02882791, + "balance_loss_mlp": 1.02547121, + "epoch": 0.2321358785510296, + "flos": 16471253272320.0, + "grad_norm": 2.2026270961483836, + "language_loss": 0.84940714, + "learning_rate": 3.586479442423508e-06, + "loss": 0.87084711, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.53515625, + "step": 3861, + "time_per_iteration": 2.3635354042053223 + }, + { + "auxiliary_loss_clip": 0.01078382, + "auxiliary_loss_mlp": 0.01058052, + "balance_loss_clip": 1.01890373, + "balance_loss_mlp": 1.02474999, + "epoch": 0.2321960018036976, + "flos": 21615251460480.0, + "grad_norm": 1.487606543572715, + "language_loss": 0.87306678, + "learning_rate": 3.586242265438576e-06, + "loss": 0.89443111, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.53515625, + "step": 3862, + "time_per_iteration": 2.4222333431243896 + }, + { + "auxiliary_loss_clip": 0.01076676, + "auxiliary_loss_mlp": 0.01054823, + "balance_loss_clip": 1.02237463, + "balance_loss_mlp": 1.02482557, + "epoch": 0.23225612505636556, + "flos": 22270431634560.0, + "grad_norm": 1.4158006110965622, + "language_loss": 0.76120085, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.78251582, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51953125, + "step": 3863, + "time_per_iteration": 2.39215350151062 + }, + { + "auxiliary_loss_clip": 0.01075587, + "auxiliary_loss_mlp": 0.01056609, + "balance_loss_clip": 1.02184796, + "balance_loss_mlp": 1.0244441, + "epoch": 0.23231624830903352, + "flos": 17051475024000.0, + "grad_norm": 1.627684964442918, + "language_loss": 0.75652093, + "learning_rate": 3.58576773102631e-06, + "loss": 0.77784288, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.51171875, + "step": 3864, + "time_per_iteration": 2.3792147636413574 + }, + { + "auxiliary_loss_clip": 0.01076871, + "auxiliary_loss_mlp": 0.01054122, + "balance_loss_clip": 1.0180254, + "balance_loss_mlp": 1.02295578, + "epoch": 0.2323763715617015, + "flos": 34638657949440.0, + "grad_norm": 1.7801298398750258, + "language_loss": 0.71828538, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.73959529, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5390625, + "step": 3865, + "time_per_iteration": 2.493830919265747 + }, + { + "auxiliary_loss_clip": 0.01082239, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_clip": 1.02732337, + "balance_loss_mlp": 1.0243386, + "epoch": 0.23243649481436945, + "flos": 25550661513600.0, + "grad_norm": 1.7526053152444734, + "language_loss": 0.96896851, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.99049997, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 0.578125, + "step": 3866, + "time_per_iteration": 2.4490180015563965 + }, + { + "auxiliary_loss_clip": 0.01075136, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.02199411, + "epoch": 0.23249661806703742, + "flos": 20482494531840.0, + "grad_norm": 2.3110849101851745, + "language_loss": 0.75179374, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.77314633, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.53125, + "step": 3867, + "time_per_iteration": 2.3598415851593018 + }, + { + "auxiliary_loss_clip": 0.01077354, + "auxiliary_loss_mlp": 0.01059715, + "balance_loss_clip": 1.02216387, + "balance_loss_mlp": 1.02394557, + "epoch": 0.23255674131970538, + "flos": 20375555437440.0, + "grad_norm": 1.673926165795074, + "language_loss": 0.83805048, + "learning_rate": 3.584817940684145e-06, + "loss": 0.85942113, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.53125, + "step": 3868, + "time_per_iteration": 2.413832902908325 + }, + { + "auxiliary_loss_clip": 0.0107445, + "auxiliary_loss_mlp": 0.01050032, + "balance_loss_clip": 1.01560462, + "balance_loss_mlp": 1.02368855, + "epoch": 0.23261686457237338, + "flos": 17055140716800.0, + "grad_norm": 1.8419555056956922, + "language_loss": 0.75278968, + "learning_rate": 3.58458034283495e-06, + "loss": 0.7740345, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 3869, + "time_per_iteration": 2.3546109199523926 + }, + { + "auxiliary_loss_clip": 0.0107661, + "auxiliary_loss_mlp": 0.01058667, + "balance_loss_clip": 1.0198288, + "balance_loss_mlp": 1.02419043, + "epoch": 0.23267698782504134, + "flos": 29168570862720.0, + "grad_norm": 2.1177823780160794, + "language_loss": 0.81323314, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.83458591, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5234375, + "step": 3870, + "time_per_iteration": 2.4818832874298096 + }, + { + "auxiliary_loss_clip": 0.01079202, + "auxiliary_loss_mlp": 0.01057979, + "balance_loss_clip": 1.01663709, + "balance_loss_mlp": 1.02382565, + "epoch": 0.2327371110777093, + "flos": 21173705095680.0, + "grad_norm": 1.9144938696687324, + "language_loss": 0.72030073, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.74167252, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.5546875, + "step": 3871, + "time_per_iteration": 2.3687267303466797 + }, + { + "auxiliary_loss_clip": 0.01078578, + "auxiliary_loss_mlp": 0.01057375, + "balance_loss_clip": 1.01748729, + "balance_loss_mlp": 1.02328372, + "epoch": 0.23279723433037727, + "flos": 24861964567680.0, + "grad_norm": 2.349815006854031, + "language_loss": 0.70717758, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.72853708, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.5546875, + "step": 3872, + "time_per_iteration": 2.4941580295562744 + }, + { + "auxiliary_loss_clip": 0.0108243, + "auxiliary_loss_mlp": 0.01065535, + "balance_loss_clip": 1.02340651, + "balance_loss_mlp": 1.0251205, + "epoch": 0.23285735758304524, + "flos": 38799082915200.0, + "grad_norm": 2.194373677702728, + "language_loss": 0.79815519, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.8196348, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.57421875, + "step": 3873, + "time_per_iteration": 2.542754888534546 + }, + { + "auxiliary_loss_clip": 0.01021409, + "auxiliary_loss_mlp": 0.01007128, + "balance_loss_clip": 1.0027169, + "balance_loss_mlp": 1.00707817, + "epoch": 0.2329174808357132, + "flos": 53941086927360.0, + "grad_norm": 0.8628338971461693, + "language_loss": 0.60627913, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6265645, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.14257812, + "step": 3874, + "time_per_iteration": 2.9577412605285645 + }, + { + "auxiliary_loss_clip": 0.01077826, + "auxiliary_loss_mlp": 0.01059301, + "balance_loss_clip": 1.01996207, + "balance_loss_mlp": 1.02446008, + "epoch": 0.23297760408838117, + "flos": 21214937278080.0, + "grad_norm": 2.2134337132195085, + "language_loss": 0.83059251, + "learning_rate": 3.583153494218927e-06, + "loss": 0.85196382, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.53125, + "step": 3875, + "time_per_iteration": 2.4001595973968506 + }, + { + "auxiliary_loss_clip": 0.01078438, + "auxiliary_loss_mlp": 0.01053373, + "balance_loss_clip": 1.01887381, + "balance_loss_mlp": 1.02576542, + "epoch": 0.23303772734104916, + "flos": 28401738560640.0, + "grad_norm": 1.5933742876896877, + "language_loss": 0.62685668, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.64817482, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.52734375, + "step": 3876, + "time_per_iteration": 2.457732677459717 + }, + { + "auxiliary_loss_clip": 0.01084608, + "auxiliary_loss_mlp": 0.0105608, + "balance_loss_clip": 1.01845789, + "balance_loss_mlp": 1.02851391, + "epoch": 0.23309785059371713, + "flos": 24313618753920.0, + "grad_norm": 1.6830564169295352, + "language_loss": 0.71796131, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.7393682, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5625, + "step": 3877, + "time_per_iteration": 2.4071102142333984 + }, + { + "auxiliary_loss_clip": 0.01080447, + "auxiliary_loss_mlp": 0.01065147, + "balance_loss_clip": 1.02695262, + "balance_loss_mlp": 1.02436316, + "epoch": 0.2331579738463851, + "flos": 15992140417920.0, + "grad_norm": 2.1846063140342293, + "language_loss": 0.8306247, + "learning_rate": 3.582439259339073e-06, + "loss": 0.85208064, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.55859375, + "step": 3878, + "time_per_iteration": 2.3781116008758545 + }, + { + "auxiliary_loss_clip": 0.01085143, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_clip": 1.02095199, + "balance_loss_mlp": 1.02827954, + "epoch": 0.23321809709905306, + "flos": 36425547711360.0, + "grad_norm": 2.041269393547575, + "language_loss": 0.76607305, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.78753096, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.5703125, + "step": 3879, + "time_per_iteration": 2.5186405181884766 + }, + { + "auxiliary_loss_clip": 0.01080217, + "auxiliary_loss_mlp": 0.0105105, + "balance_loss_clip": 1.01886356, + "balance_loss_mlp": 1.02479959, + "epoch": 0.23327822035172102, + "flos": 21323691763200.0, + "grad_norm": 2.3035415892809765, + "language_loss": 0.91065085, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.9319635, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5546875, + "step": 3880, + "time_per_iteration": 2.420231819152832 + }, + { + "auxiliary_loss_clip": 0.01081128, + "auxiliary_loss_mlp": 0.01053846, + "balance_loss_clip": 1.01896501, + "balance_loss_mlp": 1.02666008, + "epoch": 0.233338343604389, + "flos": 19170877374720.0, + "grad_norm": 1.7346536572233047, + "language_loss": 0.7331928, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.75454247, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.546875, + "step": 3881, + "time_per_iteration": 2.3863301277160645 + }, + { + "auxiliary_loss_clip": 0.01077543, + "auxiliary_loss_mlp": 0.01060656, + "balance_loss_clip": 1.02482128, + "balance_loss_mlp": 1.02559638, + "epoch": 0.23339846685705698, + "flos": 26907106723200.0, + "grad_norm": 1.5510522823000543, + "language_loss": 0.68820971, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70959175, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.51953125, + "step": 3882, + "time_per_iteration": 2.4737913608551025 + }, + { + "auxiliary_loss_clip": 0.01079139, + "auxiliary_loss_mlp": 0.01054259, + "balance_loss_clip": 1.01885366, + "balance_loss_mlp": 1.02509236, + "epoch": 0.23345859010972494, + "flos": 32341791824640.0, + "grad_norm": 2.827983133658746, + "language_loss": 0.78419244, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.80552638, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5390625, + "step": 3883, + "time_per_iteration": 2.467496871948242 + }, + { + "auxiliary_loss_clip": 0.01019394, + "auxiliary_loss_mlp": 0.01019819, + "balance_loss_clip": 1.01555181, + "balance_loss_mlp": 1.00523627, + "epoch": 0.2335187133623929, + "flos": 58480633013760.0, + "grad_norm": 0.7979469821507169, + "language_loss": 0.59082848, + "learning_rate": 3.58100916965445e-06, + "loss": 0.6112206, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.14160156, + "step": 3884, + "time_per_iteration": 3.167285203933716 + }, + { + "auxiliary_loss_clip": 0.01077011, + "auxiliary_loss_mlp": 0.01059213, + "balance_loss_clip": 1.02273464, + "balance_loss_mlp": 1.02404857, + "epoch": 0.23357883661506088, + "flos": 24501067176960.0, + "grad_norm": 1.6105170347528268, + "language_loss": 0.81751716, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.83887941, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.53125, + "step": 3885, + "time_per_iteration": 2.431936025619507 + }, + { + "auxiliary_loss_clip": 0.01076049, + "auxiliary_loss_mlp": 0.01054912, + "balance_loss_clip": 1.01891112, + "balance_loss_mlp": 1.02228153, + "epoch": 0.23363895986772884, + "flos": 18947642941440.0, + "grad_norm": 2.4431487082985295, + "language_loss": 0.88447118, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90578079, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5390625, + "step": 3886, + "time_per_iteration": 2.3971192836761475 + }, + { + "auxiliary_loss_clip": 0.01079711, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.01462352, + "balance_loss_mlp": 1.02490902, + "epoch": 0.2336990831203968, + "flos": 31685459575680.0, + "grad_norm": 1.8075117581229074, + "language_loss": 0.74260265, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.76391387, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.546875, + "step": 3887, + "time_per_iteration": 2.467482566833496 + }, + { + "auxiliary_loss_clip": 0.01077955, + "auxiliary_loss_mlp": 0.01051728, + "balance_loss_clip": 1.01658547, + "balance_loss_mlp": 1.02382064, + "epoch": 0.23375920637306477, + "flos": 27708503137920.0, + "grad_norm": 1.9829235660399014, + "language_loss": 0.85209596, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.87339282, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5390625, + "step": 3888, + "time_per_iteration": 2.444537878036499 + }, + { + "auxiliary_loss_clip": 0.01077123, + "auxiliary_loss_mlp": 0.0105708, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.0232029, + "epoch": 0.23381932962573276, + "flos": 17674674526080.0, + "grad_norm": 2.1854554441469594, + "language_loss": 0.88621247, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.90755451, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5390625, + "step": 3889, + "time_per_iteration": 2.3650519847869873 + }, + { + "auxiliary_loss_clip": 0.01081605, + "auxiliary_loss_mlp": 0.01054908, + "balance_loss_clip": 1.017452, + "balance_loss_mlp": 1.02556682, + "epoch": 0.23387945287840073, + "flos": 14390010904320.0, + "grad_norm": 2.4739840690192554, + "language_loss": 0.77979279, + "learning_rate": 3.579576921697125e-06, + "loss": 0.80115789, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5625, + "step": 3890, + "time_per_iteration": 3.8126108646392822 + }, + { + "auxiliary_loss_clip": 0.01077608, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.01964426, + "balance_loss_mlp": 1.02353132, + "epoch": 0.2339395761310687, + "flos": 46096244605440.0, + "grad_norm": 1.8256540184503642, + "language_loss": 0.74825859, + "learning_rate": 3.579338004009412e-06, + "loss": 0.76959562, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5390625, + "step": 3891, + "time_per_iteration": 2.61108136177063 + }, + { + "auxiliary_loss_clip": 0.01074359, + "auxiliary_loss_mlp": 0.01051772, + "balance_loss_clip": 1.01875091, + "balance_loss_mlp": 1.02368879, + "epoch": 0.23399969938373666, + "flos": 22380966599040.0, + "grad_norm": 1.644076662523735, + "language_loss": 0.84570456, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.86696583, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.5078125, + "step": 3892, + "time_per_iteration": 2.403780698776245 + }, + { + "auxiliary_loss_clip": 0.01078595, + "auxiliary_loss_mlp": 0.01060694, + "balance_loss_clip": 1.02388263, + "balance_loss_mlp": 1.02363729, + "epoch": 0.23405982263640462, + "flos": 43506841265280.0, + "grad_norm": 1.883219731303419, + "language_loss": 0.65962172, + "learning_rate": 3.578859988977082e-06, + "loss": 0.6810146, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.55078125, + "step": 3893, + "time_per_iteration": 2.6260251998901367 + }, + { + "auxiliary_loss_clip": 0.01077034, + "auxiliary_loss_mlp": 0.01059422, + "balance_loss_clip": 1.02058387, + "balance_loss_mlp": 1.02384555, + "epoch": 0.2341199458890726, + "flos": 22563597254400.0, + "grad_norm": 2.150981373372674, + "language_loss": 0.8097434, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.83110797, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.53125, + "step": 3894, + "time_per_iteration": 3.900834798812866 + }, + { + "auxiliary_loss_clip": 0.0107881, + "auxiliary_loss_mlp": 0.01060512, + "balance_loss_clip": 1.02234161, + "balance_loss_mlp": 1.02504623, + "epoch": 0.23418006914174055, + "flos": 25632672030720.0, + "grad_norm": 1.4283242960616884, + "language_loss": 0.83029962, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.8516928, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5390625, + "step": 3895, + "time_per_iteration": 3.853341579437256 + }, + { + "auxiliary_loss_clip": 0.01079886, + "auxiliary_loss_mlp": 0.01064895, + "balance_loss_clip": 1.02593708, + "balance_loss_mlp": 1.02480507, + "epoch": 0.23424019239440855, + "flos": 13545287625600.0, + "grad_norm": 2.186795249919968, + "language_loss": 0.82349551, + "learning_rate": 3.578142517422292e-06, + "loss": 0.84494328, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.55078125, + "step": 3896, + "time_per_iteration": 2.3750054836273193 + }, + { + "auxiliary_loss_clip": 0.01080671, + "auxiliary_loss_mlp": 0.01068503, + "balance_loss_clip": 1.02689874, + "balance_loss_mlp": 1.02411413, + "epoch": 0.2343003156470765, + "flos": 22418393443200.0, + "grad_norm": 1.9027400253851017, + "language_loss": 0.84629679, + "learning_rate": 3.577903240538623e-06, + "loss": 0.86778849, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.56640625, + "step": 3897, + "time_per_iteration": 2.3953840732574463 + }, + { + "auxiliary_loss_clip": 0.01079571, + "auxiliary_loss_mlp": 0.01062446, + "balance_loss_clip": 1.02258277, + "balance_loss_mlp": 1.02370369, + "epoch": 0.23436043889974448, + "flos": 14790010884480.0, + "grad_norm": 1.7954760734954813, + "language_loss": 0.80590415, + "learning_rate": 3.577663903820705e-06, + "loss": 0.82732427, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.55859375, + "step": 3898, + "time_per_iteration": 2.37689208984375 + }, + { + "auxiliary_loss_clip": 0.01075648, + "auxiliary_loss_mlp": 0.01059063, + "balance_loss_clip": 1.02208471, + "balance_loss_mlp": 1.02273881, + "epoch": 0.23442056215241244, + "flos": 22964609664000.0, + "grad_norm": 2.092190475990345, + "language_loss": 0.75740743, + "learning_rate": 3.577424507277614e-06, + "loss": 0.77875447, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53125, + "step": 3899, + "time_per_iteration": 2.390730142593384 + }, + { + "auxiliary_loss_clip": 0.0108134, + "auxiliary_loss_mlp": 0.01062585, + "balance_loss_clip": 1.02062297, + "balance_loss_mlp": 1.02439356, + "epoch": 0.2344806854050804, + "flos": 23070885442560.0, + "grad_norm": 1.7475747249663987, + "language_loss": 0.76987612, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.79131544, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.5703125, + "step": 3900, + "time_per_iteration": 2.4205384254455566 + }, + { + "auxiliary_loss_clip": 0.01077797, + "auxiliary_loss_mlp": 0.01065451, + "balance_loss_clip": 1.02747142, + "balance_loss_mlp": 1.02340817, + "epoch": 0.23454080865774837, + "flos": 16326119283840.0, + "grad_norm": 1.8934772353221367, + "language_loss": 0.68097222, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.70240474, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.54296875, + "step": 3901, + "time_per_iteration": 2.3614702224731445 + }, + { + "auxiliary_loss_clip": 0.01023192, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.02532554, + "balance_loss_mlp": 1.00854576, + "epoch": 0.23460093191041637, + "flos": 67757860218240.0, + "grad_norm": 0.7849834948752606, + "language_loss": 0.58328414, + "learning_rate": 3.576705958788091e-06, + "loss": 0.6038177, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.04833984, + "router_z_loss_mlp": 0.14648438, + "step": 3902, + "time_per_iteration": 2.9888994693756104 + }, + { + "auxiliary_loss_clip": 0.01076614, + "auxiliary_loss_mlp": 0.01058702, + "balance_loss_clip": 1.02119851, + "balance_loss_mlp": 1.02271461, + "epoch": 0.23466105516308433, + "flos": 20076769088640.0, + "grad_norm": 1.9035244787634606, + "language_loss": 0.82970989, + "learning_rate": 3.576466323035108e-06, + "loss": 0.85106307, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5390625, + "step": 3903, + "time_per_iteration": 2.372032403945923 + }, + { + "auxiliary_loss_clip": 0.01078053, + "auxiliary_loss_mlp": 0.01059313, + "balance_loss_clip": 1.01990211, + "balance_loss_mlp": 1.02333045, + "epoch": 0.2347211784157523, + "flos": 24534549037440.0, + "grad_norm": 1.9246864911882018, + "language_loss": 0.83908719, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.86046088, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.546875, + "step": 3904, + "time_per_iteration": 2.4618828296661377 + }, + { + "auxiliary_loss_clip": 0.01077943, + "auxiliary_loss_mlp": 0.0106214, + "balance_loss_clip": 1.02451754, + "balance_loss_mlp": 1.02366877, + "epoch": 0.23478130166842026, + "flos": 23803921681920.0, + "grad_norm": 2.1011057547738314, + "language_loss": 0.73126912, + "learning_rate": 3.57598687219895e-06, + "loss": 0.75266993, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.54296875, + "step": 3905, + "time_per_iteration": 2.401921510696411 + }, + { + "auxiliary_loss_clip": 0.0107669, + "auxiliary_loss_mlp": 0.01054752, + "balance_loss_clip": 1.01693845, + "balance_loss_mlp": 1.02413869, + "epoch": 0.23484142492108823, + "flos": 24092583736320.0, + "grad_norm": 1.7092114234134448, + "language_loss": 0.72172213, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.74303657, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5234375, + "step": 3906, + "time_per_iteration": 2.4239165782928467 + }, + { + "auxiliary_loss_clip": 0.0108305, + "auxiliary_loss_mlp": 0.01067421, + "balance_loss_clip": 1.02367139, + "balance_loss_mlp": 1.02458882, + "epoch": 0.2349015481737562, + "flos": 29094555047040.0, + "grad_norm": 2.1408830986576164, + "language_loss": 0.75929606, + "learning_rate": 3.575507182316473e-06, + "loss": 0.78080076, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.5859375, + "step": 3907, + "time_per_iteration": 2.4485673904418945 + }, + { + "auxiliary_loss_clip": 0.01081038, + "auxiliary_loss_mlp": 0.0107316, + "balance_loss_clip": 1.03281951, + "balance_loss_mlp": 1.02531314, + "epoch": 0.23496167142642416, + "flos": 18915313155840.0, + "grad_norm": 1.8173964837277947, + "language_loss": 0.75041509, + "learning_rate": 3.575267247755601e-06, + "loss": 0.77195716, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.5546875, + "step": 3908, + "time_per_iteration": 2.3869290351867676 + }, + { + "auxiliary_loss_clip": 0.01020545, + "auxiliary_loss_mlp": 0.01024796, + "balance_loss_clip": 1.02048075, + "balance_loss_mlp": 1.00620461, + "epoch": 0.23502179467909215, + "flos": 55865255621760.0, + "grad_norm": 1.0492259729787043, + "language_loss": 0.73460305, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.7550565, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.04321289, + "router_z_loss_mlp": 0.14355469, + "step": 3909, + "time_per_iteration": 2.7609524726867676 + }, + { + "auxiliary_loss_clip": 0.01079723, + "auxiliary_loss_mlp": 0.01059548, + "balance_loss_clip": 1.02333236, + "balance_loss_mlp": 1.02489233, + "epoch": 0.23508191793176011, + "flos": 23400709856640.0, + "grad_norm": 1.5904369967128753, + "language_loss": 0.89161479, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.91300756, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.546875, + "step": 3910, + "time_per_iteration": 2.405442714691162 + }, + { + "auxiliary_loss_clip": 0.01082675, + "auxiliary_loss_mlp": 0.01058027, + "balance_loss_clip": 1.01911759, + "balance_loss_mlp": 1.0265888, + "epoch": 0.23514204118442808, + "flos": 20046638718720.0, + "grad_norm": 2.2365501359952353, + "language_loss": 0.78027737, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.80168432, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5625, + "step": 3911, + "time_per_iteration": 2.3970396518707275 + }, + { + "auxiliary_loss_clip": 0.010775, + "auxiliary_loss_mlp": 0.01062506, + "balance_loss_clip": 1.02760148, + "balance_loss_mlp": 1.02599573, + "epoch": 0.23520216443709605, + "flos": 21579500361600.0, + "grad_norm": 1.6613118140115142, + "language_loss": 0.82257295, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.84397304, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.515625, + "step": 3912, + "time_per_iteration": 2.4070284366607666 + }, + { + "auxiliary_loss_clip": 0.01076327, + "auxiliary_loss_mlp": 0.01054918, + "balance_loss_clip": 1.01977539, + "balance_loss_mlp": 1.02469397, + "epoch": 0.235262287689764, + "flos": 23184667163520.0, + "grad_norm": 1.950961030018402, + "language_loss": 0.73088837, + "learning_rate": 3.574066679118909e-06, + "loss": 0.75220084, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.515625, + "step": 3913, + "time_per_iteration": 2.4060187339782715 + }, + { + "auxiliary_loss_clip": 0.01082864, + "auxiliary_loss_mlp": 0.01057797, + "balance_loss_clip": 1.01666963, + "balance_loss_mlp": 1.02515292, + "epoch": 0.23532241094243198, + "flos": 23184108581760.0, + "grad_norm": 2.519771638224627, + "language_loss": 0.77479368, + "learning_rate": 3.57382638628884e-06, + "loss": 0.79620028, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.578125, + "step": 3914, + "time_per_iteration": 2.419952869415283 + }, + { + "auxiliary_loss_clip": 0.01080778, + "auxiliary_loss_mlp": 0.01061066, + "balance_loss_clip": 1.02196503, + "balance_loss_mlp": 1.02583456, + "epoch": 0.23538253419509997, + "flos": 17018377188480.0, + "grad_norm": 3.1562011585268834, + "language_loss": 0.91888511, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.94030356, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.546875, + "step": 3915, + "time_per_iteration": 2.3770058155059814 + }, + { + "auxiliary_loss_clip": 0.01016867, + "auxiliary_loss_mlp": 0.01010157, + "balance_loss_clip": 1.00631893, + "balance_loss_mlp": 1.0036037, + "epoch": 0.23544265744776793, + "flos": 63445807751040.0, + "grad_norm": 0.8141721448585336, + "language_loss": 0.59362686, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61389709, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.03833008, + "router_z_loss_mlp": 0.1328125, + "step": 3916, + "time_per_iteration": 2.9524471759796143 + }, + { + "auxiliary_loss_clip": 0.01017187, + "auxiliary_loss_mlp": 0.0100862, + "balance_loss_clip": 1.00423265, + "balance_loss_mlp": 1.00402701, + "epoch": 0.2355027807004359, + "flos": 70511668617600.0, + "grad_norm": 0.7612913765875053, + "language_loss": 0.49457031, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51482838, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.04394531, + "router_z_loss_mlp": 0.13183594, + "step": 3917, + "time_per_iteration": 3.0565991401672363 + }, + { + "auxiliary_loss_clip": 0.01084215, + "auxiliary_loss_mlp": 0.0107448, + "balance_loss_clip": 1.03537929, + "balance_loss_mlp": 1.02581286, + "epoch": 0.23556290395310386, + "flos": 21433214298240.0, + "grad_norm": 2.031603758792656, + "language_loss": 0.78153861, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.80312556, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5859375, + "step": 3918, + "time_per_iteration": 2.4157657623291016 + }, + { + "auxiliary_loss_clip": 0.0108235, + "auxiliary_loss_mlp": 0.01073531, + "balance_loss_clip": 1.03762507, + "balance_loss_mlp": 1.02570891, + "epoch": 0.23562302720577183, + "flos": 18185453850240.0, + "grad_norm": 2.1384653607966895, + "language_loss": 0.71288717, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.73444599, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.56640625, + "step": 3919, + "time_per_iteration": 2.3611302375793457 + }, + { + "auxiliary_loss_clip": 0.01074906, + "auxiliary_loss_mlp": 0.01063324, + "balance_loss_clip": 1.02870536, + "balance_loss_mlp": 1.02353549, + "epoch": 0.2356831504584398, + "flos": 33729065631360.0, + "grad_norm": 1.6832214196775594, + "language_loss": 0.71265924, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.73404151, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.515625, + "step": 3920, + "time_per_iteration": 2.517735004425049 + }, + { + "auxiliary_loss_clip": 0.01076712, + "auxiliary_loss_mlp": 0.01069378, + "balance_loss_clip": 1.03340054, + "balance_loss_mlp": 1.02455449, + "epoch": 0.23574327371110776, + "flos": 24931721197440.0, + "grad_norm": 1.6399914833714748, + "language_loss": 0.78651619, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.80797708, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5234375, + "step": 3921, + "time_per_iteration": 2.420454502105713 + }, + { + "auxiliary_loss_clip": 0.01078331, + "auxiliary_loss_mlp": 0.01062016, + "balance_loss_clip": 1.02503717, + "balance_loss_mlp": 1.02490425, + "epoch": 0.23580339696377575, + "flos": 17821135146240.0, + "grad_norm": 2.083858806669315, + "language_loss": 0.76961392, + "learning_rate": 3.571901895946612e-06, + "loss": 0.79101735, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53515625, + "step": 3922, + "time_per_iteration": 2.4158968925476074 + }, + { + "auxiliary_loss_clip": 0.01077404, + "auxiliary_loss_mlp": 0.01060275, + "balance_loss_clip": 1.02203238, + "balance_loss_mlp": 1.02326322, + "epoch": 0.23586352021644372, + "flos": 26285408409600.0, + "grad_norm": 2.3672797348494123, + "language_loss": 0.81485039, + "learning_rate": 3.571661066327956e-06, + "loss": 0.83622718, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.54296875, + "step": 3923, + "time_per_iteration": 2.439154863357544 + }, + { + "auxiliary_loss_clip": 0.01077481, + "auxiliary_loss_mlp": 0.01066302, + "balance_loss_clip": 1.02736783, + "balance_loss_mlp": 1.02342272, + "epoch": 0.23592364346911168, + "flos": 14245819522560.0, + "grad_norm": 1.6869276354047356, + "language_loss": 0.75625324, + "learning_rate": 3.571420177111754e-06, + "loss": 0.77769113, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.54296875, + "step": 3924, + "time_per_iteration": 2.383049726486206 + }, + { + "auxiliary_loss_clip": 0.01077398, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.02202809, + "balance_loss_mlp": 1.02372408, + "epoch": 0.23598376672177965, + "flos": 18586955018880.0, + "grad_norm": 2.3558105543786265, + "language_loss": 0.83884943, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.86022747, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5390625, + "step": 3925, + "time_per_iteration": 2.380774736404419 + }, + { + "auxiliary_loss_clip": 0.0108122, + "auxiliary_loss_mlp": 0.01063989, + "balance_loss_clip": 1.02357745, + "balance_loss_mlp": 1.02457261, + "epoch": 0.2360438899744476, + "flos": 22674411509760.0, + "grad_norm": 2.0424717848448544, + "language_loss": 0.61381578, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.63526785, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.56640625, + "step": 3926, + "time_per_iteration": 2.4361178874969482 + }, + { + "auxiliary_loss_clip": 0.01072223, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_clip": 1.01818657, + "balance_loss_mlp": 1.02197289, + "epoch": 0.23610401322711558, + "flos": 29568850133760.0, + "grad_norm": 2.018462948724551, + "language_loss": 0.74100161, + "learning_rate": 3.570697151969235e-06, + "loss": 0.76224977, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.50390625, + "step": 3927, + "time_per_iteration": 2.4562835693359375 + }, + { + "auxiliary_loss_clip": 0.0107568, + "auxiliary_loss_mlp": 0.01054897, + "balance_loss_clip": 1.02075505, + "balance_loss_mlp": 1.0219357, + "epoch": 0.23616413647978354, + "flos": 17857549560960.0, + "grad_norm": 2.0727158242587644, + "language_loss": 0.76951468, + "learning_rate": 3.570456024454221e-06, + "loss": 0.79082048, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5390625, + "step": 3928, + "time_per_iteration": 2.402432918548584 + }, + { + "auxiliary_loss_clip": 0.01078899, + "auxiliary_loss_mlp": 0.01059278, + "balance_loss_clip": 1.02184606, + "balance_loss_mlp": 1.02482915, + "epoch": 0.23622425973245154, + "flos": 11034089464320.0, + "grad_norm": 2.5642000241660816, + "language_loss": 0.8430776, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.8644594, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5390625, + "step": 3929, + "time_per_iteration": 3.7743000984191895 + }, + { + "auxiliary_loss_clip": 0.01085194, + "auxiliary_loss_mlp": 0.01060572, + "balance_loss_clip": 1.01613092, + "balance_loss_mlp": 1.02695155, + "epoch": 0.2362843829851195, + "flos": 23402944183680.0, + "grad_norm": 1.8025649725781583, + "language_loss": 0.72718388, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74864149, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.58203125, + "step": 3930, + "time_per_iteration": 2.427443742752075 + }, + { + "auxiliary_loss_clip": 0.01078782, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.01379025, + "balance_loss_mlp": 1.02416706, + "epoch": 0.23634450623778747, + "flos": 39528313816320.0, + "grad_norm": 1.7998679800413144, + "language_loss": 0.7563386, + "learning_rate": 3.569732284634665e-06, + "loss": 0.77765429, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.546875, + "step": 3931, + "time_per_iteration": 2.5272216796875 + }, + { + "auxiliary_loss_clip": 0.01081307, + "auxiliary_loss_mlp": 0.0105527, + "balance_loss_clip": 1.01600206, + "balance_loss_mlp": 1.0264225, + "epoch": 0.23640462949045543, + "flos": 24206016343680.0, + "grad_norm": 2.053289642228667, + "language_loss": 0.81657898, + "learning_rate": 3.569490918967136e-06, + "loss": 0.83794475, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.546875, + "step": 3932, + "time_per_iteration": 2.4401259422302246 + }, + { + "auxiliary_loss_clip": 0.01078027, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.01959026, + "balance_loss_mlp": 1.02547073, + "epoch": 0.2364647527431234, + "flos": 26176409544960.0, + "grad_norm": 1.4462524065467668, + "language_loss": 0.87698877, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.89831781, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.52734375, + "step": 3933, + "time_per_iteration": 3.8063340187072754 + }, + { + "auxiliary_loss_clip": 0.0108405, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_clip": 1.01827276, + "balance_loss_mlp": 1.02805293, + "epoch": 0.23652487599579136, + "flos": 22635937324800.0, + "grad_norm": 1.923574500297908, + "language_loss": 0.83753073, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.85893637, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5625, + "step": 3934, + "time_per_iteration": 3.9151978492736816 + }, + { + "auxiliary_loss_clip": 0.01080195, + "auxiliary_loss_mlp": 0.01059742, + "balance_loss_clip": 1.02045071, + "balance_loss_mlp": 1.0257659, + "epoch": 0.23658499924845935, + "flos": 21761188410240.0, + "grad_norm": 1.6584110059564192, + "language_loss": 0.80188096, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.82328033, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.546875, + "step": 3935, + "time_per_iteration": 2.443239212036133 + }, + { + "auxiliary_loss_clip": 0.01078792, + "auxiliary_loss_mlp": 0.01058136, + "balance_loss_clip": 1.02280235, + "balance_loss_mlp": 1.02650952, + "epoch": 0.23664512250112732, + "flos": 21797917027200.0, + "grad_norm": 1.8551049702115125, + "language_loss": 0.81177258, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.83314186, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5234375, + "step": 3936, + "time_per_iteration": 2.390111207962036 + }, + { + "auxiliary_loss_clip": 0.01082331, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_clip": 1.02083254, + "balance_loss_mlp": 1.02752805, + "epoch": 0.23670524575379528, + "flos": 22636775197440.0, + "grad_norm": 1.4832195847087861, + "language_loss": 0.80785537, + "learning_rate": 3.568283198083826e-06, + "loss": 0.82925588, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.546875, + "step": 3937, + "time_per_iteration": 2.45430064201355 + }, + { + "auxiliary_loss_clip": 0.01073655, + "auxiliary_loss_mlp": 0.01053941, + "balance_loss_clip": 1.02080083, + "balance_loss_mlp": 1.02344048, + "epoch": 0.23676536900646325, + "flos": 16724129316480.0, + "grad_norm": 2.2110801907466677, + "language_loss": 0.87530708, + "learning_rate": 3.568041475462147e-06, + "loss": 0.89658308, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5, + "step": 3938, + "time_per_iteration": 2.3888449668884277 + }, + { + "auxiliary_loss_clip": 0.01075703, + "auxiliary_loss_mlp": 0.01064944, + "balance_loss_clip": 1.02651072, + "balance_loss_mlp": 1.02338862, + "epoch": 0.23682549225913122, + "flos": 11135093627520.0, + "grad_norm": 2.8506664392006242, + "language_loss": 0.95270014, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.97410661, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5234375, + "step": 3939, + "time_per_iteration": 2.4036147594451904 + }, + { + "auxiliary_loss_clip": 0.01078983, + "auxiliary_loss_mlp": 0.01062495, + "balance_loss_clip": 1.02203536, + "balance_loss_mlp": 1.02385592, + "epoch": 0.23688561551179918, + "flos": 22558290727680.0, + "grad_norm": 1.6208198112365364, + "language_loss": 0.83595556, + "learning_rate": 3.567557851847088e-06, + "loss": 0.85737038, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.55078125, + "step": 3940, + "time_per_iteration": 2.398594617843628 + }, + { + "auxiliary_loss_clip": 0.01082765, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.02810502, + "balance_loss_mlp": 1.02408004, + "epoch": 0.23694573876446715, + "flos": 18513916721280.0, + "grad_norm": 2.1103817979441066, + "language_loss": 0.91125536, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.93277764, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.5859375, + "step": 3941, + "time_per_iteration": 2.431144952774048 + }, + { + "auxiliary_loss_clip": 0.01078575, + "auxiliary_loss_mlp": 0.01067426, + "balance_loss_clip": 1.02751493, + "balance_loss_mlp": 1.02310205, + "epoch": 0.23700586201713514, + "flos": 15334970296320.0, + "grad_norm": 2.124074872017382, + "language_loss": 0.86314225, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.88460231, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.5546875, + "step": 3942, + "time_per_iteration": 2.383678913116455 + }, + { + "auxiliary_loss_clip": 0.01077178, + "auxiliary_loss_mlp": 0.01057203, + "balance_loss_clip": 1.01836419, + "balance_loss_mlp": 1.02251041, + "epoch": 0.2370659852698031, + "flos": 23946576963840.0, + "grad_norm": 1.8545072067336272, + "language_loss": 0.82265413, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.84399796, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.546875, + "step": 3943, + "time_per_iteration": 2.5040805339813232 + }, + { + "auxiliary_loss_clip": 0.0107849, + "auxiliary_loss_mlp": 0.01059226, + "balance_loss_clip": 1.01886177, + "balance_loss_mlp": 1.02220345, + "epoch": 0.23712610852247107, + "flos": 15331863185280.0, + "grad_norm": 2.473068128632784, + "language_loss": 0.69051814, + "learning_rate": 3.566589891386959e-06, + "loss": 0.71189523, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.5625, + "step": 3944, + "time_per_iteration": 2.4462296962738037 + }, + { + "auxiliary_loss_clip": 0.01077164, + "auxiliary_loss_mlp": 0.01062209, + "balance_loss_clip": 1.02451491, + "balance_loss_mlp": 1.0221628, + "epoch": 0.23718623177513903, + "flos": 19681551964800.0, + "grad_norm": 1.8208133984438517, + "language_loss": 0.77391869, + "learning_rate": 3.566347752735866e-06, + "loss": 0.7953124, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.55078125, + "step": 3945, + "time_per_iteration": 2.3933048248291016 + }, + { + "auxiliary_loss_clip": 0.01078977, + "auxiliary_loss_mlp": 0.01051762, + "balance_loss_clip": 1.01747787, + "balance_loss_mlp": 1.02521038, + "epoch": 0.237246355027807, + "flos": 24972150418560.0, + "grad_norm": 1.7113427036696485, + "language_loss": 0.65392619, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.6752336, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.53515625, + "step": 3946, + "time_per_iteration": 2.4553017616271973 + }, + { + "auxiliary_loss_clip": 0.01075482, + "auxiliary_loss_mlp": 0.01056593, + "balance_loss_clip": 1.0171231, + "balance_loss_mlp": 1.02182102, + "epoch": 0.23730647828047496, + "flos": 15376516680960.0, + "grad_norm": 2.166216705863622, + "language_loss": 0.78668833, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.80800909, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.53515625, + "step": 3947, + "time_per_iteration": 2.3876068592071533 + }, + { + "auxiliary_loss_clip": 0.01079457, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_clip": 1.01963031, + "balance_loss_mlp": 1.02537441, + "epoch": 0.23736660153314296, + "flos": 28149316364160.0, + "grad_norm": 1.5108802865678734, + "language_loss": 0.81493664, + "learning_rate": 3.565620980442944e-06, + "loss": 0.83629251, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5390625, + "step": 3948, + "time_per_iteration": 2.492887258529663 + }, + { + "auxiliary_loss_clip": 0.01077972, + "auxiliary_loss_mlp": 0.01057591, + "balance_loss_clip": 1.02170861, + "balance_loss_mlp": 1.02406836, + "epoch": 0.23742672478581092, + "flos": 22085601563520.0, + "grad_norm": 1.713528398090911, + "language_loss": 0.81397402, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.83532965, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5390625, + "step": 3949, + "time_per_iteration": 2.4455695152282715 + }, + { + "auxiliary_loss_clip": 0.01077005, + "auxiliary_loss_mlp": 0.01054845, + "balance_loss_clip": 1.0169605, + "balance_loss_mlp": 1.02282453, + "epoch": 0.2374868480384789, + "flos": 19536068862720.0, + "grad_norm": 1.669113887673585, + "language_loss": 0.74316537, + "learning_rate": 3.565136168723163e-06, + "loss": 0.76448393, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.54296875, + "step": 3950, + "time_per_iteration": 2.395921468734741 + }, + { + "auxiliary_loss_clip": 0.01074335, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.01352823, + "balance_loss_mlp": 1.02419209, + "epoch": 0.23754697129114685, + "flos": 19421623825920.0, + "grad_norm": 1.84496879742927, + "language_loss": 0.73657465, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75777662, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5, + "step": 3951, + "time_per_iteration": 2.462151288986206 + }, + { + "auxiliary_loss_clip": 0.01078471, + "auxiliary_loss_mlp": 0.01059743, + "balance_loss_clip": 1.02088106, + "balance_loss_mlp": 1.02376544, + "epoch": 0.23760709454381482, + "flos": 19499968650240.0, + "grad_norm": 1.7823565503616736, + "language_loss": 0.75906181, + "learning_rate": 3.564651119602903e-06, + "loss": 0.78044403, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.546875, + "step": 3952, + "time_per_iteration": 2.3894307613372803 + }, + { + "auxiliary_loss_clip": 0.01078169, + "auxiliary_loss_mlp": 0.01059294, + "balance_loss_clip": 1.02596354, + "balance_loss_mlp": 1.02391243, + "epoch": 0.23766721779648278, + "flos": 27635360106240.0, + "grad_norm": 2.32734103973321, + "language_loss": 0.71840394, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73977852, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.54296875, + "step": 3953, + "time_per_iteration": 2.4721179008483887 + }, + { + "auxiliary_loss_clip": 0.0107791, + "auxiliary_loss_mlp": 0.01058801, + "balance_loss_clip": 1.02220368, + "balance_loss_mlp": 1.02347124, + "epoch": 0.23772734104915075, + "flos": 23403223474560.0, + "grad_norm": 1.925823514329314, + "language_loss": 0.83508605, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.85645318, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.546875, + "step": 3954, + "time_per_iteration": 2.434901714324951 + }, + { + "auxiliary_loss_clip": 0.01078501, + "auxiliary_loss_mlp": 0.01055641, + "balance_loss_clip": 1.01801836, + "balance_loss_mlp": 1.02425802, + "epoch": 0.23778746430181874, + "flos": 15704595527040.0, + "grad_norm": 2.426720640614203, + "language_loss": 0.68861401, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.70995545, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.54296875, + "step": 3955, + "time_per_iteration": 2.3716843128204346 + }, + { + "auxiliary_loss_clip": 0.0107431, + "auxiliary_loss_mlp": 0.01053697, + "balance_loss_clip": 1.01786304, + "balance_loss_mlp": 1.02260876, + "epoch": 0.2378475875544867, + "flos": 19425464075520.0, + "grad_norm": 1.5806167571069345, + "language_loss": 0.84614837, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86742842, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.51953125, + "step": 3956, + "time_per_iteration": 2.4694573879241943 + }, + { + "auxiliary_loss_clip": 0.01072267, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.01212215, + "balance_loss_mlp": 1.02222812, + "epoch": 0.23790771080715467, + "flos": 22267603814400.0, + "grad_norm": 1.9615500310051626, + "language_loss": 0.86739689, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.88856405, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.5, + "step": 3957, + "time_per_iteration": 2.4048397541046143 + }, + { + "auxiliary_loss_clip": 0.01075318, + "auxiliary_loss_mlp": 0.01057061, + "balance_loss_clip": 1.02274024, + "balance_loss_mlp": 1.02256763, + "epoch": 0.23796783405982264, + "flos": 20046289605120.0, + "grad_norm": 2.1378156335997094, + "language_loss": 0.71441412, + "learning_rate": 3.563194548575151e-06, + "loss": 0.73573792, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.52734375, + "step": 3958, + "time_per_iteration": 2.4352898597717285 + }, + { + "auxiliary_loss_clip": 0.0107549, + "auxiliary_loss_mlp": 0.01059731, + "balance_loss_clip": 1.02179825, + "balance_loss_mlp": 1.02172995, + "epoch": 0.2380279573124906, + "flos": 14245086384000.0, + "grad_norm": 2.379183965493073, + "language_loss": 0.69287455, + "learning_rate": 3.562951579215745e-06, + "loss": 0.71422672, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.53515625, + "step": 3959, + "time_per_iteration": 2.3870909214019775 + }, + { + "auxiliary_loss_clip": 0.01074025, + "auxiliary_loss_mlp": 0.01053887, + "balance_loss_clip": 1.01948357, + "balance_loss_mlp": 1.02135372, + "epoch": 0.23808808056515857, + "flos": 21178103927040.0, + "grad_norm": 1.8965668801310605, + "language_loss": 0.74054658, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.76182568, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.52734375, + "step": 3960, + "time_per_iteration": 2.431682825088501 + }, + { + "auxiliary_loss_clip": 0.01076553, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.01458871, + "balance_loss_mlp": 1.02304292, + "epoch": 0.23814820381782653, + "flos": 22527217751040.0, + "grad_norm": 1.7296033976366532, + "language_loss": 0.76493096, + "learning_rate": 3.562465462704307e-06, + "loss": 0.78621805, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.53515625, + "step": 3961, + "time_per_iteration": 2.4458205699920654 + }, + { + "auxiliary_loss_clip": 0.01075924, + "auxiliary_loss_mlp": 0.01062578, + "balance_loss_clip": 1.02121258, + "balance_loss_mlp": 1.02124274, + "epoch": 0.23820832707049452, + "flos": 22303389824640.0, + "grad_norm": 2.0898375366064554, + "language_loss": 0.67958164, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.70096672, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.546875, + "step": 3962, + "time_per_iteration": 2.403914213180542 + }, + { + "auxiliary_loss_clip": 0.0107492, + "auxiliary_loss_mlp": 0.0105338, + "balance_loss_clip": 1.01752186, + "balance_loss_mlp": 1.02208567, + "epoch": 0.2382684503231625, + "flos": 24863046819840.0, + "grad_norm": 1.820191376082861, + "language_loss": 0.75947493, + "learning_rate": 3.561979109197483e-06, + "loss": 0.7807579, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 3963, + "time_per_iteration": 2.459411382675171 + }, + { + "auxiliary_loss_clip": 0.0107913, + "auxiliary_loss_mlp": 0.01053349, + "balance_loss_clip": 1.01682317, + "balance_loss_mlp": 1.02497435, + "epoch": 0.23832857357583045, + "flos": 21870536388480.0, + "grad_norm": 2.488666506472448, + "language_loss": 0.79385018, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.81517494, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5390625, + "step": 3964, + "time_per_iteration": 2.425105333328247 + }, + { + "auxiliary_loss_clip": 0.01073998, + "auxiliary_loss_mlp": 0.0105491, + "balance_loss_clip": 1.02088797, + "balance_loss_mlp": 1.02191877, + "epoch": 0.23838869682849842, + "flos": 21286998057600.0, + "grad_norm": 2.5090335301211457, + "language_loss": 0.73518348, + "learning_rate": 3.561492518769045e-06, + "loss": 0.75647259, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5234375, + "step": 3965, + "time_per_iteration": 2.453462839126587 + }, + { + "auxiliary_loss_clip": 0.0107433, + "auxiliary_loss_mlp": 0.01052358, + "balance_loss_clip": 1.01869369, + "balance_loss_mlp": 1.02297115, + "epoch": 0.23844882008116638, + "flos": 16179658663680.0, + "grad_norm": 1.7527664008769912, + "language_loss": 0.80005145, + "learning_rate": 3.561249134732282e-06, + "loss": 0.82131839, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.51171875, + "step": 3966, + "time_per_iteration": 2.4368093013763428 + }, + { + "auxiliary_loss_clip": 0.0107707, + "auxiliary_loss_mlp": 0.01056382, + "balance_loss_clip": 1.02257383, + "balance_loss_mlp": 1.02455652, + "epoch": 0.23850894333383435, + "flos": 21068651214720.0, + "grad_norm": 1.532383176698599, + "language_loss": 0.70064682, + "learning_rate": 3.561005691492797e-06, + "loss": 0.72198129, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5234375, + "step": 3967, + "time_per_iteration": 2.3868916034698486 + }, + { + "auxiliary_loss_clip": 0.01074362, + "auxiliary_loss_mlp": 0.01061574, + "balance_loss_clip": 1.02411819, + "balance_loss_mlp": 1.02198434, + "epoch": 0.23856906658650234, + "flos": 17200658730240.0, + "grad_norm": 2.0387623600773086, + "language_loss": 0.70291322, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.72427255, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5234375, + "step": 3968, + "time_per_iteration": 2.4130916595458984 + }, + { + "auxiliary_loss_clip": 0.01074555, + "auxiliary_loss_mlp": 0.01059565, + "balance_loss_clip": 1.02375472, + "balance_loss_mlp": 1.02082109, + "epoch": 0.2386291898391703, + "flos": 29493018927360.0, + "grad_norm": 2.300239420982784, + "language_loss": 0.78684676, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.80818802, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5390625, + "step": 3969, + "time_per_iteration": 3.903502941131592 + }, + { + "auxiliary_loss_clip": 0.01071671, + "auxiliary_loss_mlp": 0.01047663, + "balance_loss_clip": 1.01561987, + "balance_loss_mlp": 1.02214098, + "epoch": 0.23868931309183827, + "flos": 21141375310080.0, + "grad_norm": 3.2777187657664433, + "language_loss": 0.78482622, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.80601954, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.49414062, + "step": 3970, + "time_per_iteration": 2.417163848876953 + }, + { + "auxiliary_loss_clip": 0.0107495, + "auxiliary_loss_mlp": 0.01062242, + "balance_loss_clip": 1.02507257, + "balance_loss_mlp": 1.02118909, + "epoch": 0.23874943634450624, + "flos": 25658403569280.0, + "grad_norm": 2.0040316741195485, + "language_loss": 0.86180788, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.88317978, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5390625, + "step": 3971, + "time_per_iteration": 2.4627456665039062 + }, + { + "auxiliary_loss_clip": 0.01020481, + "auxiliary_loss_mlp": 0.01020322, + "balance_loss_clip": 1.0164361, + "balance_loss_mlp": 1.00798035, + "epoch": 0.2388095595971742, + "flos": 58983243079680.0, + "grad_norm": 0.7530235810556672, + "language_loss": 0.62846982, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64887786, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.03881836, + "router_z_loss_mlp": 0.125, + "step": 3972, + "time_per_iteration": 3.1184005737304688 + }, + { + "auxiliary_loss_clip": 0.01073938, + "auxiliary_loss_mlp": 0.01049, + "balance_loss_clip": 1.01426256, + "balance_loss_mlp": 1.02158201, + "epoch": 0.23886968284984217, + "flos": 16799401941120.0, + "grad_norm": 3.0657404849008376, + "language_loss": 0.84538603, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.86661541, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5234375, + "step": 3973, + "time_per_iteration": 3.9098262786865234 + }, + { + "auxiliary_loss_clip": 0.01075796, + "auxiliary_loss_mlp": 0.01059883, + "balance_loss_clip": 1.02507389, + "balance_loss_mlp": 1.02222466, + "epoch": 0.23892980610251013, + "flos": 22381560092160.0, + "grad_norm": 1.6517086498178086, + "language_loss": 0.80565089, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.82700765, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53515625, + "step": 3974, + "time_per_iteration": 3.838167428970337 + }, + { + "auxiliary_loss_clip": 0.0107534, + "auxiliary_loss_mlp": 0.01057331, + "balance_loss_clip": 1.01870751, + "balance_loss_mlp": 1.02172232, + "epoch": 0.23898992935517813, + "flos": 12822375680640.0, + "grad_norm": 1.838919502493363, + "language_loss": 0.8665818, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.88790858, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.53515625, + "step": 3975, + "time_per_iteration": 2.3493382930755615 + }, + { + "auxiliary_loss_clip": 0.01073377, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_clip": 1.0178597, + "balance_loss_mlp": 1.02103019, + "epoch": 0.2390500526078461, + "flos": 22344587095680.0, + "grad_norm": 2.176846875396924, + "language_loss": 0.85229903, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.87357754, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5234375, + "step": 3976, + "time_per_iteration": 2.426551103591919 + }, + { + "auxiliary_loss_clip": 0.01072137, + "auxiliary_loss_mlp": 0.01053747, + "balance_loss_clip": 1.01588643, + "balance_loss_mlp": 1.02019858, + "epoch": 0.23911017586051406, + "flos": 22634121934080.0, + "grad_norm": 6.509190957931099, + "language_loss": 0.75725096, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.77850986, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.51953125, + "step": 3977, + "time_per_iteration": 2.4058144092559814 + }, + { + "auxiliary_loss_clip": 0.01076447, + "auxiliary_loss_mlp": 0.01062771, + "balance_loss_clip": 1.02696013, + "balance_loss_mlp": 1.02269435, + "epoch": 0.23917029911318202, + "flos": 23652329091840.0, + "grad_norm": 1.7344880242107745, + "language_loss": 0.73472315, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.75611532, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5390625, + "step": 3978, + "time_per_iteration": 2.453629732131958 + }, + { + "auxiliary_loss_clip": 0.0107818, + "auxiliary_loss_mlp": 0.01057078, + "balance_loss_clip": 1.01747656, + "balance_loss_mlp": 1.02360213, + "epoch": 0.23923042236585, + "flos": 22782502679040.0, + "grad_norm": 2.8917501261807246, + "language_loss": 0.80336416, + "learning_rate": 3.558079758168997e-06, + "loss": 0.82471675, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.546875, + "step": 3979, + "time_per_iteration": 2.4246766567230225 + }, + { + "auxiliary_loss_clip": 0.01074087, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_clip": 1.03039598, + "balance_loss_mlp": 1.02157116, + "epoch": 0.23929054561851795, + "flos": 28146453632640.0, + "grad_norm": 1.7209041263400275, + "language_loss": 0.83129001, + "learning_rate": 3.557835546134977e-06, + "loss": 0.85271561, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5234375, + "step": 3980, + "time_per_iteration": 2.482715368270874 + }, + { + "auxiliary_loss_clip": 0.01073134, + "auxiliary_loss_mlp": 0.01055505, + "balance_loss_clip": 1.01742935, + "balance_loss_mlp": 1.02136803, + "epoch": 0.23935066887118592, + "flos": 21685531760640.0, + "grad_norm": 1.8140468471541995, + "language_loss": 0.84734297, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86862934, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.515625, + "step": 3981, + "time_per_iteration": 2.4528872966766357 + }, + { + "auxiliary_loss_clip": 0.01077341, + "auxiliary_loss_mlp": 0.01058149, + "balance_loss_clip": 1.02047849, + "balance_loss_mlp": 1.02256787, + "epoch": 0.2394107921238539, + "flos": 32120966275200.0, + "grad_norm": 2.5778757783985577, + "language_loss": 0.77666903, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79802394, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.546875, + "step": 3982, + "time_per_iteration": 2.5151655673980713 + }, + { + "auxiliary_loss_clip": 0.01075816, + "auxiliary_loss_mlp": 0.01057734, + "balance_loss_clip": 1.02187538, + "balance_loss_mlp": 1.02244854, + "epoch": 0.23947091537652188, + "flos": 17018237543040.0, + "grad_norm": 3.242368148338704, + "language_loss": 0.78947818, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.81081372, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53515625, + "step": 3983, + "time_per_iteration": 2.4055511951446533 + }, + { + "auxiliary_loss_clip": 0.0107449, + "auxiliary_loss_mlp": 0.01059133, + "balance_loss_clip": 1.01872063, + "balance_loss_mlp": 1.02171326, + "epoch": 0.23953103862918984, + "flos": 20592575648640.0, + "grad_norm": 1.8207216195305846, + "language_loss": 0.74724627, + "learning_rate": 3.556858107358737e-06, + "loss": 0.76858246, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.52734375, + "step": 3984, + "time_per_iteration": 2.467677116394043 + }, + { + "auxiliary_loss_clip": 0.01078624, + "auxiliary_loss_mlp": 0.01059093, + "balance_loss_clip": 1.02092195, + "balance_loss_mlp": 1.02335751, + "epoch": 0.2395911618818578, + "flos": 20703354992640.0, + "grad_norm": 2.0409626954330964, + "language_loss": 0.80851054, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.82988769, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5546875, + "step": 3985, + "time_per_iteration": 2.40859055519104 + }, + { + "auxiliary_loss_clip": 0.01076842, + "auxiliary_loss_mlp": 0.01060171, + "balance_loss_clip": 1.02195275, + "balance_loss_mlp": 1.02363205, + "epoch": 0.23965128513452577, + "flos": 27052275623040.0, + "grad_norm": 1.947394766236021, + "language_loss": 0.75091326, + "learning_rate": 3.556369033716254e-06, + "loss": 0.77228338, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.53125, + "step": 3986, + "time_per_iteration": 2.463433265686035 + }, + { + "auxiliary_loss_clip": 0.01080054, + "auxiliary_loss_mlp": 0.01062253, + "balance_loss_clip": 1.02305675, + "balance_loss_mlp": 1.0243268, + "epoch": 0.23971140838719374, + "flos": 23143330247040.0, + "grad_norm": 1.955056923679609, + "language_loss": 0.89247888, + "learning_rate": 3.556124408363871e-06, + "loss": 0.91390193, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.55859375, + "step": 3987, + "time_per_iteration": 2.4156103134155273 + }, + { + "auxiliary_loss_clip": 0.010722, + "auxiliary_loss_mlp": 0.01050684, + "balance_loss_clip": 1.01768708, + "balance_loss_mlp": 1.02254272, + "epoch": 0.23977153163986173, + "flos": 18033756526080.0, + "grad_norm": 2.3098642636749864, + "language_loss": 0.85421145, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.8754403, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.49609375, + "step": 3988, + "time_per_iteration": 2.3934428691864014 + }, + { + "auxiliary_loss_clip": 0.0107641, + "auxiliary_loss_mlp": 0.01057907, + "balance_loss_clip": 1.02003407, + "balance_loss_mlp": 1.02280843, + "epoch": 0.2398316548925297, + "flos": 18112415552640.0, + "grad_norm": 1.7469935438475088, + "language_loss": 0.86585778, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.88720095, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.53515625, + "step": 3989, + "time_per_iteration": 2.424023151397705 + }, + { + "auxiliary_loss_clip": 0.01074542, + "auxiliary_loss_mlp": 0.01051432, + "balance_loss_clip": 1.01569307, + "balance_loss_mlp": 1.02212703, + "epoch": 0.23989177814519766, + "flos": 12566916195840.0, + "grad_norm": 2.0359668325166655, + "language_loss": 0.86223918, + "learning_rate": 3.555390178293477e-06, + "loss": 0.88349891, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5234375, + "step": 3990, + "time_per_iteration": 2.372601270675659 + }, + { + "auxiliary_loss_clip": 0.01073526, + "auxiliary_loss_mlp": 0.01055634, + "balance_loss_clip": 1.02187443, + "balance_loss_mlp": 1.02186215, + "epoch": 0.23995190139786562, + "flos": 25263430824960.0, + "grad_norm": 1.5013292090190815, + "language_loss": 0.77250898, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.79380059, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.515625, + "step": 3991, + "time_per_iteration": 2.4849021434783936 + }, + { + "auxiliary_loss_clip": 0.01019178, + "auxiliary_loss_mlp": 0.01011908, + "balance_loss_clip": 1.00752103, + "balance_loss_mlp": 1.00590038, + "epoch": 0.2400120246505336, + "flos": 61957425047040.0, + "grad_norm": 0.8880633501765254, + "language_loss": 0.63784277, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65815365, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.04394531, + "router_z_loss_mlp": 0.1328125, + "step": 3992, + "time_per_iteration": 3.004757881164551 + }, + { + "auxiliary_loss_clip": 0.01019162, + "auxiliary_loss_mlp": 0.01009359, + "balance_loss_clip": 1.00423312, + "balance_loss_mlp": 1.00566673, + "epoch": 0.24007214790320155, + "flos": 66705333327360.0, + "grad_norm": 0.7573507134863463, + "language_loss": 0.62977213, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65005732, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.05126953, + "router_z_loss_mlp": 0.13476562, + "step": 3993, + "time_per_iteration": 3.1419780254364014 + }, + { + "auxiliary_loss_clip": 0.01081069, + "auxiliary_loss_mlp": 0.01059175, + "balance_loss_clip": 1.02281642, + "balance_loss_mlp": 1.02498007, + "epoch": 0.24013227115586952, + "flos": 25807971300480.0, + "grad_norm": 1.847983505272801, + "language_loss": 0.7845881, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.80599058, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5625, + "step": 3994, + "time_per_iteration": 2.457596778869629 + }, + { + "auxiliary_loss_clip": 0.01077279, + "auxiliary_loss_mlp": 0.01065415, + "balance_loss_clip": 1.02505112, + "balance_loss_mlp": 1.02334154, + "epoch": 0.2401923944085375, + "flos": 25556282242560.0, + "grad_norm": 2.077118922438508, + "language_loss": 0.79705656, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.81848347, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.5390625, + "step": 3995, + "time_per_iteration": 2.4557464122772217 + }, + { + "auxiliary_loss_clip": 0.01021171, + "auxiliary_loss_mlp": 0.01007207, + "balance_loss_clip": 1.0028199, + "balance_loss_mlp": 1.00831294, + "epoch": 0.24025251766120548, + "flos": 54937751909760.0, + "grad_norm": 0.9164335503466496, + "language_loss": 0.63519537, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65547907, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.04394531, + "router_z_loss_mlp": 0.12890625, + "step": 3996, + "time_per_iteration": 3.11061692237854 + }, + { + "auxiliary_loss_clip": 0.01080885, + "auxiliary_loss_mlp": 0.01068858, + "balance_loss_clip": 1.03063965, + "balance_loss_mlp": 1.02498055, + "epoch": 0.24031264091387344, + "flos": 20630037404160.0, + "grad_norm": 2.687051859787606, + "language_loss": 0.71523559, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.73673302, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.55859375, + "step": 3997, + "time_per_iteration": 2.390627145767212 + }, + { + "auxiliary_loss_clip": 0.0107919, + "auxiliary_loss_mlp": 0.01063947, + "balance_loss_clip": 1.02987719, + "balance_loss_mlp": 1.02494764, + "epoch": 0.2403727641665414, + "flos": 20885217598080.0, + "grad_norm": 1.831951281247983, + "language_loss": 0.88359123, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.90502262, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.54296875, + "step": 3998, + "time_per_iteration": 2.396756410598755 + }, + { + "auxiliary_loss_clip": 0.01079247, + "auxiliary_loss_mlp": 0.01058418, + "balance_loss_clip": 1.02067673, + "balance_loss_mlp": 1.02250886, + "epoch": 0.24043288741920937, + "flos": 22818952005120.0, + "grad_norm": 1.7233867684568847, + "language_loss": 0.77454561, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.79592222, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.56640625, + "step": 3999, + "time_per_iteration": 2.3993148803710938 + }, + { + "auxiliary_loss_clip": 0.01077512, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_clip": 1.03583789, + "balance_loss_mlp": 1.0239985, + "epoch": 0.24049301067187734, + "flos": 27958551361920.0, + "grad_norm": 2.154910441416475, + "language_loss": 0.74891686, + "learning_rate": 3.552938912398679e-06, + "loss": 0.77038801, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.53515625, + "step": 4000, + "time_per_iteration": 2.5048508644104004 + }, + { + "auxiliary_loss_clip": 0.01079859, + "auxiliary_loss_mlp": 0.01075256, + "balance_loss_clip": 1.03672743, + "balance_loss_mlp": 1.02325892, + "epoch": 0.24055313392454533, + "flos": 27450250744320.0, + "grad_norm": 1.7317595373730905, + "language_loss": 0.67870224, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.70025337, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.56640625, + "step": 4001, + "time_per_iteration": 2.4317779541015625 + }, + { + "auxiliary_loss_clip": 0.010764, + "auxiliary_loss_mlp": 0.01067419, + "balance_loss_clip": 1.02970147, + "balance_loss_mlp": 1.02202022, + "epoch": 0.2406132571772133, + "flos": 25555444369920.0, + "grad_norm": 1.7614096401265154, + "language_loss": 0.84261513, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.86405337, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.54296875, + "step": 4002, + "time_per_iteration": 2.493609666824341 + }, + { + "auxiliary_loss_clip": 0.0107564, + "auxiliary_loss_mlp": 0.01064927, + "balance_loss_clip": 1.0281148, + "balance_loss_mlp": 1.02226675, + "epoch": 0.24067338042988126, + "flos": 24790217990400.0, + "grad_norm": 2.109105661742295, + "language_loss": 0.84948802, + "learning_rate": 3.552202383898897e-06, + "loss": 0.87089366, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.53515625, + "step": 4003, + "time_per_iteration": 2.4183051586151123 + }, + { + "auxiliary_loss_clip": 0.01078542, + "auxiliary_loss_mlp": 0.01066714, + "balance_loss_clip": 1.0267787, + "balance_loss_mlp": 1.02382565, + "epoch": 0.24073350368254923, + "flos": 21176882029440.0, + "grad_norm": 2.137383665741016, + "language_loss": 0.88617098, + "learning_rate": 3.551956756667215e-06, + "loss": 0.90762353, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.546875, + "step": 4004, + "time_per_iteration": 2.4606592655181885 + }, + { + "auxiliary_loss_clip": 0.01081638, + "auxiliary_loss_mlp": 0.01067394, + "balance_loss_clip": 1.02922344, + "balance_loss_mlp": 1.0248735, + "epoch": 0.2407936269352172, + "flos": 22493142397440.0, + "grad_norm": 2.0558894859289034, + "language_loss": 0.79546916, + "learning_rate": 3.551711070585177e-06, + "loss": 0.8169595, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.56640625, + "step": 4005, + "time_per_iteration": 2.4592299461364746 + }, + { + "auxiliary_loss_clip": 0.01077054, + "auxiliary_loss_mlp": 0.01053745, + "balance_loss_clip": 1.01745737, + "balance_loss_mlp": 1.02484846, + "epoch": 0.24085375018788516, + "flos": 18550156579200.0, + "grad_norm": 1.840116936072851, + "language_loss": 0.79304707, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81435502, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.51953125, + "step": 4006, + "time_per_iteration": 2.4487011432647705 + }, + { + "auxiliary_loss_clip": 0.01083178, + "auxiliary_loss_mlp": 0.01071087, + "balance_loss_clip": 1.02988863, + "balance_loss_mlp": 1.02474582, + "epoch": 0.24091387344055312, + "flos": 24169392460800.0, + "grad_norm": 1.7138987185899188, + "language_loss": 0.72930193, + "learning_rate": 3.551219521907302e-06, + "loss": 0.7508446, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.58203125, + "step": 4007, + "time_per_iteration": 2.4413487911224365 + }, + { + "auxiliary_loss_clip": 0.01077413, + "auxiliary_loss_mlp": 0.01051977, + "balance_loss_clip": 1.01647663, + "balance_loss_mlp": 1.02386427, + "epoch": 0.24097399669322112, + "flos": 11035520830080.0, + "grad_norm": 1.8450255246582303, + "language_loss": 0.77505124, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.79634517, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5390625, + "step": 4008, + "time_per_iteration": 2.465357542037964 + }, + { + "auxiliary_loss_clip": 0.0108049, + "auxiliary_loss_mlp": 0.01054006, + "balance_loss_clip": 1.01759934, + "balance_loss_mlp": 1.02672732, + "epoch": 0.24103411994588908, + "flos": 17164139581440.0, + "grad_norm": 3.0711405382426578, + "language_loss": 0.76561511, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.78696012, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5390625, + "step": 4009, + "time_per_iteration": 3.776369571685791 + }, + { + "auxiliary_loss_clip": 0.01080384, + "auxiliary_loss_mlp": 0.01054209, + "balance_loss_clip": 1.0169208, + "balance_loss_mlp": 1.02711654, + "epoch": 0.24109424319855705, + "flos": 20666905666560.0, + "grad_norm": 1.9726961828837983, + "language_loss": 0.81644273, + "learning_rate": 3.550481757745804e-06, + "loss": 0.8377887, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.53125, + "step": 4010, + "time_per_iteration": 2.40700364112854 + }, + { + "auxiliary_loss_clip": 0.01082261, + "auxiliary_loss_mlp": 0.01068792, + "balance_loss_clip": 1.02554309, + "balance_loss_mlp": 1.02668428, + "epoch": 0.241154366451225, + "flos": 28180598808960.0, + "grad_norm": 2.0270255832860293, + "language_loss": 0.71875942, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.7402699, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 0.5546875, + "step": 4011, + "time_per_iteration": 2.444082021713257 + }, + { + "auxiliary_loss_clip": 0.01081837, + "auxiliary_loss_mlp": 0.01053045, + "balance_loss_clip": 1.0184505, + "balance_loss_mlp": 1.02667844, + "epoch": 0.24121448970389298, + "flos": 21688638871680.0, + "grad_norm": 1.6089169072599032, + "language_loss": 0.70895493, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.7303037, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.55078125, + "step": 4012, + "time_per_iteration": 3.9555392265319824 + }, + { + "auxiliary_loss_clip": 0.01081763, + "auxiliary_loss_mlp": 0.01057022, + "balance_loss_clip": 1.02009082, + "balance_loss_mlp": 1.02704, + "epoch": 0.24127461295656094, + "flos": 39674634791040.0, + "grad_norm": 2.359561900469933, + "language_loss": 0.75559062, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.77697849, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.546875, + "step": 4013, + "time_per_iteration": 5.497684955596924 + }, + { + "auxiliary_loss_clip": 0.01081478, + "auxiliary_loss_mlp": 0.0105799, + "balance_loss_clip": 1.02372968, + "balance_loss_mlp": 1.02782941, + "epoch": 0.2413347362092289, + "flos": 19134846984960.0, + "grad_norm": 2.076783381083543, + "language_loss": 0.89921892, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.92061365, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5390625, + "step": 4014, + "time_per_iteration": 2.399038791656494 + }, + { + "auxiliary_loss_clip": 0.01085017, + "auxiliary_loss_mlp": 0.01060062, + "balance_loss_clip": 1.02248681, + "balance_loss_mlp": 1.02813566, + "epoch": 0.2413948594618969, + "flos": 26938319345280.0, + "grad_norm": 2.0197882489369507, + "language_loss": 0.96836126, + "learning_rate": 3.549250975045952e-06, + "loss": 0.98981202, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5703125, + "step": 4015, + "time_per_iteration": 2.447695255279541 + }, + { + "auxiliary_loss_clip": 0.0107916, + "auxiliary_loss_mlp": 0.01057615, + "balance_loss_clip": 1.02278161, + "balance_loss_mlp": 1.02442491, + "epoch": 0.24145498271456486, + "flos": 25226946587520.0, + "grad_norm": 1.7531597012689384, + "language_loss": 0.84258318, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.86395097, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.546875, + "step": 4016, + "time_per_iteration": 2.444931983947754 + }, + { + "auxiliary_loss_clip": 0.01077259, + "auxiliary_loss_mlp": 0.01060783, + "balance_loss_clip": 1.02642643, + "balance_loss_mlp": 1.02584529, + "epoch": 0.24151510596723283, + "flos": 40660163049600.0, + "grad_norm": 1.987612800760057, + "language_loss": 0.70728099, + "learning_rate": 3.54875825066639e-06, + "loss": 0.72866142, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4017, + "time_per_iteration": 2.559148073196411 + }, + { + "auxiliary_loss_clip": 0.01079824, + "auxiliary_loss_mlp": 0.01068088, + "balance_loss_clip": 1.03001308, + "balance_loss_mlp": 1.02367389, + "epoch": 0.2415752292199008, + "flos": 18145792679040.0, + "grad_norm": 1.779031024214066, + "language_loss": 0.85864198, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.88012111, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5625, + "step": 4018, + "time_per_iteration": 2.429260015487671 + }, + { + "auxiliary_loss_clip": 0.01017905, + "auxiliary_loss_mlp": 0.01019244, + "balance_loss_clip": 1.01521432, + "balance_loss_mlp": 1.00593841, + "epoch": 0.24163535247256876, + "flos": 67285275788160.0, + "grad_norm": 0.83927061487824, + "language_loss": 0.60754818, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62791967, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.12011719, + "step": 4019, + "time_per_iteration": 3.0853142738342285 + }, + { + "auxiliary_loss_clip": 0.01076133, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.02285337, + "balance_loss_mlp": 1.02239799, + "epoch": 0.24169547572523672, + "flos": 24928963200000.0, + "grad_norm": 2.24034955991038, + "language_loss": 0.74860072, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.76992702, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.53515625, + "step": 4020, + "time_per_iteration": 2.4534990787506104 + }, + { + "auxiliary_loss_clip": 0.01078018, + "auxiliary_loss_mlp": 0.01055379, + "balance_loss_clip": 1.02152395, + "balance_loss_mlp": 1.02515209, + "epoch": 0.24175559897790472, + "flos": 18727480707840.0, + "grad_norm": 2.1000586244176924, + "language_loss": 0.83267361, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.8540076, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.52734375, + "step": 4021, + "time_per_iteration": 2.4020185470581055 + }, + { + "auxiliary_loss_clip": 0.01079737, + "auxiliary_loss_mlp": 0.01064526, + "balance_loss_clip": 1.02509165, + "balance_loss_mlp": 1.02375758, + "epoch": 0.24181572223057268, + "flos": 23038171632000.0, + "grad_norm": 2.323561537512663, + "language_loss": 0.78040266, + "learning_rate": 3.547525412122378e-06, + "loss": 0.80184525, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.55859375, + "step": 4022, + "time_per_iteration": 2.470710515975952 + }, + { + "auxiliary_loss_clip": 0.01080567, + "auxiliary_loss_mlp": 0.01062884, + "balance_loss_clip": 1.02430773, + "balance_loss_mlp": 1.02369666, + "epoch": 0.24187584548324065, + "flos": 20375101589760.0, + "grad_norm": 1.7775226075205641, + "language_loss": 0.77044034, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.79187489, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.56640625, + "step": 4023, + "time_per_iteration": 2.391624689102173 + }, + { + "auxiliary_loss_clip": 0.01079005, + "auxiliary_loss_mlp": 0.0105863, + "balance_loss_clip": 1.02208042, + "balance_loss_mlp": 1.02612901, + "epoch": 0.2419359687359086, + "flos": 21396450769920.0, + "grad_norm": 1.845963525267873, + "language_loss": 0.84466577, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.86604208, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.52734375, + "step": 4024, + "time_per_iteration": 2.4090967178344727 + }, + { + "auxiliary_loss_clip": 0.01076221, + "auxiliary_loss_mlp": 0.01055553, + "balance_loss_clip": 1.01936054, + "balance_loss_mlp": 1.02317858, + "epoch": 0.24199609198857658, + "flos": 18368398707840.0, + "grad_norm": 1.6746360328574112, + "language_loss": 0.87281585, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.89413351, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.53125, + "step": 4025, + "time_per_iteration": 2.3693811893463135 + }, + { + "auxiliary_loss_clip": 0.0107815, + "auxiliary_loss_mlp": 0.01058617, + "balance_loss_clip": 1.02368915, + "balance_loss_mlp": 1.02392602, + "epoch": 0.24205621524124454, + "flos": 19462856008320.0, + "grad_norm": 1.8871058157001273, + "language_loss": 0.73747367, + "learning_rate": 3.546538084949365e-06, + "loss": 0.7588414, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.54296875, + "step": 4026, + "time_per_iteration": 2.455371379852295 + }, + { + "auxiliary_loss_clip": 0.01077656, + "auxiliary_loss_mlp": 0.01055538, + "balance_loss_clip": 1.02242172, + "balance_loss_mlp": 1.02483821, + "epoch": 0.2421163384939125, + "flos": 14975434448640.0, + "grad_norm": 1.8684419400818408, + "language_loss": 0.66084582, + "learning_rate": 3.546291106520509e-06, + "loss": 0.68217778, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.52734375, + "step": 4027, + "time_per_iteration": 2.3672025203704834 + }, + { + "auxiliary_loss_clip": 0.01080366, + "auxiliary_loss_mlp": 0.01055988, + "balance_loss_clip": 1.01929498, + "balance_loss_mlp": 1.02544177, + "epoch": 0.2421764617465805, + "flos": 18661040657280.0, + "grad_norm": 2.150020879041981, + "language_loss": 0.72415972, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.74552321, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.546875, + "step": 4028, + "time_per_iteration": 2.435603380203247 + }, + { + "auxiliary_loss_clip": 0.01022265, + "auxiliary_loss_mlp": 0.01007043, + "balance_loss_clip": 1.00306118, + "balance_loss_mlp": 1.00923181, + "epoch": 0.24223658499924847, + "flos": 64343877454080.0, + "grad_norm": 0.9370061142320416, + "language_loss": 0.55335116, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57364428, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.13085938, + "step": 4029, + "time_per_iteration": 2.997204065322876 + }, + { + "auxiliary_loss_clip": 0.01080181, + "auxiliary_loss_mlp": 0.01059426, + "balance_loss_clip": 1.0187757, + "balance_loss_mlp": 1.02542043, + "epoch": 0.24229670825191643, + "flos": 25774070503680.0, + "grad_norm": 2.1093250819254914, + "language_loss": 0.75381267, + "learning_rate": 3.54554981945833e-06, + "loss": 0.77520871, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.546875, + "step": 4030, + "time_per_iteration": 2.4757838249206543 + }, + { + "auxiliary_loss_clip": 0.01078186, + "auxiliary_loss_mlp": 0.01057806, + "balance_loss_clip": 1.0222578, + "balance_loss_mlp": 1.02560997, + "epoch": 0.2423568315045844, + "flos": 20666067793920.0, + "grad_norm": 1.8835533094366022, + "language_loss": 0.78362024, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.80498016, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5234375, + "step": 4031, + "time_per_iteration": 2.415560722351074 + }, + { + "auxiliary_loss_clip": 0.01082176, + "auxiliary_loss_mlp": 0.01061099, + "balance_loss_clip": 1.02204585, + "balance_loss_mlp": 1.02561474, + "epoch": 0.24241695475725236, + "flos": 22415775091200.0, + "grad_norm": 2.5950219368272496, + "language_loss": 0.67922282, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.70065558, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5625, + "step": 4032, + "time_per_iteration": 2.4404025077819824 + }, + { + "auxiliary_loss_clip": 0.0107995, + "auxiliary_loss_mlp": 0.01055751, + "balance_loss_clip": 1.02041721, + "balance_loss_mlp": 1.02571726, + "epoch": 0.24247707800992033, + "flos": 17128039368960.0, + "grad_norm": 2.0965458164921387, + "language_loss": 0.82925487, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.85061193, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.54296875, + "step": 4033, + "time_per_iteration": 2.366330146789551 + }, + { + "auxiliary_loss_clip": 0.01077019, + "auxiliary_loss_mlp": 0.01049303, + "balance_loss_clip": 1.01530421, + "balance_loss_mlp": 1.02631962, + "epoch": 0.2425372012625883, + "flos": 31612386366720.0, + "grad_norm": 1.8414824625200181, + "language_loss": 0.70649588, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.72775906, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 4034, + "time_per_iteration": 2.512877941131592 + }, + { + "auxiliary_loss_clip": 0.01079243, + "auxiliary_loss_mlp": 0.01057042, + "balance_loss_clip": 1.02120733, + "balance_loss_mlp": 1.02503729, + "epoch": 0.24259732451525629, + "flos": 16325106854400.0, + "grad_norm": 2.49452827183652, + "language_loss": 0.97683889, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.99820173, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.54296875, + "step": 4035, + "time_per_iteration": 2.3536953926086426 + }, + { + "auxiliary_loss_clip": 0.01073839, + "auxiliary_loss_mlp": 0.01051327, + "balance_loss_clip": 1.01943851, + "balance_loss_mlp": 1.02462959, + "epoch": 0.24265744776792425, + "flos": 22855540976640.0, + "grad_norm": 1.4985255712496286, + "language_loss": 0.79332823, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.81457984, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4921875, + "step": 4036, + "time_per_iteration": 2.446871519088745 + }, + { + "auxiliary_loss_clip": 0.01078473, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.02111983, + "balance_loss_mlp": 1.02519464, + "epoch": 0.24271757102059222, + "flos": 21870501477120.0, + "grad_norm": 1.579150100521124, + "language_loss": 0.75897521, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.78034902, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.53125, + "step": 4037, + "time_per_iteration": 2.3892405033111572 + }, + { + "auxiliary_loss_clip": 0.01077016, + "auxiliary_loss_mlp": 0.01060729, + "balance_loss_clip": 1.02353573, + "balance_loss_mlp": 1.02291155, + "epoch": 0.24277769427326018, + "flos": 19207571080320.0, + "grad_norm": 2.7699569039793683, + "language_loss": 0.78764749, + "learning_rate": 3.543570475921171e-06, + "loss": 0.80902499, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.54296875, + "step": 4038, + "time_per_iteration": 2.4123494625091553 + }, + { + "auxiliary_loss_clip": 0.01076612, + "auxiliary_loss_mlp": 0.01061571, + "balance_loss_clip": 1.0248065, + "balance_loss_mlp": 1.02333963, + "epoch": 0.24283781752592815, + "flos": 19498886398080.0, + "grad_norm": 1.9446176555507715, + "language_loss": 0.73951364, + "learning_rate": 3.543322794484905e-06, + "loss": 0.76089549, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.53125, + "step": 4039, + "time_per_iteration": 2.3720524311065674 + }, + { + "auxiliary_loss_clip": 0.01077891, + "auxiliary_loss_mlp": 0.0106295, + "balance_loss_clip": 1.02811766, + "balance_loss_mlp": 1.02352035, + "epoch": 0.2428979407785961, + "flos": 19901155616640.0, + "grad_norm": 2.168588271105371, + "language_loss": 0.79834509, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.81975353, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.546875, + "step": 4040, + "time_per_iteration": 2.441763401031494 + }, + { + "auxiliary_loss_clip": 0.01071998, + "auxiliary_loss_mlp": 0.01050446, + "balance_loss_clip": 1.01756835, + "balance_loss_mlp": 1.0214678, + "epoch": 0.2429580640312641, + "flos": 24714770808960.0, + "grad_norm": 1.7118062181089024, + "language_loss": 0.81460315, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.83582759, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.50390625, + "step": 4041, + "time_per_iteration": 2.416722059249878 + }, + { + "auxiliary_loss_clip": 0.01077696, + "auxiliary_loss_mlp": 0.01059254, + "balance_loss_clip": 1.02258515, + "balance_loss_mlp": 1.02502942, + "epoch": 0.24301818728393207, + "flos": 25629145983360.0, + "grad_norm": 1.9311063059099964, + "language_loss": 0.78507715, + "learning_rate": 3.542579399075957e-06, + "loss": 0.80644667, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5234375, + "step": 4042, + "time_per_iteration": 2.492285966873169 + }, + { + "auxiliary_loss_clip": 0.01075065, + "auxiliary_loss_mlp": 0.0104853, + "balance_loss_clip": 1.01636708, + "balance_loss_mlp": 1.02317119, + "epoch": 0.24307831053660003, + "flos": 26140169687040.0, + "grad_norm": 2.034610150191903, + "language_loss": 0.82251257, + "learning_rate": 3.542331483604246e-06, + "loss": 0.84374857, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.51953125, + "step": 4043, + "time_per_iteration": 2.464989185333252 + }, + { + "auxiliary_loss_clip": 0.01078247, + "auxiliary_loss_mlp": 0.01056528, + "balance_loss_clip": 1.02055049, + "balance_loss_mlp": 1.02341557, + "epoch": 0.243138433789268, + "flos": 14971629110400.0, + "grad_norm": 2.4000180548885983, + "language_loss": 0.75167894, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.7730267, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.546875, + "step": 4044, + "time_per_iteration": 2.4083938598632812 + }, + { + "auxiliary_loss_clip": 0.01077344, + "auxiliary_loss_mlp": 0.01054739, + "balance_loss_clip": 1.02057397, + "balance_loss_mlp": 1.02513921, + "epoch": 0.24319855704193596, + "flos": 25190532172800.0, + "grad_norm": 1.7637474235695692, + "language_loss": 0.84580266, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.86712348, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.51953125, + "step": 4045, + "time_per_iteration": 2.503157377243042 + }, + { + "auxiliary_loss_clip": 0.01078927, + "auxiliary_loss_mlp": 0.0105616, + "balance_loss_clip": 1.02106452, + "balance_loss_mlp": 1.02618515, + "epoch": 0.24325868029460393, + "flos": 22126135518720.0, + "grad_norm": 1.5663968267718156, + "language_loss": 0.87577933, + "learning_rate": 3.541587386314541e-06, + "loss": 0.89713013, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.52734375, + "step": 4046, + "time_per_iteration": 2.4831786155700684 + }, + { + "auxiliary_loss_clip": 0.01074936, + "auxiliary_loss_mlp": 0.01054695, + "balance_loss_clip": 1.01898038, + "balance_loss_mlp": 1.02477837, + "epoch": 0.2433188035472719, + "flos": 23581106184960.0, + "grad_norm": 1.802450615123667, + "language_loss": 0.74445379, + "learning_rate": 3.5413392369578e-06, + "loss": 0.76575005, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.50390625, + "step": 4047, + "time_per_iteration": 2.4012045860290527 + }, + { + "auxiliary_loss_clip": 0.01077826, + "auxiliary_loss_mlp": 0.01054701, + "balance_loss_clip": 1.01779377, + "balance_loss_mlp": 1.0248189, + "epoch": 0.2433789267999399, + "flos": 24461650385280.0, + "grad_norm": 3.85577516962487, + "language_loss": 0.75266147, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.7739867, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53125, + "step": 4048, + "time_per_iteration": 3.854748010635376 + }, + { + "auxiliary_loss_clip": 0.01076663, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.01829147, + "balance_loss_mlp": 1.02549636, + "epoch": 0.24343905005260785, + "flos": 16726957136640.0, + "grad_norm": 1.898110146032987, + "language_loss": 0.74694324, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.76821017, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.51171875, + "step": 4049, + "time_per_iteration": 2.3780853748321533 + }, + { + "auxiliary_loss_clip": 0.01074201, + "auxiliary_loss_mlp": 0.01046884, + "balance_loss_clip": 1.01505554, + "balance_loss_mlp": 1.02330399, + "epoch": 0.24349917330527582, + "flos": 20042833380480.0, + "grad_norm": 1.6514405730114698, + "language_loss": 0.75455326, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.77576411, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.5078125, + "step": 4050, + "time_per_iteration": 2.404536485671997 + }, + { + "auxiliary_loss_clip": 0.01074891, + "auxiliary_loss_mlp": 0.01049501, + "balance_loss_clip": 1.0165987, + "balance_loss_mlp": 1.0237174, + "epoch": 0.24355929655794378, + "flos": 17419599066240.0, + "grad_norm": 2.837810372424419, + "language_loss": 0.78151459, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.80275846, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.51171875, + "step": 4051, + "time_per_iteration": 2.360166072845459 + }, + { + "auxiliary_loss_clip": 0.01076745, + "auxiliary_loss_mlp": 0.01053023, + "balance_loss_clip": 1.01814246, + "balance_loss_mlp": 1.02445269, + "epoch": 0.24361941981061175, + "flos": 25409751799680.0, + "grad_norm": 2.225092795586866, + "language_loss": 0.71622491, + "learning_rate": 3.540097613646296e-06, + "loss": 0.73752266, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5234375, + "step": 4052, + "time_per_iteration": 5.361985206604004 + }, + { + "auxiliary_loss_clip": 0.01079084, + "auxiliary_loss_mlp": 0.0105512, + "balance_loss_clip": 1.01892769, + "balance_loss_mlp": 1.02556324, + "epoch": 0.2436795430632797, + "flos": 22819685143680.0, + "grad_norm": 4.859315187168331, + "language_loss": 0.82454562, + "learning_rate": 3.539849113744351e-06, + "loss": 0.84588766, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.53515625, + "step": 4053, + "time_per_iteration": 3.8306734561920166 + }, + { + "auxiliary_loss_clip": 0.01079796, + "auxiliary_loss_mlp": 0.01056215, + "balance_loss_clip": 1.01873565, + "balance_loss_mlp": 1.02462983, + "epoch": 0.2437396663159477, + "flos": 15156913029120.0, + "grad_norm": 1.6102297878755432, + "language_loss": 0.78477401, + "learning_rate": 3.539600555451172e-06, + "loss": 0.8061341, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.55078125, + "step": 4054, + "time_per_iteration": 2.3878581523895264 + }, + { + "auxiliary_loss_clip": 0.01076121, + "auxiliary_loss_mlp": 0.01060595, + "balance_loss_clip": 1.02514243, + "balance_loss_mlp": 1.02334356, + "epoch": 0.24379978956861567, + "flos": 22090035306240.0, + "grad_norm": 1.5859927161887852, + "language_loss": 0.85046244, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.87182963, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.52734375, + "step": 4055, + "time_per_iteration": 2.395505666732788 + }, + { + "auxiliary_loss_clip": 0.01078852, + "auxiliary_loss_mlp": 0.01055163, + "balance_loss_clip": 1.01744509, + "balance_loss_mlp": 1.02343678, + "epoch": 0.24385991282128364, + "flos": 31466414505600.0, + "grad_norm": 4.145669128228798, + "language_loss": 0.57510948, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.59644961, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5546875, + "step": 4056, + "time_per_iteration": 2.571068048477173 + }, + { + "auxiliary_loss_clip": 0.01078051, + "auxiliary_loss_mlp": 0.01064581, + "balance_loss_clip": 1.02838922, + "balance_loss_mlp": 1.0230974, + "epoch": 0.2439200360739516, + "flos": 23837752656000.0, + "grad_norm": 2.4295222152856066, + "language_loss": 0.8144123, + "learning_rate": 3.538854530318506e-06, + "loss": 0.83583862, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.55078125, + "step": 4057, + "time_per_iteration": 2.3896138668060303 + }, + { + "auxiliary_loss_clip": 0.01073432, + "auxiliary_loss_mlp": 0.01058698, + "balance_loss_clip": 1.02374601, + "balance_loss_mlp": 1.02191103, + "epoch": 0.24398015932661957, + "flos": 19169027072640.0, + "grad_norm": 1.8453101238365188, + "language_loss": 0.80755377, + "learning_rate": 3.538605738554673e-06, + "loss": 0.82887506, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.515625, + "step": 4058, + "time_per_iteration": 2.4148991107940674 + }, + { + "auxiliary_loss_clip": 0.01077334, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.0191555, + "balance_loss_mlp": 1.02241826, + "epoch": 0.24404028257928753, + "flos": 25261371054720.0, + "grad_norm": 1.7857767011827075, + "language_loss": 0.8669045, + "learning_rate": 3.538356888446756e-06, + "loss": 0.88823211, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.55078125, + "step": 4059, + "time_per_iteration": 2.4296374320983887 + }, + { + "auxiliary_loss_clip": 0.01073479, + "auxiliary_loss_mlp": 0.01052023, + "balance_loss_clip": 1.01964605, + "balance_loss_mlp": 1.02305579, + "epoch": 0.2441004058319555, + "flos": 26466433142400.0, + "grad_norm": 2.1923685084732787, + "language_loss": 0.75336993, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.77462494, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.50390625, + "step": 4060, + "time_per_iteration": 2.4816532135009766 + }, + { + "auxiliary_loss_clip": 0.01081117, + "auxiliary_loss_mlp": 0.01062244, + "balance_loss_clip": 1.02049661, + "balance_loss_mlp": 1.0231719, + "epoch": 0.2441605290846235, + "flos": 26759319471360.0, + "grad_norm": 2.126842377447476, + "language_loss": 0.75674349, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.77817714, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.578125, + "step": 4061, + "time_per_iteration": 2.435314655303955 + }, + { + "auxiliary_loss_clip": 0.01074519, + "auxiliary_loss_mlp": 0.01050087, + "balance_loss_clip": 1.01692331, + "balance_loss_mlp": 1.02293003, + "epoch": 0.24422065233729146, + "flos": 21104786338560.0, + "grad_norm": 1.8316670769531485, + "language_loss": 0.77502209, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.79626811, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.515625, + "step": 4062, + "time_per_iteration": 2.4098353385925293 + }, + { + "auxiliary_loss_clip": 0.01072214, + "auxiliary_loss_mlp": 0.01054627, + "balance_loss_clip": 1.02112937, + "balance_loss_mlp": 1.02218962, + "epoch": 0.24428077558995942, + "flos": 25262034370560.0, + "grad_norm": 1.814163256189854, + "language_loss": 0.86056113, + "learning_rate": 3.537360904763011e-06, + "loss": 0.8818295, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5, + "step": 4063, + "time_per_iteration": 2.4213056564331055 + }, + { + "auxiliary_loss_clip": 0.01080323, + "auxiliary_loss_mlp": 0.0105313, + "balance_loss_clip": 1.01665139, + "balance_loss_mlp": 1.02515435, + "epoch": 0.24434089884262739, + "flos": 20484240099840.0, + "grad_norm": 2.365248376969329, + "language_loss": 0.71151829, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.73285282, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5546875, + "step": 4064, + "time_per_iteration": 2.419502019882202 + }, + { + "auxiliary_loss_clip": 0.01080727, + "auxiliary_loss_mlp": 0.01056408, + "balance_loss_clip": 1.01697361, + "balance_loss_mlp": 1.02358961, + "epoch": 0.24440102209529535, + "flos": 23620802267520.0, + "grad_norm": 1.8645364614035302, + "language_loss": 0.71372253, + "learning_rate": 3.536862563102088e-06, + "loss": 0.73509383, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5703125, + "step": 4065, + "time_per_iteration": 2.4062345027923584 + }, + { + "auxiliary_loss_clip": 0.01083289, + "auxiliary_loss_mlp": 0.01055913, + "balance_loss_clip": 1.01628804, + "balance_loss_mlp": 1.02581334, + "epoch": 0.24446114534796332, + "flos": 20553787261440.0, + "grad_norm": 1.901042979799193, + "language_loss": 0.86078542, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.88217747, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.578125, + "step": 4066, + "time_per_iteration": 2.391258478164673 + }, + { + "auxiliary_loss_clip": 0.01018473, + "auxiliary_loss_mlp": 0.01017425, + "balance_loss_clip": 1.01222765, + "balance_loss_mlp": 1.00539827, + "epoch": 0.24452126860063128, + "flos": 60386717623680.0, + "grad_norm": 0.7496188756551327, + "language_loss": 0.52401447, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54437339, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.05200195, + "router_z_loss_mlp": 0.13085938, + "step": 4067, + "time_per_iteration": 2.942322254180908 + }, + { + "auxiliary_loss_clip": 0.01080322, + "auxiliary_loss_mlp": 0.01062183, + "balance_loss_clip": 1.02286816, + "balance_loss_mlp": 1.02500129, + "epoch": 0.24458139185329927, + "flos": 15120777905280.0, + "grad_norm": 3.9295171125283317, + "language_loss": 0.7506758, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.77210093, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5546875, + "step": 4068, + "time_per_iteration": 2.413534641265869 + }, + { + "auxiliary_loss_clip": 0.01080881, + "auxiliary_loss_mlp": 0.01057166, + "balance_loss_clip": 1.01954353, + "balance_loss_mlp": 1.02718222, + "epoch": 0.24464151510596724, + "flos": 27997549217280.0, + "grad_norm": 2.480080020663797, + "language_loss": 0.78916574, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.81054622, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5390625, + "step": 4069, + "time_per_iteration": 2.467576503753662 + }, + { + "auxiliary_loss_clip": 0.01080329, + "auxiliary_loss_mlp": 0.01070378, + "balance_loss_clip": 1.03392386, + "balance_loss_mlp": 1.02777433, + "epoch": 0.2447016383586352, + "flos": 19791842549760.0, + "grad_norm": 1.9403531348174947, + "language_loss": 0.82166547, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.84317255, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.52734375, + "step": 4070, + "time_per_iteration": 2.4292781352996826 + }, + { + "auxiliary_loss_clip": 0.01077136, + "auxiliary_loss_mlp": 0.01060688, + "balance_loss_clip": 1.02452016, + "balance_loss_mlp": 1.02433181, + "epoch": 0.24476176161130317, + "flos": 26066153871360.0, + "grad_norm": 1.5008924857237538, + "language_loss": 0.84870189, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.87008011, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.52734375, + "step": 4071, + "time_per_iteration": 2.464749574661255 + }, + { + "auxiliary_loss_clip": 0.01082924, + "auxiliary_loss_mlp": 0.01074001, + "balance_loss_clip": 1.03449476, + "balance_loss_mlp": 1.02630389, + "epoch": 0.24482188486397113, + "flos": 18842554149120.0, + "grad_norm": 1.8849275454239685, + "language_loss": 0.81087017, + "learning_rate": 3.535116532028798e-06, + "loss": 0.83243942, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5625, + "step": 4072, + "time_per_iteration": 2.4048304557800293 + }, + { + "auxiliary_loss_clip": 0.01078391, + "auxiliary_loss_mlp": 0.01074576, + "balance_loss_clip": 1.04062521, + "balance_loss_mlp": 1.02600002, + "epoch": 0.2448820081166391, + "flos": 21250723288320.0, + "grad_norm": 3.295255207109717, + "language_loss": 0.71148777, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.73301744, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5234375, + "step": 4073, + "time_per_iteration": 2.4344606399536133 + }, + { + "auxiliary_loss_clip": 0.01078822, + "auxiliary_loss_mlp": 0.01064011, + "balance_loss_clip": 1.02877283, + "balance_loss_mlp": 1.0254252, + "epoch": 0.2449421313693071, + "flos": 23949474606720.0, + "grad_norm": 2.203530145134386, + "language_loss": 0.6982922, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.71972054, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.53515625, + "step": 4074, + "time_per_iteration": 2.4109864234924316 + }, + { + "auxiliary_loss_clip": 0.0101987, + "auxiliary_loss_mlp": 0.01006861, + "balance_loss_clip": 1.00206864, + "balance_loss_mlp": 1.00734973, + "epoch": 0.24500225462197506, + "flos": 60684631188480.0, + "grad_norm": 0.9064427775436178, + "language_loss": 0.68771493, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70798224, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.04785156, + "router_z_loss_mlp": 0.125, + "step": 4075, + "time_per_iteration": 3.102020263671875 + }, + { + "auxiliary_loss_clip": 0.01075938, + "auxiliary_loss_mlp": 0.01072903, + "balance_loss_clip": 1.03516102, + "balance_loss_mlp": 1.02394795, + "epoch": 0.24506237787464302, + "flos": 26283069348480.0, + "grad_norm": 1.925283636322416, + "language_loss": 0.81347185, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.83496022, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.51953125, + "step": 4076, + "time_per_iteration": 2.456791400909424 + }, + { + "auxiliary_loss_clip": 0.01079452, + "auxiliary_loss_mlp": 0.01084592, + "balance_loss_clip": 1.04401362, + "balance_loss_mlp": 1.02329731, + "epoch": 0.245122501127311, + "flos": 20551413288960.0, + "grad_norm": 2.069191218410067, + "language_loss": 0.84023339, + "learning_rate": 3.533867620434151e-06, + "loss": 0.86187387, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.5625, + "step": 4077, + "time_per_iteration": 2.3764889240264893 + }, + { + "auxiliary_loss_clip": 0.01076783, + "auxiliary_loss_mlp": 0.01082822, + "balance_loss_clip": 1.04515219, + "balance_loss_mlp": 1.02233672, + "epoch": 0.24518262437997895, + "flos": 29131318575360.0, + "grad_norm": 1.9341679867177062, + "language_loss": 0.64506721, + "learning_rate": 3.533617663584082e-06, + "loss": 0.66666329, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.54296875, + "step": 4078, + "time_per_iteration": 2.4667975902557373 + }, + { + "auxiliary_loss_clip": 0.01073985, + "auxiliary_loss_mlp": 0.0106914, + "balance_loss_clip": 1.03554678, + "balance_loss_mlp": 1.02268958, + "epoch": 0.24524274763264692, + "flos": 23475807924480.0, + "grad_norm": 1.4981269244582167, + "language_loss": 0.76422048, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.78565168, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.51171875, + "step": 4079, + "time_per_iteration": 2.406325101852417 + }, + { + "auxiliary_loss_clip": 0.01073614, + "auxiliary_loss_mlp": 0.01076204, + "balance_loss_clip": 1.0379374, + "balance_loss_mlp": 1.0211364, + "epoch": 0.24530287088531488, + "flos": 17200239793920.0, + "grad_norm": 1.8352484979806343, + "language_loss": 0.76581204, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.78731024, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5234375, + "step": 4080, + "time_per_iteration": 2.3974366188049316 + }, + { + "auxiliary_loss_clip": 0.01073728, + "auxiliary_loss_mlp": 0.0106845, + "balance_loss_clip": 1.03502345, + "balance_loss_mlp": 1.02232099, + "epoch": 0.24536299413798288, + "flos": 14866540318080.0, + "grad_norm": 1.7905302539273775, + "language_loss": 0.84700578, + "learning_rate": 3.532867444142186e-06, + "loss": 0.86842752, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.515625, + "step": 4081, + "time_per_iteration": 2.3796255588531494 + }, + { + "auxiliary_loss_clip": 0.01075695, + "auxiliary_loss_mlp": 0.01057281, + "balance_loss_clip": 1.02240014, + "balance_loss_mlp": 1.02428603, + "epoch": 0.24542311739065084, + "flos": 35260600642560.0, + "grad_norm": 2.217441464887436, + "language_loss": 0.7490865, + "learning_rate": 3.532617254729267e-06, + "loss": 0.77041626, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.515625, + "step": 4082, + "time_per_iteration": 2.5373880863189697 + }, + { + "auxiliary_loss_clip": 0.0107565, + "auxiliary_loss_mlp": 0.0105755, + "balance_loss_clip": 1.02455258, + "balance_loss_mlp": 1.02363372, + "epoch": 0.2454832406433188, + "flos": 21502167966720.0, + "grad_norm": 1.5032267305197065, + "language_loss": 0.7280035, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74933541, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.51953125, + "step": 4083, + "time_per_iteration": 2.4061079025268555 + }, + { + "auxiliary_loss_clip": 0.01076867, + "auxiliary_loss_mlp": 0.01068816, + "balance_loss_clip": 1.02992964, + "balance_loss_mlp": 1.02271867, + "epoch": 0.24554336389598677, + "flos": 14755795885440.0, + "grad_norm": 2.0395061278324946, + "language_loss": 0.7722435, + "learning_rate": 3.532116701561919e-06, + "loss": 0.79370034, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5390625, + "step": 4084, + "time_per_iteration": 2.3770596981048584 + }, + { + "auxiliary_loss_clip": 0.01077131, + "auxiliary_loss_mlp": 0.01047083, + "balance_loss_clip": 1.01313186, + "balance_loss_mlp": 1.02434993, + "epoch": 0.24560348714865474, + "flos": 14975504271360.0, + "grad_norm": 2.9894793932898804, + "language_loss": 0.86884457, + "learning_rate": 3.531866337826471e-06, + "loss": 0.89008677, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.52734375, + "step": 4085, + "time_per_iteration": 2.41023850440979 + }, + { + "auxiliary_loss_clip": 0.01082699, + "auxiliary_loss_mlp": 0.01063516, + "balance_loss_clip": 1.02498722, + "balance_loss_mlp": 1.02895534, + "epoch": 0.2456636104013227, + "flos": 22674202041600.0, + "grad_norm": 1.832318984215963, + "language_loss": 0.8073504, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.82881248, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5390625, + "step": 4086, + "time_per_iteration": 2.4116299152374268 + }, + { + "auxiliary_loss_clip": 0.01077715, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.01752186, + "balance_loss_mlp": 1.02709103, + "epoch": 0.2457237336539907, + "flos": 27416629238400.0, + "grad_norm": 1.541543970042599, + "language_loss": 0.7648536, + "learning_rate": 3.531365436099496e-06, + "loss": 0.78614736, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 4087, + "time_per_iteration": 2.4473538398742676 + }, + { + "auxiliary_loss_clip": 0.01084066, + "auxiliary_loss_mlp": 0.01057891, + "balance_loss_clip": 1.02005363, + "balance_loss_mlp": 1.02905893, + "epoch": 0.24578385690665866, + "flos": 20411341447680.0, + "grad_norm": 2.7007281656740476, + "language_loss": 0.80600071, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.82742029, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.546875, + "step": 4088, + "time_per_iteration": 3.862217903137207 + }, + { + "auxiliary_loss_clip": 0.01075357, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_clip": 1.01480079, + "balance_loss_mlp": 1.02610743, + "epoch": 0.24584398015932662, + "flos": 23914247178240.0, + "grad_norm": 1.506149924242245, + "language_loss": 0.78525472, + "learning_rate": 3.5308643020944e-06, + "loss": 0.80648792, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.49414062, + "step": 4089, + "time_per_iteration": 2.4388489723205566 + }, + { + "auxiliary_loss_clip": 0.01082393, + "auxiliary_loss_mlp": 0.01056873, + "balance_loss_clip": 1.01841617, + "balance_loss_mlp": 1.0282172, + "epoch": 0.2459041034119946, + "flos": 41494866768000.0, + "grad_norm": 1.9237033947739768, + "language_loss": 0.83088112, + "learning_rate": 3.530613648011309e-06, + "loss": 0.85227376, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5390625, + "step": 4090, + "time_per_iteration": 2.5869576930999756 + }, + { + "auxiliary_loss_clip": 0.01082128, + "auxiliary_loss_mlp": 0.01065169, + "balance_loss_clip": 1.02370811, + "balance_loss_mlp": 1.02802157, + "epoch": 0.24596422666466256, + "flos": 19935824463360.0, + "grad_norm": 1.6647342848811633, + "language_loss": 0.75150383, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.77297676, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.54296875, + "step": 4091, + "time_per_iteration": 3.8456361293792725 + }, + { + "auxiliary_loss_clip": 0.01081312, + "auxiliary_loss_mlp": 0.01064705, + "balance_loss_clip": 1.02672505, + "balance_loss_mlp": 1.02817321, + "epoch": 0.24602434991733052, + "flos": 21543295415040.0, + "grad_norm": 2.3545421574834777, + "language_loss": 0.78910434, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.81056452, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.53125, + "step": 4092, + "time_per_iteration": 3.804647445678711 + }, + { + "auxiliary_loss_clip": 0.01081915, + "auxiliary_loss_mlp": 0.01058369, + "balance_loss_clip": 1.02105665, + "balance_loss_mlp": 1.02526999, + "epoch": 0.24608447316999849, + "flos": 23183968936320.0, + "grad_norm": 3.349861538877092, + "language_loss": 0.83658987, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.85799271, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.56640625, + "step": 4093, + "time_per_iteration": 3.78472638130188 + }, + { + "auxiliary_loss_clip": 0.01082956, + "auxiliary_loss_mlp": 0.0106724, + "balance_loss_clip": 1.02864003, + "balance_loss_mlp": 1.02630091, + "epoch": 0.24614459642266648, + "flos": 19641052920960.0, + "grad_norm": 1.8889598990676102, + "language_loss": 0.89378333, + "learning_rate": 3.529610451363797e-06, + "loss": 0.91528523, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.56640625, + "step": 4094, + "time_per_iteration": 2.4280807971954346 + }, + { + "auxiliary_loss_clip": 0.01027151, + "auxiliary_loss_mlp": 0.01005687, + "balance_loss_clip": 0.99963087, + "balance_loss_mlp": 1.01376557, + "epoch": 0.24620471967533444, + "flos": 61736913699840.0, + "grad_norm": 0.7478623501955826, + "language_loss": 0.57607472, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59640312, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.06054688, + "router_z_loss_mlp": 0.13378906, + "step": 4095, + "time_per_iteration": 3.112612724304199 + }, + { + "auxiliary_loss_clip": 0.01019396, + "auxiliary_loss_mlp": 0.01006798, + "balance_loss_clip": 1.00040829, + "balance_loss_mlp": 1.00658584, + "epoch": 0.2462648429280024, + "flos": 69151103867520.0, + "grad_norm": 0.6510964876685286, + "language_loss": 0.56325638, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58351827, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.06396484, + "router_z_loss_mlp": 0.12792969, + "step": 4096, + "time_per_iteration": 3.164193630218506 + }, + { + "auxiliary_loss_clip": 0.01075914, + "auxiliary_loss_mlp": 0.01049103, + "balance_loss_clip": 1.01398408, + "balance_loss_mlp": 1.02262449, + "epoch": 0.24632496618067037, + "flos": 29458350080640.0, + "grad_norm": 1.708619716713177, + "language_loss": 0.78881979, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.81007004, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.53125, + "step": 4097, + "time_per_iteration": 2.4562301635742188 + }, + { + "auxiliary_loss_clip": 0.01080699, + "auxiliary_loss_mlp": 0.01070734, + "balance_loss_clip": 1.02896285, + "balance_loss_mlp": 1.02435327, + "epoch": 0.24638508943333834, + "flos": 24315294499200.0, + "grad_norm": 1.8512357099687724, + "language_loss": 0.77553487, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.79704916, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.5625, + "step": 4098, + "time_per_iteration": 2.441605806350708 + }, + { + "auxiliary_loss_clip": 0.01080359, + "auxiliary_loss_mlp": 0.01060187, + "balance_loss_clip": 1.02442384, + "balance_loss_mlp": 1.02522182, + "epoch": 0.2464452126860063, + "flos": 26612090801280.0, + "grad_norm": 1.899677103919735, + "language_loss": 0.7021755, + "learning_rate": 3.528355150558764e-06, + "loss": 0.72358096, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5546875, + "step": 4099, + "time_per_iteration": 2.4292664527893066 + }, + { + "auxiliary_loss_clip": 0.0107624, + "auxiliary_loss_mlp": 0.01056977, + "balance_loss_clip": 1.0230267, + "balance_loss_mlp": 1.02600443, + "epoch": 0.24650533593867427, + "flos": 31211059754880.0, + "grad_norm": 2.2452633704015854, + "language_loss": 0.67956233, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.70089447, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.50390625, + "step": 4100, + "time_per_iteration": 2.508368492126465 + }, + { + "auxiliary_loss_clip": 0.01020668, + "auxiliary_loss_mlp": 0.01009206, + "balance_loss_clip": 1.00398421, + "balance_loss_mlp": 1.00767159, + "epoch": 0.24656545919134226, + "flos": 68490791723520.0, + "grad_norm": 0.7191769112386236, + "language_loss": 0.61504519, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63534391, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.13085938, + "step": 4101, + "time_per_iteration": 3.119555950164795 + }, + { + "auxiliary_loss_clip": 0.01078061, + "auxiliary_loss_mlp": 0.0106332, + "balance_loss_clip": 1.02529216, + "balance_loss_mlp": 1.02574253, + "epoch": 0.24662558244401023, + "flos": 20083157867520.0, + "grad_norm": 2.1960740621879773, + "language_loss": 0.7417109, + "learning_rate": 3.527601274535012e-06, + "loss": 0.7631247, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5234375, + "step": 4102, + "time_per_iteration": 2.412973165512085 + }, + { + "auxiliary_loss_clip": 0.01080593, + "auxiliary_loss_mlp": 0.01064842, + "balance_loss_clip": 1.02581286, + "balance_loss_mlp": 1.02676773, + "epoch": 0.2466857056966782, + "flos": 30700036051200.0, + "grad_norm": 2.256476779632115, + "language_loss": 0.7643671, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78582144, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5390625, + "step": 4103, + "time_per_iteration": 2.4640629291534424 + }, + { + "auxiliary_loss_clip": 0.01082929, + "auxiliary_loss_mlp": 0.01056949, + "balance_loss_clip": 1.01734722, + "balance_loss_mlp": 1.02812374, + "epoch": 0.24674582894934616, + "flos": 22527427219200.0, + "grad_norm": 2.5089721577435586, + "language_loss": 0.80122852, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.82262731, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.546875, + "step": 4104, + "time_per_iteration": 2.428025484085083 + }, + { + "auxiliary_loss_clip": 0.01087706, + "auxiliary_loss_mlp": 0.01054585, + "balance_loss_clip": 1.014817, + "balance_loss_mlp": 1.03403044, + "epoch": 0.24680595220201412, + "flos": 20703250258560.0, + "grad_norm": 1.8701561339186086, + "language_loss": 0.85148472, + "learning_rate": 3.526846877170133e-06, + "loss": 0.87290764, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.5390625, + "step": 4105, + "time_per_iteration": 2.4216604232788086 + }, + { + "auxiliary_loss_clip": 0.01087152, + "auxiliary_loss_mlp": 0.01056883, + "balance_loss_clip": 1.0183543, + "balance_loss_mlp": 1.03238368, + "epoch": 0.2468660754546821, + "flos": 21830211901440.0, + "grad_norm": 1.8284901216229983, + "language_loss": 0.78235996, + "learning_rate": 3.52659529557275e-06, + "loss": 0.80380028, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.546875, + "step": 4106, + "time_per_iteration": 2.445884943008423 + }, + { + "auxiliary_loss_clip": 0.01086203, + "auxiliary_loss_mlp": 0.01061523, + "balance_loss_clip": 1.0216831, + "balance_loss_mlp": 1.03107238, + "epoch": 0.24692619870735008, + "flos": 15266819589120.0, + "grad_norm": 2.5019238857695614, + "language_loss": 0.74762279, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.76910007, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.55078125, + "step": 4107, + "time_per_iteration": 2.388078212738037 + }, + { + "auxiliary_loss_clip": 0.01087629, + "auxiliary_loss_mlp": 0.01064677, + "balance_loss_clip": 1.0255053, + "balance_loss_mlp": 1.03245938, + "epoch": 0.24698632196001805, + "flos": 29678791605120.0, + "grad_norm": 1.62429019251575, + "language_loss": 0.67317533, + "learning_rate": 3.526091958721587e-06, + "loss": 0.69469845, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.55078125, + "step": 4108, + "time_per_iteration": 2.542109251022339 + }, + { + "auxiliary_loss_clip": 0.01086527, + "auxiliary_loss_mlp": 0.01070262, + "balance_loss_clip": 1.02825236, + "balance_loss_mlp": 1.0300622, + "epoch": 0.247046445212686, + "flos": 39163925289600.0, + "grad_norm": 2.1495606279939428, + "language_loss": 0.74320567, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.76477355, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.56640625, + "step": 4109, + "time_per_iteration": 2.5963244438171387 + }, + { + "auxiliary_loss_clip": 0.01083853, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.02197909, + "balance_loss_mlp": 1.02764714, + "epoch": 0.24710656846535398, + "flos": 22997847144960.0, + "grad_norm": 1.9904973572082112, + "language_loss": 0.80545163, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.82691574, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.5625, + "step": 4110, + "time_per_iteration": 2.4469001293182373 + }, + { + "auxiliary_loss_clip": 0.01084, + "auxiliary_loss_mlp": 0.01067321, + "balance_loss_clip": 1.02569366, + "balance_loss_mlp": 1.02796149, + "epoch": 0.24716669171802194, + "flos": 26431589738880.0, + "grad_norm": 2.0165848248137452, + "language_loss": 0.8284595, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.84997267, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.5625, + "step": 4111, + "time_per_iteration": 2.417048692703247 + }, + { + "auxiliary_loss_clip": 0.01081531, + "auxiliary_loss_mlp": 0.01058163, + "balance_loss_clip": 1.01922953, + "balance_loss_mlp": 1.02667725, + "epoch": 0.2472268149706899, + "flos": 23328788722560.0, + "grad_norm": 2.2985209595084077, + "language_loss": 0.76875716, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.7901541, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.55078125, + "step": 4112, + "time_per_iteration": 2.437804698944092 + }, + { + "auxiliary_loss_clip": 0.01080643, + "auxiliary_loss_mlp": 0.01068369, + "balance_loss_clip": 1.02988863, + "balance_loss_mlp": 1.02494001, + "epoch": 0.24728693822335787, + "flos": 23767612001280.0, + "grad_norm": 1.868060448676148, + "language_loss": 0.84033823, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.86182833, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5546875, + "step": 4113, + "time_per_iteration": 2.4234378337860107 + }, + { + "auxiliary_loss_clip": 0.01078447, + "auxiliary_loss_mlp": 0.0105939, + "balance_loss_clip": 1.01749957, + "balance_loss_mlp": 1.02360177, + "epoch": 0.24734706147602586, + "flos": 19316500122240.0, + "grad_norm": 2.1282645429704092, + "language_loss": 0.89845634, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.91983473, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.55078125, + "step": 4114, + "time_per_iteration": 2.4025440216064453 + }, + { + "auxiliary_loss_clip": 0.01079366, + "auxiliary_loss_mlp": 0.0106359, + "balance_loss_clip": 1.02413142, + "balance_loss_mlp": 1.02330959, + "epoch": 0.24740718472869383, + "flos": 28035709200000.0, + "grad_norm": 1.7229729758228118, + "language_loss": 0.76758415, + "learning_rate": 3.524328457352734e-06, + "loss": 0.78901374, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5625, + "step": 4115, + "time_per_iteration": 2.4383676052093506 + }, + { + "auxiliary_loss_clip": 0.01019029, + "auxiliary_loss_mlp": 0.01020626, + "balance_loss_clip": 1.01461792, + "balance_loss_mlp": 1.00556958, + "epoch": 0.2474673079813618, + "flos": 68103953326080.0, + "grad_norm": 0.6968632919127377, + "language_loss": 0.58345836, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60385495, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.06005859, + "router_z_loss_mlp": 0.13476562, + "step": 4116, + "time_per_iteration": 3.1503050327301025 + }, + { + "auxiliary_loss_clip": 0.01080053, + "auxiliary_loss_mlp": 0.0106698, + "balance_loss_clip": 1.02313423, + "balance_loss_mlp": 1.02346921, + "epoch": 0.24752743123402976, + "flos": 29460793875840.0, + "grad_norm": 1.466037532565512, + "language_loss": 0.84507513, + "learning_rate": 3.523824079451235e-06, + "loss": 0.86654544, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.56640625, + "step": 4117, + "time_per_iteration": 2.4562833309173584 + }, + { + "auxiliary_loss_clip": 0.01019571, + "auxiliary_loss_mlp": 0.01009339, + "balance_loss_clip": 1.00368834, + "balance_loss_mlp": 1.0062201, + "epoch": 0.24758755448669773, + "flos": 58347545310720.0, + "grad_norm": 0.9060165209000494, + "language_loss": 0.63467872, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.6549679, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.1328125, + "step": 4118, + "time_per_iteration": 2.913499355316162 + }, + { + "auxiliary_loss_clip": 0.01078727, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_clip": 1.02374148, + "balance_loss_mlp": 1.02500451, + "epoch": 0.2476476777393657, + "flos": 20483402227200.0, + "grad_norm": 1.5908131387188509, + "language_loss": 0.80583549, + "learning_rate": 3.523319470415491e-06, + "loss": 0.82725811, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.5390625, + "step": 4119, + "time_per_iteration": 2.3861520290374756 + }, + { + "auxiliary_loss_clip": 0.01079377, + "auxiliary_loss_mlp": 0.01060273, + "balance_loss_clip": 1.02403378, + "balance_loss_mlp": 1.025141, + "epoch": 0.24770780099203366, + "flos": 20484798681600.0, + "grad_norm": 1.509772473337409, + "language_loss": 0.76656902, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.78796548, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.54296875, + "step": 4120, + "time_per_iteration": 2.4449617862701416 + }, + { + "auxiliary_loss_clip": 0.01083638, + "auxiliary_loss_mlp": 0.01067152, + "balance_loss_clip": 1.02261591, + "balance_loss_mlp": 1.02677441, + "epoch": 0.24776792424470165, + "flos": 15152653843200.0, + "grad_norm": 11.134287198808924, + "language_loss": 0.90499443, + "learning_rate": 3.522814630322041e-06, + "loss": 0.92650229, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 0.5703125, + "step": 4121, + "time_per_iteration": 2.365471839904785 + }, + { + "auxiliary_loss_clip": 0.01085718, + "auxiliary_loss_mlp": 0.01066251, + "balance_loss_clip": 1.02157116, + "balance_loss_mlp": 1.02824259, + "epoch": 0.2478280474973696, + "flos": 21724389970560.0, + "grad_norm": 5.729419427803232, + "language_loss": 0.71628761, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.73780727, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 0.57421875, + "step": 4122, + "time_per_iteration": 2.4486865997314453 + }, + { + "auxiliary_loss_clip": 0.01086587, + "auxiliary_loss_mlp": 0.01066552, + "balance_loss_clip": 1.02408934, + "balance_loss_mlp": 1.02828908, + "epoch": 0.24788817075003758, + "flos": 20411166890880.0, + "grad_norm": 2.248662068703609, + "language_loss": 0.81648588, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.83801723, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.58203125, + "step": 4123, + "time_per_iteration": 2.4198250770568848 + }, + { + "auxiliary_loss_clip": 0.01084221, + "auxiliary_loss_mlp": 0.01065897, + "balance_loss_clip": 1.02636755, + "balance_loss_mlp": 1.02788866, + "epoch": 0.24794829400270554, + "flos": 22593553067520.0, + "grad_norm": 1.891888499671389, + "language_loss": 0.7606473, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.78214848, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.5625, + "step": 4124, + "time_per_iteration": 2.463684558868408 + }, + { + "auxiliary_loss_clip": 0.01085033, + "auxiliary_loss_mlp": 0.01056914, + "balance_loss_clip": 1.01888657, + "balance_loss_mlp": 1.03119683, + "epoch": 0.2480084172553735, + "flos": 39674495145600.0, + "grad_norm": 1.7910322761784299, + "language_loss": 0.75037318, + "learning_rate": 3.521804257268357e-06, + "loss": 0.77179265, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5390625, + "step": 4125, + "time_per_iteration": 2.5674262046813965 + }, + { + "auxiliary_loss_clip": 0.01087655, + "auxiliary_loss_mlp": 0.01069042, + "balance_loss_clip": 1.02722323, + "balance_loss_mlp": 1.02826595, + "epoch": 0.24806854050804147, + "flos": 22052643373440.0, + "grad_norm": 1.9917358192151446, + "language_loss": 0.72184068, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.74340761, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.59375, + "step": 4126, + "time_per_iteration": 2.490355968475342 + }, + { + "auxiliary_loss_clip": 0.0108877, + "auxiliary_loss_mlp": 0.01063158, + "balance_loss_clip": 1.02126813, + "balance_loss_mlp": 1.03041625, + "epoch": 0.24812866376070947, + "flos": 15485864659200.0, + "grad_norm": 2.1266833865996544, + "language_loss": 0.83596849, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.8574878, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.5859375, + "step": 4127, + "time_per_iteration": 3.8470637798309326 + }, + { + "auxiliary_loss_clip": 0.01087437, + "auxiliary_loss_mlp": 0.01060213, + "balance_loss_clip": 1.01918089, + "balance_loss_mlp": 1.03049409, + "epoch": 0.24818878701337743, + "flos": 14756529024000.0, + "grad_norm": 2.278020913814939, + "language_loss": 0.86644912, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.88792562, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.5703125, + "step": 4128, + "time_per_iteration": 2.4354209899902344 + }, + { + "auxiliary_loss_clip": 0.01084819, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_clip": 1.02046907, + "balance_loss_mlp": 1.02823496, + "epoch": 0.2482489102660454, + "flos": 27088271101440.0, + "grad_norm": 2.4797213969908225, + "language_loss": 0.67561781, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.69706333, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.56640625, + "step": 4129, + "time_per_iteration": 2.452214241027832 + }, + { + "auxiliary_loss_clip": 0.0108129, + "auxiliary_loss_mlp": 0.01063552, + "balance_loss_clip": 1.0235697, + "balance_loss_mlp": 1.02740574, + "epoch": 0.24830903351871336, + "flos": 26466363319680.0, + "grad_norm": 1.7706383951366746, + "language_loss": 0.76961535, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.79106379, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.5390625, + "step": 4130, + "time_per_iteration": 3.9370250701904297 + }, + { + "auxiliary_loss_clip": 0.01083123, + "auxiliary_loss_mlp": 0.01063436, + "balance_loss_clip": 1.02233255, + "balance_loss_mlp": 1.02564478, + "epoch": 0.24836915677138133, + "flos": 10227805459200.0, + "grad_norm": 2.516836443065612, + "language_loss": 0.79092675, + "learning_rate": 3.520286966670535e-06, + "loss": 0.81239235, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.57421875, + "step": 4131, + "time_per_iteration": 3.715996503829956 + }, + { + "auxiliary_loss_clip": 0.0107723, + "auxiliary_loss_mlp": 0.01057675, + "balance_loss_clip": 1.02176881, + "balance_loss_mlp": 1.02322292, + "epoch": 0.2484292800240493, + "flos": 30079140698880.0, + "grad_norm": 1.5122974581930986, + "language_loss": 0.84776473, + "learning_rate": 3.520033883075255e-06, + "loss": 0.8691138, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5390625, + "step": 4132, + "time_per_iteration": 3.8950085639953613 + }, + { + "auxiliary_loss_clip": 0.01078606, + "auxiliary_loss_mlp": 0.0106058, + "balance_loss_clip": 1.01792717, + "balance_loss_mlp": 1.02287126, + "epoch": 0.24848940327671726, + "flos": 13442118958080.0, + "grad_norm": 1.7468271133996423, + "language_loss": 0.72457433, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.7459662, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.5546875, + "step": 4133, + "time_per_iteration": 2.3726863861083984 + }, + { + "auxiliary_loss_clip": 0.01084237, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.01922512, + "balance_loss_mlp": 1.02450764, + "epoch": 0.24854952652938525, + "flos": 19969341235200.0, + "grad_norm": 2.5180858646530684, + "language_loss": 0.6394037, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.66095561, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.59375, + "step": 4134, + "time_per_iteration": 2.3851380348205566 + }, + { + "auxiliary_loss_clip": 0.01078761, + "auxiliary_loss_mlp": 0.01055928, + "balance_loss_clip": 1.0173521, + "balance_loss_mlp": 1.02212906, + "epoch": 0.24860964978205322, + "flos": 18149213992320.0, + "grad_norm": 2.7729579342113273, + "language_loss": 0.79950857, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.8208555, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.56640625, + "step": 4135, + "time_per_iteration": 2.36698317527771 + }, + { + "auxiliary_loss_clip": 0.01078939, + "auxiliary_loss_mlp": 0.01057331, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.02425933, + "epoch": 0.24866977303472118, + "flos": 11727848557440.0, + "grad_norm": 2.5513006750789025, + "language_loss": 0.84207606, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.86343873, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.546875, + "step": 4136, + "time_per_iteration": 2.3955652713775635 + }, + { + "auxiliary_loss_clip": 0.0108012, + "auxiliary_loss_mlp": 0.0106016, + "balance_loss_clip": 1.02072513, + "balance_loss_mlp": 1.02413154, + "epoch": 0.24872989628738915, + "flos": 34822161388800.0, + "grad_norm": 1.8330838453202505, + "language_loss": 0.7264992, + "learning_rate": 3.518767600693314e-06, + "loss": 0.74790198, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.55859375, + "step": 4137, + "time_per_iteration": 2.484415292739868 + }, + { + "auxiliary_loss_clip": 0.01077692, + "auxiliary_loss_mlp": 0.01054987, + "balance_loss_clip": 1.01660144, + "balance_loss_mlp": 1.02179027, + "epoch": 0.2487900195400571, + "flos": 13698486138240.0, + "grad_norm": 2.529355796803447, + "language_loss": 0.70562118, + "learning_rate": 3.518514171403042e-06, + "loss": 0.72694802, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.55859375, + "step": 4138, + "time_per_iteration": 2.391927719116211 + }, + { + "auxiliary_loss_clip": 0.0107762, + "auxiliary_loss_mlp": 0.01053332, + "balance_loss_clip": 1.01706862, + "balance_loss_mlp": 1.02353692, + "epoch": 0.24885014279272508, + "flos": 25336643679360.0, + "grad_norm": 1.873073594662746, + "language_loss": 0.85325813, + "learning_rate": 3.51826068453056e-06, + "loss": 0.87456775, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.54296875, + "step": 4139, + "time_per_iteration": 2.414858818054199 + }, + { + "auxiliary_loss_clip": 0.01079862, + "auxiliary_loss_mlp": 0.01060154, + "balance_loss_clip": 1.01828778, + "balance_loss_mlp": 1.02373469, + "epoch": 0.24891026604539307, + "flos": 20630386517760.0, + "grad_norm": 1.4989153600431897, + "language_loss": 0.80510312, + "learning_rate": 3.518007140085481e-06, + "loss": 0.82650328, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.5625, + "step": 4140, + "time_per_iteration": 2.4023523330688477 + }, + { + "auxiliary_loss_clip": 0.01018482, + "auxiliary_loss_mlp": 0.01007378, + "balance_loss_clip": 1.00175142, + "balance_loss_mlp": 1.00529659, + "epoch": 0.24897038929806103, + "flos": 66957162030720.0, + "grad_norm": 0.9456506251621527, + "language_loss": 0.61130095, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63155955, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.05615234, + "router_z_loss_mlp": 0.13183594, + "step": 4141, + "time_per_iteration": 3.0560102462768555 + }, + { + "auxiliary_loss_clip": 0.01084525, + "auxiliary_loss_mlp": 0.01062217, + "balance_loss_clip": 1.02290154, + "balance_loss_mlp": 1.02794456, + "epoch": 0.249030512550729, + "flos": 36391088332800.0, + "grad_norm": 1.7733729439896249, + "language_loss": 0.73407769, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75554514, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.56640625, + "step": 4142, + "time_per_iteration": 2.542952060699463 + }, + { + "auxiliary_loss_clip": 0.01080373, + "auxiliary_loss_mlp": 0.01061381, + "balance_loss_clip": 1.02213693, + "balance_loss_mlp": 1.02570081, + "epoch": 0.24909063580339696, + "flos": 20153612724480.0, + "grad_norm": 1.9388110043515512, + "language_loss": 0.83226115, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.8536787, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.546875, + "step": 4143, + "time_per_iteration": 2.386887550354004 + }, + { + "auxiliary_loss_clip": 0.01074951, + "auxiliary_loss_mlp": 0.01049364, + "balance_loss_clip": 1.01527011, + "balance_loss_mlp": 1.02360392, + "epoch": 0.24915075905606493, + "flos": 26395349880960.0, + "grad_norm": 1.9773641439897351, + "language_loss": 0.60292566, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.62416881, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.51171875, + "step": 4144, + "time_per_iteration": 2.538722515106201 + }, + { + "auxiliary_loss_clip": 0.01079047, + "auxiliary_loss_mlp": 0.01054867, + "balance_loss_clip": 1.0203439, + "balance_loss_mlp": 1.02563858, + "epoch": 0.2492108823087329, + "flos": 27525977216640.0, + "grad_norm": 1.8067875286259487, + "language_loss": 0.79835415, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81969333, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.53515625, + "step": 4145, + "time_per_iteration": 2.4495434761047363 + }, + { + "auxiliary_loss_clip": 0.01085811, + "auxiliary_loss_mlp": 0.01066715, + "balance_loss_clip": 1.02253544, + "balance_loss_mlp": 1.02613711, + "epoch": 0.24927100556140086, + "flos": 16690437987840.0, + "grad_norm": 2.2430370491417073, + "language_loss": 0.68957031, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.71109557, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.59765625, + "step": 4146, + "time_per_iteration": 2.408151149749756 + }, + { + "auxiliary_loss_clip": 0.01020467, + "auxiliary_loss_mlp": 0.01007011, + "balance_loss_clip": 1.0011934, + "balance_loss_mlp": 1.0065248, + "epoch": 0.24933112881406885, + "flos": 62769225047040.0, + "grad_norm": 0.9379186542297425, + "language_loss": 0.67292118, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69319594, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.05810547, + "router_z_loss_mlp": 0.13867188, + "step": 4147, + "time_per_iteration": 3.120908737182617 + }, + { + "auxiliary_loss_clip": 0.01079976, + "auxiliary_loss_mlp": 0.01056397, + "balance_loss_clip": 1.01846504, + "balance_loss_mlp": 1.02688241, + "epoch": 0.24939125206673682, + "flos": 26650669720320.0, + "grad_norm": 1.7050143163163691, + "language_loss": 0.90759265, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.92895633, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.53125, + "step": 4148, + "time_per_iteration": 2.4532992839813232 + }, + { + "auxiliary_loss_clip": 0.01082735, + "auxiliary_loss_mlp": 0.01062415, + "balance_loss_clip": 1.01947618, + "balance_loss_mlp": 1.02610159, + "epoch": 0.24945137531940478, + "flos": 20703285169920.0, + "grad_norm": 1.8315167876952603, + "language_loss": 0.70225871, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.72371024, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 0.56640625, + "step": 4149, + "time_per_iteration": 2.4139347076416016 + }, + { + "auxiliary_loss_clip": 0.01075764, + "auxiliary_loss_mlp": 0.01055298, + "balance_loss_clip": 1.01989293, + "balance_loss_mlp": 1.0234375, + "epoch": 0.24951149857207275, + "flos": 23767542178560.0, + "grad_norm": 1.5483054578651156, + "language_loss": 0.72304076, + "learning_rate": 3.515468531258095e-06, + "loss": 0.74435139, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5234375, + "step": 4150, + "time_per_iteration": 2.418304443359375 + }, + { + "auxiliary_loss_clip": 0.01078297, + "auxiliary_loss_mlp": 0.01063491, + "balance_loss_clip": 1.02303135, + "balance_loss_mlp": 1.02362752, + "epoch": 0.2495716218247407, + "flos": 15664096483200.0, + "grad_norm": 1.750338162435281, + "language_loss": 0.74243599, + "learning_rate": 3.515214354149478e-06, + "loss": 0.76385385, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.546875, + "step": 4151, + "time_per_iteration": 2.3468077182769775 + }, + { + "auxiliary_loss_clip": 0.01083698, + "auxiliary_loss_mlp": 0.01073497, + "balance_loss_clip": 1.03139234, + "balance_loss_mlp": 1.02571785, + "epoch": 0.24963174507740868, + "flos": 24051595933440.0, + "grad_norm": 2.8058552693259378, + "language_loss": 0.66364336, + "learning_rate": 3.514960119583781e-06, + "loss": 0.68521529, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.578125, + "step": 4152, + "time_per_iteration": 2.4043893814086914 + }, + { + "auxiliary_loss_clip": 0.01075103, + "auxiliary_loss_mlp": 0.01058093, + "balance_loss_clip": 1.02421331, + "balance_loss_mlp": 1.0234642, + "epoch": 0.24969186833007664, + "flos": 21798405786240.0, + "grad_norm": 1.8896346355671247, + "language_loss": 0.78764451, + "learning_rate": 3.514705827570645e-06, + "loss": 0.80897653, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.515625, + "step": 4153, + "time_per_iteration": 2.3852810859680176 + }, + { + "auxiliary_loss_clip": 0.01074078, + "auxiliary_loss_mlp": 0.01058192, + "balance_loss_clip": 1.02300191, + "balance_loss_mlp": 1.0224632, + "epoch": 0.24975199158274464, + "flos": 19937116183680.0, + "grad_norm": 2.226582114660258, + "language_loss": 0.78315294, + "learning_rate": 3.514451478119711e-06, + "loss": 0.80447567, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.515625, + "step": 4154, + "time_per_iteration": 2.3911995887756348 + }, + { + "auxiliary_loss_clip": 0.01079533, + "auxiliary_loss_mlp": 0.01061862, + "balance_loss_clip": 1.01801717, + "balance_loss_mlp": 1.0221777, + "epoch": 0.2498121148354126, + "flos": 25337202261120.0, + "grad_norm": 2.1221823890438634, + "language_loss": 0.73134553, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.75275946, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 0.57421875, + "step": 4155, + "time_per_iteration": 2.398071527481079 + }, + { + "auxiliary_loss_clip": 0.01077804, + "auxiliary_loss_mlp": 0.01062904, + "balance_loss_clip": 1.02237296, + "balance_loss_mlp": 1.02203465, + "epoch": 0.24987223808808057, + "flos": 20557732245120.0, + "grad_norm": 1.6500431000907563, + "language_loss": 0.76256275, + "learning_rate": 3.513942606943036e-06, + "loss": 0.78396982, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.55859375, + "step": 4156, + "time_per_iteration": 2.3931994438171387 + }, + { + "auxiliary_loss_clip": 0.01074532, + "auxiliary_loss_mlp": 0.01054426, + "balance_loss_clip": 1.01794839, + "balance_loss_mlp": 1.02064347, + "epoch": 0.24993236134074853, + "flos": 19748201483520.0, + "grad_norm": 2.001688407110972, + "language_loss": 0.78130579, + "learning_rate": 3.513688085236591e-06, + "loss": 0.80259538, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5390625, + "step": 4157, + "time_per_iteration": 2.374446392059326 + }, + { + "auxiliary_loss_clip": 0.01077262, + "auxiliary_loss_mlp": 0.01056289, + "balance_loss_clip": 1.01873851, + "balance_loss_mlp": 1.02270269, + "epoch": 0.2499924845934165, + "flos": 18769306383360.0, + "grad_norm": 1.531215933859981, + "language_loss": 0.82760155, + "learning_rate": 3.513433506130942e-06, + "loss": 0.84893709, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.54296875, + "step": 4158, + "time_per_iteration": 2.388986110687256 + }, + { + "auxiliary_loss_clip": 0.01077669, + "auxiliary_loss_mlp": 0.01052939, + "balance_loss_clip": 1.01481581, + "balance_loss_mlp": 1.0225122, + "epoch": 0.25005260784608446, + "flos": 16871288163840.0, + "grad_norm": 1.8322834356388111, + "language_loss": 0.77446276, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.79576886, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.55078125, + "step": 4159, + "time_per_iteration": 2.345208168029785 + }, + { + "auxiliary_loss_clip": 0.01078562, + "auxiliary_loss_mlp": 0.0105665, + "balance_loss_clip": 1.01766884, + "balance_loss_mlp": 1.02234149, + "epoch": 0.2501127310987524, + "flos": 22123901191680.0, + "grad_norm": 1.874001779434463, + "language_loss": 0.72067797, + "learning_rate": 3.512924175760649e-06, + "loss": 0.74203002, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5625, + "step": 4160, + "time_per_iteration": 2.3951282501220703 + }, + { + "auxiliary_loss_clip": 0.01031434, + "auxiliary_loss_mlp": 0.01009431, + "balance_loss_clip": 1.00280273, + "balance_loss_mlp": 1.01480794, + "epoch": 0.2501728543514204, + "flos": 69454393781760.0, + "grad_norm": 0.7553054937286073, + "language_loss": 0.56776309, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58817172, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.06640625, + "router_z_loss_mlp": 0.16601562, + "step": 4161, + "time_per_iteration": 3.0608177185058594 + }, + { + "auxiliary_loss_clip": 0.01085247, + "auxiliary_loss_mlp": 0.01061679, + "balance_loss_clip": 1.02005148, + "balance_loss_mlp": 1.02610159, + "epoch": 0.25023297760408836, + "flos": 16289041553280.0, + "grad_norm": 1.846200933393925, + "language_loss": 0.83402824, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.85549748, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.59375, + "step": 4162, + "time_per_iteration": 2.411557197570801 + }, + { + "auxiliary_loss_clip": 0.01081511, + "auxiliary_loss_mlp": 0.01062081, + "balance_loss_clip": 1.02155006, + "balance_loss_mlp": 1.02367759, + "epoch": 0.2502931008567563, + "flos": 12237231427200.0, + "grad_norm": 2.3183482785020955, + "language_loss": 0.89269984, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.91413581, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.578125, + "step": 4163, + "time_per_iteration": 2.3530325889587402 + }, + { + "auxiliary_loss_clip": 0.01078885, + "auxiliary_loss_mlp": 0.01061278, + "balance_loss_clip": 1.02627826, + "balance_loss_mlp": 1.02362609, + "epoch": 0.25035322410942434, + "flos": 23180861825280.0, + "grad_norm": 1.6610205674405634, + "language_loss": 0.84875709, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.87015873, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.55078125, + "step": 4164, + "time_per_iteration": 2.421299934387207 + }, + { + "auxiliary_loss_clip": 0.01075523, + "auxiliary_loss_mlp": 0.01061645, + "balance_loss_clip": 1.02815902, + "balance_loss_mlp": 1.02427268, + "epoch": 0.2504133473620923, + "flos": 20916639688320.0, + "grad_norm": 1.7701499616047314, + "language_loss": 0.7506597, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.77203137, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.515625, + "step": 4165, + "time_per_iteration": 2.400285482406616 + }, + { + "auxiliary_loss_clip": 0.01080834, + "auxiliary_loss_mlp": 0.01075113, + "balance_loss_clip": 1.03386688, + "balance_loss_mlp": 1.02321649, + "epoch": 0.2504734706147603, + "flos": 20775520506240.0, + "grad_norm": 2.701473086840755, + "language_loss": 0.75123852, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.77279794, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.578125, + "step": 4166, + "time_per_iteration": 3.8191535472869873 + }, + { + "auxiliary_loss_clip": 0.01073291, + "auxiliary_loss_mlp": 0.010693, + "balance_loss_clip": 1.03377521, + "balance_loss_mlp": 1.02124238, + "epoch": 0.25053359386742824, + "flos": 24348322512000.0, + "grad_norm": 1.7467253704422765, + "language_loss": 0.83047152, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.85189748, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5234375, + "step": 4167, + "time_per_iteration": 2.40356183052063 + }, + { + "auxiliary_loss_clip": 0.01073703, + "auxiliary_loss_mlp": 0.01065158, + "balance_loss_clip": 1.02825117, + "balance_loss_mlp": 1.02098083, + "epoch": 0.2505937171200962, + "flos": 21213296444160.0, + "grad_norm": 2.46109888360696, + "language_loss": 0.81836879, + "learning_rate": 3.51088456024312e-06, + "loss": 0.83975744, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.52734375, + "step": 4168, + "time_per_iteration": 2.4010775089263916 + }, + { + "auxiliary_loss_clip": 0.01078181, + "auxiliary_loss_mlp": 0.01065821, + "balance_loss_clip": 1.02421689, + "balance_loss_mlp": 1.02225661, + "epoch": 0.25065384037276417, + "flos": 41425633808640.0, + "grad_norm": 2.1546978891637503, + "language_loss": 0.7214148, + "learning_rate": 3.510629350383849e-06, + "loss": 0.74285477, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 0.55859375, + "step": 4169, + "time_per_iteration": 2.5698513984680176 + }, + { + "auxiliary_loss_clip": 0.01071056, + "auxiliary_loss_mlp": 0.01061456, + "balance_loss_clip": 1.02729082, + "balance_loss_mlp": 1.02061069, + "epoch": 0.25071396362543213, + "flos": 26101241654400.0, + "grad_norm": 1.8729608955481363, + "language_loss": 0.79113519, + "learning_rate": 3.510374083241361e-06, + "loss": 0.8124603, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5078125, + "step": 4170, + "time_per_iteration": 3.9379851818084717 + }, + { + "auxiliary_loss_clip": 0.01073945, + "auxiliary_loss_mlp": 0.01062969, + "balance_loss_clip": 1.02818394, + "balance_loss_mlp": 1.02061081, + "epoch": 0.2507740868781001, + "flos": 19097978722560.0, + "grad_norm": 2.3545331759156354, + "language_loss": 0.78458071, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.80594987, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53125, + "step": 4171, + "time_per_iteration": 3.7507925033569336 + }, + { + "auxiliary_loss_clip": 0.01019914, + "auxiliary_loss_mlp": 0.01016879, + "balance_loss_clip": 1.01168168, + "balance_loss_mlp": 1.00511932, + "epoch": 0.25083421013076806, + "flos": 64338570927360.0, + "grad_norm": 0.8608151312430881, + "language_loss": 0.60186553, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62223351, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.05200195, + "router_z_loss_mlp": 0.1484375, + "step": 4172, + "time_per_iteration": 4.454200983047485 + }, + { + "auxiliary_loss_clip": 0.01073981, + "auxiliary_loss_mlp": 0.01065744, + "balance_loss_clip": 1.02962375, + "balance_loss_mlp": 1.02106094, + "epoch": 0.25089433338343603, + "flos": 24278461148160.0, + "grad_norm": 2.1862998908383515, + "language_loss": 0.80328882, + "learning_rate": 3.509607938211409e-06, + "loss": 0.82468605, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.52734375, + "step": 4173, + "time_per_iteration": 2.397447109222412 + }, + { + "auxiliary_loss_clip": 0.01076815, + "auxiliary_loss_mlp": 0.01054859, + "balance_loss_clip": 1.0186193, + "balance_loss_mlp": 1.0239414, + "epoch": 0.250954456636104, + "flos": 14720568456960.0, + "grad_norm": 1.9552884421112848, + "language_loss": 0.85587633, + "learning_rate": 3.509352442032875e-06, + "loss": 0.87719309, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.53125, + "step": 4174, + "time_per_iteration": 2.3786256313323975 + }, + { + "auxiliary_loss_clip": 0.01080544, + "auxiliary_loss_mlp": 0.01056319, + "balance_loss_clip": 1.02031755, + "balance_loss_mlp": 1.02575529, + "epoch": 0.25101457988877196, + "flos": 22272491404800.0, + "grad_norm": 2.2616243253516215, + "language_loss": 0.72302437, + "learning_rate": 3.509096888619545e-06, + "loss": 0.74439299, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.546875, + "step": 4175, + "time_per_iteration": 2.3921878337860107 + }, + { + "auxiliary_loss_clip": 0.01079127, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_clip": 1.01546526, + "balance_loss_mlp": 1.02446485, + "epoch": 0.2510747031414399, + "flos": 25187843998080.0, + "grad_norm": 2.084760146076791, + "language_loss": 0.82263529, + "learning_rate": 3.50884127798111e-06, + "loss": 0.84395051, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.546875, + "step": 4176, + "time_per_iteration": 2.432888984680176 + }, + { + "auxiliary_loss_clip": 0.01081941, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_clip": 1.01434827, + "balance_loss_mlp": 1.02807522, + "epoch": 0.25113482639410795, + "flos": 20703145524480.0, + "grad_norm": 2.1438036750250977, + "language_loss": 0.85197479, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8733061, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5390625, + "step": 4177, + "time_per_iteration": 2.3856217861175537 + }, + { + "auxiliary_loss_clip": 0.01085234, + "auxiliary_loss_mlp": 0.01054694, + "balance_loss_clip": 1.01978946, + "balance_loss_mlp": 1.03099251, + "epoch": 0.2511949496467759, + "flos": 21505868570880.0, + "grad_norm": 2.314804170315218, + "language_loss": 0.84918904, + "learning_rate": 3.508329885067698e-06, + "loss": 0.87058842, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.54296875, + "step": 4178, + "time_per_iteration": 2.4285356998443604 + }, + { + "auxiliary_loss_clip": 0.01081923, + "auxiliary_loss_mlp": 0.01052578, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.0304656, + "epoch": 0.2512550728994439, + "flos": 20701015931520.0, + "grad_norm": 2.326151775682438, + "language_loss": 0.76979637, + "learning_rate": 3.508074102812112e-06, + "loss": 0.79114139, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.515625, + "step": 4179, + "time_per_iteration": 2.3736279010772705 + }, + { + "auxiliary_loss_clip": 0.01085636, + "auxiliary_loss_mlp": 0.01061521, + "balance_loss_clip": 1.02704549, + "balance_loss_mlp": 1.03042579, + "epoch": 0.25131519615211184, + "flos": 18477641952000.0, + "grad_norm": 2.2080230806160204, + "language_loss": 0.73433203, + "learning_rate": 3.507818263370206e-06, + "loss": 0.75580359, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5546875, + "step": 4180, + "time_per_iteration": 2.398108959197998 + }, + { + "auxiliary_loss_clip": 0.01084017, + "auxiliary_loss_mlp": 0.01056374, + "balance_loss_clip": 1.02306676, + "balance_loss_mlp": 1.03106308, + "epoch": 0.2513753194047798, + "flos": 20483925897600.0, + "grad_norm": 2.0996582655720677, + "language_loss": 0.87329769, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.8947016, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.53125, + "step": 4181, + "time_per_iteration": 2.403031349182129 + }, + { + "auxiliary_loss_clip": 0.01081248, + "auxiliary_loss_mlp": 0.01061628, + "balance_loss_clip": 1.02891707, + "balance_loss_mlp": 1.02904284, + "epoch": 0.25143544265744777, + "flos": 37668560313600.0, + "grad_norm": 4.843528119638757, + "language_loss": 0.69405603, + "learning_rate": 3.507306412966238e-06, + "loss": 0.71548474, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5234375, + "step": 4182, + "time_per_iteration": 2.556149959564209 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01010968, + "balance_loss_clip": 1.00596154, + "balance_loss_mlp": 1.03313959, + "epoch": 0.25149556591011574, + "flos": 69364283829120.0, + "grad_norm": 0.8628053291772059, + "language_loss": 0.70236641, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72296435, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.05004883, + "router_z_loss_mlp": 0.15625, + "step": 4183, + "time_per_iteration": 3.039799451828003 + }, + { + "auxiliary_loss_clip": 0.01077612, + "auxiliary_loss_mlp": 0.01063695, + "balance_loss_clip": 1.02731252, + "balance_loss_mlp": 1.02335048, + "epoch": 0.2515556891627837, + "flos": 13989557076480.0, + "grad_norm": 2.286502126723049, + "language_loss": 0.75399435, + "learning_rate": 3.506794333933431e-06, + "loss": 0.77540737, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.54296875, + "step": 4184, + "time_per_iteration": 2.3901941776275635 + }, + { + "auxiliary_loss_clip": 0.01076076, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.03306985, + "balance_loss_mlp": 1.02456498, + "epoch": 0.25161581241545167, + "flos": 22162445199360.0, + "grad_norm": 2.003941475333182, + "language_loss": 0.85097128, + "learning_rate": 3.506538208705484e-06, + "loss": 0.87241399, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.515625, + "step": 4185, + "time_per_iteration": 2.396497964859009 + }, + { + "auxiliary_loss_clip": 0.01027217, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.03970206, + "balance_loss_mlp": 1.010777, + "epoch": 0.25167593566811963, + "flos": 69355486166400.0, + "grad_norm": 1.4341687882489893, + "language_loss": 0.61573112, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63644981, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.04956055, + "router_z_loss_mlp": 0.1640625, + "step": 4186, + "time_per_iteration": 2.9387497901916504 + }, + { + "auxiliary_loss_clip": 0.01075147, + "auxiliary_loss_mlp": 0.01070569, + "balance_loss_clip": 1.03490126, + "balance_loss_mlp": 1.02181792, + "epoch": 0.2517360589207876, + "flos": 13260605466240.0, + "grad_norm": 4.6285064656986155, + "language_loss": 0.81753963, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.83899677, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.53125, + "step": 4187, + "time_per_iteration": 2.3818259239196777 + }, + { + "auxiliary_loss_clip": 0.01075801, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.03388786, + "balance_loss_mlp": 1.02251029, + "epoch": 0.25179618217345556, + "flos": 20375764905600.0, + "grad_norm": 1.455657455157059, + "language_loss": 0.80829567, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82973415, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.53125, + "step": 4188, + "time_per_iteration": 2.4196341037750244 + }, + { + "auxiliary_loss_clip": 0.01075298, + "auxiliary_loss_mlp": 0.01060683, + "balance_loss_clip": 1.02716136, + "balance_loss_mlp": 1.02477348, + "epoch": 0.25185630542612353, + "flos": 27663709996800.0, + "grad_norm": 2.3083509965040374, + "language_loss": 0.75245166, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.77381152, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.50390625, + "step": 4189, + "time_per_iteration": 2.4350953102111816 + }, + { + "auxiliary_loss_clip": 0.01078248, + "auxiliary_loss_mlp": 0.01048704, + "balance_loss_clip": 1.0193783, + "balance_loss_mlp": 1.02839398, + "epoch": 0.25191642867879155, + "flos": 20995368537600.0, + "grad_norm": 2.181154716852697, + "language_loss": 0.86537719, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.88664675, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.49804688, + "step": 4190, + "time_per_iteration": 2.448875665664673 + }, + { + "auxiliary_loss_clip": 0.01083641, + "auxiliary_loss_mlp": 0.01058783, + "balance_loss_clip": 1.0222578, + "balance_loss_mlp": 1.03057444, + "epoch": 0.2519765519314595, + "flos": 21104611781760.0, + "grad_norm": 2.171910836275233, + "language_loss": 0.76840949, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.78983366, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.53125, + "step": 4191, + "time_per_iteration": 2.409207582473755 + }, + { + "auxiliary_loss_clip": 0.01044479, + "auxiliary_loss_mlp": 0.01047885, + "balance_loss_clip": 1.04325962, + "balance_loss_mlp": 1.02838373, + "epoch": 0.2520366751841275, + "flos": 62741503560960.0, + "grad_norm": 0.7682086639001763, + "language_loss": 0.57276005, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59368366, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.04614258, + "router_z_loss_mlp": 0.16015625, + "step": 4192, + "time_per_iteration": 3.1230616569519043 + }, + { + "auxiliary_loss_clip": 0.0108982, + "auxiliary_loss_mlp": 0.01051207, + "balance_loss_clip": 1.01848364, + "balance_loss_mlp": 1.03732848, + "epoch": 0.25209679843679544, + "flos": 22229792945280.0, + "grad_norm": 2.001803912591678, + "language_loss": 0.77661043, + "learning_rate": 3.504487151087323e-06, + "loss": 0.79802072, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5234375, + "step": 4193, + "time_per_iteration": 2.4132697582244873 + }, + { + "auxiliary_loss_clip": 0.01094692, + "auxiliary_loss_mlp": 0.01059853, + "balance_loss_clip": 1.02366102, + "balance_loss_mlp": 1.03993523, + "epoch": 0.2521569216894634, + "flos": 12165833963520.0, + "grad_norm": 3.2985968236305334, + "language_loss": 0.86110628, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.88265175, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.546875, + "step": 4194, + "time_per_iteration": 2.5010087490081787 + }, + { + "auxiliary_loss_clip": 0.01093195, + "auxiliary_loss_mlp": 0.01075285, + "balance_loss_clip": 1.0414772, + "balance_loss_mlp": 1.03754926, + "epoch": 0.2522170449421314, + "flos": 23698553598720.0, + "grad_norm": 1.4335774638891599, + "language_loss": 0.88889956, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.91058433, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5546875, + "step": 4195, + "time_per_iteration": 2.485008716583252 + }, + { + "auxiliary_loss_clip": 0.01095704, + "auxiliary_loss_mlp": 0.01071143, + "balance_loss_clip": 1.03442621, + "balance_loss_mlp": 1.04045689, + "epoch": 0.25227716819479934, + "flos": 20954520380160.0, + "grad_norm": 2.0259413998903066, + "language_loss": 0.87445676, + "learning_rate": 3.503717062883053e-06, + "loss": 0.8961252, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5546875, + "step": 4196, + "time_per_iteration": 2.4339165687561035 + }, + { + "auxiliary_loss_clip": 0.01093425, + "auxiliary_loss_mlp": 0.01092019, + "balance_loss_clip": 1.05749631, + "balance_loss_mlp": 1.03880382, + "epoch": 0.2523372914474673, + "flos": 23330220088320.0, + "grad_norm": 1.6780720249747834, + "language_loss": 0.84735811, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.86921251, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.546875, + "step": 4197, + "time_per_iteration": 2.4338364601135254 + }, + { + "auxiliary_loss_clip": 0.01093963, + "auxiliary_loss_mlp": 0.01084517, + "balance_loss_clip": 1.04698944, + "balance_loss_mlp": 1.03811133, + "epoch": 0.25239741470013527, + "flos": 36969005934720.0, + "grad_norm": 3.0854388323660116, + "language_loss": 0.74601471, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.7677995, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.55859375, + "step": 4198, + "time_per_iteration": 2.5597028732299805 + }, + { + "auxiliary_loss_clip": 0.01090204, + "auxiliary_loss_mlp": 0.0108388, + "balance_loss_clip": 1.04661489, + "balance_loss_mlp": 1.03447425, + "epoch": 0.25245753795280323, + "flos": 18514754593920.0, + "grad_norm": 2.2318758492946555, + "language_loss": 0.787907, + "learning_rate": 3.50294646148888e-06, + "loss": 0.80964786, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.55859375, + "step": 4199, + "time_per_iteration": 2.397740602493286 + }, + { + "auxiliary_loss_clip": 0.01088522, + "auxiliary_loss_mlp": 0.01087506, + "balance_loss_clip": 1.05443776, + "balance_loss_mlp": 1.03438842, + "epoch": 0.2525176612054712, + "flos": 32343467569920.0, + "grad_norm": 1.9674569056283773, + "language_loss": 0.74132341, + "learning_rate": 3.502689480360739e-06, + "loss": 0.7630837, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.54296875, + "step": 4200, + "time_per_iteration": 2.5268242359161377 + }, + { + "auxiliary_loss_clip": 0.01086461, + "auxiliary_loss_mlp": 0.0107585, + "balance_loss_clip": 1.04280555, + "balance_loss_mlp": 1.03273678, + "epoch": 0.25257778445813917, + "flos": 45256513651200.0, + "grad_norm": 1.749173320480223, + "language_loss": 0.83707917, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.8587023, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.53515625, + "step": 4201, + "time_per_iteration": 2.6112546920776367 + }, + { + "auxiliary_loss_clip": 0.01084894, + "auxiliary_loss_mlp": 0.01073967, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.02960825, + "epoch": 0.25263790771080713, + "flos": 23366669414400.0, + "grad_norm": 1.9906087121825033, + "language_loss": 0.76071811, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.78230673, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5546875, + "step": 4202, + "time_per_iteration": 2.4609363079071045 + }, + { + "auxiliary_loss_clip": 0.01077461, + "auxiliary_loss_mlp": 0.01059709, + "balance_loss_clip": 1.02556777, + "balance_loss_mlp": 1.0254724, + "epoch": 0.25269803096347515, + "flos": 18514056366720.0, + "grad_norm": 2.1953459030125866, + "language_loss": 0.74941266, + "learning_rate": 3.501918195122491e-06, + "loss": 0.77078438, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.51953125, + "step": 4203, + "time_per_iteration": 2.3874549865722656 + }, + { + "auxiliary_loss_clip": 0.01078476, + "auxiliary_loss_mlp": 0.01065233, + "balance_loss_clip": 1.02894533, + "balance_loss_mlp": 1.02459693, + "epoch": 0.2527581542161431, + "flos": 24609332903040.0, + "grad_norm": 1.51174367499265, + "language_loss": 0.78882349, + "learning_rate": 3.501660986124297e-06, + "loss": 0.81026053, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5390625, + "step": 4204, + "time_per_iteration": 2.5023257732391357 + }, + { + "auxiliary_loss_clip": 0.01075051, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_clip": 1.02182126, + "balance_loss_mlp": 1.02212751, + "epoch": 0.2528182774688111, + "flos": 12640443252480.0, + "grad_norm": 2.0982503706454714, + "language_loss": 0.74243271, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.76373047, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.53125, + "step": 4205, + "time_per_iteration": 2.359304666519165 + }, + { + "auxiliary_loss_clip": 0.010724, + "auxiliary_loss_mlp": 0.01047904, + "balance_loss_clip": 1.01736259, + "balance_loss_mlp": 1.02320468, + "epoch": 0.25287840072147905, + "flos": 46935032952960.0, + "grad_norm": 1.6262159650039363, + "language_loss": 0.77150667, + "learning_rate": 3.50114639730826e-06, + "loss": 0.79270971, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4921875, + "step": 4206, + "time_per_iteration": 4.134872198104858 + }, + { + "auxiliary_loss_clip": 0.01075341, + "auxiliary_loss_mlp": 0.01051361, + "balance_loss_clip": 1.01514506, + "balance_loss_mlp": 1.02315211, + "epoch": 0.252938523974147, + "flos": 18878724184320.0, + "grad_norm": 1.7281346679705978, + "language_loss": 0.80136329, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.82263029, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5234375, + "step": 4207, + "time_per_iteration": 2.400001049041748 + }, + { + "auxiliary_loss_clip": 0.01076288, + "auxiliary_loss_mlp": 0.01056464, + "balance_loss_clip": 1.02171493, + "balance_loss_mlp": 1.02475381, + "epoch": 0.252998647226815, + "flos": 21433633234560.0, + "grad_norm": 2.5193797022491413, + "language_loss": 0.77810645, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.79943395, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.515625, + "step": 4208, + "time_per_iteration": 2.3930201530456543 + }, + { + "auxiliary_loss_clip": 0.01074465, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_clip": 1.0157783, + "balance_loss_mlp": 1.02351665, + "epoch": 0.25305877047948294, + "flos": 25441138978560.0, + "grad_norm": 3.2356188731244564, + "language_loss": 0.71822143, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.73943794, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.51171875, + "step": 4209, + "time_per_iteration": 2.4395506381988525 + }, + { + "auxiliary_loss_clip": 0.01025824, + "auxiliary_loss_mlp": 0.01006672, + "balance_loss_clip": 1.00238085, + "balance_loss_mlp": 1.01089144, + "epoch": 0.2531188937321509, + "flos": 60182335324800.0, + "grad_norm": 0.7627831080885793, + "language_loss": 0.5519464, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57227135, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.04296875, + "router_z_loss_mlp": 0.1484375, + "step": 4210, + "time_per_iteration": 4.598435640335083 + }, + { + "auxiliary_loss_clip": 0.01080071, + "auxiliary_loss_mlp": 0.01050382, + "balance_loss_clip": 1.01829123, + "balance_loss_mlp": 1.02632594, + "epoch": 0.25317901698481887, + "flos": 19681377408000.0, + "grad_norm": 2.2624597791804386, + "language_loss": 0.81174088, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.83304536, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5390625, + "step": 4211, + "time_per_iteration": 3.7780921459198 + }, + { + "auxiliary_loss_clip": 0.01079596, + "auxiliary_loss_mlp": 0.01052049, + "balance_loss_clip": 1.01995826, + "balance_loss_mlp": 1.02891338, + "epoch": 0.25323914023748684, + "flos": 24423246023040.0, + "grad_norm": 2.5516090590190235, + "language_loss": 0.79461837, + "learning_rate": 3.499601265005622e-06, + "loss": 0.8159349, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.5078125, + "step": 4212, + "time_per_iteration": 3.8367197513580322 + }, + { + "auxiliary_loss_clip": 0.01079682, + "auxiliary_loss_mlp": 0.01052, + "balance_loss_clip": 1.01683331, + "balance_loss_mlp": 1.02612257, + "epoch": 0.2532992634901548, + "flos": 25446270948480.0, + "grad_norm": 8.9593650125193, + "language_loss": 0.55548352, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.57680035, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5390625, + "step": 4213, + "time_per_iteration": 2.430173397064209 + }, + { + "auxiliary_loss_clip": 0.01080654, + "auxiliary_loss_mlp": 0.01054722, + "balance_loss_clip": 1.01762366, + "balance_loss_mlp": 1.02645326, + "epoch": 0.25335938674282277, + "flos": 18879527145600.0, + "grad_norm": 2.5672288647055144, + "language_loss": 0.66811097, + "learning_rate": 3.499085765880308e-06, + "loss": 0.68946475, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.54296875, + "step": 4214, + "time_per_iteration": 2.4070982933044434 + }, + { + "auxiliary_loss_clip": 0.01024137, + "auxiliary_loss_mlp": 0.01007202, + "balance_loss_clip": 1.00310123, + "balance_loss_mlp": 1.00936604, + "epoch": 0.25341950999549073, + "flos": 53059809588480.0, + "grad_norm": 0.8368001895756312, + "language_loss": 0.58142912, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60174251, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.1484375, + "step": 4215, + "time_per_iteration": 2.8031153678894043 + }, + { + "auxiliary_loss_clip": 0.01077465, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.01639807, + "balance_loss_mlp": 1.02462792, + "epoch": 0.2534796332481587, + "flos": 39018686567040.0, + "grad_norm": 2.0872360687700238, + "language_loss": 0.84890956, + "learning_rate": 3.498570039373066e-06, + "loss": 0.87021369, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.52734375, + "step": 4216, + "time_per_iteration": 2.538862705230713 + }, + { + "auxiliary_loss_clip": 0.01079177, + "auxiliary_loss_mlp": 0.01053292, + "balance_loss_clip": 1.01750529, + "balance_loss_mlp": 1.02618313, + "epoch": 0.2535397565008267, + "flos": 23585854129920.0, + "grad_norm": 2.388306225907289, + "language_loss": 0.81676215, + "learning_rate": 3.498312090875666e-06, + "loss": 0.83808684, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.53125, + "step": 4217, + "time_per_iteration": 2.427696704864502 + }, + { + "auxiliary_loss_clip": 0.01076765, + "auxiliary_loss_mlp": 0.01050875, + "balance_loss_clip": 1.01473069, + "balance_loss_mlp": 1.02397919, + "epoch": 0.2535998797534947, + "flos": 19280364998400.0, + "grad_norm": 2.4135125028091076, + "language_loss": 0.77307284, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.79434919, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.52734375, + "step": 4218, + "time_per_iteration": 2.3871657848358154 + }, + { + "auxiliary_loss_clip": 0.01077302, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.01611733, + "balance_loss_mlp": 1.02251899, + "epoch": 0.25366000300616265, + "flos": 24023246042880.0, + "grad_norm": 2.2065231473986207, + "language_loss": 0.7571975, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.77848351, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 4219, + "time_per_iteration": 2.406975507736206 + }, + { + "auxiliary_loss_clip": 0.01078842, + "auxiliary_loss_mlp": 0.01056208, + "balance_loss_clip": 1.02051711, + "balance_loss_mlp": 1.02452946, + "epoch": 0.2537201262588306, + "flos": 16288448060160.0, + "grad_norm": 2.3959429438860083, + "language_loss": 0.83107066, + "learning_rate": 3.497537904525736e-06, + "loss": 0.85242116, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.54296875, + "step": 4220, + "time_per_iteration": 2.3811001777648926 + }, + { + "auxiliary_loss_clip": 0.01078409, + "auxiliary_loss_mlp": 0.01051849, + "balance_loss_clip": 1.01527548, + "balance_loss_mlp": 1.02407897, + "epoch": 0.2537802495114986, + "flos": 23293561294080.0, + "grad_norm": 2.112156310622191, + "language_loss": 0.72528195, + "learning_rate": 3.497279728822468e-06, + "loss": 0.74658453, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.54296875, + "step": 4221, + "time_per_iteration": 2.410367250442505 + }, + { + "auxiliary_loss_clip": 0.01076133, + "auxiliary_loss_mlp": 0.01054893, + "balance_loss_clip": 1.01696074, + "balance_loss_mlp": 1.02340126, + "epoch": 0.25384037276416654, + "flos": 17638190288640.0, + "grad_norm": 2.2467507118204004, + "language_loss": 0.63598317, + "learning_rate": 3.497021496342202e-06, + "loss": 0.65729344, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.52734375, + "step": 4222, + "time_per_iteration": 2.3821916580200195 + }, + { + "auxiliary_loss_clip": 0.01078024, + "auxiliary_loss_mlp": 0.01060016, + "balance_loss_clip": 1.02043819, + "balance_loss_mlp": 1.02270341, + "epoch": 0.2539004960168345, + "flos": 21505973304960.0, + "grad_norm": 1.7966814899444752, + "language_loss": 0.76058519, + "learning_rate": 3.496763207094731e-06, + "loss": 0.78196555, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5546875, + "step": 4223, + "time_per_iteration": 2.377962350845337 + }, + { + "auxiliary_loss_clip": 0.01074196, + "auxiliary_loss_mlp": 0.01055211, + "balance_loss_clip": 1.02030659, + "balance_loss_mlp": 1.02262318, + "epoch": 0.2539606192695025, + "flos": 23949788808960.0, + "grad_norm": 1.8184141238450242, + "language_loss": 0.81672072, + "learning_rate": 3.49650486108985e-06, + "loss": 0.83801478, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.515625, + "step": 4224, + "time_per_iteration": 2.459200382232666 + }, + { + "auxiliary_loss_clip": 0.01073418, + "auxiliary_loss_mlp": 0.0105391, + "balance_loss_clip": 1.01979196, + "balance_loss_mlp": 1.0220263, + "epoch": 0.25402074252217044, + "flos": 24168659322240.0, + "grad_norm": 1.605796869984486, + "language_loss": 0.78215897, + "learning_rate": 3.496246458337354e-06, + "loss": 0.80343223, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.515625, + "step": 4225, + "time_per_iteration": 2.418851852416992 + }, + { + "auxiliary_loss_clip": 0.01075765, + "auxiliary_loss_mlp": 0.01066223, + "balance_loss_clip": 1.02938747, + "balance_loss_mlp": 1.02152991, + "epoch": 0.2540808657748384, + "flos": 22302831242880.0, + "grad_norm": 1.6278799069999246, + "language_loss": 0.86017859, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.88159847, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.54296875, + "step": 4226, + "time_per_iteration": 2.416271924972534 + }, + { + "auxiliary_loss_clip": 0.01073384, + "auxiliary_loss_mlp": 0.01061288, + "balance_loss_clip": 1.02473795, + "balance_loss_mlp": 1.02136099, + "epoch": 0.25414098902750637, + "flos": 27598317287040.0, + "grad_norm": 1.818038049451049, + "language_loss": 0.72167826, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.74302495, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.51953125, + "step": 4227, + "time_per_iteration": 2.4422335624694824 + }, + { + "auxiliary_loss_clip": 0.01028033, + "auxiliary_loss_mlp": 0.01022101, + "balance_loss_clip": 1.01780951, + "balance_loss_mlp": 1.01336694, + "epoch": 0.25420111228017434, + "flos": 58167847209600.0, + "grad_norm": 0.9872389784882549, + "language_loss": 0.61832708, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.6388284, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.04296875, + "router_z_loss_mlp": 0.14648438, + "step": 4228, + "time_per_iteration": 2.8637194633483887 + }, + { + "auxiliary_loss_clip": 0.01075, + "auxiliary_loss_mlp": 0.01057377, + "balance_loss_clip": 1.01965928, + "balance_loss_mlp": 1.02197385, + "epoch": 0.2542612355328423, + "flos": 11463870700800.0, + "grad_norm": 2.6015135021238214, + "language_loss": 0.88437462, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.90569836, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.53125, + "step": 4229, + "time_per_iteration": 2.3503832817077637 + }, + { + "auxiliary_loss_clip": 0.01076023, + "auxiliary_loss_mlp": 0.01059523, + "balance_loss_clip": 1.02023208, + "balance_loss_mlp": 1.02237415, + "epoch": 0.2543213587855103, + "flos": 22964784220800.0, + "grad_norm": 2.6268718272387352, + "language_loss": 0.78569907, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.80705452, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.5390625, + "step": 4230, + "time_per_iteration": 2.4031782150268555 + }, + { + "auxiliary_loss_clip": 0.01073883, + "auxiliary_loss_mlp": 0.01052579, + "balance_loss_clip": 1.01614916, + "balance_loss_mlp": 1.02172852, + "epoch": 0.2543814820381783, + "flos": 18252382659840.0, + "grad_norm": 2.236641616598609, + "language_loss": 0.77245843, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.79372311, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.51953125, + "step": 4231, + "time_per_iteration": 2.3790180683135986 + }, + { + "auxiliary_loss_clip": 0.01073682, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_clip": 1.0175786, + "balance_loss_mlp": 1.02079129, + "epoch": 0.25444160529084625, + "flos": 15631801608960.0, + "grad_norm": 2.8088845554065474, + "language_loss": 0.75599998, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.77726734, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.52734375, + "step": 4232, + "time_per_iteration": 2.3696837425231934 + }, + { + "auxiliary_loss_clip": 0.01074432, + "auxiliary_loss_mlp": 0.01055277, + "balance_loss_clip": 1.01763058, + "balance_loss_mlp": 1.02224135, + "epoch": 0.2545017285435142, + "flos": 24600639974400.0, + "grad_norm": 2.339636349626308, + "language_loss": 0.87682462, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.89812171, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5234375, + "step": 4233, + "time_per_iteration": 2.4111387729644775 + }, + { + "auxiliary_loss_clip": 0.01070348, + "auxiliary_loss_mlp": 0.01049752, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.02030301, + "epoch": 0.2545618517961822, + "flos": 24677972369280.0, + "grad_norm": 1.8053586146770977, + "language_loss": 0.75730693, + "learning_rate": 3.493918281539737e-06, + "loss": 0.77850789, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.5, + "step": 4234, + "time_per_iteration": 2.4393951892852783 + }, + { + "auxiliary_loss_clip": 0.01074865, + "auxiliary_loss_mlp": 0.01051418, + "balance_loss_clip": 1.0175631, + "balance_loss_mlp": 1.02133608, + "epoch": 0.25462197504885015, + "flos": 23913967887360.0, + "grad_norm": 1.5268444188308696, + "language_loss": 0.76347482, + "learning_rate": 3.493659311850379e-06, + "loss": 0.78473771, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.53515625, + "step": 4235, + "time_per_iteration": 2.4027724266052246 + }, + { + "auxiliary_loss_clip": 0.01079846, + "auxiliary_loss_mlp": 0.01058806, + "balance_loss_clip": 1.01803708, + "balance_loss_mlp": 1.02221537, + "epoch": 0.2546820983015181, + "flos": 24788262954240.0, + "grad_norm": 2.3929245568302573, + "language_loss": 0.66879594, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.69018245, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.57421875, + "step": 4236, + "time_per_iteration": 2.428398609161377 + }, + { + "auxiliary_loss_clip": 0.01072863, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.01518595, + "balance_loss_mlp": 1.02180004, + "epoch": 0.2547422215541861, + "flos": 18733136348160.0, + "grad_norm": 1.611765520439152, + "language_loss": 0.68003571, + "learning_rate": 3.493141202562354e-06, + "loss": 0.70122612, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.51171875, + "step": 4237, + "time_per_iteration": 2.3625876903533936 + }, + { + "auxiliary_loss_clip": 0.01073095, + "auxiliary_loss_mlp": 0.01054144, + "balance_loss_clip": 1.01942992, + "balance_loss_mlp": 1.02093077, + "epoch": 0.25480234480685404, + "flos": 21031398927360.0, + "grad_norm": 2.0056956362019123, + "language_loss": 0.76623166, + "learning_rate": 3.492882062983333e-06, + "loss": 0.78750402, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5234375, + "step": 4238, + "time_per_iteration": 2.4061968326568604 + }, + { + "auxiliary_loss_clip": 0.01073569, + "auxiliary_loss_mlp": 0.01049756, + "balance_loss_clip": 1.01389802, + "balance_loss_mlp": 1.02122903, + "epoch": 0.254862468059522, + "flos": 25081009637760.0, + "grad_norm": 2.9001407981046787, + "language_loss": 0.8179217, + "learning_rate": 3.492622866794074e-06, + "loss": 0.83915496, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5234375, + "step": 4239, + "time_per_iteration": 2.4281363487243652 + }, + { + "auxiliary_loss_clip": 0.01073132, + "auxiliary_loss_mlp": 0.01044332, + "balance_loss_clip": 1.01187181, + "balance_loss_mlp": 1.02209997, + "epoch": 0.25492259131219, + "flos": 20557348220160.0, + "grad_norm": 1.9015545876714797, + "language_loss": 0.79197937, + "learning_rate": 3.492363614004407e-06, + "loss": 0.81315398, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51171875, + "step": 4240, + "time_per_iteration": 2.3879575729370117 + }, + { + "auxiliary_loss_clip": 0.01076192, + "auxiliary_loss_mlp": 0.01052943, + "balance_loss_clip": 1.01491547, + "balance_loss_mlp": 1.02124119, + "epoch": 0.25498271456485794, + "flos": 25041418289280.0, + "grad_norm": 1.8488801357307783, + "language_loss": 0.84481716, + "learning_rate": 3.492104304624162e-06, + "loss": 0.86610842, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.55078125, + "step": 4241, + "time_per_iteration": 2.3948469161987305 + }, + { + "auxiliary_loss_clip": 0.01074975, + "auxiliary_loss_mlp": 0.01057039, + "balance_loss_clip": 1.02191997, + "balance_loss_mlp": 1.02221251, + "epoch": 0.2550428378175259, + "flos": 26177177594880.0, + "grad_norm": 3.899051064071679, + "language_loss": 0.74997211, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.77129221, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.52734375, + "step": 4242, + "time_per_iteration": 2.4260313510894775 + }, + { + "auxiliary_loss_clip": 0.01073218, + "auxiliary_loss_mlp": 0.01051268, + "balance_loss_clip": 1.01785398, + "balance_loss_mlp": 1.02031958, + "epoch": 0.2551029610701939, + "flos": 15266295918720.0, + "grad_norm": 2.6260894762084375, + "language_loss": 0.75479132, + "learning_rate": 3.491585516131273e-06, + "loss": 0.7760362, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.53125, + "step": 4243, + "time_per_iteration": 2.33795166015625 + }, + { + "auxiliary_loss_clip": 0.01071864, + "auxiliary_loss_mlp": 0.01050879, + "balance_loss_clip": 1.01716638, + "balance_loss_mlp": 1.01989067, + "epoch": 0.2551630843228619, + "flos": 18111263477760.0, + "grad_norm": 1.6914865599684927, + "language_loss": 0.82899624, + "learning_rate": 3.491326037038301e-06, + "loss": 0.85022366, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.51953125, + "step": 4244, + "time_per_iteration": 2.376638412475586 + }, + { + "auxiliary_loss_clip": 0.01024, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.02717936, + "balance_loss_mlp": 1.00986123, + "epoch": 0.25522320757552985, + "flos": 70516381651200.0, + "grad_norm": 0.703523570983836, + "language_loss": 0.57700312, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.5975616, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.04663086, + "router_z_loss_mlp": 0.14160156, + "step": 4245, + "time_per_iteration": 3.171160936355591 + }, + { + "auxiliary_loss_clip": 0.01074919, + "auxiliary_loss_mlp": 0.01059321, + "balance_loss_clip": 1.022843, + "balance_loss_mlp": 1.02157497, + "epoch": 0.2552833308281978, + "flos": 22891990302720.0, + "grad_norm": 2.2263416549745956, + "language_loss": 0.67205805, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.69340044, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.53125, + "step": 4246, + "time_per_iteration": 3.8074207305908203 + }, + { + "auxiliary_loss_clip": 0.0107087, + "auxiliary_loss_mlp": 0.010523, + "balance_loss_clip": 1.0207696, + "balance_loss_mlp": 1.02052116, + "epoch": 0.2553434540808658, + "flos": 22052538639360.0, + "grad_norm": 1.806780632926359, + "language_loss": 0.82441473, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.8456465, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.50390625, + "step": 4247, + "time_per_iteration": 2.3782925605773926 + }, + { + "auxiliary_loss_clip": 0.01078697, + "auxiliary_loss_mlp": 0.01063497, + "balance_loss_clip": 1.02120209, + "balance_loss_mlp": 1.02155399, + "epoch": 0.25540357733353375, + "flos": 16543279140480.0, + "grad_norm": 2.4428008515100044, + "language_loss": 0.85376298, + "learning_rate": 3.490287555252514e-06, + "loss": 0.87518489, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.5703125, + "step": 4248, + "time_per_iteration": 2.3580310344696045 + }, + { + "auxiliary_loss_clip": 0.01075163, + "auxiliary_loss_mlp": 0.01050073, + "balance_loss_clip": 1.01609826, + "balance_loss_mlp": 1.02235413, + "epoch": 0.2554637005862017, + "flos": 17564104650240.0, + "grad_norm": 2.3830630995979254, + "language_loss": 0.85323852, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.87449086, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.52734375, + "step": 4249, + "time_per_iteration": 2.356660842895508 + }, + { + "auxiliary_loss_clip": 0.01020356, + "auxiliary_loss_mlp": 0.01010562, + "balance_loss_clip": 1.00612736, + "balance_loss_mlp": 1.00685477, + "epoch": 0.2555238238388697, + "flos": 72241650996480.0, + "grad_norm": 0.7670595583513616, + "language_loss": 0.56323379, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58354294, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.04443359, + "router_z_loss_mlp": 0.13476562, + "step": 4250, + "time_per_iteration": 5.815124273300171 + }, + { + "auxiliary_loss_clip": 0.01075943, + "auxiliary_loss_mlp": 0.01053674, + "balance_loss_clip": 1.01767266, + "balance_loss_mlp": 1.02132988, + "epoch": 0.25558394709153764, + "flos": 24388262974080.0, + "grad_norm": 3.308284214456833, + "language_loss": 0.82193714, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.84323335, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.546875, + "step": 4251, + "time_per_iteration": 3.852959632873535 + }, + { + "auxiliary_loss_clip": 0.01018219, + "auxiliary_loss_mlp": 0.01016391, + "balance_loss_clip": 1.01155066, + "balance_loss_mlp": 1.00466299, + "epoch": 0.2556440703442056, + "flos": 69227772946560.0, + "grad_norm": 0.8050938264999051, + "language_loss": 0.66093105, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68127716, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.04833984, + "router_z_loss_mlp": 0.13574219, + "step": 4252, + "time_per_iteration": 3.0473873615264893 + }, + { + "auxiliary_loss_clip": 0.01075939, + "auxiliary_loss_mlp": 0.01057233, + "balance_loss_clip": 1.02540445, + "balance_loss_mlp": 1.02315128, + "epoch": 0.2557041935968736, + "flos": 24862732617600.0, + "grad_norm": 1.975591373014621, + "language_loss": 0.75268579, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.77401751, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.52734375, + "step": 4253, + "time_per_iteration": 2.439805030822754 + }, + { + "auxiliary_loss_clip": 0.01079401, + "auxiliary_loss_mlp": 0.0106021, + "balance_loss_clip": 1.02408922, + "balance_loss_mlp": 1.02637506, + "epoch": 0.25576431684954154, + "flos": 22491012804480.0, + "grad_norm": 2.0975399868434788, + "language_loss": 0.73943925, + "learning_rate": 3.488728137415357e-06, + "loss": 0.76083541, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 4254, + "time_per_iteration": 2.421982765197754 + }, + { + "auxiliary_loss_clip": 0.01082292, + "auxiliary_loss_mlp": 0.01053551, + "balance_loss_clip": 1.01571441, + "balance_loss_mlp": 1.02788973, + "epoch": 0.2558244401022095, + "flos": 19825778257920.0, + "grad_norm": 1.8300949877272896, + "language_loss": 0.82018912, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.84154749, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.54296875, + "step": 4255, + "time_per_iteration": 2.3822181224823 + }, + { + "auxiliary_loss_clip": 0.01083759, + "auxiliary_loss_mlp": 0.01057442, + "balance_loss_clip": 1.02201259, + "balance_loss_mlp": 1.03045225, + "epoch": 0.2558845633548775, + "flos": 23219405832960.0, + "grad_norm": 2.5111253924531516, + "language_loss": 0.86731452, + "learning_rate": 3.488207879742721e-06, + "loss": 0.88872647, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53515625, + "step": 4256, + "time_per_iteration": 2.4200334548950195 + }, + { + "auxiliary_loss_clip": 0.010839, + "auxiliary_loss_mlp": 0.01061762, + "balance_loss_clip": 1.02139759, + "balance_loss_mlp": 1.02821481, + "epoch": 0.2559446866075455, + "flos": 16836898608000.0, + "grad_norm": 1.8178068337876152, + "language_loss": 0.76714194, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.78859854, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.5546875, + "step": 4257, + "time_per_iteration": 2.434225082397461 + }, + { + "auxiliary_loss_clip": 0.01027668, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.0271672, + "balance_loss_mlp": 1.01349211, + "epoch": 0.25600480986021346, + "flos": 57590627834880.0, + "grad_norm": 0.8153965182966899, + "language_loss": 0.65376866, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.6743592, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.04223633, + "router_z_loss_mlp": 0.14160156, + "step": 4258, + "time_per_iteration": 2.953946113586426 + }, + { + "auxiliary_loss_clip": 0.01080788, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.01574588, + "balance_loss_mlp": 1.02996993, + "epoch": 0.2560649331128814, + "flos": 27818270052480.0, + "grad_norm": 2.015320111075788, + "language_loss": 0.78186166, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.80316341, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5078125, + "step": 4259, + "time_per_iteration": 2.456165075302124 + }, + { + "auxiliary_loss_clip": 0.01029476, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.02606964, + "balance_loss_mlp": 1.01584721, + "epoch": 0.2561250563655494, + "flos": 70946896026240.0, + "grad_norm": 0.7905459937992877, + "language_loss": 0.58512831, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60573167, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.04785156, + "router_z_loss_mlp": 0.13671875, + "step": 4260, + "time_per_iteration": 3.121680736541748 + }, + { + "auxiliary_loss_clip": 0.01082973, + "auxiliary_loss_mlp": 0.01059966, + "balance_loss_clip": 1.02551413, + "balance_loss_mlp": 1.02874196, + "epoch": 0.25618517961821735, + "flos": 27011217997440.0, + "grad_norm": 1.9425785878902948, + "language_loss": 0.78015232, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.80158162, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.54296875, + "step": 4261, + "time_per_iteration": 2.455806255340576 + }, + { + "auxiliary_loss_clip": 0.0107899, + "auxiliary_loss_mlp": 0.01057792, + "balance_loss_clip": 1.02260137, + "balance_loss_mlp": 1.02658391, + "epoch": 0.2562453028708853, + "flos": 23067394306560.0, + "grad_norm": 1.9787884227091022, + "language_loss": 0.84256053, + "learning_rate": 3.486645752648842e-06, + "loss": 0.86392844, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5234375, + "step": 4262, + "time_per_iteration": 2.4539594650268555 + }, + { + "auxiliary_loss_clip": 0.01082447, + "auxiliary_loss_mlp": 0.0107765, + "balance_loss_clip": 1.03573573, + "balance_loss_mlp": 1.02535319, + "epoch": 0.2563054261235533, + "flos": 15120079678080.0, + "grad_norm": 2.1563108248189176, + "language_loss": 0.75761569, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.77921665, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.5703125, + "step": 4263, + "time_per_iteration": 2.4302217960357666 + }, + { + "auxiliary_loss_clip": 0.01079607, + "auxiliary_loss_mlp": 0.01071016, + "balance_loss_clip": 1.03434718, + "balance_loss_mlp": 1.02694726, + "epoch": 0.25636554937622125, + "flos": 27853637126400.0, + "grad_norm": 1.831008434595662, + "language_loss": 0.83279991, + "learning_rate": 3.486124592522163e-06, + "loss": 0.8543061, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.52734375, + "step": 4264, + "time_per_iteration": 2.4578843116760254 + }, + { + "auxiliary_loss_clip": 0.01078659, + "auxiliary_loss_mlp": 0.01082939, + "balance_loss_clip": 1.04421949, + "balance_loss_mlp": 1.02547669, + "epoch": 0.2564256726288892, + "flos": 28905430878720.0, + "grad_norm": 29.95908713009559, + "language_loss": 0.76675332, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.7883693, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.53125, + "step": 4265, + "time_per_iteration": 2.481652021408081 + }, + { + "auxiliary_loss_clip": 0.01074275, + "auxiliary_loss_mlp": 0.01073804, + "balance_loss_clip": 1.04161799, + "balance_loss_mlp": 1.02186966, + "epoch": 0.2564857958815572, + "flos": 18513951632640.0, + "grad_norm": 1.8090801899114544, + "language_loss": 0.83472002, + "learning_rate": 3.485603206979513e-06, + "loss": 0.85620081, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5234375, + "step": 4266, + "time_per_iteration": 2.4632465839385986 + }, + { + "auxiliary_loss_clip": 0.01073356, + "auxiliary_loss_mlp": 0.01068437, + "balance_loss_clip": 1.0355587, + "balance_loss_mlp": 1.02273524, + "epoch": 0.25654591913422514, + "flos": 25807203250560.0, + "grad_norm": 1.6783862094478106, + "language_loss": 0.80796039, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.82937831, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5078125, + "step": 4267, + "time_per_iteration": 2.4630885124206543 + }, + { + "auxiliary_loss_clip": 0.01073657, + "auxiliary_loss_mlp": 0.01079358, + "balance_loss_clip": 1.04173529, + "balance_loss_mlp": 1.02242923, + "epoch": 0.2566060423868931, + "flos": 19098642038400.0, + "grad_norm": 1.783321124905324, + "language_loss": 0.80595112, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.82748127, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.515625, + "step": 4268, + "time_per_iteration": 2.453575372695923 + }, + { + "auxiliary_loss_clip": 0.01077456, + "auxiliary_loss_mlp": 0.01083816, + "balance_loss_clip": 1.04819655, + "balance_loss_mlp": 1.02462232, + "epoch": 0.25666616563956113, + "flos": 23841523082880.0, + "grad_norm": 2.799405429202468, + "language_loss": 0.69599712, + "learning_rate": 3.484820706183595e-06, + "loss": 0.71760982, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.52734375, + "step": 4269, + "time_per_iteration": 2.4554498195648193 + }, + { + "auxiliary_loss_clip": 0.01081965, + "auxiliary_loss_mlp": 0.01072945, + "balance_loss_clip": 1.03520322, + "balance_loss_mlp": 1.02710819, + "epoch": 0.2567262888922291, + "flos": 14603574890880.0, + "grad_norm": 3.2057594178216293, + "language_loss": 0.82185954, + "learning_rate": 3.484559759962666e-06, + "loss": 0.84340858, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.55078125, + "step": 4270, + "time_per_iteration": 2.4059133529663086 + }, + { + "auxiliary_loss_clip": 0.01086131, + "auxiliary_loss_mlp": 0.0107765, + "balance_loss_clip": 1.0361414, + "balance_loss_mlp": 1.02836537, + "epoch": 0.25678641214489706, + "flos": 32921839019520.0, + "grad_norm": 4.0736051116778755, + "language_loss": 0.70268995, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.72432774, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.578125, + "step": 4271, + "time_per_iteration": 2.5596084594726562 + }, + { + "auxiliary_loss_clip": 0.01080911, + "auxiliary_loss_mlp": 0.0106466, + "balance_loss_clip": 1.02901649, + "balance_loss_mlp": 1.02565157, + "epoch": 0.256846535397565, + "flos": 24097750617600.0, + "grad_norm": 1.5163567847736132, + "language_loss": 0.88257134, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.90402704, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5546875, + "step": 4272, + "time_per_iteration": 2.467536687850952 + }, + { + "auxiliary_loss_clip": 0.01085235, + "auxiliary_loss_mlp": 0.01057126, + "balance_loss_clip": 1.02014732, + "balance_loss_mlp": 1.0310595, + "epoch": 0.256906658650233, + "flos": 19717442709120.0, + "grad_norm": 1.9756020959708118, + "language_loss": 0.83407664, + "learning_rate": 3.483776583571541e-06, + "loss": 0.85550022, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5390625, + "step": 4273, + "time_per_iteration": 2.3916962146759033 + }, + { + "auxiliary_loss_clip": 0.0108074, + "auxiliary_loss_mlp": 0.01050553, + "balance_loss_clip": 1.0176754, + "balance_loss_mlp": 1.02970731, + "epoch": 0.25696678190290095, + "flos": 22925018315520.0, + "grad_norm": 1.5446316796228075, + "language_loss": 0.78278291, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.80409586, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.51171875, + "step": 4274, + "time_per_iteration": 2.4484987258911133 + }, + { + "auxiliary_loss_clip": 0.01081847, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.01427817, + "balance_loss_mlp": 1.03109252, + "epoch": 0.2570269051555689, + "flos": 27306617944320.0, + "grad_norm": 1.708233099023805, + "language_loss": 0.85230851, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.87360954, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 4275, + "time_per_iteration": 2.4452574253082275 + }, + { + "auxiliary_loss_clip": 0.01085576, + "auxiliary_loss_mlp": 0.01052944, + "balance_loss_clip": 1.01587033, + "balance_loss_mlp": 1.0306251, + "epoch": 0.2570870284082369, + "flos": 27562182163200.0, + "grad_norm": 1.9089706259764554, + "language_loss": 0.80175793, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.82314312, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.546875, + "step": 4276, + "time_per_iteration": 2.4899728298187256 + }, + { + "auxiliary_loss_clip": 0.01085685, + "auxiliary_loss_mlp": 0.010584, + "balance_loss_clip": 1.0233283, + "balance_loss_mlp": 1.03219938, + "epoch": 0.25714715166090485, + "flos": 28729573027200.0, + "grad_norm": 1.697217589466997, + "language_loss": 0.81146967, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.83291054, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.53515625, + "step": 4277, + "time_per_iteration": 2.4556496143341064 + }, + { + "auxiliary_loss_clip": 0.01082347, + "auxiliary_loss_mlp": 0.01056501, + "balance_loss_clip": 1.02250218, + "balance_loss_mlp": 1.02949142, + "epoch": 0.2572072749135728, + "flos": 20115243273600.0, + "grad_norm": 2.023292266868303, + "language_loss": 0.80322701, + "learning_rate": 3.482470164419295e-06, + "loss": 0.82461548, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.52734375, + "step": 4278, + "time_per_iteration": 2.4086005687713623 + }, + { + "auxiliary_loss_clip": 0.01088526, + "auxiliary_loss_mlp": 0.0105833, + "balance_loss_clip": 1.02495158, + "balance_loss_mlp": 1.03265941, + "epoch": 0.2572673981662408, + "flos": 26029669633920.0, + "grad_norm": 1.7604429142794675, + "language_loss": 0.76211679, + "learning_rate": 3.482208711902952e-06, + "loss": 0.78358543, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.55859375, + "step": 4279, + "time_per_iteration": 2.455392837524414 + }, + { + "auxiliary_loss_clip": 0.01084845, + "auxiliary_loss_mlp": 0.01063502, + "balance_loss_clip": 1.02843034, + "balance_loss_mlp": 1.02919197, + "epoch": 0.25732752141890874, + "flos": 16105712670720.0, + "grad_norm": 2.1840913083891245, + "language_loss": 0.88097709, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.90246058, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5546875, + "step": 4280, + "time_per_iteration": 2.403960704803467 + }, + { + "auxiliary_loss_clip": 0.01082884, + "auxiliary_loss_mlp": 0.01065631, + "balance_loss_clip": 1.03134632, + "balance_loss_mlp": 1.02858973, + "epoch": 0.2573876446715767, + "flos": 22523447324160.0, + "grad_norm": 2.358089911094378, + "language_loss": 0.81389737, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.83538258, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.54296875, + "step": 4281, + "time_per_iteration": 2.4034228324890137 + }, + { + "auxiliary_loss_clip": 0.01080557, + "auxiliary_loss_mlp": 0.01062731, + "balance_loss_clip": 1.02892327, + "balance_loss_mlp": 1.02781153, + "epoch": 0.2574477679242447, + "flos": 23949718986240.0, + "grad_norm": 2.064327533783522, + "language_loss": 0.87976706, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.90119994, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.52734375, + "step": 4282, + "time_per_iteration": 2.445082187652588 + }, + { + "auxiliary_loss_clip": 0.01082973, + "auxiliary_loss_mlp": 0.0106525, + "balance_loss_clip": 1.03125167, + "balance_loss_mlp": 1.02799535, + "epoch": 0.2575078911769127, + "flos": 21980617505280.0, + "grad_norm": 1.4812189318027709, + "language_loss": 0.71907818, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.74056041, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.55078125, + "step": 4283, + "time_per_iteration": 2.411618947982788 + }, + { + "auxiliary_loss_clip": 0.01075335, + "auxiliary_loss_mlp": 0.01060008, + "balance_loss_clip": 1.03068256, + "balance_loss_mlp": 1.02601779, + "epoch": 0.25756801442958066, + "flos": 21944307824640.0, + "grad_norm": 1.7486191456999747, + "language_loss": 0.81591856, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.83727199, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4921875, + "step": 4284, + "time_per_iteration": 2.4249141216278076 + }, + { + "auxiliary_loss_clip": 0.01077024, + "auxiliary_loss_mlp": 0.01053497, + "balance_loss_clip": 1.02197838, + "balance_loss_mlp": 1.02500796, + "epoch": 0.2576281376822486, + "flos": 35260530819840.0, + "grad_norm": 5.0290470536187115, + "language_loss": 0.72834831, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.74965358, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.51953125, + "step": 4285, + "time_per_iteration": 3.9808921813964844 + }, + { + "auxiliary_loss_clip": 0.01077469, + "auxiliary_loss_mlp": 0.01054593, + "balance_loss_clip": 1.02486229, + "balance_loss_mlp": 1.02561808, + "epoch": 0.2576882609349166, + "flos": 14131549042560.0, + "grad_norm": 1.7757241573689395, + "language_loss": 0.61003494, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.63135558, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.51953125, + "step": 4286, + "time_per_iteration": 2.3627216815948486 + }, + { + "auxiliary_loss_clip": 0.01079658, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.02956831, + "balance_loss_mlp": 1.02462351, + "epoch": 0.25774838418758456, + "flos": 23257216702080.0, + "grad_norm": 1.5883821348488658, + "language_loss": 0.66645157, + "learning_rate": 3.480115069207354e-06, + "loss": 0.68789101, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.55078125, + "step": 4287, + "time_per_iteration": 2.453338623046875 + }, + { + "auxiliary_loss_clip": 0.01080199, + "auxiliary_loss_mlp": 0.01057085, + "balance_loss_clip": 1.01970077, + "balance_loss_mlp": 1.02458453, + "epoch": 0.2578085074402525, + "flos": 22600640073600.0, + "grad_norm": 2.5709339843878287, + "language_loss": 0.72757232, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74894512, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5546875, + "step": 4288, + "time_per_iteration": 2.3893046379089355 + }, + { + "auxiliary_loss_clip": 0.01074467, + "auxiliary_loss_mlp": 0.01051579, + "balance_loss_clip": 1.0191896, + "balance_loss_mlp": 1.0236181, + "epoch": 0.2578686306929205, + "flos": 24570684161280.0, + "grad_norm": 1.4564573970265737, + "language_loss": 0.78285396, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.8041144, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.5078125, + "step": 4289, + "time_per_iteration": 3.878025531768799 + }, + { + "auxiliary_loss_clip": 0.0107451, + "auxiliary_loss_mlp": 0.01049946, + "balance_loss_clip": 1.01701999, + "balance_loss_mlp": 1.02231395, + "epoch": 0.25792875394558845, + "flos": 18112974134400.0, + "grad_norm": 2.0132656818653056, + "language_loss": 0.86195374, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.88319826, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5234375, + "step": 4290, + "time_per_iteration": 2.3555593490600586 + }, + { + "auxiliary_loss_clip": 0.01079415, + "auxiliary_loss_mlp": 0.01064891, + "balance_loss_clip": 1.02617216, + "balance_loss_mlp": 1.02396917, + "epoch": 0.2579888771982564, + "flos": 17711926813440.0, + "grad_norm": 2.156421465686289, + "language_loss": 0.74150693, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.76294994, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5546875, + "step": 4291, + "time_per_iteration": 3.763063907623291 + }, + { + "auxiliary_loss_clip": 0.01080982, + "auxiliary_loss_mlp": 0.01055242, + "balance_loss_clip": 1.0183351, + "balance_loss_mlp": 1.02642679, + "epoch": 0.2580490004509244, + "flos": 16433966073600.0, + "grad_norm": 2.501981961390261, + "language_loss": 0.83413827, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.85550046, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.546875, + "step": 4292, + "time_per_iteration": 2.4080584049224854 + }, + { + "auxiliary_loss_clip": 0.01082294, + "auxiliary_loss_mlp": 0.01056374, + "balance_loss_clip": 1.01934719, + "balance_loss_mlp": 1.02717042, + "epoch": 0.25810912370359235, + "flos": 33833840221440.0, + "grad_norm": 2.480162463713019, + "language_loss": 0.69386786, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.71525455, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.55078125, + "step": 4293, + "time_per_iteration": 2.5038866996765137 + }, + { + "auxiliary_loss_clip": 0.01076025, + "auxiliary_loss_mlp": 0.01046063, + "balance_loss_clip": 1.0139246, + "balance_loss_mlp": 1.02382469, + "epoch": 0.2581692469562603, + "flos": 25191020931840.0, + "grad_norm": 1.8233350376311408, + "language_loss": 0.77085698, + "learning_rate": 3.478280185054542e-06, + "loss": 0.7920779, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5234375, + "step": 4294, + "time_per_iteration": 2.4974441528320312 + }, + { + "auxiliary_loss_clip": 0.01078371, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_clip": 1.01787138, + "balance_loss_mlp": 1.0257678, + "epoch": 0.2582293702089283, + "flos": 34930811139840.0, + "grad_norm": 2.6860045759973494, + "language_loss": 0.82166612, + "learning_rate": 3.478017834441318e-06, + "loss": 0.84298623, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.52734375, + "step": 4295, + "time_per_iteration": 2.5160601139068604 + }, + { + "auxiliary_loss_clip": 0.01081424, + "auxiliary_loss_mlp": 0.01054695, + "balance_loss_clip": 1.01845574, + "balance_loss_mlp": 1.02506804, + "epoch": 0.2582894934615963, + "flos": 26832532325760.0, + "grad_norm": 2.099819476058287, + "language_loss": 0.73788536, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.75924653, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5625, + "step": 4296, + "time_per_iteration": 2.4435131549835205 + }, + { + "auxiliary_loss_clip": 0.01084763, + "auxiliary_loss_mlp": 0.01054907, + "balance_loss_clip": 1.01702225, + "balance_loss_mlp": 1.02905989, + "epoch": 0.25834961671426426, + "flos": 23514072641280.0, + "grad_norm": 1.613740084083968, + "language_loss": 0.87267774, + "learning_rate": 3.477492965085067e-06, + "loss": 0.89407444, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5546875, + "step": 4297, + "time_per_iteration": 2.430133581161499 + }, + { + "auxiliary_loss_clip": 0.01078622, + "auxiliary_loss_mlp": 0.01054891, + "balance_loss_clip": 1.02065432, + "balance_loss_mlp": 1.02508116, + "epoch": 0.25840973996693223, + "flos": 22450059912960.0, + "grad_norm": 11.69624748768489, + "language_loss": 0.86250389, + "learning_rate": 3.477230446361943e-06, + "loss": 0.88383907, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.53515625, + "step": 4298, + "time_per_iteration": 2.409881591796875 + }, + { + "auxiliary_loss_clip": 0.01078353, + "auxiliary_loss_mlp": 0.01054223, + "balance_loss_clip": 1.02067792, + "balance_loss_mlp": 1.02547216, + "epoch": 0.2584698632196002, + "flos": 11290072619520.0, + "grad_norm": 2.4091422014281187, + "language_loss": 0.85478818, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.87611401, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.53125, + "step": 4299, + "time_per_iteration": 2.384657621383667 + }, + { + "auxiliary_loss_clip": 0.0107658, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.0153321, + "balance_loss_mlp": 1.0256474, + "epoch": 0.25852998647226816, + "flos": 17929051758720.0, + "grad_norm": 2.3671988414562306, + "language_loss": 0.84647644, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.86768281, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.5078125, + "step": 4300, + "time_per_iteration": 2.3479833602905273 + }, + { + "auxiliary_loss_clip": 0.01079464, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.02187228, + "balance_loss_mlp": 1.02391315, + "epoch": 0.2585901097249361, + "flos": 33254700721920.0, + "grad_norm": 1.975697322289114, + "language_loss": 0.69756126, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.71892387, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5546875, + "step": 4301, + "time_per_iteration": 2.517148971557617 + }, + { + "auxiliary_loss_clip": 0.0108036, + "auxiliary_loss_mlp": 0.01055845, + "balance_loss_clip": 1.02113104, + "balance_loss_mlp": 1.02428532, + "epoch": 0.2586502329776041, + "flos": 18440319841920.0, + "grad_norm": 2.779357530837604, + "language_loss": 0.84289163, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.86425364, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5625, + "step": 4302, + "time_per_iteration": 2.3479785919189453 + }, + { + "auxiliary_loss_clip": 0.01078988, + "auxiliary_loss_mlp": 0.01055101, + "balance_loss_clip": 1.02188993, + "balance_loss_mlp": 1.02549314, + "epoch": 0.25871035623027205, + "flos": 17967141918720.0, + "grad_norm": 1.9129157406621091, + "language_loss": 0.93693447, + "learning_rate": 3.475917012694595e-06, + "loss": 0.95827538, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.53125, + "step": 4303, + "time_per_iteration": 2.3682994842529297 + }, + { + "auxiliary_loss_clip": 0.01077102, + "auxiliary_loss_mlp": 0.01051381, + "balance_loss_clip": 1.01723945, + "balance_loss_mlp": 1.02414727, + "epoch": 0.25877047948294, + "flos": 27776618933760.0, + "grad_norm": 2.181178004725439, + "language_loss": 0.68600273, + "learning_rate": 3.475654158020507e-06, + "loss": 0.70728755, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.53125, + "step": 4304, + "time_per_iteration": 2.420952796936035 + }, + { + "auxiliary_loss_clip": 0.01076297, + "auxiliary_loss_mlp": 0.01055128, + "balance_loss_clip": 1.01936483, + "balance_loss_mlp": 1.02266169, + "epoch": 0.258830602735608, + "flos": 27124615693440.0, + "grad_norm": 3.0748785583992606, + "language_loss": 0.74593812, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.76725233, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.53515625, + "step": 4305, + "time_per_iteration": 2.441922664642334 + }, + { + "auxiliary_loss_clip": 0.01077771, + "auxiliary_loss_mlp": 0.01058685, + "balance_loss_clip": 1.02416205, + "balance_loss_mlp": 1.02269244, + "epoch": 0.25889072598827595, + "flos": 17890612485120.0, + "grad_norm": 2.057286559420386, + "language_loss": 0.77135795, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.79272246, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.546875, + "step": 4306, + "time_per_iteration": 2.3484416007995605 + }, + { + "auxiliary_loss_clip": 0.01028255, + "auxiliary_loss_mlp": 0.010082, + "balance_loss_clip": 1.00171506, + "balance_loss_mlp": 1.01027, + "epoch": 0.2589508492409439, + "flos": 53932184530560.0, + "grad_norm": 0.85194137655323, + "language_loss": 0.57224, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59260458, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.06494141, + "router_z_loss_mlp": 0.1796875, + "step": 4307, + "time_per_iteration": 2.961881399154663 + }, + { + "auxiliary_loss_clip": 0.010739, + "auxiliary_loss_mlp": 0.01052426, + "balance_loss_clip": 1.02112174, + "balance_loss_mlp": 1.02252841, + "epoch": 0.2590109724936119, + "flos": 22124739064320.0, + "grad_norm": 1.9048236111702301, + "language_loss": 0.73339623, + "learning_rate": 3.474602179854327e-06, + "loss": 0.75465947, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.51171875, + "step": 4308, + "time_per_iteration": 2.3959856033325195 + }, + { + "auxiliary_loss_clip": 0.01076503, + "auxiliary_loss_mlp": 0.01057708, + "balance_loss_clip": 1.02432907, + "balance_loss_mlp": 1.02251339, + "epoch": 0.2590710957462799, + "flos": 13473610871040.0, + "grad_norm": 1.8991488174520352, + "language_loss": 0.85895729, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.88029939, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5390625, + "step": 4309, + "time_per_iteration": 2.3701188564300537 + }, + { + "auxiliary_loss_clip": 0.0107448, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.01806831, + "balance_loss_mlp": 1.02249098, + "epoch": 0.25913121899894787, + "flos": 22306077999360.0, + "grad_norm": 1.8632425074194043, + "language_loss": 0.85792696, + "learning_rate": 3.474075855228966e-06, + "loss": 0.87917101, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.51953125, + "step": 4310, + "time_per_iteration": 2.3835034370422363 + }, + { + "auxiliary_loss_clip": 0.01077441, + "auxiliary_loss_mlp": 0.0106008, + "balance_loss_clip": 1.02517533, + "balance_loss_mlp": 1.02280116, + "epoch": 0.25919134225161583, + "flos": 25810554741120.0, + "grad_norm": 1.9857849584725553, + "language_loss": 0.78891599, + "learning_rate": 3.473812609065639e-06, + "loss": 0.81029117, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.546875, + "step": 4311, + "time_per_iteration": 2.4389965534210205 + }, + { + "auxiliary_loss_clip": 0.01074712, + "auxiliary_loss_mlp": 0.01060278, + "balance_loss_clip": 1.02806783, + "balance_loss_mlp": 1.02120852, + "epoch": 0.2592514655042838, + "flos": 31210920109440.0, + "grad_norm": 1.8730700469754076, + "language_loss": 0.73415285, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.75550276, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5390625, + "step": 4312, + "time_per_iteration": 2.4606926441192627 + }, + { + "auxiliary_loss_clip": 0.010753, + "auxiliary_loss_mlp": 0.0106053, + "balance_loss_clip": 1.0257926, + "balance_loss_mlp": 1.02332914, + "epoch": 0.25931158875695176, + "flos": 18474115904640.0, + "grad_norm": 2.9162153715125, + "language_loss": 0.72596979, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.74732804, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.51953125, + "step": 4313, + "time_per_iteration": 2.372936248779297 + }, + { + "auxiliary_loss_clip": 0.01073779, + "auxiliary_loss_mlp": 0.01057046, + "balance_loss_clip": 1.02667141, + "balance_loss_mlp": 1.02237725, + "epoch": 0.2593717120096197, + "flos": 19206942675840.0, + "grad_norm": 1.959846323404861, + "language_loss": 0.81772751, + "learning_rate": 3.473022535292867e-06, + "loss": 0.83903575, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.515625, + "step": 4314, + "time_per_iteration": 2.431692123413086 + }, + { + "auxiliary_loss_clip": 0.01077686, + "auxiliary_loss_mlp": 0.01062009, + "balance_loss_clip": 1.02658057, + "balance_loss_mlp": 1.02181053, + "epoch": 0.2594318352622877, + "flos": 31246775942400.0, + "grad_norm": 1.9997072996560323, + "language_loss": 0.67968279, + "learning_rate": 3.472759065640968e-06, + "loss": 0.70107973, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.55859375, + "step": 4315, + "time_per_iteration": 2.4657020568847656 + }, + { + "auxiliary_loss_clip": 0.01073769, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.02110469, + "balance_loss_mlp": 1.02238154, + "epoch": 0.25949195851495566, + "flos": 22236042078720.0, + "grad_norm": 3.3355826127461725, + "language_loss": 0.80552888, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.82678962, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.515625, + "step": 4316, + "time_per_iteration": 2.3831307888031006 + }, + { + "auxiliary_loss_clip": 0.01074309, + "auxiliary_loss_mlp": 0.01055821, + "balance_loss_clip": 1.02017689, + "balance_loss_mlp": 1.02131653, + "epoch": 0.2595520817676236, + "flos": 28074427764480.0, + "grad_norm": 1.6447468009322173, + "language_loss": 0.78872555, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.81002688, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.53125, + "step": 4317, + "time_per_iteration": 2.4524190425872803 + }, + { + "auxiliary_loss_clip": 0.01075491, + "auxiliary_loss_mlp": 0.01061875, + "balance_loss_clip": 1.02654147, + "balance_loss_mlp": 1.0236156, + "epoch": 0.2596122050202916, + "flos": 20189992227840.0, + "grad_norm": 2.215793343012506, + "language_loss": 0.7896806, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.81105429, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.51953125, + "step": 4318, + "time_per_iteration": 2.3714840412139893 + }, + { + "auxiliary_loss_clip": 0.01074131, + "auxiliary_loss_mlp": 0.01049109, + "balance_loss_clip": 1.0146575, + "balance_loss_mlp": 1.02261424, + "epoch": 0.25967232827295955, + "flos": 22526868637440.0, + "grad_norm": 1.741170785961475, + "language_loss": 0.77653337, + "learning_rate": 3.471704628661598e-06, + "loss": 0.79776579, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4319, + "time_per_iteration": 2.447252035140991 + }, + { + "auxiliary_loss_clip": 0.01070707, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.01963925, + "balance_loss_mlp": 1.02034926, + "epoch": 0.2597324515256275, + "flos": 21067219848960.0, + "grad_norm": 2.243173585820958, + "language_loss": 0.77908164, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.80029118, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.50390625, + "step": 4320, + "time_per_iteration": 2.3880057334899902 + }, + { + "auxiliary_loss_clip": 0.01075448, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.02012193, + "balance_loss_mlp": 1.02245069, + "epoch": 0.2597925747782955, + "flos": 22049047503360.0, + "grad_norm": 6.458865626837107, + "language_loss": 0.72355795, + "learning_rate": 3.471177075288801e-06, + "loss": 0.74483371, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.53125, + "step": 4321, + "time_per_iteration": 2.395052909851074 + }, + { + "auxiliary_loss_clip": 0.0107745, + "auxiliary_loss_mlp": 0.01055753, + "balance_loss_clip": 1.01975167, + "balance_loss_mlp": 1.02268028, + "epoch": 0.2598526980309635, + "flos": 19535929217280.0, + "grad_norm": 2.5338826503220275, + "language_loss": 0.76287216, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.78420419, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.546875, + "step": 4322, + "time_per_iteration": 2.376047134399414 + }, + { + "auxiliary_loss_clip": 0.01074656, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.01992202, + "balance_loss_mlp": 1.02162707, + "epoch": 0.25991282128363147, + "flos": 24494154727680.0, + "grad_norm": 2.118315046902038, + "language_loss": 0.7482574, + "learning_rate": 3.470649298767278e-06, + "loss": 0.76957762, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.53125, + "step": 4323, + "time_per_iteration": 2.429680347442627 + }, + { + "auxiliary_loss_clip": 0.01078464, + "auxiliary_loss_mlp": 0.01053606, + "balance_loss_clip": 1.01624537, + "balance_loss_mlp": 1.02164388, + "epoch": 0.25997294453629943, + "flos": 24200465437440.0, + "grad_norm": 1.830521610321595, + "language_loss": 0.68545103, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.70677173, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5703125, + "step": 4324, + "time_per_iteration": 3.7968106269836426 + }, + { + "auxiliary_loss_clip": 0.01073215, + "auxiliary_loss_mlp": 0.01051081, + "balance_loss_clip": 1.01857305, + "balance_loss_mlp": 1.02189898, + "epoch": 0.2600330677889674, + "flos": 31430104824960.0, + "grad_norm": 2.0957276456776612, + "language_loss": 0.72408247, + "learning_rate": 3.470121299177082e-06, + "loss": 0.74532539, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51171875, + "step": 4325, + "time_per_iteration": 2.4652199745178223 + }, + { + "auxiliary_loss_clip": 0.01073369, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_clip": 1.02138686, + "balance_loss_mlp": 1.02027774, + "epoch": 0.26009319104163536, + "flos": 32265262391040.0, + "grad_norm": 2.472866592451999, + "language_loss": 0.74793828, + "learning_rate": 3.469857215756257e-06, + "loss": 0.76922512, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.53125, + "step": 4326, + "time_per_iteration": 2.461482048034668 + }, + { + "auxiliary_loss_clip": 0.01070659, + "auxiliary_loss_mlp": 0.01044288, + "balance_loss_clip": 1.01472414, + "balance_loss_mlp": 1.02038574, + "epoch": 0.26015331429430333, + "flos": 26285548055040.0, + "grad_norm": 1.793825952644206, + "language_loss": 0.88048708, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.90163654, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.50390625, + "step": 4327, + "time_per_iteration": 2.442293167114258 + }, + { + "auxiliary_loss_clip": 0.0107843, + "auxiliary_loss_mlp": 0.0106121, + "balance_loss_clip": 1.02091765, + "balance_loss_mlp": 1.02286947, + "epoch": 0.2602134375469713, + "flos": 21141270576000.0, + "grad_norm": 2.058602752696256, + "language_loss": 0.81928498, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.84068143, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.5546875, + "step": 4328, + "time_per_iteration": 2.384488821029663 + }, + { + "auxiliary_loss_clip": 0.01071583, + "auxiliary_loss_mlp": 0.01048263, + "balance_loss_clip": 1.01710176, + "balance_loss_mlp": 1.02037001, + "epoch": 0.26027356079963926, + "flos": 25920147098880.0, + "grad_norm": 1.5834764506403771, + "language_loss": 0.89003396, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.91123241, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.51171875, + "step": 4329, + "time_per_iteration": 3.944563627243042 + }, + { + "auxiliary_loss_clip": 0.01072424, + "auxiliary_loss_mlp": 0.01045391, + "balance_loss_clip": 1.01493263, + "balance_loss_mlp": 1.02175605, + "epoch": 0.2603336840523072, + "flos": 26358027770880.0, + "grad_norm": 2.0251589679589266, + "language_loss": 0.78649986, + "learning_rate": 3.468800324801802e-06, + "loss": 0.80767804, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.5078125, + "step": 4330, + "time_per_iteration": 2.440662384033203 + }, + { + "auxiliary_loss_clip": 0.01079927, + "auxiliary_loss_mlp": 0.01054726, + "balance_loss_clip": 1.02056062, + "balance_loss_mlp": 1.02399313, + "epoch": 0.2603938073049752, + "flos": 23512536541440.0, + "grad_norm": 1.5510334898907268, + "language_loss": 0.76541388, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.78676045, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5625, + "step": 4331, + "time_per_iteration": 3.794708728790283 + }, + { + "auxiliary_loss_clip": 0.010733, + "auxiliary_loss_mlp": 0.01054805, + "balance_loss_clip": 1.0239892, + "balance_loss_mlp": 1.02238512, + "epoch": 0.26045393055764315, + "flos": 25373127916800.0, + "grad_norm": 1.4035966223016234, + "language_loss": 0.7041899, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.72547096, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.5078125, + "step": 4332, + "time_per_iteration": 2.4303629398345947 + }, + { + "auxiliary_loss_clip": 0.01075134, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_clip": 1.01844633, + "balance_loss_mlp": 1.02149844, + "epoch": 0.2605140538103111, + "flos": 27634068385920.0, + "grad_norm": 2.454597704683902, + "language_loss": 0.81336772, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.83465707, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53515625, + "step": 4333, + "time_per_iteration": 2.427786111831665 + }, + { + "auxiliary_loss_clip": 0.01072847, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.01577163, + "balance_loss_mlp": 1.02106941, + "epoch": 0.2605741770629791, + "flos": 13769045729280.0, + "grad_norm": 1.9362733487493158, + "language_loss": 0.81186533, + "learning_rate": 3.467742542694501e-06, + "loss": 0.83307409, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.515625, + "step": 4334, + "time_per_iteration": 2.3746626377105713 + }, + { + "auxiliary_loss_clip": 0.01074322, + "auxiliary_loss_mlp": 0.01051257, + "balance_loss_clip": 1.01647139, + "balance_loss_mlp": 1.02155924, + "epoch": 0.26063430031564705, + "flos": 26030472595200.0, + "grad_norm": 1.7592378062730563, + "language_loss": 0.80796647, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82922232, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53125, + "step": 4335, + "time_per_iteration": 2.455803871154785 + }, + { + "auxiliary_loss_clip": 0.01023571, + "auxiliary_loss_mlp": 0.01011124, + "balance_loss_clip": 1.00640333, + "balance_loss_mlp": 1.00821912, + "epoch": 0.26069442356831507, + "flos": 62440587619200.0, + "grad_norm": 0.8304028826942695, + "language_loss": 0.60752141, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62786841, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.04711914, + "router_z_loss_mlp": 0.15332031, + "step": 4336, + "time_per_iteration": 3.0014219284057617 + }, + { + "auxiliary_loss_clip": 0.01074725, + "auxiliary_loss_mlp": 0.01054559, + "balance_loss_clip": 1.02172852, + "balance_loss_mlp": 1.02164316, + "epoch": 0.26075454682098304, + "flos": 13625517663360.0, + "grad_norm": 1.7855990358468834, + "language_loss": 0.79169303, + "learning_rate": 3.46694862168102e-06, + "loss": 0.8129859, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.53125, + "step": 4337, + "time_per_iteration": 2.3511011600494385 + }, + { + "auxiliary_loss_clip": 0.01076179, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.01846409, + "balance_loss_mlp": 1.02160656, + "epoch": 0.260814670073651, + "flos": 12125823678720.0, + "grad_norm": 2.347252809973351, + "language_loss": 0.76598531, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.78728414, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 4338, + "time_per_iteration": 2.3604750633239746 + }, + { + "auxiliary_loss_clip": 0.01076979, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.02058506, + "balance_loss_mlp": 1.02260697, + "epoch": 0.26087479332631897, + "flos": 15121615777920.0, + "grad_norm": 2.1654072602600523, + "language_loss": 0.81942666, + "learning_rate": 3.466419062854447e-06, + "loss": 0.8407644, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.54296875, + "step": 4339, + "time_per_iteration": 2.359541654586792 + }, + { + "auxiliary_loss_clip": 0.01072547, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.01473713, + "balance_loss_mlp": 1.02153528, + "epoch": 0.26093491657898693, + "flos": 24679787760000.0, + "grad_norm": 1.7265116503646307, + "language_loss": 0.77139068, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.79257655, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.5078125, + "step": 4340, + "time_per_iteration": 2.4139230251312256 + }, + { + "auxiliary_loss_clip": 0.01076319, + "auxiliary_loss_mlp": 0.01052298, + "balance_loss_clip": 1.0182755, + "balance_loss_mlp": 1.02203894, + "epoch": 0.2609950398316549, + "flos": 25115050080000.0, + "grad_norm": 1.677140175040068, + "language_loss": 0.83773458, + "learning_rate": 3.465889281600845e-06, + "loss": 0.85902083, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.54296875, + "step": 4341, + "time_per_iteration": 2.4035019874572754 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.0105677, + "balance_loss_clip": 1.02081656, + "balance_loss_mlp": 1.02171755, + "epoch": 0.26105516308432286, + "flos": 28547326396800.0, + "grad_norm": 1.8356666906858434, + "language_loss": 0.77731442, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79861969, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.51953125, + "step": 4342, + "time_per_iteration": 2.445312023162842 + }, + { + "auxiliary_loss_clip": 0.0107335, + "auxiliary_loss_mlp": 0.01053596, + "balance_loss_clip": 1.02012241, + "balance_loss_mlp": 1.02118945, + "epoch": 0.2611152863369908, + "flos": 39529046954880.0, + "grad_norm": 1.8529670231957953, + "language_loss": 0.67540729, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.69667673, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5234375, + "step": 4343, + "time_per_iteration": 2.5378615856170654 + }, + { + "auxiliary_loss_clip": 0.0107292, + "auxiliary_loss_mlp": 0.01054901, + "balance_loss_clip": 1.02178502, + "balance_loss_mlp": 1.01954246, + "epoch": 0.2611754095896588, + "flos": 13734481616640.0, + "grad_norm": 2.3268496488028405, + "language_loss": 0.75760335, + "learning_rate": 3.465094192845553e-06, + "loss": 0.77888155, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.53515625, + "step": 4344, + "time_per_iteration": 2.369339942932129 + }, + { + "auxiliary_loss_clip": 0.01074775, + "auxiliary_loss_mlp": 0.01056119, + "balance_loss_clip": 1.01918781, + "balance_loss_mlp": 1.0217073, + "epoch": 0.26123553284232676, + "flos": 21505589280000.0, + "grad_norm": 2.131038828039069, + "language_loss": 0.88215935, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.90346837, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53125, + "step": 4345, + "time_per_iteration": 2.381333351135254 + }, + { + "auxiliary_loss_clip": 0.01071246, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.0174334, + "balance_loss_mlp": 1.02142692, + "epoch": 0.2612956560949947, + "flos": 21138791869440.0, + "grad_norm": 1.9422444996717085, + "language_loss": 0.77984381, + "learning_rate": 3.464563855876015e-06, + "loss": 0.80103219, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.49804688, + "step": 4346, + "time_per_iteration": 2.441781997680664 + }, + { + "auxiliary_loss_clip": 0.01072756, + "auxiliary_loss_mlp": 0.01054135, + "balance_loss_clip": 1.02054191, + "balance_loss_mlp": 1.02086449, + "epoch": 0.2613557793476627, + "flos": 25117842988800.0, + "grad_norm": 2.158721893466456, + "language_loss": 0.77314299, + "learning_rate": 3.464298604081606e-06, + "loss": 0.79441184, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.51953125, + "step": 4347, + "time_per_iteration": 2.4101173877716064 + }, + { + "auxiliary_loss_clip": 0.01071502, + "auxiliary_loss_mlp": 0.01046581, + "balance_loss_clip": 1.01549101, + "balance_loss_mlp": 1.02074325, + "epoch": 0.26141590260033065, + "flos": 26066502984960.0, + "grad_norm": 1.4060562024447225, + "language_loss": 0.74727488, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.76845562, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.5078125, + "step": 4348, + "time_per_iteration": 2.471334457397461 + }, + { + "auxiliary_loss_clip": 0.01073311, + "auxiliary_loss_mlp": 0.01051702, + "balance_loss_clip": 1.01818085, + "balance_loss_mlp": 1.01989734, + "epoch": 0.2614760258529987, + "flos": 25700368890240.0, + "grad_norm": 2.01920030646507, + "language_loss": 0.93448657, + "learning_rate": 3.463767933923799e-06, + "loss": 0.95573664, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.53125, + "step": 4349, + "time_per_iteration": 2.4081010818481445 + }, + { + "auxiliary_loss_clip": 0.01071766, + "auxiliary_loss_mlp": 0.01044168, + "balance_loss_clip": 1.01243472, + "balance_loss_mlp": 1.02092195, + "epoch": 0.26153614910566664, + "flos": 17456188037760.0, + "grad_norm": 2.9429252964090558, + "language_loss": 0.81996727, + "learning_rate": 3.463502515580524e-06, + "loss": 0.84112668, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.5078125, + "step": 4350, + "time_per_iteration": 2.3827712535858154 + }, + { + "auxiliary_loss_clip": 0.01070446, + "auxiliary_loss_mlp": 0.01050832, + "balance_loss_clip": 1.02019525, + "balance_loss_mlp": 1.02010953, + "epoch": 0.2615962723583346, + "flos": 17711856990720.0, + "grad_norm": 5.252955689955155, + "language_loss": 0.65117335, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.67238617, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.50390625, + "step": 4351, + "time_per_iteration": 2.3749914169311523 + }, + { + "auxiliary_loss_clip": 0.01073416, + "auxiliary_loss_mlp": 0.01055108, + "balance_loss_clip": 1.01822484, + "balance_loss_mlp": 1.02055144, + "epoch": 0.26165639561100257, + "flos": 23256623208960.0, + "grad_norm": 1.9978575640111786, + "language_loss": 0.85868651, + "learning_rate": 3.462971512415555e-06, + "loss": 0.87997174, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53125, + "step": 4352, + "time_per_iteration": 2.4563987255096436 + }, + { + "auxiliary_loss_clip": 0.01023088, + "auxiliary_loss_mlp": 0.01008059, + "balance_loss_clip": 1.00250423, + "balance_loss_mlp": 1.00789118, + "epoch": 0.26171651886367053, + "flos": 66734940026880.0, + "grad_norm": 0.7960481905605304, + "language_loss": 0.7066282, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72693968, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.15234375, + "step": 4353, + "time_per_iteration": 2.9228315353393555 + }, + { + "auxiliary_loss_clip": 0.01071141, + "auxiliary_loss_mlp": 0.01058067, + "balance_loss_clip": 1.02406907, + "balance_loss_mlp": 1.01957369, + "epoch": 0.2617766421163385, + "flos": 22348392433920.0, + "grad_norm": 1.9106782608450519, + "language_loss": 0.79149318, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.81278527, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.515625, + "step": 4354, + "time_per_iteration": 2.4554953575134277 + }, + { + "auxiliary_loss_clip": 0.01075195, + "auxiliary_loss_mlp": 0.01055758, + "balance_loss_clip": 1.01892233, + "balance_loss_mlp": 1.02012491, + "epoch": 0.26183676536900646, + "flos": 26065944403200.0, + "grad_norm": 2.37505933932333, + "language_loss": 0.70129329, + "learning_rate": 3.462174591623085e-06, + "loss": 0.72260278, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.55078125, + "step": 4355, + "time_per_iteration": 2.4110467433929443 + }, + { + "auxiliary_loss_clip": 0.0107322, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.01147342, + "balance_loss_mlp": 1.02161407, + "epoch": 0.26189688862167443, + "flos": 20995403448960.0, + "grad_norm": 2.0618172212860983, + "language_loss": 0.69353861, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.71472579, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.515625, + "step": 4356, + "time_per_iteration": 2.435774564743042 + }, + { + "auxiliary_loss_clip": 0.01020801, + "auxiliary_loss_mlp": 0.01005474, + "balance_loss_clip": 0.99991935, + "balance_loss_mlp": 1.00536466, + "epoch": 0.2619570118743424, + "flos": 65795007870720.0, + "grad_norm": 0.6817224665486104, + "language_loss": 0.53214431, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55240709, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.15429688, + "step": 4357, + "time_per_iteration": 2.9298925399780273 + }, + { + "auxiliary_loss_clip": 0.01073918, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_clip": 1.02298093, + "balance_loss_mlp": 1.02068233, + "epoch": 0.26201713512701036, + "flos": 28765568505600.0, + "grad_norm": 1.9020245968283926, + "language_loss": 0.8606267, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.88193095, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.53125, + "step": 4358, + "time_per_iteration": 2.434086561203003 + }, + { + "auxiliary_loss_clip": 0.01076251, + "auxiliary_loss_mlp": 0.01057127, + "balance_loss_clip": 1.02181685, + "balance_loss_mlp": 1.02151835, + "epoch": 0.2620772583796783, + "flos": 26431310448000.0, + "grad_norm": 1.9928014143877297, + "language_loss": 0.68779707, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.70913082, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.546875, + "step": 4359, + "time_per_iteration": 2.4158825874328613 + }, + { + "auxiliary_loss_clip": 0.01072657, + "auxiliary_loss_mlp": 0.01049641, + "balance_loss_clip": 1.01809788, + "balance_loss_mlp": 1.02093148, + "epoch": 0.2621373816323463, + "flos": 20155532849280.0, + "grad_norm": 1.8390268760174922, + "language_loss": 0.79267991, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.81390297, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.51953125, + "step": 4360, + "time_per_iteration": 2.4023942947387695 + }, + { + "auxiliary_loss_clip": 0.01071282, + "auxiliary_loss_mlp": 0.01046654, + "balance_loss_clip": 1.0165658, + "balance_loss_mlp": 1.02061868, + "epoch": 0.26219750488501425, + "flos": 28619980669440.0, + "grad_norm": 1.7844327970749396, + "language_loss": 0.69250047, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.71367979, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.5078125, + "step": 4361, + "time_per_iteration": 2.429924249649048 + }, + { + "auxiliary_loss_clip": 0.01073374, + "auxiliary_loss_mlp": 0.01056513, + "balance_loss_clip": 1.02153707, + "balance_loss_mlp": 1.02107918, + "epoch": 0.2622576281376823, + "flos": 15041839587840.0, + "grad_norm": 1.9532582427800615, + "language_loss": 0.85483241, + "learning_rate": 3.46031316964119e-06, + "loss": 0.8761313, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5234375, + "step": 4362, + "time_per_iteration": 2.394083261489868 + }, + { + "auxiliary_loss_clip": 0.0107293, + "auxiliary_loss_mlp": 0.01054282, + "balance_loss_clip": 1.01999736, + "balance_loss_mlp": 1.02152908, + "epoch": 0.26231775139035024, + "flos": 26394965856000.0, + "grad_norm": 2.8871697402762777, + "language_loss": 0.66678905, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.68806118, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51171875, + "step": 4363, + "time_per_iteration": 2.4192521572113037 + }, + { + "auxiliary_loss_clip": 0.01018608, + "auxiliary_loss_mlp": 0.01012827, + "balance_loss_clip": 1.00696206, + "balance_loss_mlp": 1.00333714, + "epoch": 0.2623778746430182, + "flos": 65405341653120.0, + "grad_norm": 0.8948577204897447, + "language_loss": 0.61236382, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63267815, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.15234375, + "step": 4364, + "time_per_iteration": 4.489120721817017 + }, + { + "auxiliary_loss_clip": 0.01075468, + "auxiliary_loss_mlp": 0.01053017, + "balance_loss_clip": 1.0181365, + "balance_loss_mlp": 1.02231574, + "epoch": 0.26243799789568617, + "flos": 12603400433280.0, + "grad_norm": 2.6068395661194175, + "language_loss": 0.74981117, + "learning_rate": 3.459514586533184e-06, + "loss": 0.77109599, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.53125, + "step": 4365, + "time_per_iteration": 2.3775956630706787 + }, + { + "auxiliary_loss_clip": 0.01073562, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_clip": 1.01991415, + "balance_loss_mlp": 1.02181947, + "epoch": 0.26249812114835414, + "flos": 28622494287360.0, + "grad_norm": 1.678796955104702, + "language_loss": 0.7851603, + "learning_rate": 3.459248281460509e-06, + "loss": 0.80641454, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.515625, + "step": 4366, + "time_per_iteration": 2.4507999420166016 + }, + { + "auxiliary_loss_clip": 0.01074393, + "auxiliary_loss_mlp": 0.01049724, + "balance_loss_clip": 1.0170604, + "balance_loss_mlp": 1.02195668, + "epoch": 0.2625582444010221, + "flos": 14464515479040.0, + "grad_norm": 1.8096582470263507, + "language_loss": 0.77484095, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.79608214, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.5234375, + "step": 4367, + "time_per_iteration": 2.370509386062622 + }, + { + "auxiliary_loss_clip": 0.01071435, + "auxiliary_loss_mlp": 0.01042482, + "balance_loss_clip": 1.01231027, + "balance_loss_mlp": 1.02135015, + "epoch": 0.26261836765369007, + "flos": 16612372454400.0, + "grad_norm": 2.551111189414717, + "language_loss": 0.71039283, + "learning_rate": 3.458715505320736e-06, + "loss": 0.73153204, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.5, + "step": 4368, + "time_per_iteration": 5.412498235702515 + }, + { + "auxiliary_loss_clip": 0.01071168, + "auxiliary_loss_mlp": 0.01051672, + "balance_loss_clip": 1.01683927, + "balance_loss_mlp": 1.02039039, + "epoch": 0.26267849090635803, + "flos": 20518943857920.0, + "grad_norm": 4.33302994961259, + "language_loss": 0.80292642, + "learning_rate": 3.458449034273841e-06, + "loss": 0.82415485, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5078125, + "step": 4369, + "time_per_iteration": 2.4068639278411865 + }, + { + "auxiliary_loss_clip": 0.01070505, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_clip": 1.01602495, + "balance_loss_mlp": 1.02002275, + "epoch": 0.262738614159026, + "flos": 21322888801920.0, + "grad_norm": 1.884842959364299, + "language_loss": 0.84700143, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.86819875, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.50390625, + "step": 4370, + "time_per_iteration": 3.8132314682006836 + }, + { + "auxiliary_loss_clip": 0.01076014, + "auxiliary_loss_mlp": 0.01054178, + "balance_loss_clip": 1.01924992, + "balance_loss_mlp": 1.02244735, + "epoch": 0.26279873741169396, + "flos": 17602613746560.0, + "grad_norm": 1.6993829999798655, + "language_loss": 0.72578192, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.74708378, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5390625, + "step": 4371, + "time_per_iteration": 2.377772808074951 + }, + { + "auxiliary_loss_clip": 0.01018091, + "auxiliary_loss_mlp": 0.01015393, + "balance_loss_clip": 1.01000428, + "balance_loss_mlp": 1.00286913, + "epoch": 0.2628588606643619, + "flos": 60946514363520.0, + "grad_norm": 0.7044636279822712, + "language_loss": 0.56634283, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58667767, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.05395508, + "router_z_loss_mlp": 0.15234375, + "step": 4372, + "time_per_iteration": 3.132427453994751 + }, + { + "auxiliary_loss_clip": 0.01069466, + "auxiliary_loss_mlp": 0.01048252, + "balance_loss_clip": 1.01691175, + "balance_loss_mlp": 1.02094471, + "epoch": 0.2629189839170299, + "flos": 27015093158400.0, + "grad_norm": 1.556597330961399, + "language_loss": 0.78949457, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.81067181, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.484375, + "step": 4373, + "time_per_iteration": 2.425274610519409 + }, + { + "auxiliary_loss_clip": 0.01070972, + "auxiliary_loss_mlp": 0.01048908, + "balance_loss_clip": 1.01736569, + "balance_loss_mlp": 1.02056718, + "epoch": 0.26297910716969786, + "flos": 17018900858880.0, + "grad_norm": 2.545545797527707, + "language_loss": 0.72815669, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.74935544, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.50390625, + "step": 4374, + "time_per_iteration": 2.4075911045074463 + }, + { + "auxiliary_loss_clip": 0.01071875, + "auxiliary_loss_mlp": 0.01054828, + "balance_loss_clip": 1.02056718, + "balance_loss_mlp": 1.02003694, + "epoch": 0.2630392304223659, + "flos": 24896284300800.0, + "grad_norm": 1.7072455381322775, + "language_loss": 0.82015687, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.84142387, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4375, + "time_per_iteration": 2.440300703048706 + }, + { + "auxiliary_loss_clip": 0.01069186, + "auxiliary_loss_mlp": 0.01050676, + "balance_loss_clip": 1.01801252, + "balance_loss_mlp": 1.02030897, + "epoch": 0.26309935367503384, + "flos": 32852640971520.0, + "grad_norm": 2.1664880913367304, + "language_loss": 0.68038678, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.70158529, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.48828125, + "step": 4376, + "time_per_iteration": 2.508558511734009 + }, + { + "auxiliary_loss_clip": 0.01071573, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_clip": 1.02166569, + "balance_loss_mlp": 1.02057719, + "epoch": 0.2631594769277018, + "flos": 15887051625600.0, + "grad_norm": 1.857856961149813, + "language_loss": 0.70849031, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.72973359, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.5078125, + "step": 4377, + "time_per_iteration": 2.3727734088897705 + }, + { + "auxiliary_loss_clip": 0.01071186, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.01570153, + "balance_loss_mlp": 1.02024055, + "epoch": 0.2632196001803698, + "flos": 50803060348800.0, + "grad_norm": 2.188057773850793, + "language_loss": 0.81357157, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.83479059, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5078125, + "step": 4378, + "time_per_iteration": 2.6476705074310303 + }, + { + "auxiliary_loss_clip": 0.01071592, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_clip": 1.02078009, + "balance_loss_mlp": 1.02148581, + "epoch": 0.26327972343303774, + "flos": 13732247289600.0, + "grad_norm": 2.115085212185895, + "language_loss": 0.78831565, + "learning_rate": 3.455781283723846e-06, + "loss": 0.80955589, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.5, + "step": 4379, + "time_per_iteration": 2.3553049564361572 + }, + { + "auxiliary_loss_clip": 0.01074975, + "auxiliary_loss_mlp": 0.01062382, + "balance_loss_clip": 1.0238297, + "balance_loss_mlp": 1.02036619, + "epoch": 0.2633398466857057, + "flos": 23767926203520.0, + "grad_norm": 2.2618575531424305, + "language_loss": 0.80257642, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.82395005, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.546875, + "step": 4380, + "time_per_iteration": 2.411187171936035 + }, + { + "auxiliary_loss_clip": 0.01072493, + "auxiliary_loss_mlp": 0.01048887, + "balance_loss_clip": 1.01596117, + "balance_loss_mlp": 1.01996279, + "epoch": 0.26339996993837367, + "flos": 27598980602880.0, + "grad_norm": 1.987610887800834, + "language_loss": 0.65933168, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.68054545, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.52734375, + "step": 4381, + "time_per_iteration": 2.4205918312072754 + }, + { + "auxiliary_loss_clip": 0.0107033, + "auxiliary_loss_mlp": 0.01049403, + "balance_loss_clip": 1.01974356, + "balance_loss_mlp": 1.02046156, + "epoch": 0.26346009319104163, + "flos": 16945373802240.0, + "grad_norm": 2.336024924939081, + "language_loss": 0.83574873, + "learning_rate": 3.454979881632595e-06, + "loss": 0.85694611, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.49804688, + "step": 4382, + "time_per_iteration": 2.3898627758026123 + }, + { + "auxiliary_loss_clip": 0.01074245, + "auxiliary_loss_mlp": 0.01053144, + "balance_loss_clip": 1.01792955, + "balance_loss_mlp": 1.01964223, + "epoch": 0.2635202164437096, + "flos": 37230714552960.0, + "grad_norm": 1.9475356981253675, + "language_loss": 0.71328592, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.73455977, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.546875, + "step": 4383, + "time_per_iteration": 2.5041112899780273 + }, + { + "auxiliary_loss_clip": 0.01073093, + "auxiliary_loss_mlp": 0.01048298, + "balance_loss_clip": 1.01697028, + "balance_loss_mlp": 1.02107728, + "epoch": 0.26358033969637756, + "flos": 20995298714880.0, + "grad_norm": 2.4514888713762324, + "language_loss": 0.71068275, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.73189664, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.51953125, + "step": 4384, + "time_per_iteration": 2.396869659423828 + }, + { + "auxiliary_loss_clip": 0.01069869, + "auxiliary_loss_mlp": 0.01053486, + "balance_loss_clip": 1.02231228, + "balance_loss_mlp": 1.02000129, + "epoch": 0.26364046294904553, + "flos": 27744847729920.0, + "grad_norm": 2.4552271821809684, + "language_loss": 0.71779841, + "learning_rate": 3.45417798298451e-06, + "loss": 0.73903197, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.49804688, + "step": 4385, + "time_per_iteration": 2.4241960048675537 + }, + { + "auxiliary_loss_clip": 0.01073885, + "auxiliary_loss_mlp": 0.01047898, + "balance_loss_clip": 1.01563978, + "balance_loss_mlp": 1.02189493, + "epoch": 0.2637005862017135, + "flos": 22891990302720.0, + "grad_norm": 1.8630245294285674, + "language_loss": 0.85936046, + "learning_rate": 3.453910573136482e-06, + "loss": 0.88057828, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51953125, + "step": 4386, + "time_per_iteration": 2.402496337890625 + }, + { + "auxiliary_loss_clip": 0.01074296, + "auxiliary_loss_mlp": 0.01052495, + "balance_loss_clip": 1.01720881, + "balance_loss_mlp": 1.02113962, + "epoch": 0.26376070945438146, + "flos": 15047949075840.0, + "grad_norm": 2.317443098218713, + "language_loss": 0.78721803, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.80848587, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53125, + "step": 4387, + "time_per_iteration": 2.3532235622406006 + }, + { + "auxiliary_loss_clip": 0.0107399, + "auxiliary_loss_mlp": 0.0105448, + "balance_loss_clip": 1.02048147, + "balance_loss_mlp": 1.02366519, + "epoch": 0.2638208327070494, + "flos": 21140781816960.0, + "grad_norm": 1.913303795235619, + "language_loss": 0.77323854, + "learning_rate": 3.453375588053264e-06, + "loss": 0.79452324, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.50390625, + "step": 4388, + "time_per_iteration": 2.411665916442871 + }, + { + "auxiliary_loss_clip": 0.01072425, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.01644444, + "balance_loss_mlp": 1.02069473, + "epoch": 0.26388095595971744, + "flos": 21724529616000.0, + "grad_norm": 2.0344393221731614, + "language_loss": 0.87365133, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.89487481, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.515625, + "step": 4389, + "time_per_iteration": 2.3712828159332275 + }, + { + "auxiliary_loss_clip": 0.010176, + "auxiliary_loss_mlp": 0.01007884, + "balance_loss_clip": 1.00325847, + "balance_loss_mlp": 1.00271702, + "epoch": 0.2639410792123854, + "flos": 65512036368000.0, + "grad_norm": 0.8087847824361902, + "language_loss": 0.60417962, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62443447, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.04614258, + "router_z_loss_mlp": 0.1484375, + "step": 4390, + "time_per_iteration": 3.0851640701293945 + }, + { + "auxiliary_loss_clip": 0.01075767, + "auxiliary_loss_mlp": 0.01052959, + "balance_loss_clip": 1.01882911, + "balance_loss_mlp": 1.02262926, + "epoch": 0.2640012024650534, + "flos": 23947519570560.0, + "grad_norm": 1.6112758985339612, + "language_loss": 0.78939849, + "learning_rate": 3.4525726971127e-06, + "loss": 0.81068575, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.53125, + "step": 4391, + "time_per_iteration": 2.4139206409454346 + }, + { + "auxiliary_loss_clip": 0.01017085, + "auxiliary_loss_mlp": 0.01004083, + "balance_loss_clip": 0.99900419, + "balance_loss_mlp": 1.00304842, + "epoch": 0.26406132571772134, + "flos": 56437620451200.0, + "grad_norm": 0.8816296268494779, + "language_loss": 0.58848631, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60869801, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.05078125, + "router_z_loss_mlp": 0.140625, + "step": 4392, + "time_per_iteration": 3.013625383377075 + }, + { + "auxiliary_loss_clip": 0.01075899, + "auxiliary_loss_mlp": 0.01056363, + "balance_loss_clip": 1.0224483, + "balance_loss_mlp": 1.02265227, + "epoch": 0.2641214489703893, + "flos": 22089476724480.0, + "grad_norm": 1.768621624007672, + "language_loss": 0.70007789, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.7214005, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.53125, + "step": 4393, + "time_per_iteration": 2.4076552391052246 + }, + { + "auxiliary_loss_clip": 0.01077326, + "auxiliary_loss_mlp": 0.01063317, + "balance_loss_clip": 1.02533698, + "balance_loss_mlp": 1.02198005, + "epoch": 0.26418157222305727, + "flos": 16543837722240.0, + "grad_norm": 4.900983563578649, + "language_loss": 0.85495836, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.87636483, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5546875, + "step": 4394, + "time_per_iteration": 2.4367597103118896 + }, + { + "auxiliary_loss_clip": 0.01080281, + "auxiliary_loss_mlp": 0.01062346, + "balance_loss_clip": 1.02374578, + "balance_loss_mlp": 1.02344549, + "epoch": 0.26424169547572524, + "flos": 18001566385920.0, + "grad_norm": 3.0673260240288918, + "language_loss": 0.71700317, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.73842943, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5703125, + "step": 4395, + "time_per_iteration": 2.367868185043335 + }, + { + "auxiliary_loss_clip": 0.01074548, + "auxiliary_loss_mlp": 0.01059416, + "balance_loss_clip": 1.0248816, + "balance_loss_mlp": 1.02255666, + "epoch": 0.2643018187283932, + "flos": 16982207153280.0, + "grad_norm": 1.779207680020714, + "language_loss": 0.87638766, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89772725, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.51953125, + "step": 4396, + "time_per_iteration": 2.4258487224578857 + }, + { + "auxiliary_loss_clip": 0.01017505, + "auxiliary_loss_mlp": 0.01008365, + "balance_loss_clip": 1.00261879, + "balance_loss_mlp": 1.0031302, + "epoch": 0.26436194198106117, + "flos": 59661396794880.0, + "grad_norm": 0.7840055535939273, + "language_loss": 0.55092192, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57118058, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.05737305, + "router_z_loss_mlp": 0.14355469, + "step": 4397, + "time_per_iteration": 2.796813488006592 + }, + { + "auxiliary_loss_clip": 0.01073692, + "auxiliary_loss_mlp": 0.0106266, + "balance_loss_clip": 1.02794623, + "balance_loss_mlp": 1.02209532, + "epoch": 0.26442206523372913, + "flos": 32920093451520.0, + "grad_norm": 2.462275416721509, + "language_loss": 0.79551864, + "learning_rate": 3.450697357532435e-06, + "loss": 0.81688207, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.515625, + "step": 4398, + "time_per_iteration": 2.5414960384368896 + }, + { + "auxiliary_loss_clip": 0.01077684, + "auxiliary_loss_mlp": 0.01059664, + "balance_loss_clip": 1.0235672, + "balance_loss_mlp": 1.02437735, + "epoch": 0.2644821884863971, + "flos": 21030281763840.0, + "grad_norm": 1.8795839434736703, + "language_loss": 0.68680739, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.7081809, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.53125, + "step": 4399, + "time_per_iteration": 2.3847877979278564 + }, + { + "auxiliary_loss_clip": 0.01067514, + "auxiliary_loss_mlp": 0.01048005, + "balance_loss_clip": 1.01803637, + "balance_loss_mlp": 1.01967084, + "epoch": 0.26454231173906506, + "flos": 20775764885760.0, + "grad_norm": 1.7245315170562143, + "language_loss": 0.8779701, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.89912528, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4765625, + "step": 4400, + "time_per_iteration": 2.4151861667633057 + }, + { + "auxiliary_loss_clip": 0.01074036, + "auxiliary_loss_mlp": 0.01063056, + "balance_loss_clip": 1.02276313, + "balance_loss_mlp": 1.02075052, + "epoch": 0.264602434991733, + "flos": 16617713892480.0, + "grad_norm": 2.186771841995937, + "language_loss": 0.78020489, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.80157584, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.53515625, + "step": 4401, + "time_per_iteration": 2.3639345169067383 + }, + { + "auxiliary_loss_clip": 0.01075111, + "auxiliary_loss_mlp": 0.0105754, + "balance_loss_clip": 1.01893985, + "balance_loss_mlp": 1.02072692, + "epoch": 0.26466255824440105, + "flos": 19061669041920.0, + "grad_norm": 1.7835480232152183, + "language_loss": 0.8918519, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.91317844, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.54296875, + "step": 4402, + "time_per_iteration": 2.4081978797912598 + }, + { + "auxiliary_loss_clip": 0.01074722, + "auxiliary_loss_mlp": 0.01048177, + "balance_loss_clip": 1.01229477, + "balance_loss_mlp": 1.02064991, + "epoch": 0.264722681497069, + "flos": 22637438513280.0, + "grad_norm": 1.5945054664744287, + "language_loss": 0.79567623, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.8169052, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5390625, + "step": 4403, + "time_per_iteration": 2.3940067291259766 + }, + { + "auxiliary_loss_clip": 0.01074963, + "auxiliary_loss_mlp": 0.01049149, + "balance_loss_clip": 1.01433945, + "balance_loss_mlp": 1.02096355, + "epoch": 0.264782804749737, + "flos": 22491152449920.0, + "grad_norm": 2.251026592938621, + "language_loss": 0.89896894, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.92021012, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5390625, + "step": 4404, + "time_per_iteration": 3.8373467922210693 + }, + { + "auxiliary_loss_clip": 0.01073727, + "auxiliary_loss_mlp": 0.01053911, + "balance_loss_clip": 1.02284527, + "balance_loss_mlp": 1.02107954, + "epoch": 0.26484292800240494, + "flos": 16799332118400.0, + "grad_norm": 1.7512179372268681, + "language_loss": 0.76998001, + "learning_rate": 3.448819322433709e-06, + "loss": 0.79125643, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.52734375, + "step": 4405, + "time_per_iteration": 2.367297887802124 + }, + { + "auxiliary_loss_clip": 0.01073361, + "auxiliary_loss_mlp": 0.0104682, + "balance_loss_clip": 1.01263118, + "balance_loss_mlp": 1.02177894, + "epoch": 0.2649030512550729, + "flos": 20448523912320.0, + "grad_norm": 1.8566187587673808, + "language_loss": 0.71482456, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.73602641, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.515625, + "step": 4406, + "time_per_iteration": 2.3857016563415527 + }, + { + "auxiliary_loss_clip": 0.01072359, + "auxiliary_loss_mlp": 0.01048427, + "balance_loss_clip": 1.01306939, + "balance_loss_mlp": 1.0210526, + "epoch": 0.2649631745077409, + "flos": 22415111775360.0, + "grad_norm": 1.9747583401611728, + "language_loss": 0.8521868, + "learning_rate": 3.448282246369912e-06, + "loss": 0.87339461, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.51171875, + "step": 4407, + "time_per_iteration": 2.4085209369659424 + }, + { + "auxiliary_loss_clip": 0.0107254, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.01132739, + "balance_loss_mlp": 1.0209434, + "epoch": 0.26502329776040884, + "flos": 35114663692800.0, + "grad_norm": 1.6418963007304697, + "language_loss": 0.77838743, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.79955524, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.515625, + "step": 4408, + "time_per_iteration": 4.6377458572387695 + }, + { + "auxiliary_loss_clip": 0.01071699, + "auxiliary_loss_mlp": 0.01047874, + "balance_loss_clip": 1.01459098, + "balance_loss_mlp": 1.02136874, + "epoch": 0.2650834210130768, + "flos": 38686069244160.0, + "grad_norm": 2.004287653102171, + "language_loss": 0.72387266, + "learning_rate": 3.447744950630084e-06, + "loss": 0.74506843, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5, + "step": 4409, + "time_per_iteration": 2.5407700538635254 + }, + { + "auxiliary_loss_clip": 0.01076913, + "auxiliary_loss_mlp": 0.01055827, + "balance_loss_clip": 1.02182817, + "balance_loss_mlp": 1.02410507, + "epoch": 0.26514354426574477, + "flos": 24715713415680.0, + "grad_norm": 1.9199239050013146, + "language_loss": 0.74880129, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.77012873, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.52734375, + "step": 4410, + "time_per_iteration": 3.8978984355926514 + }, + { + "auxiliary_loss_clip": 0.01077251, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.01429951, + "balance_loss_mlp": 1.02376747, + "epoch": 0.26520366751841273, + "flos": 20339001377280.0, + "grad_norm": 1.8900529293633495, + "language_loss": 0.74886668, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.77012646, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.53515625, + "step": 4411, + "time_per_iteration": 2.3954174518585205 + }, + { + "auxiliary_loss_clip": 0.01074289, + "auxiliary_loss_mlp": 0.01054588, + "balance_loss_clip": 1.02023232, + "balance_loss_mlp": 1.02279973, + "epoch": 0.2652637907710807, + "flos": 22342841527680.0, + "grad_norm": 2.218615436977912, + "language_loss": 0.83401418, + "learning_rate": 3.446938595306071e-06, + "loss": 0.85530299, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4412, + "time_per_iteration": 2.4159350395202637 + }, + { + "auxiliary_loss_clip": 0.01074394, + "auxiliary_loss_mlp": 0.01063184, + "balance_loss_clip": 1.03125942, + "balance_loss_mlp": 1.02262282, + "epoch": 0.26532391402374866, + "flos": 19353228739200.0, + "grad_norm": 2.0289900916496015, + "language_loss": 0.75693786, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.77831364, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.515625, + "step": 4413, + "time_per_iteration": 2.385753870010376 + }, + { + "auxiliary_loss_clip": 0.01024476, + "auxiliary_loss_mlp": 0.01007897, + "balance_loss_clip": 1.00339067, + "balance_loss_mlp": 1.01096463, + "epoch": 0.26538403727641663, + "flos": 44784800138880.0, + "grad_norm": 0.8872707873771541, + "language_loss": 0.56966448, + "learning_rate": 3.446400750732793e-06, + "loss": 0.58998823, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.04516602, + "router_z_loss_mlp": 0.13476562, + "step": 4414, + "time_per_iteration": 2.9851279258728027 + }, + { + "auxiliary_loss_clip": 0.01070528, + "auxiliary_loss_mlp": 0.01054276, + "balance_loss_clip": 1.02347219, + "balance_loss_mlp": 1.02178693, + "epoch": 0.26544416052908465, + "flos": 28180913011200.0, + "grad_norm": 1.9745228663093242, + "language_loss": 0.75394487, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.77519286, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48632812, + "step": 4415, + "time_per_iteration": 2.4561593532562256 + }, + { + "auxiliary_loss_clip": 0.01074639, + "auxiliary_loss_mlp": 0.01062094, + "balance_loss_clip": 1.02330303, + "balance_loss_mlp": 1.02116525, + "epoch": 0.2655042837817526, + "flos": 17564349029760.0, + "grad_norm": 2.759697729214561, + "language_loss": 0.88711846, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.90848577, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.53515625, + "step": 4416, + "time_per_iteration": 2.404308319091797 + }, + { + "auxiliary_loss_clip": 0.01074088, + "auxiliary_loss_mlp": 0.01056198, + "balance_loss_clip": 1.02160299, + "balance_loss_mlp": 1.02150428, + "epoch": 0.2655644070344206, + "flos": 23403502765440.0, + "grad_norm": 1.8502565706068381, + "language_loss": 0.78403437, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.80533731, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5234375, + "step": 4417, + "time_per_iteration": 2.420186758041382 + }, + { + "auxiliary_loss_clip": 0.01071369, + "auxiliary_loss_mlp": 0.01059749, + "balance_loss_clip": 1.02462959, + "balance_loss_mlp": 1.02081275, + "epoch": 0.26562453028708854, + "flos": 26467271015040.0, + "grad_norm": 1.6302959759438682, + "language_loss": 0.81383401, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.83514524, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.50390625, + "step": 4418, + "time_per_iteration": 2.495574474334717 + }, + { + "auxiliary_loss_clip": 0.01074878, + "auxiliary_loss_mlp": 0.01069858, + "balance_loss_clip": 1.03206873, + "balance_loss_mlp": 1.02226973, + "epoch": 0.2656846535397565, + "flos": 19206593562240.0, + "grad_norm": 2.346353920187919, + "language_loss": 0.68998063, + "learning_rate": 3.445055179644071e-06, + "loss": 0.71142799, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5234375, + "step": 4419, + "time_per_iteration": 2.402921438217163 + }, + { + "auxiliary_loss_clip": 0.01074616, + "auxiliary_loss_mlp": 0.01062022, + "balance_loss_clip": 1.02600896, + "balance_loss_mlp": 1.02175593, + "epoch": 0.2657447767924245, + "flos": 30550119206400.0, + "grad_norm": 1.7564905694905417, + "language_loss": 0.80253053, + "learning_rate": 3.444785900995585e-06, + "loss": 0.82389688, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 4420, + "time_per_iteration": 2.478043794631958 + }, + { + "auxiliary_loss_clip": 0.01075533, + "auxiliary_loss_mlp": 0.01059594, + "balance_loss_clip": 1.02175736, + "balance_loss_mlp": 1.0220356, + "epoch": 0.26580490004509244, + "flos": 20921701835520.0, + "grad_norm": 1.9267863479778327, + "language_loss": 0.82908463, + "learning_rate": 3.444516567560673e-06, + "loss": 0.85043597, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.53515625, + "step": 4421, + "time_per_iteration": 2.3880460262298584 + }, + { + "auxiliary_loss_clip": 0.01071816, + "auxiliary_loss_mlp": 0.01055726, + "balance_loss_clip": 1.0253756, + "balance_loss_mlp": 1.02150404, + "epoch": 0.2658650232977604, + "flos": 43943988798720.0, + "grad_norm": 1.552903800572558, + "language_loss": 0.67600667, + "learning_rate": 3.444247179349548e-06, + "loss": 0.69728208, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.50390625, + "step": 4422, + "time_per_iteration": 2.614163637161255 + }, + { + "auxiliary_loss_clip": 0.01075837, + "auxiliary_loss_mlp": 0.01054379, + "balance_loss_clip": 1.01887822, + "balance_loss_mlp": 1.0220325, + "epoch": 0.26592514655042837, + "flos": 29715136197120.0, + "grad_norm": 2.654681800080909, + "language_loss": 0.7659986, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.78730071, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5390625, + "step": 4423, + "time_per_iteration": 2.4544618129730225 + }, + { + "auxiliary_loss_clip": 0.01073284, + "auxiliary_loss_mlp": 0.01053484, + "balance_loss_clip": 1.01755476, + "balance_loss_mlp": 1.02083123, + "epoch": 0.26598526980309634, + "flos": 46676082332160.0, + "grad_norm": 1.7945235670677797, + "language_loss": 0.79370296, + "learning_rate": 3.443708238639522e-06, + "loss": 0.81497061, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5234375, + "step": 4424, + "time_per_iteration": 2.609229803085327 + }, + { + "auxiliary_loss_clip": 0.01075593, + "auxiliary_loss_mlp": 0.01051831, + "balance_loss_clip": 1.01823831, + "balance_loss_mlp": 1.02316391, + "epoch": 0.2660453930557643, + "flos": 11508663841920.0, + "grad_norm": 2.994664917255243, + "language_loss": 0.80922914, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.83050334, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5234375, + "step": 4425, + "time_per_iteration": 2.352877378463745 + }, + { + "auxiliary_loss_clip": 0.01074698, + "auxiliary_loss_mlp": 0.01049209, + "balance_loss_clip": 1.0179044, + "balance_loss_mlp": 1.02443743, + "epoch": 0.26610551630843227, + "flos": 24790392547200.0, + "grad_norm": 1.6550601586860452, + "language_loss": 0.82597041, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.84720945, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.50390625, + "step": 4426, + "time_per_iteration": 2.455880880355835 + }, + { + "auxiliary_loss_clip": 0.01079919, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_clip": 1.02104044, + "balance_loss_mlp": 1.0272522, + "epoch": 0.26616563956110023, + "flos": 27635150638080.0, + "grad_norm": 1.5485942954486374, + "language_loss": 0.78378457, + "learning_rate": 3.442899417008333e-06, + "loss": 0.80515325, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5234375, + "step": 4427, + "time_per_iteration": 2.4552979469299316 + }, + { + "auxiliary_loss_clip": 0.01073168, + "auxiliary_loss_mlp": 0.0104355, + "balance_loss_clip": 1.01138711, + "balance_loss_mlp": 1.02380967, + "epoch": 0.26622576281376825, + "flos": 28361728275840.0, + "grad_norm": 2.0327135138317836, + "language_loss": 0.7775349, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79870206, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.49414062, + "step": 4428, + "time_per_iteration": 2.4775850772857666 + }, + { + "auxiliary_loss_clip": 0.0107767, + "auxiliary_loss_mlp": 0.01050879, + "balance_loss_clip": 1.01807213, + "balance_loss_mlp": 1.02430558, + "epoch": 0.2662858860664362, + "flos": 18040354773120.0, + "grad_norm": 2.0649971471335435, + "language_loss": 0.85101199, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.87229753, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.53125, + "step": 4429, + "time_per_iteration": 2.3745672702789307 + }, + { + "auxiliary_loss_clip": 0.01073933, + "auxiliary_loss_mlp": 0.01050148, + "balance_loss_clip": 1.01672196, + "balance_loss_mlp": 1.02361727, + "epoch": 0.2663460093191042, + "flos": 22744761632640.0, + "grad_norm": 1.683707102786659, + "language_loss": 0.74085599, + "learning_rate": 3.442090102943143e-06, + "loss": 0.76209676, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.50390625, + "step": 4430, + "time_per_iteration": 2.4466476440429688 + }, + { + "auxiliary_loss_clip": 0.01078158, + "auxiliary_loss_mlp": 0.01061735, + "balance_loss_clip": 1.02623463, + "balance_loss_mlp": 1.02445412, + "epoch": 0.26640613257177215, + "flos": 16507842243840.0, + "grad_norm": 2.4170130626429827, + "language_loss": 0.83435285, + "learning_rate": 3.441820222206035e-06, + "loss": 0.85575181, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.53515625, + "step": 4431, + "time_per_iteration": 2.372795581817627 + }, + { + "auxiliary_loss_clip": 0.01080335, + "auxiliary_loss_mlp": 0.01057496, + "balance_loss_clip": 1.02226925, + "balance_loss_mlp": 1.02573395, + "epoch": 0.2664662558244401, + "flos": 23074830426240.0, + "grad_norm": 2.737064242022605, + "language_loss": 0.78595757, + "learning_rate": 3.44155028679496e-06, + "loss": 0.80733585, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.546875, + "step": 4432, + "time_per_iteration": 2.440993309020996 + }, + { + "auxiliary_loss_clip": 0.01075815, + "auxiliary_loss_mlp": 0.01047164, + "balance_loss_clip": 1.01280808, + "balance_loss_mlp": 1.02452064, + "epoch": 0.2665263790771081, + "flos": 23768135671680.0, + "grad_norm": 3.250746465679421, + "language_loss": 0.84568697, + "learning_rate": 3.441280296720154e-06, + "loss": 0.86691678, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51171875, + "step": 4433, + "time_per_iteration": 2.411165475845337 + }, + { + "auxiliary_loss_clip": 0.01074335, + "auxiliary_loss_mlp": 0.01059823, + "balance_loss_clip": 1.02648067, + "balance_loss_mlp": 1.0246104, + "epoch": 0.26658650232977604, + "flos": 28000027923840.0, + "grad_norm": 1.9573983007248246, + "language_loss": 0.7815389, + "learning_rate": 3.441010251991854e-06, + "loss": 0.80288053, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.49609375, + "step": 4434, + "time_per_iteration": 2.463564872741699 + }, + { + "auxiliary_loss_clip": 0.01074167, + "auxiliary_loss_mlp": 0.01059514, + "balance_loss_clip": 1.02556348, + "balance_loss_mlp": 1.02247226, + "epoch": 0.266646625582444, + "flos": 22162549933440.0, + "grad_norm": 1.8337714829246643, + "language_loss": 0.84080172, + "learning_rate": 3.440740152620301e-06, + "loss": 0.86213857, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.515625, + "step": 4435, + "time_per_iteration": 2.3972725868225098 + }, + { + "auxiliary_loss_clip": 0.01077376, + "auxiliary_loss_mlp": 0.01066691, + "balance_loss_clip": 1.02847242, + "balance_loss_mlp": 1.02363467, + "epoch": 0.266706748835112, + "flos": 27852345406080.0, + "grad_norm": 2.187311084003557, + "language_loss": 0.90432, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.92576063, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5390625, + "step": 4436, + "time_per_iteration": 2.4430606365203857 + }, + { + "auxiliary_loss_clip": 0.01075429, + "auxiliary_loss_mlp": 0.01059241, + "balance_loss_clip": 1.02481389, + "balance_loss_mlp": 1.02281094, + "epoch": 0.26676687208777994, + "flos": 25810938766080.0, + "grad_norm": 1.3288860175014947, + "language_loss": 0.79482567, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81617242, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.52734375, + "step": 4437, + "time_per_iteration": 2.4560909271240234 + }, + { + "auxiliary_loss_clip": 0.01074356, + "auxiliary_loss_mlp": 0.01057928, + "balance_loss_clip": 1.02493072, + "balance_loss_mlp": 1.02272642, + "epoch": 0.2668269953404479, + "flos": 36063114220800.0, + "grad_norm": 2.2494981972854404, + "language_loss": 0.66014171, + "learning_rate": 3.439929526748556e-06, + "loss": 0.68146455, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.515625, + "step": 4438, + "time_per_iteration": 2.521247625350952 + }, + { + "auxiliary_loss_clip": 0.01073864, + "auxiliary_loss_mlp": 0.01061089, + "balance_loss_clip": 1.0264945, + "balance_loss_mlp": 1.02229762, + "epoch": 0.26688711859311587, + "flos": 26569985834880.0, + "grad_norm": 1.7495985651751287, + "language_loss": 0.77063626, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.79198581, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.515625, + "step": 4439, + "time_per_iteration": 2.4206268787384033 + }, + { + "auxiliary_loss_clip": 0.01073698, + "auxiliary_loss_mlp": 0.01054294, + "balance_loss_clip": 1.0179354, + "balance_loss_mlp": 1.02175355, + "epoch": 0.26694724184578383, + "flos": 26760331900800.0, + "grad_norm": 1.6381865952208705, + "language_loss": 0.7312457, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.75252557, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.51953125, + "step": 4440, + "time_per_iteration": 2.4655959606170654 + }, + { + "auxiliary_loss_clip": 0.01076438, + "auxiliary_loss_mlp": 0.01063172, + "balance_loss_clip": 1.02855372, + "balance_loss_mlp": 1.02334976, + "epoch": 0.2670073650984518, + "flos": 20958535186560.0, + "grad_norm": 1.9375921722768221, + "language_loss": 0.68463135, + "learning_rate": 3.439118409456376e-06, + "loss": 0.70602745, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53125, + "step": 4441, + "time_per_iteration": 2.39318585395813 + }, + { + "auxiliary_loss_clip": 0.01073786, + "auxiliary_loss_mlp": 0.01058619, + "balance_loss_clip": 1.02145004, + "balance_loss_mlp": 1.02222848, + "epoch": 0.2670674883511198, + "flos": 28364800475520.0, + "grad_norm": 3.3109606620341627, + "language_loss": 0.77490914, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.79623324, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.515625, + "step": 4442, + "time_per_iteration": 2.461061716079712 + }, + { + "auxiliary_loss_clip": 0.01026944, + "auxiliary_loss_mlp": 0.01017329, + "balance_loss_clip": 1.01284671, + "balance_loss_mlp": 1.01342559, + "epoch": 0.2671276116037878, + "flos": 58968370840320.0, + "grad_norm": 0.9336348364823988, + "language_loss": 0.61309135, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63353407, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.13476562, + "step": 4443, + "time_per_iteration": 2.966085433959961 + }, + { + "auxiliary_loss_clip": 0.01073006, + "auxiliary_loss_mlp": 0.01053569, + "balance_loss_clip": 1.01907039, + "balance_loss_mlp": 1.02146423, + "epoch": 0.26718773485645575, + "flos": 43943395305600.0, + "grad_norm": 1.5314309560568364, + "language_loss": 0.77793545, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.79920125, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.515625, + "step": 4444, + "time_per_iteration": 4.022387266159058 + }, + { + "auxiliary_loss_clip": 0.01076122, + "auxiliary_loss_mlp": 0.01050261, + "balance_loss_clip": 1.01633406, + "balance_loss_mlp": 1.02389503, + "epoch": 0.2672478581091237, + "flos": 25227156055680.0, + "grad_norm": 1.727499854727591, + "language_loss": 0.81925899, + "learning_rate": 3.438036155780158e-06, + "loss": 0.84052277, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.51953125, + "step": 4445, + "time_per_iteration": 2.430450677871704 + }, + { + "auxiliary_loss_clip": 0.01076681, + "auxiliary_loss_mlp": 0.01051722, + "balance_loss_clip": 1.01581645, + "balance_loss_mlp": 1.0235095, + "epoch": 0.2673079813617917, + "flos": 15267273436800.0, + "grad_norm": 2.0308988675332786, + "language_loss": 0.91210425, + "learning_rate": 3.43776545600926e-06, + "loss": 0.93338835, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 4446, + "time_per_iteration": 2.380800724029541 + }, + { + "auxiliary_loss_clip": 0.01077024, + "auxiliary_loss_mlp": 0.01061186, + "balance_loss_clip": 1.02847505, + "balance_loss_mlp": 1.02514422, + "epoch": 0.26736810461445965, + "flos": 25811532259200.0, + "grad_norm": 2.2042589045568275, + "language_loss": 0.69300926, + "learning_rate": 3.437494701718153e-06, + "loss": 0.71439135, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.51953125, + "step": 4447, + "time_per_iteration": 3.8453664779663086 + }, + { + "auxiliary_loss_clip": 0.01079396, + "auxiliary_loss_mlp": 0.01060789, + "balance_loss_clip": 1.02586031, + "balance_loss_mlp": 1.02532041, + "epoch": 0.2674282278671276, + "flos": 24311663717760.0, + "grad_norm": 2.5769621217621386, + "language_loss": 0.84273052, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.8641324, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5390625, + "step": 4448, + "time_per_iteration": 3.9560627937316895 + }, + { + "auxiliary_loss_clip": 0.01076408, + "auxiliary_loss_mlp": 0.0107401, + "balance_loss_clip": 1.04051208, + "balance_loss_mlp": 1.02504373, + "epoch": 0.2674883511197956, + "flos": 22814553173760.0, + "grad_norm": 1.695952249700283, + "language_loss": 0.85830009, + "learning_rate": 3.436953029616378e-06, + "loss": 0.87980425, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.515625, + "step": 4449, + "time_per_iteration": 3.826409339904785 + }, + { + "auxiliary_loss_clip": 0.01080747, + "auxiliary_loss_mlp": 0.01066122, + "balance_loss_clip": 1.02492332, + "balance_loss_mlp": 1.02448535, + "epoch": 0.26754847437246354, + "flos": 25369113110400.0, + "grad_norm": 1.6772338586879099, + "language_loss": 0.85174739, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.87321609, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.5625, + "step": 4450, + "time_per_iteration": 2.4365384578704834 + }, + { + "auxiliary_loss_clip": 0.01074248, + "auxiliary_loss_mlp": 0.01057021, + "balance_loss_clip": 1.02359438, + "balance_loss_mlp": 1.02365041, + "epoch": 0.2676085976251315, + "flos": 20229374108160.0, + "grad_norm": 1.7679951003885779, + "language_loss": 0.81655002, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83786273, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5078125, + "step": 4451, + "time_per_iteration": 2.398487091064453 + }, + { + "auxiliary_loss_clip": 0.01076089, + "auxiliary_loss_mlp": 0.01061989, + "balance_loss_clip": 1.02913499, + "balance_loss_mlp": 1.02547097, + "epoch": 0.26766872087779947, + "flos": 28036966008960.0, + "grad_norm": 1.6172904477095364, + "language_loss": 0.87669969, + "learning_rate": 3.436140112818882e-06, + "loss": 0.89808053, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.50390625, + "step": 4452, + "time_per_iteration": 2.4805595874786377 + }, + { + "auxiliary_loss_clip": 0.01078851, + "auxiliary_loss_mlp": 0.01065783, + "balance_loss_clip": 1.02489471, + "balance_loss_mlp": 1.02411878, + "epoch": 0.26772884413046744, + "flos": 18324408528000.0, + "grad_norm": 2.3303143714187105, + "language_loss": 0.85209334, + "learning_rate": 3.435869031622194e-06, + "loss": 0.87353969, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.546875, + "step": 4453, + "time_per_iteration": 2.385587692260742 + }, + { + "auxiliary_loss_clip": 0.0107913, + "auxiliary_loss_mlp": 0.01072828, + "balance_loss_clip": 1.03580189, + "balance_loss_mlp": 1.02596557, + "epoch": 0.2677889673831354, + "flos": 22126414809600.0, + "grad_norm": 1.5232494554021099, + "language_loss": 0.80689251, + "learning_rate": 3.435597895977208e-06, + "loss": 0.82841206, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53125, + "step": 4454, + "time_per_iteration": 2.424654245376587 + }, + { + "auxiliary_loss_clip": 0.01077221, + "auxiliary_loss_mlp": 0.01059077, + "balance_loss_clip": 1.02503037, + "balance_loss_mlp": 1.02315319, + "epoch": 0.2678490906358034, + "flos": 23728649057280.0, + "grad_norm": 1.5613152582371714, + "language_loss": 0.73622394, + "learning_rate": 3.435326705894206e-06, + "loss": 0.75758696, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5390625, + "step": 4455, + "time_per_iteration": 2.4213528633117676 + }, + { + "auxiliary_loss_clip": 0.01072984, + "auxiliary_loss_mlp": 0.01053904, + "balance_loss_clip": 1.02081132, + "balance_loss_mlp": 1.02242017, + "epoch": 0.2679092138884714, + "flos": 21761781903360.0, + "grad_norm": 1.6779732380198227, + "language_loss": 0.74429232, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.50390625, + "step": 4456, + "time_per_iteration": 2.4432196617126465 + }, + { + "auxiliary_loss_clip": 0.01077355, + "auxiliary_loss_mlp": 0.01058231, + "balance_loss_clip": 1.02017975, + "balance_loss_mlp": 1.02231002, + "epoch": 0.26796933714113935, + "flos": 19860272547840.0, + "grad_norm": 3.384174829339199, + "language_loss": 0.72601771, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.74737358, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.55078125, + "step": 4457, + "time_per_iteration": 2.392247438430786 + }, + { + "auxiliary_loss_clip": 0.01076928, + "auxiliary_loss_mlp": 0.01055336, + "balance_loss_clip": 1.01776135, + "balance_loss_mlp": 1.02344418, + "epoch": 0.2680294603938073, + "flos": 20046848186880.0, + "grad_norm": 2.331580297025153, + "language_loss": 0.8047809, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.82610351, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.53515625, + "step": 4458, + "time_per_iteration": 2.4061574935913086 + }, + { + "auxiliary_loss_clip": 0.01018423, + "auxiliary_loss_mlp": 0.01022965, + "balance_loss_clip": 1.01836383, + "balance_loss_mlp": 1.00553381, + "epoch": 0.2680895836464753, + "flos": 72110237172480.0, + "grad_norm": 0.8859083138563465, + "language_loss": 0.58771718, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60813105, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.04589844, + "router_z_loss_mlp": 0.12890625, + "step": 4459, + "time_per_iteration": 3.0255489349365234 + }, + { + "auxiliary_loss_clip": 0.01074195, + "auxiliary_loss_mlp": 0.01054024, + "balance_loss_clip": 1.01899993, + "balance_loss_mlp": 1.02200794, + "epoch": 0.26814970689914325, + "flos": 20448000241920.0, + "grad_norm": 2.3141396385503206, + "language_loss": 0.8615886, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.88287079, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5234375, + "step": 4460, + "time_per_iteration": 2.4130585193634033 + }, + { + "auxiliary_loss_clip": 0.01076231, + "auxiliary_loss_mlp": 0.01054698, + "balance_loss_clip": 1.01767194, + "balance_loss_mlp": 1.02306306, + "epoch": 0.2682098301518112, + "flos": 17565710572800.0, + "grad_norm": 1.914329414146881, + "language_loss": 0.69021732, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.71152663, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.53125, + "step": 4461, + "time_per_iteration": 2.3903214931488037 + }, + { + "auxiliary_loss_clip": 0.01076994, + "auxiliary_loss_mlp": 0.01062374, + "balance_loss_clip": 1.02515686, + "balance_loss_mlp": 1.02425134, + "epoch": 0.2682699534044792, + "flos": 18332263584000.0, + "grad_norm": 1.5639224771243898, + "language_loss": 0.69220281, + "learning_rate": 3.43342685191282e-06, + "loss": 0.71359646, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.52734375, + "step": 4462, + "time_per_iteration": 2.4480509757995605 + }, + { + "auxiliary_loss_clip": 0.01080018, + "auxiliary_loss_mlp": 0.0105767, + "balance_loss_clip": 1.02076316, + "balance_loss_mlp": 1.02702665, + "epoch": 0.26833007665714714, + "flos": 25300124530560.0, + "grad_norm": 1.8127175192101044, + "language_loss": 0.70817077, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.72954762, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.53125, + "step": 4463, + "time_per_iteration": 2.438394069671631 + }, + { + "auxiliary_loss_clip": 0.01085028, + "auxiliary_loss_mlp": 0.0105985, + "balance_loss_clip": 1.02294326, + "balance_loss_mlp": 1.02934098, + "epoch": 0.2683901999098151, + "flos": 16099044600960.0, + "grad_norm": 2.428844848218226, + "language_loss": 0.79834807, + "learning_rate": 3.432883547133931e-06, + "loss": 0.81979692, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5546875, + "step": 4464, + "time_per_iteration": 2.372387409210205 + }, + { + "auxiliary_loss_clip": 0.01080816, + "auxiliary_loss_mlp": 0.01053587, + "balance_loss_clip": 1.01892102, + "balance_loss_mlp": 1.02788556, + "epoch": 0.2684503231624831, + "flos": 27306827412480.0, + "grad_norm": 1.7586771045861913, + "language_loss": 0.72028899, + "learning_rate": 3.432611813236704e-06, + "loss": 0.74163294, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53125, + "step": 4465, + "time_per_iteration": 2.471996545791626 + }, + { + "auxiliary_loss_clip": 0.01028311, + "auxiliary_loss_mlp": 0.01005431, + "balance_loss_clip": 1.001616, + "balance_loss_mlp": 1.01575828, + "epoch": 0.26851044641515104, + "flos": 71854498396800.0, + "grad_norm": 0.6997889838820115, + "language_loss": 0.53265154, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55298895, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.03808594, + "router_z_loss_mlp": 0.125, + "step": 4466, + "time_per_iteration": 3.1587204933166504 + }, + { + "auxiliary_loss_clip": 0.01081618, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_clip": 1.03211176, + "balance_loss_mlp": 1.02966428, + "epoch": 0.268570569667819, + "flos": 18732787234560.0, + "grad_norm": 2.0038914978239992, + "language_loss": 0.75368607, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.77517933, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.51953125, + "step": 4467, + "time_per_iteration": 2.387817859649658 + }, + { + "auxiliary_loss_clip": 0.01080591, + "auxiliary_loss_mlp": 0.01067379, + "balance_loss_clip": 1.02932787, + "balance_loss_mlp": 1.02597451, + "epoch": 0.268630692920487, + "flos": 18177633705600.0, + "grad_norm": 2.2426441861507493, + "language_loss": 0.82623041, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.84771013, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.546875, + "step": 4468, + "time_per_iteration": 2.3790152072906494 + }, + { + "auxiliary_loss_clip": 0.01022344, + "auxiliary_loss_mlp": 0.01006427, + "balance_loss_clip": 1.00189745, + "balance_loss_mlp": 1.00919986, + "epoch": 0.268690816173155, + "flos": 68728025612160.0, + "grad_norm": 0.8554368110100365, + "language_loss": 0.59697372, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61726141, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.04541016, + "router_z_loss_mlp": 0.13085938, + "step": 4469, + "time_per_iteration": 3.117133378982544 + }, + { + "auxiliary_loss_clip": 0.01079794, + "auxiliary_loss_mlp": 0.01071111, + "balance_loss_clip": 1.03286827, + "balance_loss_mlp": 1.02656472, + "epoch": 0.26875093942582295, + "flos": 23292548864640.0, + "grad_norm": 2.5505715315572544, + "language_loss": 0.83655512, + "learning_rate": 3.431252329084972e-06, + "loss": 0.85806417, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.53125, + "step": 4470, + "time_per_iteration": 2.4219741821289062 + }, + { + "auxiliary_loss_clip": 0.01071548, + "auxiliary_loss_mlp": 0.01060694, + "balance_loss_clip": 1.02548003, + "balance_loss_mlp": 1.0214268, + "epoch": 0.2688110626784909, + "flos": 21542387719680.0, + "grad_norm": 2.343131633392283, + "language_loss": 0.84107673, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.8623991, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5, + "step": 4471, + "time_per_iteration": 2.408945322036743 + }, + { + "auxiliary_loss_clip": 0.01072913, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_clip": 1.02791131, + "balance_loss_mlp": 1.02242017, + "epoch": 0.2688711859311589, + "flos": 28399399499520.0, + "grad_norm": 2.020575159230114, + "language_loss": 0.71991956, + "learning_rate": 3.43070815543947e-06, + "loss": 0.74125254, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.50390625, + "step": 4472, + "time_per_iteration": 2.497464418411255 + }, + { + "auxiliary_loss_clip": 0.01073874, + "auxiliary_loss_mlp": 0.01067186, + "balance_loss_clip": 1.03433156, + "balance_loss_mlp": 1.02280831, + "epoch": 0.26893130918382685, + "flos": 25993743978240.0, + "grad_norm": 2.415709645356201, + "language_loss": 0.69308305, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.71449363, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.51171875, + "step": 4473, + "time_per_iteration": 2.4493463039398193 + }, + { + "auxiliary_loss_clip": 0.01073068, + "auxiliary_loss_mlp": 0.01063133, + "balance_loss_clip": 1.02791834, + "balance_loss_mlp": 1.02249312, + "epoch": 0.2689914324364948, + "flos": 20338582440960.0, + "grad_norm": 1.8119295038376493, + "language_loss": 0.85421842, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.87558043, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5078125, + "step": 4474, + "time_per_iteration": 2.4423153400421143 + }, + { + "auxiliary_loss_clip": 0.01072493, + "auxiliary_loss_mlp": 0.01058728, + "balance_loss_clip": 1.0254209, + "balance_loss_mlp": 1.02226901, + "epoch": 0.2690515556891628, + "flos": 19463519324160.0, + "grad_norm": 2.76222023987949, + "language_loss": 0.72076267, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.74207485, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.50390625, + "step": 4475, + "time_per_iteration": 2.3792662620544434 + }, + { + "auxiliary_loss_clip": 0.01075785, + "auxiliary_loss_mlp": 0.01063422, + "balance_loss_clip": 1.02880383, + "balance_loss_mlp": 1.02338982, + "epoch": 0.26911167894183075, + "flos": 18145757767680.0, + "grad_norm": 1.9595815190776056, + "language_loss": 0.74868435, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.77007639, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5234375, + "step": 4476, + "time_per_iteration": 2.3939402103424072 + }, + { + "auxiliary_loss_clip": 0.01074089, + "auxiliary_loss_mlp": 0.01052672, + "balance_loss_clip": 1.01993704, + "balance_loss_mlp": 1.02321672, + "epoch": 0.2691718021944987, + "flos": 19974089180160.0, + "grad_norm": 1.5765396288246891, + "language_loss": 0.81733245, + "learning_rate": 3.429346772085922e-06, + "loss": 0.83860004, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5078125, + "step": 4477, + "time_per_iteration": 2.4235072135925293 + }, + { + "auxiliary_loss_clip": 0.01078389, + "auxiliary_loss_mlp": 0.01054993, + "balance_loss_clip": 1.0207088, + "balance_loss_mlp": 1.0244801, + "epoch": 0.2692319254471667, + "flos": 37445814639360.0, + "grad_norm": 1.704638653942128, + "language_loss": 0.67204201, + "learning_rate": 3.429074332770984e-06, + "loss": 0.69337583, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5390625, + "step": 4478, + "time_per_iteration": 2.5652217864990234 + }, + { + "auxiliary_loss_clip": 0.01080727, + "auxiliary_loss_mlp": 0.01048847, + "balance_loss_clip": 1.01537347, + "balance_loss_mlp": 1.02854323, + "epoch": 0.26929204869983464, + "flos": 22126694100480.0, + "grad_norm": 3.2210995123961093, + "language_loss": 0.82587636, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.84717202, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5234375, + "step": 4479, + "time_per_iteration": 2.423214912414551 + }, + { + "auxiliary_loss_clip": 0.01081965, + "auxiliary_loss_mlp": 0.0105682, + "balance_loss_clip": 1.02260745, + "balance_loss_mlp": 1.02865601, + "epoch": 0.2693521719525026, + "flos": 19791772727040.0, + "grad_norm": 2.9552735491991955, + "language_loss": 0.82037961, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.84176755, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.53125, + "step": 4480, + "time_per_iteration": 2.4368138313293457 + }, + { + "auxiliary_loss_clip": 0.01077264, + "auxiliary_loss_mlp": 0.01042853, + "balance_loss_clip": 1.01376545, + "balance_loss_mlp": 1.02781487, + "epoch": 0.2694122952051706, + "flos": 20993378590080.0, + "grad_norm": 1.7845991042314484, + "language_loss": 0.78819978, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80940092, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.49414062, + "step": 4481, + "time_per_iteration": 2.4556541442871094 + }, + { + "auxiliary_loss_clip": 0.01083016, + "auxiliary_loss_mlp": 0.01055658, + "balance_loss_clip": 1.02075386, + "balance_loss_mlp": 1.03005326, + "epoch": 0.2694724184578386, + "flos": 25848086319360.0, + "grad_norm": 1.7632311172867987, + "language_loss": 0.7543394, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.77572608, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.53125, + "step": 4482, + "time_per_iteration": 2.5267765522003174 + }, + { + "auxiliary_loss_clip": 0.01086978, + "auxiliary_loss_mlp": 0.01049001, + "balance_loss_clip": 1.01348829, + "balance_loss_mlp": 1.03258836, + "epoch": 0.26953254171050656, + "flos": 21725856247680.0, + "grad_norm": 4.110515363310741, + "language_loss": 0.73834264, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.75970244, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.546875, + "step": 4483, + "time_per_iteration": 2.4514079093933105 + }, + { + "auxiliary_loss_clip": 0.0108564, + "auxiliary_loss_mlp": 0.01060549, + "balance_loss_clip": 1.02156734, + "balance_loss_mlp": 1.03053987, + "epoch": 0.2695926649631745, + "flos": 19681901078400.0, + "grad_norm": 2.66010145519583, + "language_loss": 0.88473737, + "learning_rate": 3.427438559239605e-06, + "loss": 0.90619928, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5546875, + "step": 4484, + "time_per_iteration": 3.8702521324157715 + }, + { + "auxiliary_loss_clip": 0.01084432, + "auxiliary_loss_mlp": 0.01052531, + "balance_loss_clip": 1.01755512, + "balance_loss_mlp": 1.03027821, + "epoch": 0.2696527882158425, + "flos": 32885319870720.0, + "grad_norm": 1.4716760769728843, + "language_loss": 0.68070889, + "learning_rate": 3.427165740807239e-06, + "loss": 0.70207852, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.54296875, + "step": 4485, + "time_per_iteration": 2.5176010131835938 + }, + { + "auxiliary_loss_clip": 0.01083507, + "auxiliary_loss_mlp": 0.01062697, + "balance_loss_clip": 1.02600431, + "balance_loss_mlp": 1.02957714, + "epoch": 0.26971291146851045, + "flos": 12124182844800.0, + "grad_norm": 9.037358991890008, + "language_loss": 0.7522223, + "learning_rate": 3.426892868256604e-06, + "loss": 0.77368432, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5390625, + "step": 4486, + "time_per_iteration": 2.42329478263855 + }, + { + "auxiliary_loss_clip": 0.01087311, + "auxiliary_loss_mlp": 0.01057575, + "balance_loss_clip": 1.02066755, + "balance_loss_mlp": 1.0308125, + "epoch": 0.2697730347211784, + "flos": 22633458618240.0, + "grad_norm": 1.910318987145019, + "language_loss": 0.85289031, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.87433916, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.56640625, + "step": 4487, + "time_per_iteration": 5.267353296279907 + }, + { + "auxiliary_loss_clip": 0.01081948, + "auxiliary_loss_mlp": 0.01059956, + "balance_loss_clip": 1.02266765, + "balance_loss_mlp": 1.02810609, + "epoch": 0.2698331579738464, + "flos": 23511943048320.0, + "grad_norm": 2.205742482182975, + "language_loss": 0.74087733, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.76229644, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5390625, + "step": 4488, + "time_per_iteration": 2.444610118865967 + }, + { + "auxiliary_loss_clip": 0.01081115, + "auxiliary_loss_mlp": 0.0105983, + "balance_loss_clip": 1.0229466, + "balance_loss_mlp": 1.02765942, + "epoch": 0.26989328122651435, + "flos": 24639986943360.0, + "grad_norm": 1.7270330148667488, + "language_loss": 0.84642744, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86783695, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53515625, + "step": 4489, + "time_per_iteration": 3.884119749069214 + }, + { + "auxiliary_loss_clip": 0.01079627, + "auxiliary_loss_mlp": 0.01059643, + "balance_loss_clip": 1.02316475, + "balance_loss_mlp": 1.02570987, + "epoch": 0.2699534044791823, + "flos": 10771996821120.0, + "grad_norm": 2.80775985968407, + "language_loss": 0.91354632, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.93493903, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5390625, + "step": 4490, + "time_per_iteration": 2.378265619277954 + }, + { + "auxiliary_loss_clip": 0.01074776, + "auxiliary_loss_mlp": 0.01052924, + "balance_loss_clip": 1.02083325, + "balance_loss_mlp": 1.02428544, + "epoch": 0.2700135277318503, + "flos": 36170192960640.0, + "grad_norm": 1.766931230042687, + "language_loss": 0.74646431, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.76774138, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.50390625, + "step": 4491, + "time_per_iteration": 2.5096938610076904 + }, + { + "auxiliary_loss_clip": 0.01078741, + "auxiliary_loss_mlp": 0.01056906, + "balance_loss_clip": 1.02125072, + "balance_loss_mlp": 1.02590334, + "epoch": 0.27007365098451824, + "flos": 17417713852800.0, + "grad_norm": 2.1684216070802793, + "language_loss": 0.75891864, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.78027511, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.52734375, + "step": 4492, + "time_per_iteration": 2.414543867111206 + }, + { + "auxiliary_loss_clip": 0.01074656, + "auxiliary_loss_mlp": 0.01055445, + "balance_loss_clip": 1.02108872, + "balance_loss_mlp": 1.02340496, + "epoch": 0.2701337742371862, + "flos": 23184562429440.0, + "grad_norm": 1.895995877568906, + "language_loss": 0.90445864, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.92575967, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51171875, + "step": 4493, + "time_per_iteration": 2.46675705909729 + }, + { + "auxiliary_loss_clip": 0.01074662, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.0241704, + "epoch": 0.2701938974898542, + "flos": 24388297885440.0, + "grad_norm": 1.6250921841986714, + "language_loss": 0.72315037, + "learning_rate": 3.424707940835998e-06, + "loss": 0.74439692, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.50390625, + "step": 4494, + "time_per_iteration": 2.4872848987579346 + }, + { + "auxiliary_loss_clip": 0.01071786, + "auxiliary_loss_mlp": 0.01047835, + "balance_loss_clip": 1.01476693, + "balance_loss_mlp": 1.02084494, + "epoch": 0.2702540207425222, + "flos": 26213103250560.0, + "grad_norm": 1.9829476952183305, + "language_loss": 0.879803, + "learning_rate": 3.42443458168683e-06, + "loss": 0.90099919, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.5078125, + "step": 4495, + "time_per_iteration": 2.446254014968872 + }, + { + "auxiliary_loss_clip": 0.01073262, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.02234316, + "balance_loss_mlp": 1.02271485, + "epoch": 0.27031414399519016, + "flos": 22925367429120.0, + "grad_norm": 2.982129866319449, + "language_loss": 0.77635396, + "learning_rate": 3.424161168522959e-06, + "loss": 0.79765403, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 4496, + "time_per_iteration": 2.419271469116211 + }, + { + "auxiliary_loss_clip": 0.01020045, + "auxiliary_loss_mlp": 0.01003056, + "balance_loss_clip": 0.99943256, + "balance_loss_mlp": 1.00717783, + "epoch": 0.2703742672478581, + "flos": 63016759653120.0, + "grad_norm": 0.6918861217157642, + "language_loss": 0.50203735, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52226835, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.03613281, + "router_z_loss_mlp": 0.12890625, + "step": 4497, + "time_per_iteration": 3.0774805545806885 + }, + { + "auxiliary_loss_clip": 0.01077812, + "auxiliary_loss_mlp": 0.01053876, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.02604985, + "epoch": 0.2704343905005261, + "flos": 18839900885760.0, + "grad_norm": 1.682706776823039, + "language_loss": 0.74006504, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.76138198, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.515625, + "step": 4498, + "time_per_iteration": 2.4216818809509277 + }, + { + "auxiliary_loss_clip": 0.01016139, + "auxiliary_loss_mlp": 0.01003936, + "balance_loss_clip": 1.00057387, + "balance_loss_mlp": 1.00361872, + "epoch": 0.27049451375319405, + "flos": 71230042085760.0, + "grad_norm": 0.8910268459121352, + "language_loss": 0.59243906, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61263978, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.125, + "step": 4499, + "time_per_iteration": 3.0479917526245117 + }, + { + "auxiliary_loss_clip": 0.01078042, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.01516044, + "balance_loss_mlp": 1.02662921, + "epoch": 0.270554637005862, + "flos": 24277483630080.0, + "grad_norm": 1.931993567951779, + "language_loss": 0.74916184, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.77043933, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.515625, + "step": 4500, + "time_per_iteration": 2.4671826362609863 + }, + { + "auxiliary_loss_clip": 0.01074717, + "auxiliary_loss_mlp": 0.01051005, + "balance_loss_clip": 1.01905644, + "balance_loss_mlp": 1.02319431, + "epoch": 0.27061476025853, + "flos": 17631557130240.0, + "grad_norm": 2.5575588375060825, + "language_loss": 0.83810151, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.85935879, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.515625, + "step": 4501, + "time_per_iteration": 2.403182029724121 + }, + { + "auxiliary_loss_clip": 0.01077232, + "auxiliary_loss_mlp": 0.01051335, + "balance_loss_clip": 1.01767039, + "balance_loss_mlp": 1.02515996, + "epoch": 0.27067488351119795, + "flos": 22709045445120.0, + "grad_norm": 1.710333638194561, + "language_loss": 0.7375862, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75887185, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.51953125, + "step": 4502, + "time_per_iteration": 2.41780686378479 + }, + { + "auxiliary_loss_clip": 0.01079017, + "auxiliary_loss_mlp": 0.0105623, + "balance_loss_clip": 1.02008581, + "balance_loss_mlp": 1.02509737, + "epoch": 0.2707350067638659, + "flos": 41717996467200.0, + "grad_norm": 1.8494952723065698, + "language_loss": 0.69741744, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.71876997, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5390625, + "step": 4503, + "time_per_iteration": 2.5837082862854004 + }, + { + "auxiliary_loss_clip": 0.01082196, + "auxiliary_loss_mlp": 0.01050135, + "balance_loss_clip": 1.01738811, + "balance_loss_mlp": 1.02953386, + "epoch": 0.2707951300165339, + "flos": 20192017086720.0, + "grad_norm": 1.9089302828992918, + "language_loss": 0.69690812, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.71823138, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.52734375, + "step": 4504, + "time_per_iteration": 2.4206793308258057 + }, + { + "auxiliary_loss_clip": 0.01079783, + "auxiliary_loss_mlp": 0.01063174, + "balance_loss_clip": 1.0268867, + "balance_loss_mlp": 1.02898526, + "epoch": 0.27085525326920185, + "flos": 21432900096000.0, + "grad_norm": 1.8145306230033325, + "language_loss": 0.7702781, + "learning_rate": 3.421698021097902e-06, + "loss": 0.79170763, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5078125, + "step": 4505, + "time_per_iteration": 2.4123997688293457 + }, + { + "auxiliary_loss_clip": 0.01083769, + "auxiliary_loss_mlp": 0.01059484, + "balance_loss_clip": 1.02364945, + "balance_loss_mlp": 1.02801538, + "epoch": 0.2709153765218698, + "flos": 17674290501120.0, + "grad_norm": 2.5958396934877848, + "language_loss": 0.75593239, + "learning_rate": 3.42142406835758e-06, + "loss": 0.77736497, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.55859375, + "step": 4506, + "time_per_iteration": 2.403510332107544 + }, + { + "auxiliary_loss_clip": 0.01080021, + "auxiliary_loss_mlp": 0.01052819, + "balance_loss_clip": 1.01684129, + "balance_loss_mlp": 1.02641344, + "epoch": 0.2709754997745378, + "flos": 24455261606400.0, + "grad_norm": 1.9094190771261172, + "language_loss": 0.81652892, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83785737, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53515625, + "step": 4507, + "time_per_iteration": 2.4387030601501465 + }, + { + "auxiliary_loss_clip": 0.01025602, + "auxiliary_loss_mlp": 0.01016443, + "balance_loss_clip": 1.01191342, + "balance_loss_mlp": 1.01312232, + "epoch": 0.2710356230272058, + "flos": 65207664201600.0, + "grad_norm": 0.7363581958317527, + "language_loss": 0.50947499, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52989548, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.04541016, + "router_z_loss_mlp": 0.125, + "step": 4508, + "time_per_iteration": 2.931016683578491 + }, + { + "auxiliary_loss_clip": 0.01079671, + "auxiliary_loss_mlp": 0.01050071, + "balance_loss_clip": 1.0195775, + "balance_loss_mlp": 1.02962184, + "epoch": 0.27109574627987376, + "flos": 25483243944960.0, + "grad_norm": 1.822730300280252, + "language_loss": 0.76877695, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.79007435, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.5, + "step": 4509, + "time_per_iteration": 2.4285154342651367 + }, + { + "auxiliary_loss_clip": 0.01074396, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.02076674, + "balance_loss_mlp": 1.02619195, + "epoch": 0.2711558695325417, + "flos": 19681761432960.0, + "grad_norm": 1.8908230522729756, + "language_loss": 0.7290684, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.75030541, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.48242188, + "step": 4510, + "time_per_iteration": 2.4095330238342285 + }, + { + "auxiliary_loss_clip": 0.01077008, + "auxiliary_loss_mlp": 0.0105392, + "balance_loss_clip": 1.02287769, + "balance_loss_mlp": 1.02815747, + "epoch": 0.2712159927852097, + "flos": 18586780462080.0, + "grad_norm": 3.072833760295191, + "language_loss": 0.72004628, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.74135554, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.48828125, + "step": 4511, + "time_per_iteration": 2.386728048324585 + }, + { + "auxiliary_loss_clip": 0.01079318, + "auxiliary_loss_mlp": 0.01054646, + "balance_loss_clip": 1.02067173, + "balance_loss_mlp": 1.0276897, + "epoch": 0.27127611603787766, + "flos": 25629041249280.0, + "grad_norm": 2.0577878869843813, + "language_loss": 0.82825398, + "learning_rate": 3.419779220367979e-06, + "loss": 0.84959364, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.515625, + "step": 4512, + "time_per_iteration": 2.5171661376953125 + }, + { + "auxiliary_loss_clip": 0.01075303, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.02461588, + "balance_loss_mlp": 1.0268147, + "epoch": 0.2713362392905456, + "flos": 23147833812480.0, + "grad_norm": 1.7933555223172841, + "language_loss": 0.8205657, + "learning_rate": 3.419504890542124e-06, + "loss": 0.84186506, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.484375, + "step": 4513, + "time_per_iteration": 2.401294231414795 + }, + { + "auxiliary_loss_clip": 0.01078061, + "auxiliary_loss_mlp": 0.01067517, + "balance_loss_clip": 1.03738046, + "balance_loss_mlp": 1.02585375, + "epoch": 0.2713963625432136, + "flos": 18365151951360.0, + "grad_norm": 3.743266820654981, + "language_loss": 0.90130258, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.92275834, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.5234375, + "step": 4514, + "time_per_iteration": 2.4261319637298584 + }, + { + "auxiliary_loss_clip": 0.01074818, + "auxiliary_loss_mlp": 0.01068169, + "balance_loss_clip": 1.03768682, + "balance_loss_mlp": 1.02528644, + "epoch": 0.27145648579588155, + "flos": 22490663690880.0, + "grad_norm": 1.672920709130401, + "language_loss": 0.93105006, + "learning_rate": 3.418956069417517e-06, + "loss": 0.95247996, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.49609375, + "step": 4515, + "time_per_iteration": 2.401991128921509 + }, + { + "auxiliary_loss_clip": 0.01077396, + "auxiliary_loss_mlp": 0.01086759, + "balance_loss_clip": 1.04863632, + "balance_loss_mlp": 1.02416754, + "epoch": 0.2715166090485495, + "flos": 19238329854720.0, + "grad_norm": 2.5977939703651862, + "language_loss": 0.76070333, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.78234494, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.53125, + "step": 4516, + "time_per_iteration": 2.39780592918396 + }, + { + "auxiliary_loss_clip": 0.01073903, + "auxiliary_loss_mlp": 0.01074457, + "balance_loss_clip": 1.04072094, + "balance_loss_mlp": 1.023211, + "epoch": 0.2715767323012175, + "flos": 17708714968320.0, + "grad_norm": 2.0632615351501036, + "language_loss": 0.77473772, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.79622126, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5078125, + "step": 4517, + "time_per_iteration": 2.3561458587646484 + }, + { + "auxiliary_loss_clip": 0.01072683, + "auxiliary_loss_mlp": 0.01067538, + "balance_loss_clip": 1.03377807, + "balance_loss_mlp": 1.02249765, + "epoch": 0.27163685555388545, + "flos": 22381734648960.0, + "grad_norm": 2.496365147501928, + "language_loss": 0.80259621, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.82399845, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5, + "step": 4518, + "time_per_iteration": 2.422555446624756 + }, + { + "auxiliary_loss_clip": 0.01073405, + "auxiliary_loss_mlp": 0.01065717, + "balance_loss_clip": 1.03174233, + "balance_loss_mlp": 1.0236094, + "epoch": 0.2716969788065534, + "flos": 22345599525120.0, + "grad_norm": 1.704977610776915, + "language_loss": 0.70343524, + "learning_rate": 3.41785778156811e-06, + "loss": 0.72482646, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49804688, + "step": 4519, + "time_per_iteration": 2.4186031818389893 + }, + { + "auxiliary_loss_clip": 0.01073087, + "auxiliary_loss_mlp": 0.01055701, + "balance_loss_clip": 1.02463543, + "balance_loss_mlp": 1.02304232, + "epoch": 0.2717571020592214, + "flos": 25227295701120.0, + "grad_norm": 1.867034231520625, + "language_loss": 0.76696157, + "learning_rate": 3.417583075166451e-06, + "loss": 0.78824937, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.5, + "step": 4520, + "time_per_iteration": 2.5162932872772217 + }, + { + "auxiliary_loss_clip": 0.01077464, + "auxiliary_loss_mlp": 0.01065299, + "balance_loss_clip": 1.02853489, + "balance_loss_mlp": 1.02491355, + "epoch": 0.2718172253118894, + "flos": 20188840152960.0, + "grad_norm": 2.0569901582916135, + "language_loss": 0.78802919, + "learning_rate": 3.4173083150099e-06, + "loss": 0.80945683, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.52734375, + "step": 4521, + "time_per_iteration": 2.3844425678253174 + }, + { + "auxiliary_loss_clip": 0.01077254, + "auxiliary_loss_mlp": 0.01057545, + "balance_loss_clip": 1.02132916, + "balance_loss_mlp": 1.02410066, + "epoch": 0.27187734856455736, + "flos": 14318264327040.0, + "grad_norm": 2.3880216575140705, + "language_loss": 0.76926148, + "learning_rate": 3.417033501108875e-06, + "loss": 0.79060954, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.53125, + "step": 4522, + "time_per_iteration": 2.3921091556549072 + }, + { + "auxiliary_loss_clip": 0.01079402, + "auxiliary_loss_mlp": 0.01061662, + "balance_loss_clip": 1.02566135, + "balance_loss_mlp": 1.0262413, + "epoch": 0.27193747181722533, + "flos": 21106566817920.0, + "grad_norm": 2.2704248152842665, + "language_loss": 0.73483682, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75624752, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 4523, + "time_per_iteration": 2.414151191711426 + }, + { + "auxiliary_loss_clip": 0.01075233, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.02000523, + "balance_loss_mlp": 1.02573907, + "epoch": 0.2719975950698933, + "flos": 19681761432960.0, + "grad_norm": 1.5506806396223207, + "language_loss": 0.76025397, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.78153729, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.49414062, + "step": 4524, + "time_per_iteration": 3.8186089992523193 + }, + { + "auxiliary_loss_clip": 0.01078752, + "auxiliary_loss_mlp": 0.01056242, + "balance_loss_clip": 1.02090847, + "balance_loss_mlp": 1.02692854, + "epoch": 0.27205771832256126, + "flos": 24753314816640.0, + "grad_norm": 1.7764797010703288, + "language_loss": 0.77810705, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.79945695, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.515625, + "step": 4525, + "time_per_iteration": 2.4182326793670654 + }, + { + "auxiliary_loss_clip": 0.01082182, + "auxiliary_loss_mlp": 0.01053792, + "balance_loss_clip": 1.0204134, + "balance_loss_mlp": 1.02999878, + "epoch": 0.2721178415752292, + "flos": 21754694897280.0, + "grad_norm": 1.990814982584223, + "language_loss": 0.8294397, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.85079944, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.51953125, + "step": 4526, + "time_per_iteration": 3.9255146980285645 + }, + { + "auxiliary_loss_clip": 0.01085959, + "auxiliary_loss_mlp": 0.01053482, + "balance_loss_clip": 1.01411951, + "balance_loss_mlp": 1.0298636, + "epoch": 0.2721779648278972, + "flos": 12676019794560.0, + "grad_norm": 6.776218229718864, + "language_loss": 0.79732746, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.81872189, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.5625, + "step": 4527, + "time_per_iteration": 3.936542510986328 + }, + { + "auxiliary_loss_clip": 0.01082677, + "auxiliary_loss_mlp": 0.01062736, + "balance_loss_clip": 1.02461314, + "balance_loss_mlp": 1.02907193, + "epoch": 0.27223808808056515, + "flos": 16252278024960.0, + "grad_norm": 2.0163594122793858, + "language_loss": 0.83486831, + "learning_rate": 3.415383489652503e-06, + "loss": 0.85632241, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5390625, + "step": 4528, + "time_per_iteration": 2.3741657733917236 + }, + { + "auxiliary_loss_clip": 0.01084581, + "auxiliary_loss_mlp": 0.01061499, + "balance_loss_clip": 1.02676105, + "balance_loss_mlp": 1.03191161, + "epoch": 0.2722982113332331, + "flos": 27744568439040.0, + "grad_norm": 1.73664397854751, + "language_loss": 0.78255868, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.80401945, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5234375, + "step": 4529, + "time_per_iteration": 3.897963762283325 + }, + { + "auxiliary_loss_clip": 0.01082755, + "auxiliary_loss_mlp": 0.01066294, + "balance_loss_clip": 1.03141296, + "balance_loss_mlp": 1.02816391, + "epoch": 0.2723583345859011, + "flos": 21725158020480.0, + "grad_norm": 1.894639729751049, + "language_loss": 0.84480059, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.86629105, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.546875, + "step": 4530, + "time_per_iteration": 2.4287378787994385 + }, + { + "auxiliary_loss_clip": 0.01081808, + "auxiliary_loss_mlp": 0.01065314, + "balance_loss_clip": 1.02883637, + "balance_loss_mlp": 1.02884376, + "epoch": 0.27241845783856905, + "flos": 17346316389120.0, + "grad_norm": 2.355760461405491, + "language_loss": 0.92989618, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.95136738, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.53125, + "step": 4531, + "time_per_iteration": 2.3752546310424805 + }, + { + "auxiliary_loss_clip": 0.0108292, + "auxiliary_loss_mlp": 0.01066953, + "balance_loss_clip": 1.03126204, + "balance_loss_mlp": 1.02827084, + "epoch": 0.272478581091237, + "flos": 24753140259840.0, + "grad_norm": 1.8883606464291378, + "language_loss": 0.77884221, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.80034095, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.546875, + "step": 4532, + "time_per_iteration": 2.4509592056274414 + }, + { + "auxiliary_loss_clip": 0.01077989, + "auxiliary_loss_mlp": 0.01057181, + "balance_loss_clip": 1.02299142, + "balance_loss_mlp": 1.02768898, + "epoch": 0.272538704343905, + "flos": 17889774612480.0, + "grad_norm": 2.948359497429654, + "language_loss": 0.90018177, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.92153347, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.50390625, + "step": 4533, + "time_per_iteration": 2.372528553009033 + }, + { + "auxiliary_loss_clip": 0.0107568, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.02029419, + "balance_loss_mlp": 1.02618897, + "epoch": 0.272598827596573, + "flos": 22930848512640.0, + "grad_norm": 1.7758313058288122, + "language_loss": 0.72729278, + "learning_rate": 3.413731546022929e-06, + "loss": 0.74859101, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49609375, + "step": 4534, + "time_per_iteration": 2.449655532836914 + }, + { + "auxiliary_loss_clip": 0.01077707, + "auxiliary_loss_mlp": 0.01057441, + "balance_loss_clip": 1.02012849, + "balance_loss_mlp": 1.02443922, + "epoch": 0.27265895084924097, + "flos": 24237403522560.0, + "grad_norm": 1.6377728823454702, + "language_loss": 0.92334455, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.94469601, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.53125, + "step": 4535, + "time_per_iteration": 2.42824125289917 + }, + { + "auxiliary_loss_clip": 0.01077804, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_clip": 1.02538431, + "balance_loss_mlp": 1.02382863, + "epoch": 0.27271907410190893, + "flos": 27012020958720.0, + "grad_norm": 4.270012466075696, + "language_loss": 0.74073404, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.76215428, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5390625, + "step": 4536, + "time_per_iteration": 2.4511008262634277 + }, + { + "auxiliary_loss_clip": 0.01073062, + "auxiliary_loss_mlp": 0.01049422, + "balance_loss_clip": 1.01644921, + "balance_loss_mlp": 1.02218676, + "epoch": 0.2727791973545769, + "flos": 34451349171840.0, + "grad_norm": 2.0255679489029212, + "language_loss": 0.73427844, + "learning_rate": 3.41290485034781e-06, + "loss": 0.7555033, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.5078125, + "step": 4537, + "time_per_iteration": 2.487445831298828 + }, + { + "auxiliary_loss_clip": 0.01073827, + "auxiliary_loss_mlp": 0.01049396, + "balance_loss_clip": 1.01506364, + "balance_loss_mlp": 1.02224541, + "epoch": 0.27283932060724486, + "flos": 15041036626560.0, + "grad_norm": 2.1225739545966227, + "language_loss": 0.79116029, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.81239247, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4538, + "time_per_iteration": 2.3808610439300537 + }, + { + "auxiliary_loss_clip": 0.01070491, + "auxiliary_loss_mlp": 0.01054308, + "balance_loss_clip": 1.02038074, + "balance_loss_mlp": 1.02007771, + "epoch": 0.2728994438599128, + "flos": 21651351672960.0, + "grad_norm": 1.4811048464126957, + "language_loss": 0.91031158, + "learning_rate": 3.412353451992847e-06, + "loss": 0.9315595, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.50390625, + "step": 4539, + "time_per_iteration": 2.39631724357605 + }, + { + "auxiliary_loss_clip": 0.01072708, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_clip": 1.01654005, + "balance_loss_mlp": 1.02248812, + "epoch": 0.2729595671125808, + "flos": 17487610128000.0, + "grad_norm": 2.04523959865395, + "language_loss": 0.89499307, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.91622293, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5, + "step": 4540, + "time_per_iteration": 2.3650896549224854 + }, + { + "auxiliary_loss_clip": 0.01073516, + "auxiliary_loss_mlp": 0.01053399, + "balance_loss_clip": 1.02066386, + "balance_loss_mlp": 1.02122808, + "epoch": 0.27301969036524876, + "flos": 19317128526720.0, + "grad_norm": 1.9326391379780228, + "language_loss": 0.83551776, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.85678691, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5234375, + "step": 4541, + "time_per_iteration": 2.3827805519104004 + }, + { + "auxiliary_loss_clip": 0.01071894, + "auxiliary_loss_mlp": 0.01052808, + "balance_loss_clip": 1.01892853, + "balance_loss_mlp": 1.02109551, + "epoch": 0.2730798136179167, + "flos": 21064706231040.0, + "grad_norm": 2.0817007098897937, + "language_loss": 0.80788666, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82913363, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 4542, + "time_per_iteration": 2.391402006149292 + }, + { + "auxiliary_loss_clip": 0.0107347, + "auxiliary_loss_mlp": 0.01048504, + "balance_loss_clip": 1.01607871, + "balance_loss_mlp": 1.02258825, + "epoch": 0.2731399368705847, + "flos": 19170737729280.0, + "grad_norm": 2.138247718852104, + "language_loss": 0.91592449, + "learning_rate": 3.411250012687582e-06, + "loss": 0.93714428, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51171875, + "step": 4543, + "time_per_iteration": 2.431884288787842 + }, + { + "auxiliary_loss_clip": 0.01075727, + "auxiliary_loss_mlp": 0.01056541, + "balance_loss_clip": 1.02020574, + "balance_loss_mlp": 1.02220595, + "epoch": 0.27320006012325265, + "flos": 18289320744960.0, + "grad_norm": 2.0814661169806428, + "language_loss": 0.65256983, + "learning_rate": 3.410974019048255e-06, + "loss": 0.67389256, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.53515625, + "step": 4544, + "time_per_iteration": 2.357912302017212 + }, + { + "auxiliary_loss_clip": 0.01075215, + "auxiliary_loss_mlp": 0.01056019, + "balance_loss_clip": 1.01930308, + "balance_loss_mlp": 1.02306557, + "epoch": 0.2732601833759206, + "flos": 34859483498880.0, + "grad_norm": 3.1949960012761203, + "language_loss": 0.71193635, + "learning_rate": 3.410697971904651e-06, + "loss": 0.73324865, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.51953125, + "step": 4545, + "time_per_iteration": 2.521660089492798 + }, + { + "auxiliary_loss_clip": 0.01019208, + "auxiliary_loss_mlp": 0.0101022, + "balance_loss_clip": 1.0060004, + "balance_loss_mlp": 1.0059526, + "epoch": 0.2733203066285886, + "flos": 53907709800960.0, + "grad_norm": 0.7357713910684953, + "language_loss": 0.61624885, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63654315, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.04223633, + "router_z_loss_mlp": 0.1328125, + "step": 4546, + "time_per_iteration": 3.0820863246917725 + }, + { + "auxiliary_loss_clip": 0.01078071, + "auxiliary_loss_mlp": 0.01052716, + "balance_loss_clip": 1.02107739, + "balance_loss_mlp": 1.02726269, + "epoch": 0.2733804298812566, + "flos": 20659539369600.0, + "grad_norm": 1.772739420437474, + "language_loss": 0.66485536, + "learning_rate": 3.410145717146488e-06, + "loss": 0.68616325, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.5078125, + "step": 4547, + "time_per_iteration": 2.412407398223877 + }, + { + "auxiliary_loss_clip": 0.01073108, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_clip": 1.0194633, + "balance_loss_mlp": 1.02437842, + "epoch": 0.27344055313392457, + "flos": 25883174102400.0, + "grad_norm": 2.2206728760946026, + "language_loss": 0.798527, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.81976116, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48828125, + "step": 4548, + "time_per_iteration": 2.4449100494384766 + }, + { + "auxiliary_loss_clip": 0.01075818, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.02305007, + "balance_loss_mlp": 1.02653646, + "epoch": 0.27350067638659253, + "flos": 22928649096960.0, + "grad_norm": 2.7845662838176697, + "language_loss": 0.84027815, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.86155653, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4921875, + "step": 4549, + "time_per_iteration": 2.414067029953003 + }, + { + "auxiliary_loss_clip": 0.0107839, + "auxiliary_loss_mlp": 0.01059857, + "balance_loss_clip": 1.02416623, + "balance_loss_mlp": 1.02550352, + "epoch": 0.2735607996392605, + "flos": 16574072826240.0, + "grad_norm": 2.03933313003626, + "language_loss": 0.72387505, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.7452575, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.52734375, + "step": 4550, + "time_per_iteration": 2.3573460578918457 + }, + { + "auxiliary_loss_clip": 0.01074531, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_clip": 1.01670384, + "balance_loss_mlp": 1.02525342, + "epoch": 0.27362092289192846, + "flos": 19644299677440.0, + "grad_norm": 2.5161746762160524, + "language_loss": 0.80338997, + "learning_rate": 3.409040566039563e-06, + "loss": 0.82458568, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4921875, + "step": 4551, + "time_per_iteration": 2.4047656059265137 + }, + { + "auxiliary_loss_clip": 0.01076231, + "auxiliary_loss_mlp": 0.01055113, + "balance_loss_clip": 1.02378464, + "balance_loss_mlp": 1.02527523, + "epoch": 0.27368104614459643, + "flos": 17638190288640.0, + "grad_norm": 2.5344629755334007, + "language_loss": 0.72957373, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.7508871, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.5078125, + "step": 4552, + "time_per_iteration": 2.3753767013549805 + }, + { + "auxiliary_loss_clip": 0.01077413, + "auxiliary_loss_mlp": 0.01054606, + "balance_loss_clip": 1.02120364, + "balance_loss_mlp": 1.0259378, + "epoch": 0.2737411693972644, + "flos": 21578941779840.0, + "grad_norm": 1.8920686071096664, + "language_loss": 0.7284193, + "learning_rate": 3.408487669858431e-06, + "loss": 0.74973953, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.515625, + "step": 4553, + "time_per_iteration": 2.4301674365997314 + }, + { + "auxiliary_loss_clip": 0.010736, + "auxiliary_loss_mlp": 0.01056791, + "balance_loss_clip": 1.02410388, + "balance_loss_mlp": 1.02332997, + "epoch": 0.27380129264993236, + "flos": 25482859920000.0, + "grad_norm": 1.8273507903724784, + "language_loss": 0.61243927, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.63374317, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.50390625, + "step": 4554, + "time_per_iteration": 2.4275763034820557 + }, + { + "auxiliary_loss_clip": 0.01076475, + "auxiliary_loss_mlp": 0.01059569, + "balance_loss_clip": 1.02380586, + "balance_loss_mlp": 1.02387857, + "epoch": 0.2738614159026003, + "flos": 18660202784640.0, + "grad_norm": 1.7374347653295796, + "language_loss": 0.75637293, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.77773345, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.52734375, + "step": 4555, + "time_per_iteration": 2.4255292415618896 + }, + { + "auxiliary_loss_clip": 0.01074684, + "auxiliary_loss_mlp": 0.01057038, + "balance_loss_clip": 1.02075124, + "balance_loss_mlp": 1.02366829, + "epoch": 0.2739215391552683, + "flos": 23476017392640.0, + "grad_norm": 1.9683367497327542, + "language_loss": 0.78591603, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80723321, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.51171875, + "step": 4556, + "time_per_iteration": 2.4037158489227295 + }, + { + "auxiliary_loss_clip": 0.0108108, + "auxiliary_loss_mlp": 0.01063843, + "balance_loss_clip": 1.02407444, + "balance_loss_mlp": 1.02456486, + "epoch": 0.27398166240793626, + "flos": 17127690255360.0, + "grad_norm": 2.1641177048514293, + "language_loss": 0.83670223, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.85815144, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.56640625, + "step": 4557, + "time_per_iteration": 2.3856043815612793 + }, + { + "auxiliary_loss_clip": 0.01071119, + "auxiliary_loss_mlp": 0.01052964, + "balance_loss_clip": 1.02146864, + "balance_loss_mlp": 1.02172136, + "epoch": 0.2740417856606042, + "flos": 23403607499520.0, + "grad_norm": 1.821617851297972, + "language_loss": 0.75141239, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.77265322, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4921875, + "step": 4558, + "time_per_iteration": 2.3971292972564697 + }, + { + "auxiliary_loss_clip": 0.01072843, + "auxiliary_loss_mlp": 0.0105167, + "balance_loss_clip": 1.01930428, + "balance_loss_mlp": 1.02173519, + "epoch": 0.2741019089132722, + "flos": 12779781955200.0, + "grad_norm": 2.220757140090357, + "language_loss": 0.70826721, + "learning_rate": 3.406827699810819e-06, + "loss": 0.72951239, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51171875, + "step": 4559, + "time_per_iteration": 2.3884692192077637 + }, + { + "auxiliary_loss_clip": 0.01071593, + "auxiliary_loss_mlp": 0.01056798, + "balance_loss_clip": 1.0237534, + "balance_loss_mlp": 1.02108932, + "epoch": 0.27416203216594015, + "flos": 20630491251840.0, + "grad_norm": 2.594010823423996, + "language_loss": 0.73177218, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.75305617, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.50390625, + "step": 4560, + "time_per_iteration": 2.392277717590332 + }, + { + "auxiliary_loss_clip": 0.01071294, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.01364028, + "balance_loss_mlp": 1.01989293, + "epoch": 0.27422215541860817, + "flos": 26540379135360.0, + "grad_norm": 1.7226201761427198, + "language_loss": 0.83097512, + "learning_rate": 3.406273949573303e-06, + "loss": 0.85216904, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4561, + "time_per_iteration": 2.476163148880005 + }, + { + "auxiliary_loss_clip": 0.01072274, + "auxiliary_loss_mlp": 0.01050991, + "balance_loss_clip": 1.01859033, + "balance_loss_mlp": 1.02081072, + "epoch": 0.27428227867127614, + "flos": 23330045531520.0, + "grad_norm": 1.7397448701904699, + "language_loss": 0.76757425, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.78880692, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.515625, + "step": 4562, + "time_per_iteration": 2.4096767902374268 + }, + { + "auxiliary_loss_clip": 0.01072227, + "auxiliary_loss_mlp": 0.01050071, + "balance_loss_clip": 1.01778924, + "balance_loss_mlp": 1.02148747, + "epoch": 0.2743424019239441, + "flos": 23034121914240.0, + "grad_norm": 1.5092487370907206, + "language_loss": 0.76199389, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.78321689, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5078125, + "step": 4563, + "time_per_iteration": 2.419037103652954 + }, + { + "auxiliary_loss_clip": 0.01075752, + "auxiliary_loss_mlp": 0.0106109, + "balance_loss_clip": 1.02268124, + "balance_loss_mlp": 1.02166915, + "epoch": 0.27440252517661207, + "flos": 21980024012160.0, + "grad_norm": 1.9367770804965188, + "language_loss": 0.65068376, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.67205215, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.54296875, + "step": 4564, + "time_per_iteration": 3.789053440093994 + }, + { + "auxiliary_loss_clip": 0.01073596, + "auxiliary_loss_mlp": 0.01045926, + "balance_loss_clip": 1.01302445, + "balance_loss_mlp": 1.02195311, + "epoch": 0.27446264842928003, + "flos": 40185867962880.0, + "grad_norm": 1.8240163210558014, + "language_loss": 0.80439126, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.82558644, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.515625, + "step": 4565, + "time_per_iteration": 2.5750885009765625 + }, + { + "auxiliary_loss_clip": 0.01074237, + "auxiliary_loss_mlp": 0.01057269, + "balance_loss_clip": 1.02434325, + "balance_loss_mlp": 1.02349687, + "epoch": 0.274522771681948, + "flos": 13478847575040.0, + "grad_norm": 1.9273681853567755, + "language_loss": 0.70528555, + "learning_rate": 3.404888640957477e-06, + "loss": 0.72660065, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5078125, + "step": 4566, + "time_per_iteration": 3.7810823917388916 + }, + { + "auxiliary_loss_clip": 0.0107195, + "auxiliary_loss_mlp": 0.01052792, + "balance_loss_clip": 1.02127266, + "balance_loss_mlp": 1.02220273, + "epoch": 0.27458289493461596, + "flos": 28620853453440.0, + "grad_norm": 1.7952770104617188, + "language_loss": 0.62620711, + "learning_rate": 3.404611419371723e-06, + "loss": 0.6474545, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.49609375, + "step": 4567, + "time_per_iteration": 3.894662380218506 + }, + { + "auxiliary_loss_clip": 0.01074782, + "auxiliary_loss_mlp": 0.01052599, + "balance_loss_clip": 1.0178616, + "balance_loss_mlp": 1.02377319, + "epoch": 0.2746430181872839, + "flos": 20118804232320.0, + "grad_norm": 1.7794119169808733, + "language_loss": 0.83394372, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.85521758, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.51171875, + "step": 4568, + "time_per_iteration": 2.4015963077545166 + }, + { + "auxiliary_loss_clip": 0.01073725, + "auxiliary_loss_mlp": 0.01052685, + "balance_loss_clip": 1.01971197, + "balance_loss_mlp": 1.02215016, + "epoch": 0.2747031414399519, + "flos": 20192436023040.0, + "grad_norm": 2.3073131701439835, + "language_loss": 0.69258225, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.71384633, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.515625, + "step": 4569, + "time_per_iteration": 3.8007290363311768 + }, + { + "auxiliary_loss_clip": 0.01072688, + "auxiliary_loss_mlp": 0.01060094, + "balance_loss_clip": 1.02565432, + "balance_loss_mlp": 1.02142489, + "epoch": 0.27476326469261986, + "flos": 13515506369280.0, + "grad_norm": 2.2911836826202148, + "language_loss": 0.72457778, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.74590564, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51171875, + "step": 4570, + "time_per_iteration": 2.3502182960510254 + }, + { + "auxiliary_loss_clip": 0.01023824, + "auxiliary_loss_mlp": 0.01004838, + "balance_loss_clip": 0.99990231, + "balance_loss_mlp": 1.01076174, + "epoch": 0.2748233879452878, + "flos": 65934067282560.0, + "grad_norm": 0.7303904829886919, + "language_loss": 0.55812889, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57841551, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.04931641, + "router_z_loss_mlp": 0.13085938, + "step": 4571, + "time_per_iteration": 3.187305450439453 + }, + { + "auxiliary_loss_clip": 0.01076556, + "auxiliary_loss_mlp": 0.01064788, + "balance_loss_clip": 1.02866769, + "balance_loss_mlp": 1.02318275, + "epoch": 0.2748835111979558, + "flos": 17383254474240.0, + "grad_norm": 2.4809910447790777, + "language_loss": 0.80067712, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.82209063, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.53515625, + "step": 4572, + "time_per_iteration": 2.406135082244873 + }, + { + "auxiliary_loss_clip": 0.01066863, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_clip": 1.01855862, + "balance_loss_mlp": 1.02014899, + "epoch": 0.27494363445062375, + "flos": 23586412711680.0, + "grad_norm": 1.4561912485367179, + "language_loss": 0.82280201, + "learning_rate": 3.402946971702147e-06, + "loss": 0.84394115, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.46875, + "step": 4573, + "time_per_iteration": 2.443777561187744 + }, + { + "auxiliary_loss_clip": 0.01070854, + "auxiliary_loss_mlp": 0.01050024, + "balance_loss_clip": 1.01788485, + "balance_loss_mlp": 1.02147031, + "epoch": 0.2750037577032918, + "flos": 17163650822400.0, + "grad_norm": 2.154897857044085, + "language_loss": 0.80187631, + "learning_rate": 3.402669377496223e-06, + "loss": 0.82308507, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.49414062, + "step": 4574, + "time_per_iteration": 2.3696236610412598 + }, + { + "auxiliary_loss_clip": 0.01071491, + "auxiliary_loss_mlp": 0.01054682, + "balance_loss_clip": 1.02340126, + "balance_loss_mlp": 1.02075946, + "epoch": 0.27506388095595974, + "flos": 24490942882560.0, + "grad_norm": 2.0563741677412204, + "language_loss": 0.75959039, + "learning_rate": 3.402391730100936e-06, + "loss": 0.78085214, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.5078125, + "step": 4575, + "time_per_iteration": 2.414783477783203 + }, + { + "auxiliary_loss_clip": 0.01069297, + "auxiliary_loss_mlp": 0.01058415, + "balance_loss_clip": 1.02687204, + "balance_loss_mlp": 1.02054787, + "epoch": 0.2751240042086277, + "flos": 38763157259520.0, + "grad_norm": 1.6074792643352929, + "language_loss": 0.73112828, + "learning_rate": 3.402114029526814e-06, + "loss": 0.75240541, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.48828125, + "step": 4576, + "time_per_iteration": 2.5387487411499023 + }, + { + "auxiliary_loss_clip": 0.01070832, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_clip": 1.0191381, + "balance_loss_mlp": 1.02160466, + "epoch": 0.27518412746129567, + "flos": 26905815002880.0, + "grad_norm": 1.7616521614086194, + "language_loss": 0.74599075, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.76720446, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4921875, + "step": 4577, + "time_per_iteration": 2.458364486694336 + }, + { + "auxiliary_loss_clip": 0.0107233, + "auxiliary_loss_mlp": 0.01054285, + "balance_loss_clip": 1.02090645, + "balance_loss_mlp": 1.02167392, + "epoch": 0.27524425071396363, + "flos": 24899356500480.0, + "grad_norm": 1.8921720859889564, + "language_loss": 0.78271383, + "learning_rate": 3.401558468884188e-06, + "loss": 0.80397999, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5078125, + "step": 4578, + "time_per_iteration": 2.42012882232666 + }, + { + "auxiliary_loss_clip": 0.01074312, + "auxiliary_loss_mlp": 0.01054084, + "balance_loss_clip": 1.01548421, + "balance_loss_mlp": 1.02108419, + "epoch": 0.2753043739666316, + "flos": 26286804864000.0, + "grad_norm": 1.5836003275866866, + "language_loss": 0.67731154, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.69859552, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.53125, + "step": 4579, + "time_per_iteration": 2.459341287612915 + }, + { + "auxiliary_loss_clip": 0.01074212, + "auxiliary_loss_mlp": 0.01058425, + "balance_loss_clip": 1.02325797, + "balance_loss_mlp": 1.02245367, + "epoch": 0.27536449721929956, + "flos": 24205632318720.0, + "grad_norm": 1.7215787305466974, + "language_loss": 0.8128264, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.83415276, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.51953125, + "step": 4580, + "time_per_iteration": 2.4095144271850586 + }, + { + "auxiliary_loss_clip": 0.01071295, + "auxiliary_loss_mlp": 0.01053659, + "balance_loss_clip": 1.02001834, + "balance_loss_mlp": 1.02133918, + "epoch": 0.27542462047196753, + "flos": 19536243419520.0, + "grad_norm": 1.6043714855069107, + "language_loss": 0.68724155, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.70849109, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5, + "step": 4581, + "time_per_iteration": 2.417628526687622 + }, + { + "auxiliary_loss_clip": 0.01075321, + "auxiliary_loss_mlp": 0.0105105, + "balance_loss_clip": 1.02034211, + "balance_loss_mlp": 1.02372169, + "epoch": 0.2754847437246355, + "flos": 14318299238400.0, + "grad_norm": 2.9109276055221978, + "language_loss": 0.79614592, + "learning_rate": 3.400446709916392e-06, + "loss": 0.81740957, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.515625, + "step": 4582, + "time_per_iteration": 2.3754961490631104 + }, + { + "auxiliary_loss_clip": 0.01072624, + "auxiliary_loss_mlp": 0.01054438, + "balance_loss_clip": 1.02048683, + "balance_loss_mlp": 1.02206349, + "epoch": 0.27554486697730346, + "flos": 18837910938240.0, + "grad_norm": 1.6692080629293908, + "language_loss": 0.8586036, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.87987423, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 4583, + "time_per_iteration": 2.433675765991211 + }, + { + "auxiliary_loss_clip": 0.0107544, + "auxiliary_loss_mlp": 0.01051011, + "balance_loss_clip": 1.01856256, + "balance_loss_mlp": 1.02342057, + "epoch": 0.2756049902299714, + "flos": 22381210978560.0, + "grad_norm": 1.818314884236316, + "language_loss": 0.68698597, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.70825052, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51953125, + "step": 4584, + "time_per_iteration": 2.3907196521759033 + }, + { + "auxiliary_loss_clip": 0.01070223, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.02265334, + "balance_loss_mlp": 1.0215981, + "epoch": 0.2756651134826394, + "flos": 19572867302400.0, + "grad_norm": 2.284960384035675, + "language_loss": 0.78693902, + "learning_rate": 3.399612333050327e-06, + "loss": 0.80817884, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.48632812, + "step": 4585, + "time_per_iteration": 2.421934127807617 + }, + { + "auxiliary_loss_clip": 0.01076635, + "auxiliary_loss_mlp": 0.0105595, + "balance_loss_clip": 1.02099776, + "balance_loss_mlp": 1.02403045, + "epoch": 0.27572523673530736, + "flos": 23585435193600.0, + "grad_norm": 1.780813417738699, + "language_loss": 0.73844254, + "learning_rate": 3.399334101267362e-06, + "loss": 0.75976837, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.52734375, + "step": 4586, + "time_per_iteration": 2.415919780731201 + }, + { + "auxiliary_loss_clip": 0.01072378, + "auxiliary_loss_mlp": 0.01051792, + "balance_loss_clip": 1.019701, + "balance_loss_mlp": 1.02228904, + "epoch": 0.2757853599879754, + "flos": 22819021827840.0, + "grad_norm": 1.4763840780036939, + "language_loss": 0.82013714, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.84137887, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.5, + "step": 4587, + "time_per_iteration": 2.4464282989501953 + }, + { + "auxiliary_loss_clip": 0.01070832, + "auxiliary_loss_mlp": 0.01060473, + "balance_loss_clip": 1.026999, + "balance_loss_mlp": 1.02146506, + "epoch": 0.27584548324064334, + "flos": 18550715160960.0, + "grad_norm": 1.8397972968792529, + "language_loss": 0.84037447, + "learning_rate": 3.398777478523316e-06, + "loss": 0.86168754, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4921875, + "step": 4588, + "time_per_iteration": 2.3718390464782715 + }, + { + "auxiliary_loss_clip": 0.01068748, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_clip": 1.01678872, + "balance_loss_mlp": 1.02019536, + "epoch": 0.2759056064933113, + "flos": 23768729164800.0, + "grad_norm": 1.4066858647829044, + "language_loss": 0.76754636, + "learning_rate": 3.398499087583342e-06, + "loss": 0.78870785, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.48632812, + "step": 4589, + "time_per_iteration": 2.4657483100891113 + }, + { + "auxiliary_loss_clip": 0.01069449, + "auxiliary_loss_mlp": 0.0105432, + "balance_loss_clip": 1.022753, + "balance_loss_mlp": 1.01999938, + "epoch": 0.27596572974597927, + "flos": 24280695475200.0, + "grad_norm": 1.6318565636427187, + "language_loss": 0.90064037, + "learning_rate": 3.398220643612143e-06, + "loss": 0.92187804, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.49414062, + "step": 4590, + "time_per_iteration": 2.4347591400146484 + }, + { + "auxiliary_loss_clip": 0.01069565, + "auxiliary_loss_mlp": 0.01051522, + "balance_loss_clip": 1.01795244, + "balance_loss_mlp": 1.01897311, + "epoch": 0.27602585299864724, + "flos": 35039600536320.0, + "grad_norm": 1.6303374409188511, + "language_loss": 0.72947192, + "learning_rate": 3.397942146620277e-06, + "loss": 0.75068277, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.50390625, + "step": 4591, + "time_per_iteration": 2.516845226287842 + }, + { + "auxiliary_loss_clip": 0.01072856, + "auxiliary_loss_mlp": 0.01055848, + "balance_loss_clip": 1.02099085, + "balance_loss_mlp": 1.02175951, + "epoch": 0.2760859762513152, + "flos": 24308451872640.0, + "grad_norm": 1.900805149490003, + "language_loss": 0.81643236, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.83771944, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.51171875, + "step": 4592, + "time_per_iteration": 2.414559841156006 + }, + { + "auxiliary_loss_clip": 0.0101835, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.02435982, + "balance_loss_mlp": 1.00640607, + "epoch": 0.27614609950398317, + "flos": 71257623926400.0, + "grad_norm": 0.7312760779356849, + "language_loss": 0.61681879, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63729072, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.11914062, + "step": 4593, + "time_per_iteration": 2.9908697605133057 + }, + { + "auxiliary_loss_clip": 0.01070144, + "auxiliary_loss_mlp": 0.01051478, + "balance_loss_clip": 1.01552474, + "balance_loss_mlp": 1.01931095, + "epoch": 0.27620622275665113, + "flos": 29673694546560.0, + "grad_norm": 2.0341018840279057, + "language_loss": 0.78189397, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.80311018, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5078125, + "step": 4594, + "time_per_iteration": 2.450208902359009 + }, + { + "auxiliary_loss_clip": 0.01072062, + "auxiliary_loss_mlp": 0.01046429, + "balance_loss_clip": 1.01574445, + "balance_loss_mlp": 1.02274632, + "epoch": 0.2762663460093191, + "flos": 15377145085440.0, + "grad_norm": 1.5062076677730314, + "language_loss": 0.92738408, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.94856894, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4921875, + "step": 4595, + "time_per_iteration": 2.433159589767456 + }, + { + "auxiliary_loss_clip": 0.01073408, + "auxiliary_loss_mlp": 0.01052912, + "balance_loss_clip": 1.01955748, + "balance_loss_mlp": 1.02249408, + "epoch": 0.27632646926198706, + "flos": 20703040790400.0, + "grad_norm": 1.9070236656025337, + "language_loss": 0.70678794, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.72805119, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5078125, + "step": 4596, + "time_per_iteration": 2.389512300491333 + }, + { + "auxiliary_loss_clip": 0.0107729, + "auxiliary_loss_mlp": 0.01054762, + "balance_loss_clip": 1.01945233, + "balance_loss_mlp": 1.02323878, + "epoch": 0.276386592514655, + "flos": 32812107016320.0, + "grad_norm": 1.6690693131531837, + "language_loss": 0.65438986, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.67571038, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5390625, + "step": 4597, + "time_per_iteration": 2.5295517444610596 + }, + { + "auxiliary_loss_clip": 0.01071334, + "auxiliary_loss_mlp": 0.01051628, + "balance_loss_clip": 1.02127755, + "balance_loss_mlp": 1.02348852, + "epoch": 0.276446715767323, + "flos": 18550715160960.0, + "grad_norm": 1.948476302116572, + "language_loss": 0.87418848, + "learning_rate": 3.395991183985887e-06, + "loss": 0.89541817, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47851562, + "step": 4598, + "time_per_iteration": 2.3935153484344482 + }, + { + "auxiliary_loss_clip": 0.01073474, + "auxiliary_loss_mlp": 0.01055072, + "balance_loss_clip": 1.02250409, + "balance_loss_mlp": 1.02295184, + "epoch": 0.27650683901999096, + "flos": 22818533068800.0, + "grad_norm": 2.518017860763197, + "language_loss": 0.81706989, + "learning_rate": 3.395712263209037e-06, + "loss": 0.83835536, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.50390625, + "step": 4599, + "time_per_iteration": 2.427734375 + }, + { + "auxiliary_loss_clip": 0.01075369, + "auxiliary_loss_mlp": 0.01060059, + "balance_loss_clip": 1.02615571, + "balance_loss_mlp": 1.0231806, + "epoch": 0.276566962272659, + "flos": 21360455291520.0, + "grad_norm": 1.8989281736837642, + "language_loss": 0.80745113, + "learning_rate": 3.395433289506639e-06, + "loss": 0.82880545, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.51953125, + "step": 4600, + "time_per_iteration": 2.412236213684082 + }, + { + "auxiliary_loss_clip": 0.01079002, + "auxiliary_loss_mlp": 0.01074545, + "balance_loss_clip": 1.03739929, + "balance_loss_mlp": 1.02451432, + "epoch": 0.27662708552532694, + "flos": 17709692486400.0, + "grad_norm": 2.0205282498912194, + "language_loss": 0.74233866, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.76387417, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.54296875, + "step": 4601, + "time_per_iteration": 2.404066801071167 + }, + { + "auxiliary_loss_clip": 0.01071861, + "auxiliary_loss_mlp": 0.01059218, + "balance_loss_clip": 1.02719843, + "balance_loss_mlp": 1.02170742, + "epoch": 0.2766872087779949, + "flos": 21251630983680.0, + "grad_norm": 1.7836627767919369, + "language_loss": 0.81638932, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.83770013, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.50390625, + "step": 4602, + "time_per_iteration": 2.428396224975586 + }, + { + "auxiliary_loss_clip": 0.01076368, + "auxiliary_loss_mlp": 0.01059441, + "balance_loss_clip": 1.02186632, + "balance_loss_mlp": 1.02212334, + "epoch": 0.2767473320306629, + "flos": 12931095254400.0, + "grad_norm": 2.367602394352554, + "language_loss": 0.78961265, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.81097078, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.54296875, + "step": 4603, + "time_per_iteration": 3.76438570022583 + }, + { + "auxiliary_loss_clip": 0.01070813, + "auxiliary_loss_mlp": 0.01059837, + "balance_loss_clip": 1.02825844, + "balance_loss_mlp": 1.02229571, + "epoch": 0.27680745528333084, + "flos": 15011953597440.0, + "grad_norm": 1.5905051716807581, + "language_loss": 0.82918113, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.85048759, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.484375, + "step": 4604, + "time_per_iteration": 2.355046510696411 + }, + { + "auxiliary_loss_clip": 0.01071546, + "auxiliary_loss_mlp": 0.01054178, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.02070415, + "epoch": 0.2768675785359988, + "flos": 22636740286080.0, + "grad_norm": 1.8548228046838384, + "language_loss": 0.71574402, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.7370013, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5078125, + "step": 4605, + "time_per_iteration": 2.427611827850342 + }, + { + "auxiliary_loss_clip": 0.01021188, + "auxiliary_loss_mlp": 0.0101846, + "balance_loss_clip": 1.01454973, + "balance_loss_mlp": 1.00866485, + "epoch": 0.27692770178866677, + "flos": 66127171345920.0, + "grad_norm": 0.7397887751549002, + "language_loss": 0.57251322, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59290969, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.0390625, + "router_z_loss_mlp": 0.125, + "step": 4606, + "time_per_iteration": 4.516442537307739 + }, + { + "auxiliary_loss_clip": 0.01074575, + "auxiliary_loss_mlp": 0.01054342, + "balance_loss_clip": 1.02229893, + "balance_loss_mlp": 1.02232087, + "epoch": 0.27698782504133473, + "flos": 26463884613120.0, + "grad_norm": 1.9874370868949134, + "language_loss": 0.71575445, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.73704362, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.5234375, + "step": 4607, + "time_per_iteration": 3.9149959087371826 + }, + { + "auxiliary_loss_clip": 0.0106806, + "auxiliary_loss_mlp": 0.01050938, + "balance_loss_clip": 1.0220772, + "balance_loss_mlp": 1.02139056, + "epoch": 0.2770479482940027, + "flos": 25883627950080.0, + "grad_norm": 1.6197845495514234, + "language_loss": 0.71069419, + "learning_rate": 3.393199595837555e-06, + "loss": 0.73188412, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.46679688, + "step": 4608, + "time_per_iteration": 4.009166955947876 + }, + { + "auxiliary_loss_clip": 0.01073815, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.0119257, + "balance_loss_mlp": 1.02270293, + "epoch": 0.27710807154667066, + "flos": 22856134469760.0, + "grad_norm": 1.781093868746056, + "language_loss": 0.74713254, + "learning_rate": 3.392920146281499e-06, + "loss": 0.76830494, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.51171875, + "step": 4609, + "time_per_iteration": 2.4308862686157227 + }, + { + "auxiliary_loss_clip": 0.01073996, + "auxiliary_loss_mlp": 0.0105275, + "balance_loss_clip": 1.01856053, + "balance_loss_mlp": 1.02353454, + "epoch": 0.27716819479933863, + "flos": 17710146334080.0, + "grad_norm": 2.1958786035047906, + "language_loss": 0.85422409, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.8754915, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5078125, + "step": 4610, + "time_per_iteration": 2.355536460876465 + }, + { + "auxiliary_loss_clip": 0.01082873, + "auxiliary_loss_mlp": 0.0106416, + "balance_loss_clip": 1.02730095, + "balance_loss_mlp": 1.02922428, + "epoch": 0.2772283180520066, + "flos": 19645032816000.0, + "grad_norm": 2.0030321497172774, + "language_loss": 0.71375048, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.73522079, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.53515625, + "step": 4611, + "time_per_iteration": 2.416422128677368 + }, + { + "auxiliary_loss_clip": 0.01075013, + "auxiliary_loss_mlp": 0.01042949, + "balance_loss_clip": 1.01257443, + "balance_loss_mlp": 1.02795208, + "epoch": 0.27728844130467456, + "flos": 21031573484160.0, + "grad_norm": 1.7150717490139142, + "language_loss": 0.76009667, + "learning_rate": 3.392081480737698e-06, + "loss": 0.78127629, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47070312, + "step": 4612, + "time_per_iteration": 2.399803876876831 + }, + { + "auxiliary_loss_clip": 0.01077805, + "auxiliary_loss_mlp": 0.01051842, + "balance_loss_clip": 1.01953626, + "balance_loss_mlp": 1.02724504, + "epoch": 0.2773485645573425, + "flos": 18988211808000.0, + "grad_norm": 2.3710040314568026, + "language_loss": 0.67849261, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.69978905, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.50390625, + "step": 4613, + "time_per_iteration": 2.4346001148223877 + }, + { + "auxiliary_loss_clip": 0.01076504, + "auxiliary_loss_mlp": 0.01052753, + "balance_loss_clip": 1.02337933, + "balance_loss_mlp": 1.02719545, + "epoch": 0.27740868781001055, + "flos": 21467429297280.0, + "grad_norm": 1.604830070836584, + "language_loss": 0.81069005, + "learning_rate": 3.39152210641815e-06, + "loss": 0.83198255, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4921875, + "step": 4614, + "time_per_iteration": 2.461322069168091 + }, + { + "auxiliary_loss_clip": 0.01078711, + "auxiliary_loss_mlp": 0.01052419, + "balance_loss_clip": 1.02022028, + "balance_loss_mlp": 1.02760363, + "epoch": 0.2774688110626785, + "flos": 19826825598720.0, + "grad_norm": 2.7187713797783273, + "language_loss": 0.83482063, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.85613191, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.51171875, + "step": 4615, + "time_per_iteration": 2.393500566482544 + }, + { + "auxiliary_loss_clip": 0.01081586, + "auxiliary_loss_mlp": 0.01056996, + "balance_loss_clip": 1.02385581, + "balance_loss_mlp": 1.02923822, + "epoch": 0.2775289343153465, + "flos": 18215444574720.0, + "grad_norm": 2.4516283905090135, + "language_loss": 0.65835458, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.67974043, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5234375, + "step": 4616, + "time_per_iteration": 2.359668493270874 + }, + { + "auxiliary_loss_clip": 0.01076496, + "auxiliary_loss_mlp": 0.01054194, + "balance_loss_clip": 1.02324748, + "balance_loss_mlp": 1.02733541, + "epoch": 0.27758905756801444, + "flos": 16471532563200.0, + "grad_norm": 2.5912449066217853, + "language_loss": 0.84606594, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.86737287, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4921875, + "step": 4617, + "time_per_iteration": 2.4042820930480957 + }, + { + "auxiliary_loss_clip": 0.01076571, + "auxiliary_loss_mlp": 0.01054896, + "balance_loss_clip": 1.02338934, + "balance_loss_mlp": 1.02528608, + "epoch": 0.2776491808206824, + "flos": 18727410885120.0, + "grad_norm": 1.9638553825747433, + "language_loss": 0.78000677, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.80132139, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.51171875, + "step": 4618, + "time_per_iteration": 2.3831894397735596 + }, + { + "auxiliary_loss_clip": 0.010779, + "auxiliary_loss_mlp": 0.01053519, + "balance_loss_clip": 1.02121305, + "balance_loss_mlp": 1.0266726, + "epoch": 0.27770930407335037, + "flos": 28036931097600.0, + "grad_norm": 1.9707115643701252, + "language_loss": 0.85655093, + "learning_rate": 3.390122747388459e-06, + "loss": 0.87786508, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.51171875, + "step": 4619, + "time_per_iteration": 2.5510616302490234 + }, + { + "auxiliary_loss_clip": 0.01074125, + "auxiliary_loss_mlp": 0.01058424, + "balance_loss_clip": 1.029814, + "balance_loss_mlp": 1.02464676, + "epoch": 0.27776942732601834, + "flos": 23548706576640.0, + "grad_norm": 1.4126479189903516, + "language_loss": 0.78116155, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.80248702, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.49609375, + "step": 4620, + "time_per_iteration": 2.4771981239318848 + }, + { + "auxiliary_loss_clip": 0.01070684, + "auxiliary_loss_mlp": 0.01059985, + "balance_loss_clip": 1.02637959, + "balance_loss_mlp": 1.02196527, + "epoch": 0.2778295505786863, + "flos": 23907753665280.0, + "grad_norm": 1.7919381526307359, + "language_loss": 0.79709995, + "learning_rate": 3.389562634707122e-06, + "loss": 0.81840664, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.48828125, + "step": 4621, + "time_per_iteration": 2.4520630836486816 + }, + { + "auxiliary_loss_clip": 0.0107285, + "auxiliary_loss_mlp": 0.01062429, + "balance_loss_clip": 1.02918172, + "balance_loss_mlp": 1.02177715, + "epoch": 0.27788967383135427, + "flos": 25553454422400.0, + "grad_norm": 5.605483084362551, + "language_loss": 0.89297593, + "learning_rate": 3.389282499322611e-06, + "loss": 0.91432875, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5078125, + "step": 4622, + "time_per_iteration": 2.407270669937134 + }, + { + "auxiliary_loss_clip": 0.01071907, + "auxiliary_loss_mlp": 0.0105954, + "balance_loss_clip": 1.02799726, + "balance_loss_mlp": 1.02156377, + "epoch": 0.27794979708402223, + "flos": 16251719443200.0, + "grad_norm": 1.9778230753638553, + "language_loss": 0.83154505, + "learning_rate": 3.389002311256369e-06, + "loss": 0.85285962, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.50390625, + "step": 4623, + "time_per_iteration": 2.3918397426605225 + }, + { + "auxiliary_loss_clip": 0.01073699, + "auxiliary_loss_mlp": 0.01059356, + "balance_loss_clip": 1.02883828, + "balance_loss_mlp": 1.02356315, + "epoch": 0.2780099203366902, + "flos": 20666591464320.0, + "grad_norm": 1.9955116853345316, + "language_loss": 0.83334768, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.85467815, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.5, + "step": 4624, + "time_per_iteration": 2.4015495777130127 + }, + { + "auxiliary_loss_clip": 0.0106973, + "auxiliary_loss_mlp": 0.01059722, + "balance_loss_clip": 1.02684367, + "balance_loss_mlp": 1.02049232, + "epoch": 0.27807004358935816, + "flos": 17738880249600.0, + "grad_norm": 2.2491409988623645, + "language_loss": 0.78172791, + "learning_rate": 3.388441777121191e-06, + "loss": 0.80302244, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4921875, + "step": 4625, + "time_per_iteration": 2.376760244369507 + }, + { + "auxiliary_loss_clip": 0.01073185, + "auxiliary_loss_mlp": 0.01060735, + "balance_loss_clip": 1.02808309, + "balance_loss_mlp": 1.0234971, + "epoch": 0.2781301668420261, + "flos": 16726189086720.0, + "grad_norm": 2.115294245861918, + "language_loss": 0.71735889, + "learning_rate": 3.388161431073511e-06, + "loss": 0.73869812, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.49804688, + "step": 4626, + "time_per_iteration": 2.409721612930298 + }, + { + "auxiliary_loss_clip": 0.01076668, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.02171159, + "balance_loss_mlp": 1.02312386, + "epoch": 0.27819029009469415, + "flos": 13843899417600.0, + "grad_norm": 2.171775768468512, + "language_loss": 0.94348454, + "learning_rate": 3.38788103238661e-06, + "loss": 0.96481687, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53515625, + "step": 4627, + "time_per_iteration": 2.37099289894104 + }, + { + "auxiliary_loss_clip": 0.01075817, + "auxiliary_loss_mlp": 0.01049237, + "balance_loss_clip": 1.01719379, + "balance_loss_mlp": 1.02544546, + "epoch": 0.2782504133473621, + "flos": 27088061633280.0, + "grad_norm": 1.8231731043383834, + "language_loss": 0.86904883, + "learning_rate": 3.387600581071121e-06, + "loss": 0.89029944, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.50390625, + "step": 4628, + "time_per_iteration": 2.495032548904419 + }, + { + "auxiliary_loss_clip": 0.01072104, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.01863682, + "balance_loss_mlp": 1.02307642, + "epoch": 0.2783105366000301, + "flos": 21067778430720.0, + "grad_norm": 1.9001713172620474, + "language_loss": 0.80904675, + "learning_rate": 3.387320077137679e-06, + "loss": 0.83026433, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4921875, + "step": 4629, + "time_per_iteration": 2.3927762508392334 + }, + { + "auxiliary_loss_clip": 0.0107271, + "auxiliary_loss_mlp": 0.01044101, + "balance_loss_clip": 1.01482368, + "balance_loss_mlp": 1.02568579, + "epoch": 0.27837065985269804, + "flos": 26500717964160.0, + "grad_norm": 1.4965784396963704, + "language_loss": 0.85588849, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.8770566, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.47070312, + "step": 4630, + "time_per_iteration": 2.4466190338134766 + }, + { + "auxiliary_loss_clip": 0.01076984, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.01380014, + "balance_loss_mlp": 1.02629125, + "epoch": 0.278430783105366, + "flos": 20222356924800.0, + "grad_norm": 2.1523739721966813, + "language_loss": 0.83486271, + "learning_rate": 3.386758911459485e-06, + "loss": 0.85611361, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 4631, + "time_per_iteration": 2.3849973678588867 + }, + { + "auxiliary_loss_clip": 0.01082283, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.01310134, + "balance_loss_mlp": 1.02948833, + "epoch": 0.278490906358034, + "flos": 25591719139200.0, + "grad_norm": 1.9252694112719715, + "language_loss": 0.72817087, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74948454, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 4632, + "time_per_iteration": 2.4917008876800537 + }, + { + "auxiliary_loss_clip": 0.01074353, + "auxiliary_loss_mlp": 0.01050191, + "balance_loss_clip": 1.01790905, + "balance_loss_mlp": 1.02711952, + "epoch": 0.27855102961070194, + "flos": 16170861000960.0, + "grad_norm": 1.7463738748973807, + "language_loss": 0.83630258, + "learning_rate": 3.386197535437145e-06, + "loss": 0.857548, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47265625, + "step": 4633, + "time_per_iteration": 2.3542466163635254 + }, + { + "auxiliary_loss_clip": 0.01078003, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.01438963, + "balance_loss_mlp": 1.02733207, + "epoch": 0.2786111528633699, + "flos": 22926554415360.0, + "grad_norm": 1.6838674909701292, + "language_loss": 0.88985443, + "learning_rate": 3.385916768573529e-06, + "loss": 0.9111191, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 4634, + "time_per_iteration": 2.399946928024292 + }, + { + "auxiliary_loss_clip": 0.01079702, + "auxiliary_loss_mlp": 0.01051518, + "balance_loss_clip": 1.01616085, + "balance_loss_mlp": 1.02798867, + "epoch": 0.27867127611603787, + "flos": 23403083829120.0, + "grad_norm": 1.5224078005921926, + "language_loss": 0.77592874, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79724091, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.515625, + "step": 4635, + "time_per_iteration": 2.4442081451416016 + }, + { + "auxiliary_loss_clip": 0.01076313, + "auxiliary_loss_mlp": 0.01055833, + "balance_loss_clip": 1.0211674, + "balance_loss_mlp": 1.02563119, + "epoch": 0.27873139936870583, + "flos": 19827977673600.0, + "grad_norm": 1.6835297636683026, + "language_loss": 0.66851783, + "learning_rate": 3.385355077194637e-06, + "loss": 0.68983924, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5078125, + "step": 4636, + "time_per_iteration": 2.413301944732666 + }, + { + "auxiliary_loss_clip": 0.01079341, + "auxiliary_loss_mlp": 0.01054852, + "balance_loss_clip": 1.01997137, + "balance_loss_mlp": 1.02609921, + "epoch": 0.2787915226213738, + "flos": 17706829754880.0, + "grad_norm": 2.649595170574568, + "language_loss": 0.86198997, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.88333189, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53125, + "step": 4637, + "time_per_iteration": 2.3539958000183105 + }, + { + "auxiliary_loss_clip": 0.01071889, + "auxiliary_loss_mlp": 0.01054259, + "balance_loss_clip": 1.0236938, + "balance_loss_mlp": 1.02397346, + "epoch": 0.27885164587404176, + "flos": 22089476724480.0, + "grad_norm": 1.4495973520316157, + "language_loss": 0.77773446, + "learning_rate": 3.384793175684533e-06, + "loss": 0.79899585, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47851562, + "step": 4638, + "time_per_iteration": 2.446038246154785 + }, + { + "auxiliary_loss_clip": 0.0107398, + "auxiliary_loss_mlp": 0.0105901, + "balance_loss_clip": 1.02505875, + "balance_loss_mlp": 1.02369976, + "epoch": 0.27891176912670973, + "flos": 19206698296320.0, + "grad_norm": 1.4502198639346005, + "language_loss": 0.72858274, + "learning_rate": 3.38451214615691e-06, + "loss": 0.74991262, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.50390625, + "step": 4639, + "time_per_iteration": 2.3728268146514893 + }, + { + "auxiliary_loss_clip": 0.0107156, + "auxiliary_loss_mlp": 0.01051987, + "balance_loss_clip": 1.01879942, + "balance_loss_mlp": 1.02174222, + "epoch": 0.27897189237937775, + "flos": 27598771134720.0, + "grad_norm": 2.8913667347213607, + "language_loss": 0.66902852, + "learning_rate": 3.384231064128447e-06, + "loss": 0.69026399, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.49804688, + "step": 4640, + "time_per_iteration": 2.460193634033203 + }, + { + "auxiliary_loss_clip": 0.01072704, + "auxiliary_loss_mlp": 0.01050607, + "balance_loss_clip": 1.01765728, + "balance_loss_mlp": 1.02228975, + "epoch": 0.2790320156320457, + "flos": 21177161320320.0, + "grad_norm": 2.5881800263575405, + "language_loss": 0.73256922, + "learning_rate": 3.383949929609804e-06, + "loss": 0.75380236, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.50390625, + "step": 4641, + "time_per_iteration": 2.3921053409576416 + }, + { + "auxiliary_loss_clip": 0.01073793, + "auxiliary_loss_mlp": 0.01057322, + "balance_loss_clip": 1.02132046, + "balance_loss_mlp": 1.02215958, + "epoch": 0.2790921388847137, + "flos": 22782816881280.0, + "grad_norm": 1.5850970656942875, + "language_loss": 0.77033806, + "learning_rate": 3.383668742611641e-06, + "loss": 0.7916491, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.515625, + "step": 4642, + "time_per_iteration": 3.8508524894714355 + }, + { + "auxiliary_loss_clip": 0.01073842, + "auxiliary_loss_mlp": 0.01053597, + "balance_loss_clip": 1.02048075, + "balance_loss_mlp": 1.02306485, + "epoch": 0.27915226213738165, + "flos": 23399627604480.0, + "grad_norm": 2.317147592076058, + "language_loss": 0.87279326, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.89406765, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5078125, + "step": 4643, + "time_per_iteration": 2.3984670639038086 + }, + { + "auxiliary_loss_clip": 0.01072733, + "auxiliary_loss_mlp": 0.01057108, + "balance_loss_clip": 1.02246547, + "balance_loss_mlp": 1.02242696, + "epoch": 0.2792123853900496, + "flos": 22746681757440.0, + "grad_norm": 1.805655091986533, + "language_loss": 0.84260941, + "learning_rate": 3.383106211219407e-06, + "loss": 0.86390775, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.50390625, + "step": 4644, + "time_per_iteration": 2.425086736679077 + }, + { + "auxiliary_loss_clip": 0.01073375, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.0181222, + "balance_loss_mlp": 1.02208591, + "epoch": 0.2792725086427176, + "flos": 15048472746240.0, + "grad_norm": 1.8788181501255627, + "language_loss": 0.8046416, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.8258822, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.515625, + "step": 4645, + "time_per_iteration": 2.370615005493164 + }, + { + "auxiliary_loss_clip": 0.01018563, + "auxiliary_loss_mlp": 0.01011556, + "balance_loss_clip": 1.00759828, + "balance_loss_mlp": 1.00647724, + "epoch": 0.27933263189538554, + "flos": 62541871073280.0, + "grad_norm": 0.7825148058184128, + "language_loss": 0.62396514, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64426637, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.12109375, + "step": 4646, + "time_per_iteration": 5.810253858566284 + }, + { + "auxiliary_loss_clip": 0.01071375, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.02049387, + "balance_loss_mlp": 1.02366567, + "epoch": 0.2793927551480535, + "flos": 25117214584320.0, + "grad_norm": 1.5719970074648184, + "language_loss": 0.90405607, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.92527944, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4765625, + "step": 4647, + "time_per_iteration": 2.4505696296691895 + }, + { + "auxiliary_loss_clip": 0.0107752, + "auxiliary_loss_mlp": 0.01053217, + "balance_loss_clip": 1.01890898, + "balance_loss_mlp": 1.02595472, + "epoch": 0.27945287840072147, + "flos": 21323517206400.0, + "grad_norm": 1.7811598880785062, + "language_loss": 0.87785876, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89916611, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4648, + "time_per_iteration": 3.850947856903076 + }, + { + "auxiliary_loss_clip": 0.01081056, + "auxiliary_loss_mlp": 0.01045949, + "balance_loss_clip": 1.01259387, + "balance_loss_mlp": 1.02755475, + "epoch": 0.27951300165338944, + "flos": 27449412871680.0, + "grad_norm": 2.1169970092315817, + "language_loss": 0.74778318, + "learning_rate": 3.38169896509385e-06, + "loss": 0.76905322, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.53515625, + "step": 4649, + "time_per_iteration": 2.4620604515075684 + }, + { + "auxiliary_loss_clip": 0.01076113, + "auxiliary_loss_mlp": 0.01053917, + "balance_loss_clip": 1.01696253, + "balance_loss_mlp": 1.02569866, + "epoch": 0.2795731249060574, + "flos": 15158100015360.0, + "grad_norm": 2.2102141439751137, + "language_loss": 0.82275057, + "learning_rate": 3.381417358643549e-06, + "loss": 0.84405077, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.50390625, + "step": 4650, + "time_per_iteration": 2.4198408126831055 + }, + { + "auxiliary_loss_clip": 0.01022799, + "auxiliary_loss_mlp": 0.01004804, + "balance_loss_clip": 1.00075078, + "balance_loss_mlp": 1.01027346, + "epoch": 0.27963324815872537, + "flos": 60116628412800.0, + "grad_norm": 0.8287267065210739, + "language_loss": 0.58853072, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60880673, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.04052734, + "router_z_loss_mlp": 0.125, + "step": 4651, + "time_per_iteration": 3.057788848876953 + }, + { + "auxiliary_loss_clip": 0.01079829, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_clip": 1.01666403, + "balance_loss_mlp": 1.02601457, + "epoch": 0.27969337141139333, + "flos": 21764784280320.0, + "grad_norm": 1.7186968341322653, + "language_loss": 0.75266051, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.77401334, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5390625, + "step": 4652, + "time_per_iteration": 2.447598457336426 + }, + { + "auxiliary_loss_clip": 0.010801, + "auxiliary_loss_mlp": 0.01054507, + "balance_loss_clip": 1.01988852, + "balance_loss_mlp": 1.02685976, + "epoch": 0.27975349466406135, + "flos": 39850038794880.0, + "grad_norm": 2.129726512659265, + "language_loss": 0.80194092, + "learning_rate": 3.380572225034461e-06, + "loss": 0.82328695, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.53125, + "step": 4653, + "time_per_iteration": 2.5436012744903564 + }, + { + "auxiliary_loss_clip": 0.01078584, + "auxiliary_loss_mlp": 0.01055928, + "balance_loss_clip": 1.02076077, + "balance_loss_mlp": 1.02723503, + "epoch": 0.2798136179167293, + "flos": 21578732311680.0, + "grad_norm": 1.97922260636658, + "language_loss": 0.80019343, + "learning_rate": 3.380290409114312e-06, + "loss": 0.82153857, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.51171875, + "step": 4654, + "time_per_iteration": 2.451482057571411 + }, + { + "auxiliary_loss_clip": 0.01080403, + "auxiliary_loss_mlp": 0.01059467, + "balance_loss_clip": 1.02229714, + "balance_loss_mlp": 1.02651644, + "epoch": 0.2798737411693973, + "flos": 21536766990720.0, + "grad_norm": 1.9419464234738197, + "language_loss": 0.82176769, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.84316641, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5390625, + "step": 4655, + "time_per_iteration": 2.3884999752044678 + }, + { + "auxiliary_loss_clip": 0.01078656, + "auxiliary_loss_mlp": 0.0105812, + "balance_loss_clip": 1.02280974, + "balance_loss_mlp": 1.02560842, + "epoch": 0.27993386442206525, + "flos": 26979795907200.0, + "grad_norm": 1.873400977843817, + "language_loss": 0.83283681, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.85420454, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.53125, + "step": 4656, + "time_per_iteration": 2.464714288711548 + }, + { + "auxiliary_loss_clip": 0.01077136, + "auxiliary_loss_mlp": 0.01063631, + "balance_loss_clip": 1.02536488, + "balance_loss_mlp": 1.02458024, + "epoch": 0.2799939876747332, + "flos": 24348811271040.0, + "grad_norm": 1.6471747766486, + "language_loss": 0.83953416, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.86094189, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.52734375, + "step": 4657, + "time_per_iteration": 2.414790153503418 + }, + { + "auxiliary_loss_clip": 0.01075602, + "auxiliary_loss_mlp": 0.01056796, + "balance_loss_clip": 1.02365613, + "balance_loss_mlp": 1.02389479, + "epoch": 0.2800541109274012, + "flos": 33655573486080.0, + "grad_norm": 1.9235403287445256, + "language_loss": 0.65368879, + "learning_rate": 3.379162622133105e-06, + "loss": 0.67501271, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.515625, + "step": 4658, + "time_per_iteration": 2.5225112438201904 + }, + { + "auxiliary_loss_clip": 0.01075384, + "auxiliary_loss_mlp": 0.01063709, + "balance_loss_clip": 1.02675366, + "balance_loss_mlp": 1.02358103, + "epoch": 0.28011423418006914, + "flos": 21613401158400.0, + "grad_norm": 1.7802344930872605, + "language_loss": 0.80216265, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.82355356, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.51953125, + "step": 4659, + "time_per_iteration": 2.4269728660583496 + }, + { + "auxiliary_loss_clip": 0.01076984, + "auxiliary_loss_mlp": 0.0106831, + "balance_loss_clip": 1.03073478, + "balance_loss_mlp": 1.02368283, + "epoch": 0.2801743574327371, + "flos": 23111314663680.0, + "grad_norm": 1.8942537733874574, + "language_loss": 0.81089187, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.83234477, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.53125, + "step": 4660, + "time_per_iteration": 2.402141571044922 + }, + { + "auxiliary_loss_clip": 0.01071538, + "auxiliary_loss_mlp": 0.01055238, + "balance_loss_clip": 1.02202594, + "balance_loss_mlp": 1.02227724, + "epoch": 0.2802344806854051, + "flos": 12640582897920.0, + "grad_norm": 2.1666504065612084, + "language_loss": 0.82090926, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.84217697, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4921875, + "step": 4661, + "time_per_iteration": 2.362389326095581 + }, + { + "auxiliary_loss_clip": 0.01075862, + "auxiliary_loss_mlp": 0.01061799, + "balance_loss_clip": 1.02703714, + "balance_loss_mlp": 1.0249455, + "epoch": 0.28029460393807304, + "flos": 37266395829120.0, + "grad_norm": 1.804585638079182, + "language_loss": 0.7989859, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.82036245, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5078125, + "step": 4662, + "time_per_iteration": 2.528850793838501 + }, + { + "auxiliary_loss_clip": 0.01078547, + "auxiliary_loss_mlp": 0.01061908, + "balance_loss_clip": 1.02099514, + "balance_loss_mlp": 1.02345061, + "epoch": 0.280354727190741, + "flos": 20740048698240.0, + "grad_norm": 1.6966514247449367, + "language_loss": 0.71169984, + "learning_rate": 3.377751711782227e-06, + "loss": 0.73310441, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.55078125, + "step": 4663, + "time_per_iteration": 2.379404067993164 + }, + { + "auxiliary_loss_clip": 0.01077526, + "auxiliary_loss_mlp": 0.01062694, + "balance_loss_clip": 1.02142382, + "balance_loss_mlp": 1.02412999, + "epoch": 0.28041485044340897, + "flos": 21469942915200.0, + "grad_norm": 1.7421386372719467, + "language_loss": 0.80264735, + "learning_rate": 3.377469372935791e-06, + "loss": 0.82404959, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.53515625, + "step": 4664, + "time_per_iteration": 2.440117120742798 + }, + { + "auxiliary_loss_clip": 0.01072514, + "auxiliary_loss_mlp": 0.01060878, + "balance_loss_clip": 1.02432847, + "balance_loss_mlp": 1.02172208, + "epoch": 0.28047497369607693, + "flos": 14793362375040.0, + "grad_norm": 1.7605160689979775, + "language_loss": 0.81202066, + "learning_rate": 3.377186981855578e-06, + "loss": 0.83335459, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5078125, + "step": 4665, + "time_per_iteration": 2.3460710048675537 + }, + { + "auxiliary_loss_clip": 0.01077168, + "auxiliary_loss_mlp": 0.01058519, + "balance_loss_clip": 1.02256548, + "balance_loss_mlp": 1.02541518, + "epoch": 0.2805350969487449, + "flos": 23069768279040.0, + "grad_norm": 1.7531591754793212, + "language_loss": 0.81929469, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.84065151, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.51953125, + "step": 4666, + "time_per_iteration": 2.4438018798828125 + }, + { + "auxiliary_loss_clip": 0.0108217, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.01916528, + "balance_loss_mlp": 1.02743387, + "epoch": 0.2805952202014129, + "flos": 20478968484480.0, + "grad_norm": 2.3256609352915207, + "language_loss": 0.8728075, + "learning_rate": 3.376622043036658e-06, + "loss": 0.89419043, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.546875, + "step": 4667, + "time_per_iteration": 2.460062265396118 + }, + { + "auxiliary_loss_clip": 0.01080352, + "auxiliary_loss_mlp": 0.01064537, + "balance_loss_clip": 1.0255084, + "balance_loss_mlp": 1.02610743, + "epoch": 0.2806553434540809, + "flos": 27416105568000.0, + "grad_norm": 2.1448869192435636, + "language_loss": 0.81216931, + "learning_rate": 3.376339495319373e-06, + "loss": 0.83361816, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.54296875, + "step": 4668, + "time_per_iteration": 2.4775025844573975 + }, + { + "auxiliary_loss_clip": 0.01077652, + "auxiliary_loss_mlp": 0.01057375, + "balance_loss_clip": 1.02092075, + "balance_loss_mlp": 1.0236702, + "epoch": 0.28071546670674885, + "flos": 26503825075200.0, + "grad_norm": 1.3900241586094073, + "language_loss": 0.77166843, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.7930187, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5390625, + "step": 4669, + "time_per_iteration": 2.4621458053588867 + }, + { + "auxiliary_loss_clip": 0.01080603, + "auxiliary_loss_mlp": 0.01064544, + "balance_loss_clip": 1.02837622, + "balance_loss_mlp": 1.02643049, + "epoch": 0.2807755899594168, + "flos": 20557627511040.0, + "grad_norm": 2.3032031758222735, + "language_loss": 0.81161356, + "learning_rate": 3.375774243322725e-06, + "loss": 0.83306503, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.54296875, + "step": 4670, + "time_per_iteration": 2.422586441040039 + }, + { + "auxiliary_loss_clip": 0.01083178, + "auxiliary_loss_mlp": 0.01065899, + "balance_loss_clip": 1.0280149, + "balance_loss_mlp": 1.02788544, + "epoch": 0.2808357132120848, + "flos": 24312257210880.0, + "grad_norm": 2.0140508194992917, + "language_loss": 0.81115055, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.83264136, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5546875, + "step": 4671, + "time_per_iteration": 2.4333016872406006 + }, + { + "auxiliary_loss_clip": 0.01076685, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_clip": 1.01668501, + "balance_loss_mlp": 1.02664101, + "epoch": 0.28089583646475275, + "flos": 26431205713920.0, + "grad_norm": 1.8505935922550074, + "language_loss": 0.76447517, + "learning_rate": 3.37520878264809e-06, + "loss": 0.78574193, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5, + "step": 4672, + "time_per_iteration": 2.4659483432769775 + }, + { + "auxiliary_loss_clip": 0.01084693, + "auxiliary_loss_mlp": 0.01068439, + "balance_loss_clip": 1.02709711, + "balance_loss_mlp": 1.0293107, + "epoch": 0.2809559597174207, + "flos": 23110721170560.0, + "grad_norm": 2.630489905644986, + "language_loss": 0.77324677, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.79477805, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.5546875, + "step": 4673, + "time_per_iteration": 2.3980906009674072 + }, + { + "auxiliary_loss_clip": 0.01077794, + "auxiliary_loss_mlp": 0.01056871, + "balance_loss_clip": 1.01922464, + "balance_loss_mlp": 1.02434015, + "epoch": 0.2810160829700887, + "flos": 20922434974080.0, + "grad_norm": 2.248175506604307, + "language_loss": 0.73431802, + "learning_rate": 3.374643113381237e-06, + "loss": 0.75566465, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.53515625, + "step": 4674, + "time_per_iteration": 2.4272267818450928 + }, + { + "auxiliary_loss_clip": 0.01084078, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_clip": 1.02358818, + "balance_loss_mlp": 1.02846932, + "epoch": 0.28107620622275664, + "flos": 14355027855360.0, + "grad_norm": 2.460728561415191, + "language_loss": 0.79073429, + "learning_rate": 3.374360200552541e-06, + "loss": 0.81219977, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5546875, + "step": 4675, + "time_per_iteration": 2.364560127258301 + }, + { + "auxiliary_loss_clip": 0.01078683, + "auxiliary_loss_mlp": 0.01059344, + "balance_loss_clip": 1.02196026, + "balance_loss_mlp": 1.0246352, + "epoch": 0.2811363294754246, + "flos": 20918140876800.0, + "grad_norm": 2.1454887605451876, + "language_loss": 0.71479356, + "learning_rate": 3.374077235607968e-06, + "loss": 0.73617381, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.54296875, + "step": 4676, + "time_per_iteration": 2.4208877086639404 + }, + { + "auxiliary_loss_clip": 0.01073907, + "auxiliary_loss_mlp": 0.01055509, + "balance_loss_clip": 1.01919782, + "balance_loss_mlp": 1.02456164, + "epoch": 0.28119645272809257, + "flos": 20593797546240.0, + "grad_norm": 1.4984206123770427, + "language_loss": 0.72127086, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.74256504, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.4921875, + "step": 4677, + "time_per_iteration": 2.3764734268188477 + }, + { + "auxiliary_loss_clip": 0.01076891, + "auxiliary_loss_mlp": 0.01055143, + "balance_loss_clip": 1.01910651, + "balance_loss_mlp": 1.02358627, + "epoch": 0.28125657598076054, + "flos": 25336259654400.0, + "grad_norm": 3.0408568614982183, + "language_loss": 0.64009249, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.66141278, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.53125, + "step": 4678, + "time_per_iteration": 2.433130979537964 + }, + { + "auxiliary_loss_clip": 0.01075383, + "auxiliary_loss_mlp": 0.01060978, + "balance_loss_clip": 1.02256835, + "balance_loss_mlp": 1.0226202, + "epoch": 0.2813166992334285, + "flos": 24825934177920.0, + "grad_norm": 1.5747668625149234, + "language_loss": 0.71840012, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.73976374, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.52734375, + "step": 4679, + "time_per_iteration": 2.4161243438720703 + }, + { + "auxiliary_loss_clip": 0.0107562, + "auxiliary_loss_mlp": 0.01059454, + "balance_loss_clip": 1.02180743, + "balance_loss_mlp": 1.02272987, + "epoch": 0.2813768224860965, + "flos": 21759722133120.0, + "grad_norm": 1.9146496257083523, + "language_loss": 0.75607181, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.77742255, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.52734375, + "step": 4680, + "time_per_iteration": 2.4204468727111816 + }, + { + "auxiliary_loss_clip": 0.01077649, + "auxiliary_loss_mlp": 0.01055174, + "balance_loss_clip": 1.01895833, + "balance_loss_mlp": 1.02411509, + "epoch": 0.2814369457387645, + "flos": 24315643612800.0, + "grad_norm": 1.5876086403023753, + "language_loss": 0.78479981, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.80612808, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.53515625, + "step": 4681, + "time_per_iteration": 2.4202840328216553 + }, + { + "auxiliary_loss_clip": 0.0107626, + "auxiliary_loss_mlp": 0.01058012, + "balance_loss_clip": 1.01924539, + "balance_loss_mlp": 1.02262545, + "epoch": 0.28149706899143245, + "flos": 18514335657600.0, + "grad_norm": 2.1904869768750346, + "language_loss": 0.76306874, + "learning_rate": 3.372378352108146e-06, + "loss": 0.78441149, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.53515625, + "step": 4682, + "time_per_iteration": 3.7881886959075928 + }, + { + "auxiliary_loss_clip": 0.01074142, + "auxiliary_loss_mlp": 0.01055862, + "balance_loss_clip": 1.01866913, + "balance_loss_mlp": 1.02219307, + "epoch": 0.2815571922441004, + "flos": 24862104213120.0, + "grad_norm": 2.070927810340737, + "language_loss": 0.8209852, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.84228522, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.51953125, + "step": 4683, + "time_per_iteration": 2.4148483276367188 + }, + { + "auxiliary_loss_clip": 0.01076493, + "auxiliary_loss_mlp": 0.01056769, + "balance_loss_clip": 1.02174532, + "balance_loss_mlp": 1.02350736, + "epoch": 0.2816173154967684, + "flos": 19900597034880.0, + "grad_norm": 2.4858029449364287, + "language_loss": 0.77324909, + "learning_rate": 3.371811641167852e-06, + "loss": 0.79458171, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.53125, + "step": 4684, + "time_per_iteration": 2.392758846282959 + }, + { + "auxiliary_loss_clip": 0.01073065, + "auxiliary_loss_mlp": 0.01049314, + "balance_loss_clip": 1.01591182, + "balance_loss_mlp": 1.02213931, + "epoch": 0.28167743874943635, + "flos": 17490437948160.0, + "grad_norm": 1.6853096583050504, + "language_loss": 0.7749542, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.79617798, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5078125, + "step": 4685, + "time_per_iteration": 3.8161308765411377 + }, + { + "auxiliary_loss_clip": 0.01070713, + "auxiliary_loss_mlp": 0.01050004, + "balance_loss_clip": 1.01610076, + "balance_loss_mlp": 1.02125263, + "epoch": 0.2817375620021043, + "flos": 25300927491840.0, + "grad_norm": 1.507850484658547, + "language_loss": 0.77059197, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.79179919, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49414062, + "step": 4686, + "time_per_iteration": 3.965028762817383 + }, + { + "auxiliary_loss_clip": 0.01076811, + "auxiliary_loss_mlp": 0.01059403, + "balance_loss_clip": 1.02063632, + "balance_loss_mlp": 1.02340198, + "epoch": 0.2817976852547723, + "flos": 18692358013440.0, + "grad_norm": 3.042565515878938, + "language_loss": 0.65233904, + "learning_rate": 3.370961184640025e-06, + "loss": 0.67370117, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.53515625, + "step": 4687, + "time_per_iteration": 2.4101359844207764 + }, + { + "auxiliary_loss_clip": 0.01077388, + "auxiliary_loss_mlp": 0.01060127, + "balance_loss_clip": 1.0236485, + "balance_loss_mlp": 1.02533913, + "epoch": 0.28185780850744024, + "flos": 22741305408000.0, + "grad_norm": 2.5664833465295387, + "language_loss": 0.78177023, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.80314541, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.51953125, + "step": 4688, + "time_per_iteration": 3.8244881629943848 + }, + { + "auxiliary_loss_clip": 0.01074161, + "auxiliary_loss_mlp": 0.01053045, + "balance_loss_clip": 1.01763988, + "balance_loss_mlp": 1.02339292, + "epoch": 0.2819179317601082, + "flos": 14933189836800.0, + "grad_norm": 1.9077793850065246, + "language_loss": 0.80201823, + "learning_rate": 3.37039395366863e-06, + "loss": 0.82329029, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5078125, + "step": 4689, + "time_per_iteration": 2.3489952087402344 + }, + { + "auxiliary_loss_clip": 0.01075698, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_clip": 1.01372695, + "balance_loss_mlp": 1.02441537, + "epoch": 0.2819780550127762, + "flos": 23144307765120.0, + "grad_norm": 1.6093650302615685, + "language_loss": 0.78959715, + "learning_rate": 3.37011026022934e-06, + "loss": 0.81083977, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.51171875, + "step": 4690, + "time_per_iteration": 2.4115638732910156 + }, + { + "auxiliary_loss_clip": 0.01075653, + "auxiliary_loss_mlp": 0.01050436, + "balance_loss_clip": 1.01641369, + "balance_loss_mlp": 1.02306056, + "epoch": 0.28203817826544414, + "flos": 21615286371840.0, + "grad_norm": 3.3055515073188517, + "language_loss": 0.88851738, + "learning_rate": 3.369826514835332e-06, + "loss": 0.90977836, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5234375, + "step": 4691, + "time_per_iteration": 2.384486198425293 + }, + { + "auxiliary_loss_clip": 0.01079977, + "auxiliary_loss_mlp": 0.01060821, + "balance_loss_clip": 1.02706087, + "balance_loss_mlp": 1.02653646, + "epoch": 0.2820983015181121, + "flos": 24025585104000.0, + "grad_norm": 1.7103776242309447, + "language_loss": 0.83197814, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.85338604, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.53125, + "step": 4692, + "time_per_iteration": 2.4489409923553467 + }, + { + "auxiliary_loss_clip": 0.01076642, + "auxiliary_loss_mlp": 0.01060059, + "balance_loss_clip": 1.02350938, + "balance_loss_mlp": 1.02468419, + "epoch": 0.2821584247707801, + "flos": 30006626071680.0, + "grad_norm": 1.5725828492621323, + "language_loss": 0.75767666, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.77904367, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.51953125, + "step": 4693, + "time_per_iteration": 2.4895682334899902 + }, + { + "auxiliary_loss_clip": 0.01075775, + "auxiliary_loss_mlp": 0.01049423, + "balance_loss_clip": 1.01456583, + "balance_loss_mlp": 1.0232619, + "epoch": 0.2822185480234481, + "flos": 21395717631360.0, + "grad_norm": 1.7299452960929549, + "language_loss": 0.79036152, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.8116135, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5234375, + "step": 4694, + "time_per_iteration": 2.4006242752075195 + }, + { + "auxiliary_loss_clip": 0.010745, + "auxiliary_loss_mlp": 0.01049439, + "balance_loss_clip": 1.0169183, + "balance_loss_mlp": 1.02518725, + "epoch": 0.28227867127611606, + "flos": 27451612287360.0, + "grad_norm": 1.7353932559278333, + "language_loss": 0.6827755, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.7040149, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.49414062, + "step": 4695, + "time_per_iteration": 2.4656476974487305 + }, + { + "auxiliary_loss_clip": 0.01077869, + "auxiliary_loss_mlp": 0.01058895, + "balance_loss_clip": 1.02277446, + "balance_loss_mlp": 1.0245676, + "epoch": 0.282338794528784, + "flos": 22592854840320.0, + "grad_norm": 2.08248830040284, + "language_loss": 0.76893044, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.79029804, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.53125, + "step": 4696, + "time_per_iteration": 2.378814220428467 + }, + { + "auxiliary_loss_clip": 0.01078481, + "auxiliary_loss_mlp": 0.01055785, + "balance_loss_clip": 1.02076149, + "balance_loss_mlp": 1.02664435, + "epoch": 0.282398917781452, + "flos": 42009311784960.0, + "grad_norm": 1.8841364001543182, + "language_loss": 0.6343829, + "learning_rate": 3.368122952024877e-06, + "loss": 0.6557256, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.515625, + "step": 4697, + "time_per_iteration": 2.596346855163574 + }, + { + "auxiliary_loss_clip": 0.01070532, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_clip": 1.01797473, + "balance_loss_mlp": 1.02230275, + "epoch": 0.28245904103411995, + "flos": 23223525373440.0, + "grad_norm": 1.606275432149893, + "language_loss": 0.74177808, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.76296085, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.48242188, + "step": 4698, + "time_per_iteration": 2.4619977474212646 + }, + { + "auxiliary_loss_clip": 0.0107196, + "auxiliary_loss_mlp": 0.01051307, + "balance_loss_clip": 1.02086139, + "balance_loss_mlp": 1.02303231, + "epoch": 0.2825191642867879, + "flos": 25373442119040.0, + "grad_norm": 3.3876386134487246, + "language_loss": 0.77114475, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.79237747, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.48828125, + "step": 4699, + "time_per_iteration": 2.459455728530884 + }, + { + "auxiliary_loss_clip": 0.01076197, + "auxiliary_loss_mlp": 0.01055288, + "balance_loss_clip": 1.01797545, + "balance_loss_mlp": 1.02395022, + "epoch": 0.2825792875394559, + "flos": 17235886158720.0, + "grad_norm": 6.897366929514765, + "language_loss": 0.83459896, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.85591388, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5234375, + "step": 4700, + "time_per_iteration": 2.3565571308135986 + }, + { + "auxiliary_loss_clip": 0.01068034, + "auxiliary_loss_mlp": 0.01045582, + "balance_loss_clip": 1.01687622, + "balance_loss_mlp": 1.02163625, + "epoch": 0.28263941079212385, + "flos": 26722765411200.0, + "grad_norm": 1.726302704720677, + "language_loss": 0.82934475, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.85048085, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.46484375, + "step": 4701, + "time_per_iteration": 2.4480061531066895 + }, + { + "auxiliary_loss_clip": 0.01071317, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.01810932, + "balance_loss_mlp": 1.02115655, + "epoch": 0.2826995340447918, + "flos": 25920147098880.0, + "grad_norm": 2.3050024689287363, + "language_loss": 0.75630534, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.77752, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.5, + "step": 4702, + "time_per_iteration": 2.4229376316070557 + }, + { + "auxiliary_loss_clip": 0.01070087, + "auxiliary_loss_mlp": 0.01047948, + "balance_loss_clip": 1.0168817, + "balance_loss_mlp": 1.02223134, + "epoch": 0.2827596572974598, + "flos": 22378697360640.0, + "grad_norm": 1.8226248368435236, + "language_loss": 0.79917049, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.82035077, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.47851562, + "step": 4703, + "time_per_iteration": 2.439295530319214 + }, + { + "auxiliary_loss_clip": 0.01070917, + "auxiliary_loss_mlp": 0.0105216, + "balance_loss_clip": 1.01980639, + "balance_loss_mlp": 1.02139246, + "epoch": 0.28281978055012774, + "flos": 33545736748800.0, + "grad_norm": 1.8523219666451411, + "language_loss": 0.70716125, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.728392, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.49609375, + "step": 4704, + "time_per_iteration": 2.487262725830078 + }, + { + "auxiliary_loss_clip": 0.01071736, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.02359414, + "balance_loss_mlp": 1.02353096, + "epoch": 0.2828799038027957, + "flos": 23439742623360.0, + "grad_norm": 2.825825832611166, + "language_loss": 0.71874326, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.74002647, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.48242188, + "step": 4705, + "time_per_iteration": 2.449760675430298 + }, + { + "auxiliary_loss_clip": 0.01018686, + "auxiliary_loss_mlp": 0.01008896, + "balance_loss_clip": 1.00462878, + "balance_loss_mlp": 1.00686038, + "epoch": 0.2829400270554637, + "flos": 69870629410560.0, + "grad_norm": 0.7216904597050283, + "language_loss": 0.59359223, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61386806, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.11816406, + "step": 4706, + "time_per_iteration": 3.1052396297454834 + }, + { + "auxiliary_loss_clip": 0.0106837, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_clip": 1.0167532, + "balance_loss_mlp": 1.02169228, + "epoch": 0.2830001503081317, + "flos": 24787913840640.0, + "grad_norm": 1.5544511839354211, + "language_loss": 0.82715386, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84828091, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.46875, + "step": 4707, + "time_per_iteration": 2.4354686737060547 + }, + { + "auxiliary_loss_clip": 0.0107426, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_clip": 1.01578188, + "balance_loss_mlp": 1.02262449, + "epoch": 0.28306027356079966, + "flos": 27668248473600.0, + "grad_norm": 1.8709330592639417, + "language_loss": 0.81738544, + "learning_rate": 3.36499490449902e-06, + "loss": 0.83862662, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.515625, + "step": 4708, + "time_per_iteration": 2.444035053253174 + }, + { + "auxiliary_loss_clip": 0.01020949, + "auxiliary_loss_mlp": 0.01025797, + "balance_loss_clip": 1.02145791, + "balance_loss_mlp": 1.00918674, + "epoch": 0.2831203968134676, + "flos": 60525288276480.0, + "grad_norm": 0.9020369355131317, + "language_loss": 0.62869424, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.6491617, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.04345703, + "router_z_loss_mlp": 0.1171875, + "step": 4709, + "time_per_iteration": 2.939412832260132 + }, + { + "auxiliary_loss_clip": 0.01074097, + "auxiliary_loss_mlp": 0.01046889, + "balance_loss_clip": 1.01565623, + "balance_loss_mlp": 1.02544761, + "epoch": 0.2831805200661356, + "flos": 22053690714240.0, + "grad_norm": 1.479323472615513, + "language_loss": 0.75264448, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.77385437, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.48632812, + "step": 4710, + "time_per_iteration": 2.4848527908325195 + }, + { + "auxiliary_loss_clip": 0.01076303, + "auxiliary_loss_mlp": 0.01060904, + "balance_loss_clip": 1.02585673, + "balance_loss_mlp": 1.02395868, + "epoch": 0.28324064331880355, + "flos": 22599592732800.0, + "grad_norm": 1.8142361819399593, + "language_loss": 0.81385469, + "learning_rate": 3.364140713048579e-06, + "loss": 0.83522677, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5234375, + "step": 4711, + "time_per_iteration": 2.405076265335083 + }, + { + "auxiliary_loss_clip": 0.01076509, + "auxiliary_loss_mlp": 0.01052689, + "balance_loss_clip": 1.02117026, + "balance_loss_mlp": 1.02468729, + "epoch": 0.2833007665714715, + "flos": 30402960359040.0, + "grad_norm": 2.1213666931100867, + "language_loss": 0.72170973, + "learning_rate": 3.363855879093996e-06, + "loss": 0.7430017, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.515625, + "step": 4712, + "time_per_iteration": 2.4853057861328125 + }, + { + "auxiliary_loss_clip": 0.01075558, + "auxiliary_loss_mlp": 0.01062017, + "balance_loss_clip": 1.02851939, + "balance_loss_mlp": 1.0246048, + "epoch": 0.2833608898241395, + "flos": 23548392374400.0, + "grad_norm": 1.7952975578431338, + "language_loss": 0.83828217, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.85965788, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5078125, + "step": 4713, + "time_per_iteration": 2.4018986225128174 + }, + { + "auxiliary_loss_clip": 0.01075767, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.02492702, + "balance_loss_mlp": 1.0261538, + "epoch": 0.28342101307680745, + "flos": 20265683788800.0, + "grad_norm": 1.758811263825591, + "language_loss": 0.77426279, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.79558754, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.49609375, + "step": 4714, + "time_per_iteration": 2.4213197231292725 + }, + { + "auxiliary_loss_clip": 0.0107499, + "auxiliary_loss_mlp": 0.01061152, + "balance_loss_clip": 1.03056252, + "balance_loss_mlp": 1.02551889, + "epoch": 0.2834811363294754, + "flos": 30845728621440.0, + "grad_norm": 1.4416170518889595, + "language_loss": 0.79435378, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.81571519, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.49609375, + "step": 4715, + "time_per_iteration": 2.513606309890747 + }, + { + "auxiliary_loss_clip": 0.01074895, + "auxiliary_loss_mlp": 0.01059316, + "balance_loss_clip": 1.02572334, + "balance_loss_mlp": 1.02502453, + "epoch": 0.2835412595821434, + "flos": 22709918229120.0, + "grad_norm": 1.7031063393484447, + "language_loss": 0.74794328, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.76928544, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5, + "step": 4716, + "time_per_iteration": 2.461986780166626 + }, + { + "auxiliary_loss_clip": 0.01076971, + "auxiliary_loss_mlp": 0.01069454, + "balance_loss_clip": 1.03206968, + "balance_loss_mlp": 1.02255106, + "epoch": 0.28360138283481134, + "flos": 18076734276480.0, + "grad_norm": 2.2729664851207403, + "language_loss": 0.76687682, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.78834105, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.546875, + "step": 4717, + "time_per_iteration": 2.3891706466674805 + }, + { + "auxiliary_loss_clip": 0.01075213, + "auxiliary_loss_mlp": 0.01064477, + "balance_loss_clip": 1.02954876, + "balance_loss_mlp": 1.02345192, + "epoch": 0.2836615060874793, + "flos": 17853918779520.0, + "grad_norm": 1.5566477876659615, + "language_loss": 0.68559635, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.70699322, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.51953125, + "step": 4718, + "time_per_iteration": 2.397733688354492 + }, + { + "auxiliary_loss_clip": 0.01076139, + "auxiliary_loss_mlp": 0.01061572, + "balance_loss_clip": 1.02700162, + "balance_loss_mlp": 1.02358508, + "epoch": 0.2837216293401473, + "flos": 25739087454720.0, + "grad_norm": 1.6433678478870297, + "language_loss": 0.74389148, + "learning_rate": 3.361860593925566e-06, + "loss": 0.76526862, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5234375, + "step": 4719, + "time_per_iteration": 2.443596124649048 + }, + { + "auxiliary_loss_clip": 0.01072691, + "auxiliary_loss_mlp": 0.01057427, + "balance_loss_clip": 1.02271295, + "balance_loss_mlp": 1.02343249, + "epoch": 0.2837817525928153, + "flos": 20922469885440.0, + "grad_norm": 1.815107214020504, + "language_loss": 0.81781822, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.83911943, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.4921875, + "step": 4720, + "time_per_iteration": 2.3948445320129395 + }, + { + "auxiliary_loss_clip": 0.01075249, + "auxiliary_loss_mlp": 0.01057191, + "balance_loss_clip": 1.01851988, + "balance_loss_mlp": 1.02322078, + "epoch": 0.28384187584548326, + "flos": 18915697180800.0, + "grad_norm": 1.822250441707057, + "language_loss": 0.8079623, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.82928669, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.51953125, + "step": 4721, + "time_per_iteration": 2.4417712688446045 + }, + { + "auxiliary_loss_clip": 0.01071824, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_clip": 1.01863551, + "balance_loss_mlp": 1.0215528, + "epoch": 0.2839019990981512, + "flos": 27342753068160.0, + "grad_norm": 1.9615977759241718, + "language_loss": 0.84351826, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.86474878, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.50390625, + "step": 4722, + "time_per_iteration": 3.964268684387207 + }, + { + "auxiliary_loss_clip": 0.01073584, + "auxiliary_loss_mlp": 0.0104903, + "balance_loss_clip": 1.01631904, + "balance_loss_mlp": 1.02306306, + "epoch": 0.2839621223508192, + "flos": 18113323248000.0, + "grad_norm": 1.8690806976543404, + "language_loss": 0.71081311, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.73203921, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.50390625, + "step": 4723, + "time_per_iteration": 2.400887966156006 + }, + { + "auxiliary_loss_clip": 0.01075444, + "auxiliary_loss_mlp": 0.01052084, + "balance_loss_clip": 1.01558232, + "balance_loss_mlp": 1.02365446, + "epoch": 0.28402224560348716, + "flos": 26357189898240.0, + "grad_norm": 1.5831876150218114, + "language_loss": 0.79948676, + "learning_rate": 3.360433840760998e-06, + "loss": 0.82076204, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.51953125, + "step": 4724, + "time_per_iteration": 3.8484649658203125 + }, + { + "auxiliary_loss_clip": 0.01076815, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.02014482, + "balance_loss_mlp": 1.02459276, + "epoch": 0.2840823688561551, + "flos": 24059660457600.0, + "grad_norm": 1.8255867507188768, + "language_loss": 0.93966043, + "learning_rate": 3.36014833532143e-06, + "loss": 0.96098959, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5234375, + "step": 4725, + "time_per_iteration": 2.411259651184082 + }, + { + "auxiliary_loss_clip": 0.01077186, + "auxiliary_loss_mlp": 0.01058698, + "balance_loss_clip": 1.02140963, + "balance_loss_mlp": 1.02478433, + "epoch": 0.2841424921088231, + "flos": 29458559548800.0, + "grad_norm": 2.3997252007971213, + "language_loss": 0.90161264, + "learning_rate": 3.3598627783049e-06, + "loss": 0.92297149, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5234375, + "step": 4726, + "time_per_iteration": 3.947939872741699 + }, + { + "auxiliary_loss_clip": 0.01077161, + "auxiliary_loss_mlp": 0.01059857, + "balance_loss_clip": 1.02523911, + "balance_loss_mlp": 1.02562833, + "epoch": 0.28420261536149105, + "flos": 48098688301440.0, + "grad_norm": 2.0506136916434885, + "language_loss": 0.80552876, + "learning_rate": 3.359577169722238e-06, + "loss": 0.82689893, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.515625, + "step": 4727, + "time_per_iteration": 4.058341026306152 + }, + { + "auxiliary_loss_clip": 0.01073043, + "auxiliary_loss_mlp": 0.01049946, + "balance_loss_clip": 1.01835573, + "balance_loss_mlp": 1.02348638, + "epoch": 0.284262738614159, + "flos": 25664966904960.0, + "grad_norm": 3.6178745915667476, + "language_loss": 0.68747115, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.70870101, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.49609375, + "step": 4728, + "time_per_iteration": 2.4283571243286133 + }, + { + "auxiliary_loss_clip": 0.01074001, + "auxiliary_loss_mlp": 0.01059988, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.02337551, + "epoch": 0.284322861866827, + "flos": 19717966379520.0, + "grad_norm": 3.4989805553700246, + "language_loss": 0.78457457, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.8059144, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.50390625, + "step": 4729, + "time_per_iteration": 2.406920909881592 + }, + { + "auxiliary_loss_clip": 0.01078605, + "auxiliary_loss_mlp": 0.0105751, + "balance_loss_clip": 1.01995945, + "balance_loss_mlp": 1.02596056, + "epoch": 0.28438298511949495, + "flos": 23914107532800.0, + "grad_norm": 1.793091625493373, + "language_loss": 0.67934281, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.70070398, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.52734375, + "step": 4730, + "time_per_iteration": 2.452794313430786 + }, + { + "auxiliary_loss_clip": 0.01077291, + "auxiliary_loss_mlp": 0.01052987, + "balance_loss_clip": 1.01684284, + "balance_loss_mlp": 1.02569306, + "epoch": 0.2844431083721629, + "flos": 26066153871360.0, + "grad_norm": 1.6758702146814628, + "language_loss": 0.76265371, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.78395653, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.515625, + "step": 4731, + "time_per_iteration": 2.4511759281158447 + }, + { + "auxiliary_loss_clip": 0.01074882, + "auxiliary_loss_mlp": 0.01056428, + "balance_loss_clip": 1.02290678, + "balance_loss_mlp": 1.02397084, + "epoch": 0.2845032316248309, + "flos": 25809297932160.0, + "grad_norm": 1.491089932463519, + "language_loss": 0.84510922, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.8664223, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5078125, + "step": 4732, + "time_per_iteration": 2.466252326965332 + }, + { + "auxiliary_loss_clip": 0.01077078, + "auxiliary_loss_mlp": 0.01064418, + "balance_loss_clip": 1.0260092, + "balance_loss_mlp": 1.02444088, + "epoch": 0.2845633548774989, + "flos": 19822322033280.0, + "grad_norm": 1.6431155688000156, + "language_loss": 0.80677259, + "learning_rate": 3.357862435944109e-06, + "loss": 0.82818758, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.52734375, + "step": 4733, + "time_per_iteration": 2.3997511863708496 + }, + { + "auxiliary_loss_clip": 0.01079883, + "auxiliary_loss_mlp": 0.01065864, + "balance_loss_clip": 1.0244031, + "balance_loss_mlp": 1.02512479, + "epoch": 0.28462347813016686, + "flos": 23181769520640.0, + "grad_norm": 2.427114045750969, + "language_loss": 0.73544776, + "learning_rate": 3.357576466701875e-06, + "loss": 0.7569052, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.546875, + "step": 4734, + "time_per_iteration": 2.4254510402679443 + }, + { + "auxiliary_loss_clip": 0.01072561, + "auxiliary_loss_mlp": 0.01053659, + "balance_loss_clip": 1.01837289, + "balance_loss_mlp": 1.02162051, + "epoch": 0.2846836013828348, + "flos": 18659504557440.0, + "grad_norm": 3.440772847853321, + "language_loss": 0.75363761, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.77489984, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5078125, + "step": 4735, + "time_per_iteration": 2.3891022205352783 + }, + { + "auxiliary_loss_clip": 0.01073673, + "auxiliary_loss_mlp": 0.01061956, + "balance_loss_clip": 1.02881598, + "balance_loss_mlp": 1.02296472, + "epoch": 0.2847437246355028, + "flos": 14172641579520.0, + "grad_norm": 2.3785281303374175, + "language_loss": 0.81087649, + "learning_rate": 3.357004373789946e-06, + "loss": 0.83223271, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5078125, + "step": 4736, + "time_per_iteration": 2.383725881576538 + }, + { + "auxiliary_loss_clip": 0.01076747, + "auxiliary_loss_mlp": 0.01058753, + "balance_loss_clip": 1.02034414, + "balance_loss_mlp": 1.02455556, + "epoch": 0.28480384788817076, + "flos": 29277080968320.0, + "grad_norm": 1.9269392644904917, + "language_loss": 0.61018932, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.63154429, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5234375, + "step": 4737, + "time_per_iteration": 2.4753646850585938 + }, + { + "auxiliary_loss_clip": 0.01071016, + "auxiliary_loss_mlp": 0.01062245, + "balance_loss_clip": 1.02579129, + "balance_loss_mlp": 1.02085793, + "epoch": 0.2848639711408387, + "flos": 22600221137280.0, + "grad_norm": 2.0520696799436013, + "language_loss": 0.87928224, + "learning_rate": 3.356432075047052e-06, + "loss": 0.90061492, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5, + "step": 4738, + "time_per_iteration": 2.4211652278900146 + }, + { + "auxiliary_loss_clip": 0.01078696, + "auxiliary_loss_mlp": 0.01069069, + "balance_loss_clip": 1.02820432, + "balance_loss_mlp": 1.02354622, + "epoch": 0.2849240943935067, + "flos": 17598598940160.0, + "grad_norm": 2.0235511349570143, + "language_loss": 0.91971934, + "learning_rate": 3.356145848516118e-06, + "loss": 0.94119704, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.55078125, + "step": 4739, + "time_per_iteration": 2.378560781478882 + }, + { + "auxiliary_loss_clip": 0.01075701, + "auxiliary_loss_mlp": 0.0105085, + "balance_loss_clip": 1.01756728, + "balance_loss_mlp": 1.0239377, + "epoch": 0.28498421764617465, + "flos": 24861440897280.0, + "grad_norm": 1.6689619181131328, + "language_loss": 0.72650218, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74776763, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.51953125, + "step": 4740, + "time_per_iteration": 2.4405720233917236 + }, + { + "auxiliary_loss_clip": 0.01072659, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.01111579, + "balance_loss_mlp": 1.02291703, + "epoch": 0.2850443408988426, + "flos": 22781490249600.0, + "grad_norm": 1.509469855394795, + "language_loss": 0.79439133, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.81558764, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.49609375, + "step": 4741, + "time_per_iteration": 2.420545816421509 + }, + { + "auxiliary_loss_clip": 0.01075667, + "auxiliary_loss_mlp": 0.01057342, + "balance_loss_clip": 1.01816988, + "balance_loss_mlp": 1.02132845, + "epoch": 0.2851044641515106, + "flos": 18843042908160.0, + "grad_norm": 2.049237337244661, + "language_loss": 0.77733159, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.79866171, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.54296875, + "step": 4742, + "time_per_iteration": 2.395918130874634 + }, + { + "auxiliary_loss_clip": 0.01079579, + "auxiliary_loss_mlp": 0.01065705, + "balance_loss_clip": 1.02529359, + "balance_loss_mlp": 1.0236094, + "epoch": 0.28516458740417855, + "flos": 18879492234240.0, + "grad_norm": 2.182973752867177, + "language_loss": 0.5945363, + "learning_rate": 3.355000428249086e-06, + "loss": 0.61598915, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.55859375, + "step": 4743, + "time_per_iteration": 2.3966941833496094 + }, + { + "auxiliary_loss_clip": 0.01079592, + "auxiliary_loss_mlp": 0.01058454, + "balance_loss_clip": 1.01983023, + "balance_loss_mlp": 1.02499247, + "epoch": 0.2852247106568465, + "flos": 25298693164800.0, + "grad_norm": 1.816825386539203, + "language_loss": 0.75601912, + "learning_rate": 3.354713944700797e-06, + "loss": 0.77739966, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.546875, + "step": 4744, + "time_per_iteration": 2.428297996520996 + }, + { + "auxiliary_loss_clip": 0.01074194, + "auxiliary_loss_mlp": 0.01047269, + "balance_loss_clip": 1.01322305, + "balance_loss_mlp": 1.02283525, + "epoch": 0.2852848339095145, + "flos": 11654600791680.0, + "grad_norm": 2.3758193824985763, + "language_loss": 0.78478295, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.80599761, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.51171875, + "step": 4745, + "time_per_iteration": 2.378159523010254 + }, + { + "auxiliary_loss_clip": 0.01072266, + "auxiliary_loss_mlp": 0.01056973, + "balance_loss_clip": 1.02221107, + "balance_loss_mlp": 1.02376986, + "epoch": 0.2853449571621825, + "flos": 12932386974720.0, + "grad_norm": 1.8563503100045373, + "language_loss": 0.83576071, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.8570531, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48632812, + "step": 4746, + "time_per_iteration": 2.3764760494232178 + }, + { + "auxiliary_loss_clip": 0.01077584, + "auxiliary_loss_mlp": 0.01060662, + "balance_loss_clip": 1.01982141, + "balance_loss_mlp": 1.02247214, + "epoch": 0.28540508041485046, + "flos": 20009560988160.0, + "grad_norm": 1.6407601435637755, + "language_loss": 0.81606013, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.83744264, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.55078125, + "step": 4747, + "time_per_iteration": 2.419158697128296 + }, + { + "auxiliary_loss_clip": 0.01026891, + "auxiliary_loss_mlp": 0.01027562, + "balance_loss_clip": 1.02274549, + "balance_loss_mlp": 1.01432717, + "epoch": 0.28546520366751843, + "flos": 68135864175360.0, + "grad_norm": 0.7951255599807986, + "language_loss": 0.60512269, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62566721, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.0480957, + "router_z_loss_mlp": 0.125, + "step": 4748, + "time_per_iteration": 3.037266731262207 + }, + { + "auxiliary_loss_clip": 0.01073853, + "auxiliary_loss_mlp": 0.01057747, + "balance_loss_clip": 1.01855099, + "balance_loss_mlp": 1.02143073, + "epoch": 0.2855253269201864, + "flos": 13250969930880.0, + "grad_norm": 2.948606844728039, + "language_loss": 0.82145447, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.84277046, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.5234375, + "step": 4749, + "time_per_iteration": 2.367530107498169 + }, + { + "auxiliary_loss_clip": 0.010736, + "auxiliary_loss_mlp": 0.01056597, + "balance_loss_clip": 1.02169299, + "balance_loss_mlp": 1.02161503, + "epoch": 0.28558545017285436, + "flos": 28619631555840.0, + "grad_norm": 1.9824025116408432, + "language_loss": 0.7258451, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.74714708, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.51953125, + "step": 4750, + "time_per_iteration": 2.465240716934204 + }, + { + "auxiliary_loss_clip": 0.01072928, + "auxiliary_loss_mlp": 0.01065172, + "balance_loss_clip": 1.02657199, + "balance_loss_mlp": 1.02192104, + "epoch": 0.2856455734255223, + "flos": 34129065611520.0, + "grad_norm": 1.675595733420184, + "language_loss": 0.83028674, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.85166776, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.51171875, + "step": 4751, + "time_per_iteration": 2.5410549640655518 + }, + { + "auxiliary_loss_clip": 0.01074466, + "auxiliary_loss_mlp": 0.01057854, + "balance_loss_clip": 1.02104175, + "balance_loss_mlp": 1.02259719, + "epoch": 0.2857056966781903, + "flos": 39784576262400.0, + "grad_norm": 2.022613939167455, + "language_loss": 0.8128733, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.83419651, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.51953125, + "step": 4752, + "time_per_iteration": 2.553499937057495 + }, + { + "auxiliary_loss_clip": 0.01073005, + "auxiliary_loss_mlp": 0.01055177, + "balance_loss_clip": 1.0183171, + "balance_loss_mlp": 1.02098489, + "epoch": 0.28576581993085826, + "flos": 21871199704320.0, + "grad_norm": 1.751244306578235, + "language_loss": 0.80196536, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.82324719, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5234375, + "step": 4753, + "time_per_iteration": 2.41487193107605 + }, + { + "auxiliary_loss_clip": 0.01077244, + "auxiliary_loss_mlp": 0.0106178, + "balance_loss_clip": 1.0207243, + "balance_loss_mlp": 1.02284491, + "epoch": 0.2858259431835262, + "flos": 19090856805120.0, + "grad_norm": 2.576943761525006, + "language_loss": 0.9171409, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.93853116, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 0.546875, + "step": 4754, + "time_per_iteration": 2.3717336654663086 + }, + { + "auxiliary_loss_clip": 0.01073195, + "auxiliary_loss_mlp": 0.01054287, + "balance_loss_clip": 1.01845288, + "balance_loss_mlp": 1.02294278, + "epoch": 0.2858860664361942, + "flos": 20333415559680.0, + "grad_norm": 1.663027878056912, + "language_loss": 0.82831073, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84958547, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5, + "step": 4755, + "time_per_iteration": 2.4186153411865234 + }, + { + "auxiliary_loss_clip": 0.01074341, + "auxiliary_loss_mlp": 0.01054076, + "balance_loss_clip": 1.01733589, + "balance_loss_mlp": 1.0219245, + "epoch": 0.28594618968886215, + "flos": 24460603044480.0, + "grad_norm": 1.5397807391454041, + "language_loss": 0.85035586, + "learning_rate": 3.351272138300922e-06, + "loss": 0.87164009, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5234375, + "step": 4756, + "time_per_iteration": 2.4371719360351562 + }, + { + "auxiliary_loss_clip": 0.01017336, + "auxiliary_loss_mlp": 0.01018424, + "balance_loss_clip": 1.01468062, + "balance_loss_mlp": 1.00537157, + "epoch": 0.2860063129415301, + "flos": 71648964023040.0, + "grad_norm": 0.8796171416364948, + "language_loss": 0.61102045, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63137805, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.03735352, + "router_z_loss_mlp": 0.11914062, + "step": 4757, + "time_per_iteration": 3.175727367401123 + }, + { + "auxiliary_loss_clip": 0.01074646, + "auxiliary_loss_mlp": 0.01055235, + "balance_loss_clip": 1.02290595, + "balance_loss_mlp": 1.02427244, + "epoch": 0.2860664361941981, + "flos": 20557627511040.0, + "grad_norm": 1.8601619301397203, + "language_loss": 0.66947848, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.6907773, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.50390625, + "step": 4758, + "time_per_iteration": 2.392843246459961 + }, + { + "auxiliary_loss_clip": 0.01075873, + "auxiliary_loss_mlp": 0.01065197, + "balance_loss_clip": 1.02678728, + "balance_loss_mlp": 1.0234673, + "epoch": 0.2861265594468661, + "flos": 35994788956800.0, + "grad_norm": 1.6247413411596905, + "language_loss": 0.64364958, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.66506028, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5234375, + "step": 4759, + "time_per_iteration": 2.542724132537842 + }, + { + "auxiliary_loss_clip": 0.010754, + "auxiliary_loss_mlp": 0.01057911, + "balance_loss_clip": 1.02033639, + "balance_loss_mlp": 1.02364874, + "epoch": 0.28618668269953407, + "flos": 20046394339200.0, + "grad_norm": 1.732512556786511, + "language_loss": 0.75493592, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.77626902, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.515625, + "step": 4760, + "time_per_iteration": 2.414698600769043 + }, + { + "auxiliary_loss_clip": 0.01073642, + "auxiliary_loss_mlp": 0.01055705, + "balance_loss_clip": 1.02580714, + "balance_loss_mlp": 1.025208, + "epoch": 0.28624680595220203, + "flos": 24970719052800.0, + "grad_norm": 2.508640551028646, + "language_loss": 0.74067581, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.76196933, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.484375, + "step": 4761, + "time_per_iteration": 3.996635913848877 + }, + { + "auxiliary_loss_clip": 0.01075039, + "auxiliary_loss_mlp": 0.0105621, + "balance_loss_clip": 1.02259266, + "balance_loss_mlp": 1.02321172, + "epoch": 0.28630692920487, + "flos": 22491152449920.0, + "grad_norm": 2.191047007432471, + "language_loss": 0.76072901, + "learning_rate": 3.349548466945793e-06, + "loss": 0.78204149, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.51953125, + "step": 4762, + "time_per_iteration": 2.4520647525787354 + }, + { + "auxiliary_loss_clip": 0.01073848, + "auxiliary_loss_mlp": 0.01059448, + "balance_loss_clip": 1.02556872, + "balance_loss_mlp": 1.02453709, + "epoch": 0.28636705245753796, + "flos": 21248872986240.0, + "grad_norm": 1.4318592317331356, + "language_loss": 0.76385844, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78519142, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4921875, + "step": 4763, + "time_per_iteration": 3.7698569297790527 + }, + { + "auxiliary_loss_clip": 0.01073399, + "auxiliary_loss_mlp": 0.01064028, + "balance_loss_clip": 1.02714455, + "balance_loss_mlp": 1.02215338, + "epoch": 0.28642717571020593, + "flos": 24094678417920.0, + "grad_norm": 1.621004060134999, + "language_loss": 0.78813422, + "learning_rate": 3.348973500311086e-06, + "loss": 0.8095085, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.51171875, + "step": 4764, + "time_per_iteration": 2.465522527694702 + }, + { + "auxiliary_loss_clip": 0.01075302, + "auxiliary_loss_mlp": 0.01058303, + "balance_loss_clip": 1.01812911, + "balance_loss_mlp": 1.02322674, + "epoch": 0.2864872989628739, + "flos": 22600290960000.0, + "grad_norm": 1.8653646912986528, + "language_loss": 0.73012084, + "learning_rate": 3.348685940258466e-06, + "loss": 0.75145686, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.5234375, + "step": 4765, + "time_per_iteration": 3.9146671295166016 + }, + { + "auxiliary_loss_clip": 0.01070442, + "auxiliary_loss_mlp": 0.01051525, + "balance_loss_clip": 1.01964903, + "balance_loss_mlp": 1.0215838, + "epoch": 0.28654742221554186, + "flos": 32743677018240.0, + "grad_norm": 2.0041492775482013, + "language_loss": 0.76617008, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.78738976, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.48828125, + "step": 4766, + "time_per_iteration": 4.07109522819519 + }, + { + "auxiliary_loss_clip": 0.01071233, + "auxiliary_loss_mlp": 0.01052172, + "balance_loss_clip": 1.01776814, + "balance_loss_mlp": 1.02217388, + "epoch": 0.2866075454682098, + "flos": 26980354488960.0, + "grad_norm": 1.5847343336297581, + "language_loss": 0.79499912, + "learning_rate": 3.348110666737214e-06, + "loss": 0.81623316, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49023438, + "step": 4767, + "time_per_iteration": 2.4637036323547363 + }, + { + "auxiliary_loss_clip": 0.0107299, + "auxiliary_loss_mlp": 0.01053293, + "balance_loss_clip": 1.02010477, + "balance_loss_mlp": 1.0223068, + "epoch": 0.2866676687208778, + "flos": 23252852782080.0, + "grad_norm": 1.961315836485499, + "language_loss": 0.66522348, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.68648636, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5078125, + "step": 4768, + "time_per_iteration": 2.4347009658813477 + }, + { + "auxiliary_loss_clip": 0.01074385, + "auxiliary_loss_mlp": 0.01059645, + "balance_loss_clip": 1.02383423, + "balance_loss_mlp": 1.02187777, + "epoch": 0.28672779197354575, + "flos": 21578662488960.0, + "grad_norm": 1.6471189362521488, + "language_loss": 0.7170741, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.73841441, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5234375, + "step": 4769, + "time_per_iteration": 2.40556263923645 + }, + { + "auxiliary_loss_clip": 0.0107208, + "auxiliary_loss_mlp": 0.01049696, + "balance_loss_clip": 1.01450586, + "balance_loss_mlp": 1.02149248, + "epoch": 0.2867879152262137, + "flos": 19864531733760.0, + "grad_norm": 1.732430666231737, + "language_loss": 0.76071501, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.78193277, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5078125, + "step": 4770, + "time_per_iteration": 2.443547487258911 + }, + { + "auxiliary_loss_clip": 0.01074499, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.01484156, + "balance_loss_mlp": 1.02241778, + "epoch": 0.2868480384788817, + "flos": 28212265278720.0, + "grad_norm": 2.214327611548951, + "language_loss": 0.6876415, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.70887852, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51953125, + "step": 4771, + "time_per_iteration": 2.466397285461426 + }, + { + "auxiliary_loss_clip": 0.01015794, + "auxiliary_loss_mlp": 0.01002973, + "balance_loss_clip": 0.99953985, + "balance_loss_mlp": 1.00422299, + "epoch": 0.2869081617315497, + "flos": 65421298010880.0, + "grad_norm": 0.7773311393434161, + "language_loss": 0.57007879, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.59026647, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.03442383, + "router_z_loss_mlp": 0.11572266, + "step": 4772, + "time_per_iteration": 3.000842332839966 + }, + { + "auxiliary_loss_clip": 0.01073066, + "auxiliary_loss_mlp": 0.01061988, + "balance_loss_clip": 1.02481842, + "balance_loss_mlp": 1.02130222, + "epoch": 0.28696828498421767, + "flos": 18659748936960.0, + "grad_norm": 2.3686432129747157, + "language_loss": 0.84776151, + "learning_rate": 3.346383619630856e-06, + "loss": 0.86911201, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.51953125, + "step": 4773, + "time_per_iteration": 2.409834146499634 + }, + { + "auxiliary_loss_clip": 0.01073431, + "auxiliary_loss_mlp": 0.01051297, + "balance_loss_clip": 1.0155108, + "balance_loss_mlp": 1.02207756, + "epoch": 0.28702840823688563, + "flos": 23658613136640.0, + "grad_norm": 2.380348133002109, + "language_loss": 0.78890514, + "learning_rate": 3.34609559969027e-06, + "loss": 0.81015247, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.515625, + "step": 4774, + "time_per_iteration": 2.4460296630859375 + }, + { + "auxiliary_loss_clip": 0.01071819, + "auxiliary_loss_mlp": 0.01055334, + "balance_loss_clip": 1.02040541, + "balance_loss_mlp": 1.02175379, + "epoch": 0.2870885314895536, + "flos": 13803993866880.0, + "grad_norm": 1.6776004519968446, + "language_loss": 0.74832326, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.76959479, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5, + "step": 4775, + "time_per_iteration": 2.3969686031341553 + }, + { + "auxiliary_loss_clip": 0.01077407, + "auxiliary_loss_mlp": 0.01053388, + "balance_loss_clip": 1.01981866, + "balance_loss_mlp": 1.02501225, + "epoch": 0.28714865474222157, + "flos": 17785768072320.0, + "grad_norm": 1.7000665405246873, + "language_loss": 0.89065868, + "learning_rate": 3.34551940668778e-06, + "loss": 0.91196662, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5234375, + "step": 4776, + "time_per_iteration": 2.3873684406280518 + }, + { + "auxiliary_loss_clip": 0.01073301, + "auxiliary_loss_mlp": 0.01053193, + "balance_loss_clip": 1.02045846, + "balance_loss_mlp": 1.02340293, + "epoch": 0.28720877799488953, + "flos": 15996574160640.0, + "grad_norm": 1.8469120895615572, + "language_loss": 0.7589196, + "learning_rate": 3.345231233647726e-06, + "loss": 0.78018457, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5, + "step": 4777, + "time_per_iteration": 2.4053878784179688 + }, + { + "auxiliary_loss_clip": 0.01078807, + "auxiliary_loss_mlp": 0.01064628, + "balance_loss_clip": 1.02230835, + "balance_loss_mlp": 1.02438879, + "epoch": 0.2872689012475575, + "flos": 20922085860480.0, + "grad_norm": 1.8780643257407983, + "language_loss": 0.81915498, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.84058934, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 0.546875, + "step": 4778, + "time_per_iteration": 2.4110677242279053 + }, + { + "auxiliary_loss_clip": 0.01071515, + "auxiliary_loss_mlp": 0.01056809, + "balance_loss_clip": 1.02121258, + "balance_loss_mlp": 1.02313912, + "epoch": 0.28732902450022546, + "flos": 21324040876800.0, + "grad_norm": 1.6362716944880653, + "language_loss": 0.76014507, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.78142834, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.484375, + "step": 4779, + "time_per_iteration": 2.4224095344543457 + }, + { + "auxiliary_loss_clip": 0.01073371, + "auxiliary_loss_mlp": 0.01058628, + "balance_loss_clip": 1.02360415, + "balance_loss_mlp": 1.02300239, + "epoch": 0.2873891477528934, + "flos": 20849326853760.0, + "grad_norm": 1.5840545597449143, + "language_loss": 0.77753973, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.79885978, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.50390625, + "step": 4780, + "time_per_iteration": 2.3932180404663086 + }, + { + "auxiliary_loss_clip": 0.01073391, + "auxiliary_loss_mlp": 0.01054954, + "balance_loss_clip": 1.02212358, + "balance_loss_mlp": 1.02332151, + "epoch": 0.2874492710055614, + "flos": 17419110307200.0, + "grad_norm": 1.8467946375664082, + "language_loss": 0.82417935, + "learning_rate": 3.344078031483784e-06, + "loss": 0.8454628, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5, + "step": 4781, + "time_per_iteration": 2.424726963043213 + }, + { + "auxiliary_loss_clip": 0.01074647, + "auxiliary_loss_mlp": 0.01061658, + "balance_loss_clip": 1.02346361, + "balance_loss_mlp": 1.02245021, + "epoch": 0.28750939425822936, + "flos": 13405983834240.0, + "grad_norm": 1.9833192606475072, + "language_loss": 0.88325703, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.90462005, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5234375, + "step": 4782, + "time_per_iteration": 2.381814479827881 + }, + { + "auxiliary_loss_clip": 0.01073162, + "auxiliary_loss_mlp": 0.01052776, + "balance_loss_clip": 1.01776385, + "balance_loss_mlp": 1.02246714, + "epoch": 0.2875695175108973, + "flos": 21869000288640.0, + "grad_norm": 1.6347355370607755, + "language_loss": 0.72600436, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.74726373, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5078125, + "step": 4783, + "time_per_iteration": 2.415754795074463 + }, + { + "auxiliary_loss_clip": 0.01072468, + "auxiliary_loss_mlp": 0.01058328, + "balance_loss_clip": 1.02239788, + "balance_loss_mlp": 1.02192974, + "epoch": 0.2876296407635653, + "flos": 26244385695360.0, + "grad_norm": 1.7004100020637987, + "language_loss": 0.78401566, + "learning_rate": 3.343212594663047e-06, + "loss": 0.80532366, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.50390625, + "step": 4784, + "time_per_iteration": 2.40915584564209 + }, + { + "auxiliary_loss_clip": 0.01072112, + "auxiliary_loss_mlp": 0.01068081, + "balance_loss_clip": 1.03279555, + "balance_loss_mlp": 1.02243412, + "epoch": 0.28768976401623325, + "flos": 25372499512320.0, + "grad_norm": 1.4997893525646806, + "language_loss": 0.7716583, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.79306018, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.49609375, + "step": 4785, + "time_per_iteration": 2.4666945934295654 + }, + { + "auxiliary_loss_clip": 0.01073986, + "auxiliary_loss_mlp": 0.01057448, + "balance_loss_clip": 1.02311599, + "balance_loss_mlp": 1.02390802, + "epoch": 0.28774988726890127, + "flos": 30663063054720.0, + "grad_norm": 1.872304715780382, + "language_loss": 0.83494884, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85626316, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5, + "step": 4786, + "time_per_iteration": 2.4626622200012207 + }, + { + "auxiliary_loss_clip": 0.01072487, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.02178693, + "balance_loss_mlp": 1.02255142, + "epoch": 0.28781001052156924, + "flos": 20594391039360.0, + "grad_norm": 1.7376844880686022, + "language_loss": 0.81106526, + "learning_rate": 3.342346699429516e-06, + "loss": 0.83234823, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5, + "step": 4787, + "time_per_iteration": 2.4181816577911377 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01053539, + "balance_loss_clip": 1.01710844, + "balance_loss_mlp": 1.02122593, + "epoch": 0.2878701337742372, + "flos": 26541112273920.0, + "grad_norm": 1.9013956838924242, + "language_loss": 0.85134947, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.87261164, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.515625, + "step": 4788, + "time_per_iteration": 2.4286539554595947 + }, + { + "auxiliary_loss_clip": 0.01076333, + "auxiliary_loss_mlp": 0.01056138, + "balance_loss_clip": 1.02302194, + "balance_loss_mlp": 1.02448487, + "epoch": 0.28793025702690517, + "flos": 28145615760000.0, + "grad_norm": 1.762044127173841, + "language_loss": 0.75818121, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.77950585, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.515625, + "step": 4789, + "time_per_iteration": 2.5001394748687744 + }, + { + "auxiliary_loss_clip": 0.01068981, + "auxiliary_loss_mlp": 0.01051644, + "balance_loss_clip": 1.01952863, + "balance_loss_mlp": 1.02093577, + "epoch": 0.28799038027957313, + "flos": 23804340618240.0, + "grad_norm": 1.9151625208477214, + "language_loss": 0.85745358, + "learning_rate": 3.341480346078704e-06, + "loss": 0.87865984, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.48046875, + "step": 4790, + "time_per_iteration": 2.392709970474243 + }, + { + "auxiliary_loss_clip": 0.01074319, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.01916385, + "balance_loss_mlp": 1.02390289, + "epoch": 0.2880505035322411, + "flos": 22343085907200.0, + "grad_norm": 2.892576960418631, + "language_loss": 0.79518348, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.81644756, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.50390625, + "step": 4791, + "time_per_iteration": 2.462554693222046 + }, + { + "auxiliary_loss_clip": 0.01075608, + "auxiliary_loss_mlp": 0.01052303, + "balance_loss_clip": 1.01777959, + "balance_loss_mlp": 1.02269506, + "epoch": 0.28811062678490906, + "flos": 18003277042560.0, + "grad_norm": 1.8854170450488141, + "language_loss": 0.73091519, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.75219429, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.53125, + "step": 4792, + "time_per_iteration": 2.36004638671875 + }, + { + "auxiliary_loss_clip": 0.01075175, + "auxiliary_loss_mlp": 0.01052617, + "balance_loss_clip": 1.02019227, + "balance_loss_mlp": 1.02373874, + "epoch": 0.28817075003757703, + "flos": 22089790926720.0, + "grad_norm": 1.7252346707429362, + "language_loss": 0.81450498, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.83578295, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.515625, + "step": 4793, + "time_per_iteration": 2.428710460662842 + }, + { + "auxiliary_loss_clip": 0.01071423, + "auxiliary_loss_mlp": 0.01049683, + "balance_loss_clip": 1.01630473, + "balance_loss_mlp": 1.02382827, + "epoch": 0.288230873290245, + "flos": 41681512229760.0, + "grad_norm": 1.716099649177568, + "language_loss": 0.79554808, + "learning_rate": 3.340324496161797e-06, + "loss": 0.81675911, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4765625, + "step": 4794, + "time_per_iteration": 2.552276372909546 + }, + { + "auxiliary_loss_clip": 0.01077174, + "auxiliary_loss_mlp": 0.01061085, + "balance_loss_clip": 1.02558422, + "balance_loss_mlp": 1.0256083, + "epoch": 0.28829099654291296, + "flos": 18623439256320.0, + "grad_norm": 2.194883403283756, + "language_loss": 0.84690183, + "learning_rate": 3.340035406592074e-06, + "loss": 0.86828434, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.515625, + "step": 4795, + "time_per_iteration": 2.404552459716797 + }, + { + "auxiliary_loss_clip": 0.01070762, + "auxiliary_loss_mlp": 0.01050507, + "balance_loss_clip": 1.01965559, + "balance_loss_mlp": 1.02263403, + "epoch": 0.2883511197955809, + "flos": 24673852828800.0, + "grad_norm": 1.7822672764001202, + "language_loss": 0.75784624, + "learning_rate": 3.339746266208074e-06, + "loss": 0.77905887, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48046875, + "step": 4796, + "time_per_iteration": 2.3984270095825195 + }, + { + "auxiliary_loss_clip": 0.01077316, + "auxiliary_loss_mlp": 0.01066587, + "balance_loss_clip": 1.02684236, + "balance_loss_mlp": 1.0236392, + "epoch": 0.2884112430482489, + "flos": 23111035372800.0, + "grad_norm": 1.8849671974551858, + "language_loss": 0.74277216, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.76421118, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.5390625, + "step": 4797, + "time_per_iteration": 2.4430696964263916 + }, + { + "auxiliary_loss_clip": 0.01074542, + "auxiliary_loss_mlp": 0.01050094, + "balance_loss_clip": 1.01466501, + "balance_loss_mlp": 1.02373171, + "epoch": 0.28847136630091685, + "flos": 16872405327360.0, + "grad_norm": 2.2701752252437557, + "language_loss": 0.75895566, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.78020203, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5078125, + "step": 4798, + "time_per_iteration": 2.376037120819092 + }, + { + "auxiliary_loss_clip": 0.01075096, + "auxiliary_loss_mlp": 0.01056316, + "balance_loss_clip": 1.01816869, + "balance_loss_mlp": 1.02265584, + "epoch": 0.2885314895535849, + "flos": 25656588178560.0, + "grad_norm": 2.5632206552079677, + "language_loss": 0.68572044, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.70703459, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5234375, + "step": 4799, + "time_per_iteration": 2.538046360015869 + }, + { + "auxiliary_loss_clip": 0.01076044, + "auxiliary_loss_mlp": 0.01061831, + "balance_loss_clip": 1.02373183, + "balance_loss_mlp": 1.02361369, + "epoch": 0.28859161280625284, + "flos": 21106147881600.0, + "grad_norm": 1.6742904360003887, + "language_loss": 0.83106464, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.85244346, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5234375, + "step": 4800, + "time_per_iteration": 3.9291372299194336 + }, + { + "auxiliary_loss_clip": 0.01071793, + "auxiliary_loss_mlp": 0.0105297, + "balance_loss_clip": 1.0201869, + "balance_loss_mlp": 1.02247405, + "epoch": 0.2886517360589208, + "flos": 26468318355840.0, + "grad_norm": 1.5994166352621397, + "language_loss": 0.91404307, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93529075, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4921875, + "step": 4801, + "time_per_iteration": 2.4813859462738037 + }, + { + "auxiliary_loss_clip": 0.01074449, + "auxiliary_loss_mlp": 0.01052867, + "balance_loss_clip": 1.01669931, + "balance_loss_mlp": 1.02330148, + "epoch": 0.28871185931158877, + "flos": 25264094140800.0, + "grad_norm": 2.517070367280696, + "language_loss": 0.74397177, + "learning_rate": 3.33801035741839e-06, + "loss": 0.76524496, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.51171875, + "step": 4802, + "time_per_iteration": 2.4148359298706055 + }, + { + "auxiliary_loss_clip": 0.01018943, + "auxiliary_loss_mlp": 0.01015723, + "balance_loss_clip": 1.01121664, + "balance_loss_mlp": 1.00655103, + "epoch": 0.28877198256425674, + "flos": 66662390488320.0, + "grad_norm": 0.779790091754816, + "language_loss": 0.62979871, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65014535, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.04516602, + "router_z_loss_mlp": 0.12402344, + "step": 4803, + "time_per_iteration": 4.394913673400879 + }, + { + "auxiliary_loss_clip": 0.01072752, + "auxiliary_loss_mlp": 0.01061553, + "balance_loss_clip": 1.02538502, + "balance_loss_mlp": 1.02152371, + "epoch": 0.2888321058169247, + "flos": 20301993469440.0, + "grad_norm": 1.8610799099675213, + "language_loss": 0.72760189, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.74894494, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.51171875, + "step": 4804, + "time_per_iteration": 2.4458436965942383 + }, + { + "auxiliary_loss_clip": 0.0107581, + "auxiliary_loss_mlp": 0.01059876, + "balance_loss_clip": 1.02122855, + "balance_loss_mlp": 1.0225029, + "epoch": 0.28889222906959267, + "flos": 25515643553280.0, + "grad_norm": 1.7385467716163125, + "language_loss": 0.69403988, + "learning_rate": 3.337141717919346e-06, + "loss": 0.71539676, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.53515625, + "step": 4805, + "time_per_iteration": 4.054839611053467 + }, + { + "auxiliary_loss_clip": 0.01074066, + "auxiliary_loss_mlp": 0.01062355, + "balance_loss_clip": 1.02437496, + "balance_loss_mlp": 1.02169561, + "epoch": 0.28895235232226063, + "flos": 32669940493440.0, + "grad_norm": 1.396963792111239, + "language_loss": 0.70882869, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.7301929, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5234375, + "step": 4806, + "time_per_iteration": 3.872166633605957 + }, + { + "auxiliary_loss_clip": 0.01070611, + "auxiliary_loss_mlp": 0.01060892, + "balance_loss_clip": 1.0258441, + "balance_loss_mlp": 1.02124405, + "epoch": 0.2890124755749286, + "flos": 29713425540480.0, + "grad_norm": 1.4284383836015453, + "language_loss": 0.72325987, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7445749, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.49414062, + "step": 4807, + "time_per_iteration": 2.4775710105895996 + }, + { + "auxiliary_loss_clip": 0.01071866, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.02351499, + "balance_loss_mlp": 1.02166462, + "epoch": 0.28907259882759656, + "flos": 22673364168960.0, + "grad_norm": 1.5159217271741365, + "language_loss": 0.82605267, + "learning_rate": 3.336272622079382e-06, + "loss": 0.84736538, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5, + "step": 4808, + "time_per_iteration": 2.3981733322143555 + }, + { + "auxiliary_loss_clip": 0.01069743, + "auxiliary_loss_mlp": 0.01064287, + "balance_loss_clip": 1.02912056, + "balance_loss_mlp": 1.02181685, + "epoch": 0.2891327220802645, + "flos": 22564923886080.0, + "grad_norm": 1.4552747696854553, + "language_loss": 0.79725134, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.81859165, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.47851562, + "step": 4809, + "time_per_iteration": 2.508521556854248 + }, + { + "auxiliary_loss_clip": 0.01077044, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.02215648, + "balance_loss_mlp": 1.02223158, + "epoch": 0.2891928453329325, + "flos": 21651735697920.0, + "grad_norm": 1.8779261297386927, + "language_loss": 0.80362689, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.82499701, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.546875, + "step": 4810, + "time_per_iteration": 2.420687437057495 + }, + { + "auxiliary_loss_clip": 0.01071535, + "auxiliary_loss_mlp": 0.01058747, + "balance_loss_clip": 1.02358055, + "balance_loss_mlp": 1.02232552, + "epoch": 0.28925296858560046, + "flos": 23220976844160.0, + "grad_norm": 1.6192189001275268, + "language_loss": 0.7803086, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.80161142, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.4921875, + "step": 4811, + "time_per_iteration": 2.454054355621338 + }, + { + "auxiliary_loss_clip": 0.01069983, + "auxiliary_loss_mlp": 0.01053326, + "balance_loss_clip": 1.01896977, + "balance_loss_mlp": 1.02097917, + "epoch": 0.2893130918382685, + "flos": 28620399605760.0, + "grad_norm": 1.3775956769239155, + "language_loss": 0.79047394, + "learning_rate": 3.335113118275117e-06, + "loss": 0.81170702, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49023438, + "step": 4812, + "time_per_iteration": 2.4516518115997314 + }, + { + "auxiliary_loss_clip": 0.0101686, + "auxiliary_loss_mlp": 0.01013333, + "balance_loss_clip": 1.00923252, + "balance_loss_mlp": 1.00387549, + "epoch": 0.28937321509093644, + "flos": 72297615772800.0, + "grad_norm": 0.8734953003775844, + "language_loss": 0.60456955, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62487149, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.12988281, + "step": 4813, + "time_per_iteration": 3.192640781402588 + }, + { + "auxiliary_loss_clip": 0.01073581, + "auxiliary_loss_mlp": 0.01047269, + "balance_loss_clip": 1.014606, + "balance_loss_mlp": 1.0240196, + "epoch": 0.2894333383436044, + "flos": 16215479585280.0, + "grad_norm": 3.116159072365966, + "language_loss": 0.83498842, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.85619688, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.49609375, + "step": 4814, + "time_per_iteration": 2.368969440460205 + }, + { + "auxiliary_loss_clip": 0.01077734, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_clip": 1.02898169, + "balance_loss_mlp": 1.02558315, + "epoch": 0.2894934615962724, + "flos": 24827086252800.0, + "grad_norm": 1.6606257000707405, + "language_loss": 0.74661142, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.76803112, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5234375, + "step": 4815, + "time_per_iteration": 2.4814956188201904 + }, + { + "auxiliary_loss_clip": 0.01070772, + "auxiliary_loss_mlp": 0.01056411, + "balance_loss_clip": 1.02541661, + "balance_loss_mlp": 1.02396107, + "epoch": 0.28955358484894034, + "flos": 20448907937280.0, + "grad_norm": 1.5421379597520848, + "language_loss": 0.72432935, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.74560124, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46875, + "step": 4816, + "time_per_iteration": 2.397700548171997 + }, + { + "auxiliary_loss_clip": 0.01077515, + "auxiliary_loss_mlp": 0.01062567, + "balance_loss_clip": 1.02537346, + "balance_loss_mlp": 1.02342248, + "epoch": 0.2896137081016083, + "flos": 22564086013440.0, + "grad_norm": 2.1357712458324873, + "language_loss": 0.77042443, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.79182523, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.54296875, + "step": 4817, + "time_per_iteration": 2.4409806728363037 + }, + { + "auxiliary_loss_clip": 0.0107998, + "auxiliary_loss_mlp": 0.01069305, + "balance_loss_clip": 1.03196907, + "balance_loss_mlp": 1.02685452, + "epoch": 0.28967383135427627, + "flos": 26686735021440.0, + "grad_norm": 1.8567295150786596, + "language_loss": 0.784168, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.80566084, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.53125, + "step": 4818, + "time_per_iteration": 2.440365791320801 + }, + { + "auxiliary_loss_clip": 0.01075655, + "auxiliary_loss_mlp": 0.01058817, + "balance_loss_clip": 1.02536726, + "balance_loss_mlp": 1.02466488, + "epoch": 0.28973395460694423, + "flos": 15557401768320.0, + "grad_norm": 2.588603090438053, + "language_loss": 0.81700897, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.83835369, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.51171875, + "step": 4819, + "time_per_iteration": 2.417724132537842 + }, + { + "auxiliary_loss_clip": 0.01076651, + "auxiliary_loss_mlp": 0.01068047, + "balance_loss_clip": 1.03111553, + "balance_loss_mlp": 1.02393651, + "epoch": 0.2897940778596122, + "flos": 18696477553920.0, + "grad_norm": 1.7703697372160547, + "language_loss": 0.80564809, + "learning_rate": 3.332791681244776e-06, + "loss": 0.82709509, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.52734375, + "step": 4820, + "time_per_iteration": 2.3786885738372803 + }, + { + "auxiliary_loss_clip": 0.01076354, + "auxiliary_loss_mlp": 0.0105713, + "balance_loss_clip": 1.02322674, + "balance_loss_mlp": 1.02430773, + "epoch": 0.28985420111228016, + "flos": 18769306383360.0, + "grad_norm": 2.0657199995573867, + "language_loss": 0.75037491, + "learning_rate": 3.332501274072231e-06, + "loss": 0.77170968, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.51953125, + "step": 4821, + "time_per_iteration": 2.5142199993133545 + }, + { + "auxiliary_loss_clip": 0.01075543, + "auxiliary_loss_mlp": 0.01060032, + "balance_loss_clip": 1.0254612, + "balance_loss_mlp": 1.02461958, + "epoch": 0.28991432436494813, + "flos": 23068895495040.0, + "grad_norm": 1.8891130077683513, + "language_loss": 0.73827028, + "learning_rate": 3.332210816371104e-06, + "loss": 0.75962603, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5078125, + "step": 4822, + "time_per_iteration": 2.4035074710845947 + }, + { + "auxiliary_loss_clip": 0.01073723, + "auxiliary_loss_mlp": 0.01067737, + "balance_loss_clip": 1.03421545, + "balance_loss_mlp": 1.02355075, + "epoch": 0.2899744476176161, + "flos": 17602229721600.0, + "grad_norm": 1.7607369357515252, + "language_loss": 0.67336535, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.69477999, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5, + "step": 4823, + "time_per_iteration": 2.3950703144073486 + }, + { + "auxiliary_loss_clip": 0.01071275, + "auxiliary_loss_mlp": 0.01057437, + "balance_loss_clip": 1.02444017, + "balance_loss_mlp": 1.0224185, + "epoch": 0.29003457087028406, + "flos": 22308277415040.0, + "grad_norm": 1.9338915450665042, + "language_loss": 0.83148879, + "learning_rate": 3.331629749427164e-06, + "loss": 0.85277593, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.48828125, + "step": 4824, + "time_per_iteration": 2.3936192989349365 + }, + { + "auxiliary_loss_clip": 0.01072199, + "auxiliary_loss_mlp": 0.01058554, + "balance_loss_clip": 1.02274394, + "balance_loss_mlp": 1.02192903, + "epoch": 0.2900946941229521, + "flos": 21943888888320.0, + "grad_norm": 2.257098834980242, + "language_loss": 0.74338484, + "learning_rate": 3.331339140206385e-06, + "loss": 0.76469243, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.50390625, + "step": 4825, + "time_per_iteration": 2.4501688480377197 + }, + { + "auxiliary_loss_clip": 0.01074698, + "auxiliary_loss_mlp": 0.01052137, + "balance_loss_clip": 1.01773286, + "balance_loss_mlp": 1.02319407, + "epoch": 0.29015481737562004, + "flos": 17931181351680.0, + "grad_norm": 2.2635394337086567, + "language_loss": 0.75976992, + "learning_rate": 3.331048480501092e-06, + "loss": 0.78103817, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4826, + "time_per_iteration": 2.366119861602783 + }, + { + "auxiliary_loss_clip": 0.01070986, + "auxiliary_loss_mlp": 0.01054655, + "balance_loss_clip": 1.02137196, + "balance_loss_mlp": 1.02134156, + "epoch": 0.290214940628288, + "flos": 22782432856320.0, + "grad_norm": 1.694768873937425, + "language_loss": 0.70475221, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.72600865, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.49609375, + "step": 4827, + "time_per_iteration": 2.5647995471954346 + }, + { + "auxiliary_loss_clip": 0.01075844, + "auxiliary_loss_mlp": 0.01053036, + "balance_loss_clip": 1.01903749, + "balance_loss_mlp": 1.02522469, + "epoch": 0.290275063880956, + "flos": 20005581093120.0, + "grad_norm": 1.7994022040925317, + "language_loss": 0.81390083, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.83518958, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 4828, + "time_per_iteration": 2.4223439693450928 + }, + { + "auxiliary_loss_clip": 0.01073436, + "auxiliary_loss_mlp": 0.01058096, + "balance_loss_clip": 1.02471757, + "balance_loss_mlp": 1.02404261, + "epoch": 0.29033518713362394, + "flos": 22052538639360.0, + "grad_norm": 1.8723298988506072, + "language_loss": 0.81776071, + "learning_rate": 3.33017619858836e-06, + "loss": 0.83907592, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.49414062, + "step": 4829, + "time_per_iteration": 2.4619228839874268 + }, + { + "auxiliary_loss_clip": 0.01072167, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_clip": 1.01642776, + "balance_loss_mlp": 1.02366257, + "epoch": 0.2903953103862919, + "flos": 25628866692480.0, + "grad_norm": 1.4492073900328657, + "language_loss": 0.83741641, + "learning_rate": 3.329885337055249e-06, + "loss": 0.85860944, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.48632812, + "step": 4830, + "time_per_iteration": 2.4402098655700684 + }, + { + "auxiliary_loss_clip": 0.01074304, + "auxiliary_loss_mlp": 0.01051683, + "balance_loss_clip": 1.01916313, + "balance_loss_mlp": 1.02435994, + "epoch": 0.29045543363895987, + "flos": 16944919954560.0, + "grad_norm": 2.3300599919391987, + "language_loss": 0.81380785, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.83506775, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.5, + "step": 4831, + "time_per_iteration": 2.4789958000183105 + }, + { + "auxiliary_loss_clip": 0.0107142, + "auxiliary_loss_mlp": 0.01049467, + "balance_loss_clip": 1.02021289, + "balance_loss_mlp": 1.02436423, + "epoch": 0.29051555689162784, + "flos": 26394302540160.0, + "grad_norm": 1.9296052727155317, + "language_loss": 0.75834548, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.77955431, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.46875, + "step": 4832, + "time_per_iteration": 2.428722620010376 + }, + { + "auxiliary_loss_clip": 0.01071331, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_clip": 1.01322722, + "balance_loss_mlp": 1.02352381, + "epoch": 0.2905756801442958, + "flos": 21102866213760.0, + "grad_norm": 1.5656511306715408, + "language_loss": 0.77419889, + "learning_rate": 3.329012449923736e-06, + "loss": 0.79534775, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4765625, + "step": 4833, + "time_per_iteration": 2.471726179122925 + }, + { + "auxiliary_loss_clip": 0.01073317, + "auxiliary_loss_mlp": 0.01046768, + "balance_loss_clip": 1.01451039, + "balance_loss_mlp": 1.0253334, + "epoch": 0.29063580339696377, + "flos": 15705154108800.0, + "grad_norm": 1.5160196133509451, + "language_loss": 0.6642921, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.68549293, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.48046875, + "step": 4834, + "time_per_iteration": 2.401853084564209 + }, + { + "auxiliary_loss_clip": 0.01069633, + "auxiliary_loss_mlp": 0.01044276, + "balance_loss_clip": 1.01442575, + "balance_loss_mlp": 1.0232029, + "epoch": 0.29069592664963173, + "flos": 24643827192960.0, + "grad_norm": 1.5260463333510896, + "language_loss": 0.73305595, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.75419509, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.46484375, + "step": 4835, + "time_per_iteration": 2.4605278968811035 + }, + { + "auxiliary_loss_clip": 0.01070639, + "auxiliary_loss_mlp": 0.01050404, + "balance_loss_clip": 1.02012444, + "balance_loss_mlp": 1.02301168, + "epoch": 0.2907560499022997, + "flos": 24972569354880.0, + "grad_norm": 1.5760233865184936, + "language_loss": 0.81034529, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.83155566, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4765625, + "step": 4836, + "time_per_iteration": 2.462496757507324 + }, + { + "auxiliary_loss_clip": 0.01070378, + "auxiliary_loss_mlp": 0.01046846, + "balance_loss_clip": 1.01747298, + "balance_loss_mlp": 1.02356136, + "epoch": 0.29081617315496766, + "flos": 18656606914560.0, + "grad_norm": 1.6347402066325598, + "language_loss": 0.82773226, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.84890455, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.46679688, + "step": 4837, + "time_per_iteration": 2.4614601135253906 + }, + { + "auxiliary_loss_clip": 0.01069727, + "auxiliary_loss_mlp": 0.01047956, + "balance_loss_clip": 1.01512551, + "balance_loss_mlp": 1.02030027, + "epoch": 0.2908762964076356, + "flos": 35329693956480.0, + "grad_norm": 2.4281840639783927, + "language_loss": 0.68271661, + "learning_rate": 3.327556630259381e-06, + "loss": 0.70389342, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.49414062, + "step": 4838, + "time_per_iteration": 2.5122272968292236 + }, + { + "auxiliary_loss_clip": 0.01075079, + "auxiliary_loss_mlp": 0.010554, + "balance_loss_clip": 1.01997054, + "balance_loss_mlp": 1.02445769, + "epoch": 0.29093641966030365, + "flos": 23075179539840.0, + "grad_norm": 1.640599650086532, + "language_loss": 0.7251932, + "learning_rate": 3.327265315259095e-06, + "loss": 0.74649799, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5078125, + "step": 4839, + "time_per_iteration": 2.472647190093994 + }, + { + "auxiliary_loss_clip": 0.01071145, + "auxiliary_loss_mlp": 0.01051697, + "balance_loss_clip": 1.020154, + "balance_loss_mlp": 1.02251196, + "epoch": 0.2909965429129716, + "flos": 35953940799360.0, + "grad_norm": 1.9854029705570404, + "language_loss": 0.77677441, + "learning_rate": 3.326973949928776e-06, + "loss": 0.79800284, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.48632812, + "step": 4840, + "time_per_iteration": 3.988675355911255 + }, + { + "auxiliary_loss_clip": 0.01072052, + "auxiliary_loss_mlp": 0.01052887, + "balance_loss_clip": 1.01998496, + "balance_loss_mlp": 1.02300477, + "epoch": 0.2910566661656396, + "flos": 30879001013760.0, + "grad_norm": 1.796450017146185, + "language_loss": 0.61609578, + "learning_rate": 3.326682534279471e-06, + "loss": 0.63734514, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.49023438, + "step": 4841, + "time_per_iteration": 2.5071892738342285 + }, + { + "auxiliary_loss_clip": 0.01071493, + "auxiliary_loss_mlp": 0.01050884, + "balance_loss_clip": 1.0179584, + "balance_loss_mlp": 1.02227485, + "epoch": 0.29111678941830754, + "flos": 30008825487360.0, + "grad_norm": 1.9291208358891798, + "language_loss": 0.72571898, + "learning_rate": 3.326391068322232e-06, + "loss": 0.74694276, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4921875, + "step": 4842, + "time_per_iteration": 3.8700199127197266 + }, + { + "auxiliary_loss_clip": 0.01069128, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.01614356, + "balance_loss_mlp": 1.02155626, + "epoch": 0.2911769126709755, + "flos": 22856274115200.0, + "grad_norm": 1.4386335905371743, + "language_loss": 0.75185323, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.77301955, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4765625, + "step": 4843, + "time_per_iteration": 2.4057388305664062 + }, + { + "auxiliary_loss_clip": 0.01069561, + "auxiliary_loss_mlp": 0.01047418, + "balance_loss_clip": 1.01487374, + "balance_loss_mlp": 1.02084136, + "epoch": 0.2912370359236435, + "flos": 21649501370880.0, + "grad_norm": 1.9498740757938409, + "language_loss": 0.59697199, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.61814177, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.48632812, + "step": 4844, + "time_per_iteration": 3.926154851913452 + }, + { + "auxiliary_loss_clip": 0.01073613, + "auxiliary_loss_mlp": 0.0105114, + "balance_loss_clip": 1.01537764, + "balance_loss_mlp": 1.02288723, + "epoch": 0.29129715917631144, + "flos": 22892234682240.0, + "grad_norm": 1.8795667400760883, + "language_loss": 0.88026434, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.90151185, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5078125, + "step": 4845, + "time_per_iteration": 2.4223411083221436 + }, + { + "auxiliary_loss_clip": 0.01072573, + "auxiliary_loss_mlp": 0.01057143, + "balance_loss_clip": 1.01963937, + "balance_loss_mlp": 1.02259171, + "epoch": 0.2913572824289794, + "flos": 22673364168960.0, + "grad_norm": 1.6314729044197387, + "language_loss": 0.6915915, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.71288872, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5, + "step": 4846, + "time_per_iteration": 3.9333624839782715 + }, + { + "auxiliary_loss_clip": 0.01071084, + "auxiliary_loss_mlp": 0.01050121, + "balance_loss_clip": 1.01831567, + "balance_loss_mlp": 1.02316034, + "epoch": 0.29141740568164737, + "flos": 23106427073280.0, + "grad_norm": 1.6654248248235428, + "language_loss": 0.71504289, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.73625493, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.47851562, + "step": 4847, + "time_per_iteration": 2.5234131813049316 + }, + { + "auxiliary_loss_clip": 0.01072629, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_clip": 1.01221609, + "balance_loss_mlp": 1.02349424, + "epoch": 0.29147752893431533, + "flos": 23585889041280.0, + "grad_norm": 1.4993159706789372, + "language_loss": 0.75313497, + "learning_rate": 3.324641216731237e-06, + "loss": 0.77431887, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4921875, + "step": 4848, + "time_per_iteration": 2.4333927631378174 + }, + { + "auxiliary_loss_clip": 0.01071027, + "auxiliary_loss_mlp": 0.01053241, + "balance_loss_clip": 1.01940989, + "balance_loss_mlp": 1.02208877, + "epoch": 0.2915376521869833, + "flos": 20591004637440.0, + "grad_norm": 2.7211882075607368, + "language_loss": 0.79357147, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.81481409, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.48828125, + "step": 4849, + "time_per_iteration": 2.420400619506836 + }, + { + "auxiliary_loss_clip": 0.01074034, + "auxiliary_loss_mlp": 0.01049937, + "balance_loss_clip": 1.01443648, + "balance_loss_mlp": 1.02355087, + "epoch": 0.29159777543965126, + "flos": 20810503555200.0, + "grad_norm": 1.5777766122832055, + "language_loss": 0.80270684, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.82394648, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.50390625, + "step": 4850, + "time_per_iteration": 2.388314962387085 + }, + { + "auxiliary_loss_clip": 0.01070676, + "auxiliary_loss_mlp": 0.01047199, + "balance_loss_clip": 1.01331997, + "balance_loss_mlp": 1.02213418, + "epoch": 0.29165789869231923, + "flos": 24242989340160.0, + "grad_norm": 1.6652021646208748, + "language_loss": 0.77076858, + "learning_rate": 3.323765612674296e-06, + "loss": 0.79194725, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.484375, + "step": 4851, + "time_per_iteration": 2.4699764251708984 + }, + { + "auxiliary_loss_clip": 0.01071387, + "auxiliary_loss_mlp": 0.01046969, + "balance_loss_clip": 1.01719046, + "balance_loss_mlp": 1.0229665, + "epoch": 0.29171802194498725, + "flos": 28948653008640.0, + "grad_norm": 1.3966409713721095, + "language_loss": 0.78496802, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.80615163, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.484375, + "step": 4852, + "time_per_iteration": 2.4490742683410645 + }, + { + "auxiliary_loss_clip": 0.01070788, + "auxiliary_loss_mlp": 0.0105358, + "balance_loss_clip": 1.01991558, + "balance_loss_mlp": 1.02261329, + "epoch": 0.2917781451976552, + "flos": 22597218760320.0, + "grad_norm": 1.8614846901315991, + "language_loss": 0.78789645, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80914009, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.48046875, + "step": 4853, + "time_per_iteration": 2.485038995742798 + }, + { + "auxiliary_loss_clip": 0.01072619, + "auxiliary_loss_mlp": 0.01048618, + "balance_loss_clip": 1.01523995, + "balance_loss_mlp": 1.02197242, + "epoch": 0.2918382684503232, + "flos": 21573530519040.0, + "grad_norm": 2.3236688225356836, + "language_loss": 0.89055943, + "learning_rate": 3.322889556841445e-06, + "loss": 0.91177183, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5078125, + "step": 4854, + "time_per_iteration": 2.378618001937866 + }, + { + "auxiliary_loss_clip": 0.01070569, + "auxiliary_loss_mlp": 0.01057428, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.02120233, + "epoch": 0.29189839170299114, + "flos": 24352337318400.0, + "grad_norm": 1.8639364092714703, + "language_loss": 0.8733449, + "learning_rate": 3.322597437887519e-06, + "loss": 0.89462483, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49414062, + "step": 4855, + "time_per_iteration": 2.4928650856018066 + }, + { + "auxiliary_loss_clip": 0.01017139, + "auxiliary_loss_mlp": 0.01010783, + "balance_loss_clip": 1.00639629, + "balance_loss_mlp": 1.00586987, + "epoch": 0.2919585149556591, + "flos": 71313065032320.0, + "grad_norm": 0.8226205422264361, + "language_loss": 0.60308337, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62336266, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.04394531, + "router_z_loss_mlp": 0.11279297, + "step": 4856, + "time_per_iteration": 3.1482086181640625 + }, + { + "auxiliary_loss_clip": 0.01070308, + "auxiliary_loss_mlp": 0.01049347, + "balance_loss_clip": 1.01854348, + "balance_loss_mlp": 1.02170873, + "epoch": 0.2920186382083271, + "flos": 15632290368000.0, + "grad_norm": 1.6784295241136882, + "language_loss": 0.69830358, + "learning_rate": 3.322013049531664e-06, + "loss": 0.71950018, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48632812, + "step": 4857, + "time_per_iteration": 2.467507839202881 + }, + { + "auxiliary_loss_clip": 0.01071382, + "auxiliary_loss_mlp": 0.01045638, + "balance_loss_clip": 1.01489353, + "balance_loss_mlp": 1.02295709, + "epoch": 0.29207876146099504, + "flos": 28364765564160.0, + "grad_norm": 2.011303578434968, + "language_loss": 0.85622841, + "learning_rate": 3.321720780151895e-06, + "loss": 0.87739861, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.484375, + "step": 4858, + "time_per_iteration": 2.440809488296509 + }, + { + "auxiliary_loss_clip": 0.01071842, + "auxiliary_loss_mlp": 0.0105463, + "balance_loss_clip": 1.02237201, + "balance_loss_mlp": 1.02308083, + "epoch": 0.292138884713663, + "flos": 21869907984000.0, + "grad_norm": 1.623429923013787, + "language_loss": 0.79075944, + "learning_rate": 3.321428460652342e-06, + "loss": 0.81202418, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.48828125, + "step": 4859, + "time_per_iteration": 2.414384603500366 + }, + { + "auxiliary_loss_clip": 0.01073159, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_clip": 1.01900613, + "balance_loss_mlp": 1.02229774, + "epoch": 0.29219900796633097, + "flos": 20991598110720.0, + "grad_norm": 2.6628204581679316, + "language_loss": 0.7077781, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.72903091, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5078125, + "step": 4860, + "time_per_iteration": 2.41534686088562 + }, + { + "auxiliary_loss_clip": 0.01070193, + "auxiliary_loss_mlp": 0.01052389, + "balance_loss_clip": 1.02083397, + "balance_loss_mlp": 1.02235961, + "epoch": 0.29225913121899894, + "flos": 35003221032960.0, + "grad_norm": 2.139665364680057, + "language_loss": 0.76568305, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78690886, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.47851562, + "step": 4861, + "time_per_iteration": 2.544222831726074 + }, + { + "auxiliary_loss_clip": 0.01067884, + "auxiliary_loss_mlp": 0.0104972, + "balance_loss_clip": 1.01989377, + "balance_loss_mlp": 1.02146459, + "epoch": 0.2923192544716669, + "flos": 13514843053440.0, + "grad_norm": 2.496752382358247, + "language_loss": 0.93723172, + "learning_rate": 3.320551201545832e-06, + "loss": 0.95840776, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.46484375, + "step": 4862, + "time_per_iteration": 2.362663745880127 + }, + { + "auxiliary_loss_clip": 0.01069886, + "auxiliary_loss_mlp": 0.0104875, + "balance_loss_clip": 1.01885247, + "balance_loss_mlp": 1.02121615, + "epoch": 0.29237937772433487, + "flos": 19462506894720.0, + "grad_norm": 2.2662374111127117, + "language_loss": 0.754219, + "learning_rate": 3.320258681678008e-06, + "loss": 0.77540541, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.48632812, + "step": 4863, + "time_per_iteration": 2.444514274597168 + }, + { + "auxiliary_loss_clip": 0.01068398, + "auxiliary_loss_mlp": 0.01048657, + "balance_loss_clip": 1.01897383, + "balance_loss_mlp": 1.0232991, + "epoch": 0.29243950097700283, + "flos": 20849536321920.0, + "grad_norm": 1.7105223351914913, + "language_loss": 0.79495978, + "learning_rate": 3.319966111745842e-06, + "loss": 0.81613028, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.453125, + "step": 4864, + "time_per_iteration": 2.4131219387054443 + }, + { + "auxiliary_loss_clip": 0.01072191, + "auxiliary_loss_mlp": 0.0105468, + "balance_loss_clip": 1.01858377, + "balance_loss_mlp": 1.02246499, + "epoch": 0.29249962422967085, + "flos": 23583165955200.0, + "grad_norm": 1.6090769220502443, + "language_loss": 0.82753515, + "learning_rate": 3.319673491760429e-06, + "loss": 0.84880388, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.49804688, + "step": 4865, + "time_per_iteration": 2.4482688903808594 + }, + { + "auxiliary_loss_clip": 0.01071196, + "auxiliary_loss_mlp": 0.01049899, + "balance_loss_clip": 1.01540005, + "balance_loss_mlp": 1.02163339, + "epoch": 0.2925597474823388, + "flos": 22272247025280.0, + "grad_norm": 1.8840959476230292, + "language_loss": 0.87059575, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.89180672, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.49609375, + "step": 4866, + "time_per_iteration": 2.420487403869629 + }, + { + "auxiliary_loss_clip": 0.01070118, + "auxiliary_loss_mlp": 0.01048095, + "balance_loss_clip": 1.01583707, + "balance_loss_mlp": 1.02221847, + "epoch": 0.2926198707350068, + "flos": 34454770485120.0, + "grad_norm": 1.596215330631747, + "language_loss": 0.76350784, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.78469002, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47851562, + "step": 4867, + "time_per_iteration": 2.5623457431793213 + }, + { + "auxiliary_loss_clip": 0.01073657, + "auxiliary_loss_mlp": 0.01058701, + "balance_loss_clip": 1.02491713, + "balance_loss_mlp": 1.02275741, + "epoch": 0.29267999398767475, + "flos": 20703110613120.0, + "grad_norm": 1.913639311522078, + "language_loss": 0.74725288, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.76857644, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5078125, + "step": 4868, + "time_per_iteration": 2.382826328277588 + }, + { + "auxiliary_loss_clip": 0.01069967, + "auxiliary_loss_mlp": 0.010403, + "balance_loss_clip": 1.00959158, + "balance_loss_mlp": 1.02296472, + "epoch": 0.2927401172403427, + "flos": 18367700480640.0, + "grad_norm": 1.4192228196538488, + "language_loss": 0.75830472, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.77940738, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.46875, + "step": 4869, + "time_per_iteration": 2.4319021701812744 + }, + { + "auxiliary_loss_clip": 0.01072517, + "auxiliary_loss_mlp": 0.01053297, + "balance_loss_clip": 1.01867843, + "balance_loss_mlp": 1.02307403, + "epoch": 0.2928002404930107, + "flos": 26102847576960.0, + "grad_norm": 1.470710897877159, + "language_loss": 0.78101373, + "learning_rate": 3.318209641423088e-06, + "loss": 0.80227196, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.49414062, + "step": 4870, + "time_per_iteration": 2.4650747776031494 + }, + { + "auxiliary_loss_clip": 0.01073812, + "auxiliary_loss_mlp": 0.01059873, + "balance_loss_clip": 1.02244115, + "balance_loss_mlp": 1.02202988, + "epoch": 0.29286036374567864, + "flos": 21323656851840.0, + "grad_norm": 2.291424017738178, + "language_loss": 0.69248939, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.7138263, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.51953125, + "step": 4871, + "time_per_iteration": 2.4673917293548584 + }, + { + "auxiliary_loss_clip": 0.01070124, + "auxiliary_loss_mlp": 0.01057419, + "balance_loss_clip": 1.02506566, + "balance_loss_mlp": 1.02232099, + "epoch": 0.2929204869983466, + "flos": 29568221729280.0, + "grad_norm": 1.7290370470554173, + "language_loss": 0.78987998, + "learning_rate": 3.317623751303933e-06, + "loss": 0.81115538, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47851562, + "step": 4872, + "time_per_iteration": 2.5025992393493652 + }, + { + "auxiliary_loss_clip": 0.01073132, + "auxiliary_loss_mlp": 0.01056448, + "balance_loss_clip": 1.01808643, + "balance_loss_mlp": 1.02347612, + "epoch": 0.2929806102510146, + "flos": 19057374944640.0, + "grad_norm": 3.6130454317329086, + "language_loss": 0.74260318, + "learning_rate": 3.317330731292164e-06, + "loss": 0.76389897, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.49609375, + "step": 4873, + "time_per_iteration": 2.461566209793091 + }, + { + "auxiliary_loss_clip": 0.01072895, + "auxiliary_loss_mlp": 0.010536, + "balance_loss_clip": 1.01843297, + "balance_loss_mlp": 1.02171183, + "epoch": 0.29304073350368254, + "flos": 21943155749760.0, + "grad_norm": 1.9713303159774065, + "language_loss": 0.79986107, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.82112604, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.51171875, + "step": 4874, + "time_per_iteration": 2.3997695446014404 + }, + { + "auxiliary_loss_clip": 0.01077039, + "auxiliary_loss_mlp": 0.01053078, + "balance_loss_clip": 1.01581359, + "balance_loss_mlp": 1.02475643, + "epoch": 0.2931008567563505, + "flos": 15449904092160.0, + "grad_norm": 2.1095128928403106, + "language_loss": 0.7898097, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.81111085, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5234375, + "step": 4875, + "time_per_iteration": 2.406203269958496 + }, + { + "auxiliary_loss_clip": 0.01075784, + "auxiliary_loss_mlp": 0.01053323, + "balance_loss_clip": 1.01763201, + "balance_loss_mlp": 1.02586794, + "epoch": 0.29316098000901847, + "flos": 16982207153280.0, + "grad_norm": 2.9449864798644336, + "language_loss": 0.70681322, + "learning_rate": 3.316451371581431e-06, + "loss": 0.72810429, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5, + "step": 4876, + "time_per_iteration": 2.3931055068969727 + }, + { + "auxiliary_loss_clip": 0.01071525, + "auxiliary_loss_mlp": 0.0105532, + "balance_loss_clip": 1.0227282, + "balance_loss_mlp": 1.0228765, + "epoch": 0.29322110326168643, + "flos": 16356912969600.0, + "grad_norm": 1.9634521644839524, + "language_loss": 0.84057999, + "learning_rate": 3.316158151823096e-06, + "loss": 0.86184835, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.48632812, + "step": 4877, + "time_per_iteration": 2.422816514968872 + }, + { + "auxiliary_loss_clip": 0.01078558, + "auxiliary_loss_mlp": 0.01059278, + "balance_loss_clip": 1.02268076, + "balance_loss_mlp": 1.02663875, + "epoch": 0.29328122651435445, + "flos": 13990010924160.0, + "grad_norm": 3.8218805808170018, + "language_loss": 0.71173424, + "learning_rate": 3.315864882155911e-06, + "loss": 0.73311263, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.51953125, + "step": 4878, + "time_per_iteration": 2.3955490589141846 + }, + { + "auxiliary_loss_clip": 0.01074607, + "auxiliary_loss_mlp": 0.0106074, + "balance_loss_clip": 1.02385676, + "balance_loss_mlp": 1.02409339, + "epoch": 0.2933413497670224, + "flos": 25263430824960.0, + "grad_norm": 1.8630777652454271, + "language_loss": 0.74581999, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.76717347, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5078125, + "step": 4879, + "time_per_iteration": 2.4634108543395996 + }, + { + "auxiliary_loss_clip": 0.01075968, + "auxiliary_loss_mlp": 0.01068686, + "balance_loss_clip": 1.03318548, + "balance_loss_mlp": 1.02404356, + "epoch": 0.2934014730196904, + "flos": 32122397640960.0, + "grad_norm": 2.1000944207387486, + "language_loss": 0.682441, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.70388758, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.51953125, + "step": 4880, + "time_per_iteration": 3.931900978088379 + }, + { + "auxiliary_loss_clip": 0.01072769, + "auxiliary_loss_mlp": 0.01057478, + "balance_loss_clip": 1.02185822, + "balance_loss_mlp": 1.02212763, + "epoch": 0.29346159627235835, + "flos": 24351359800320.0, + "grad_norm": 20.708060075014668, + "language_loss": 0.73499632, + "learning_rate": 3.314984773812481e-06, + "loss": 0.75629878, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5078125, + "step": 4881, + "time_per_iteration": 2.44140887260437 + }, + { + "auxiliary_loss_clip": 0.01072737, + "auxiliary_loss_mlp": 0.01061236, + "balance_loss_clip": 1.0237323, + "balance_loss_mlp": 1.02187037, + "epoch": 0.2935217195250263, + "flos": 22745669328000.0, + "grad_norm": 1.63244243773188, + "language_loss": 0.8459152, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86725497, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5078125, + "step": 4882, + "time_per_iteration": 3.8535501956939697 + }, + { + "auxiliary_loss_clip": 0.01074145, + "auxiliary_loss_mlp": 0.01056139, + "balance_loss_clip": 1.01739645, + "balance_loss_mlp": 1.02330899, + "epoch": 0.2935818427776943, + "flos": 21724494704640.0, + "grad_norm": 2.1956220755468503, + "language_loss": 0.73960423, + "learning_rate": 3.314397785576548e-06, + "loss": 0.76090711, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.5078125, + "step": 4883, + "time_per_iteration": 2.3880820274353027 + }, + { + "auxiliary_loss_clip": 0.01072391, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.01862621, + "balance_loss_mlp": 1.02203321, + "epoch": 0.29364196603036224, + "flos": 23803851859200.0, + "grad_norm": 2.2670835496753154, + "language_loss": 0.93938446, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.96065319, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.50390625, + "step": 4884, + "time_per_iteration": 3.9766452312469482 + }, + { + "auxiliary_loss_clip": 0.01074587, + "auxiliary_loss_mlp": 0.01057759, + "balance_loss_clip": 1.02230597, + "balance_loss_mlp": 1.02402091, + "epoch": 0.2937020892830302, + "flos": 23469139854720.0, + "grad_norm": 2.316736242430448, + "language_loss": 0.75325751, + "learning_rate": 3.313810597972234e-06, + "loss": 0.77458096, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.50390625, + "step": 4885, + "time_per_iteration": 2.411137580871582 + }, + { + "auxiliary_loss_clip": 0.01069872, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.01873064, + "balance_loss_mlp": 1.02077472, + "epoch": 0.2937622125356982, + "flos": 24271793078400.0, + "grad_norm": 2.02589585953045, + "language_loss": 0.86321092, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.88443124, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4921875, + "step": 4886, + "time_per_iteration": 3.812697649002075 + }, + { + "auxiliary_loss_clip": 0.01072576, + "auxiliary_loss_mlp": 0.01055209, + "balance_loss_clip": 1.02075756, + "balance_loss_mlp": 1.02184367, + "epoch": 0.29382233578836614, + "flos": 20661564228480.0, + "grad_norm": 2.480560310220524, + "language_loss": 0.78955257, + "learning_rate": 3.313223211088603e-06, + "loss": 0.81083047, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 4887, + "time_per_iteration": 2.427464246749878 + }, + { + "auxiliary_loss_clip": 0.01072477, + "auxiliary_loss_mlp": 0.01057146, + "balance_loss_clip": 1.02293324, + "balance_loss_mlp": 1.02183831, + "epoch": 0.2938824590410341, + "flos": 16544117013120.0, + "grad_norm": 2.3209977973659384, + "language_loss": 0.80854034, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.82983661, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5078125, + "step": 4888, + "time_per_iteration": 2.3706562519073486 + }, + { + "auxiliary_loss_clip": 0.01070153, + "auxiliary_loss_mlp": 0.01048503, + "balance_loss_clip": 1.0156486, + "balance_loss_mlp": 1.02175987, + "epoch": 0.29394258229370207, + "flos": 37923949975680.0, + "grad_norm": 4.030520061916835, + "language_loss": 0.56728446, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.588471, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.484375, + "step": 4889, + "time_per_iteration": 2.581333637237549 + }, + { + "auxiliary_loss_clip": 0.01073057, + "auxiliary_loss_mlp": 0.01055146, + "balance_loss_clip": 1.01730871, + "balance_loss_mlp": 1.02242005, + "epoch": 0.29400270554637004, + "flos": 20043741075840.0, + "grad_norm": 1.8805411142663446, + "language_loss": 0.86202985, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.88331187, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 4890, + "time_per_iteration": 2.4000842571258545 + }, + { + "auxiliary_loss_clip": 0.01074029, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_clip": 1.02265227, + "balance_loss_mlp": 1.02325094, + "epoch": 0.294062828799038, + "flos": 15265527868800.0, + "grad_norm": 2.0009550088355703, + "language_loss": 0.74154902, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.76286113, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 4891, + "time_per_iteration": 2.405726432800293 + }, + { + "auxiliary_loss_clip": 0.01073932, + "auxiliary_loss_mlp": 0.01054097, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.02365768, + "epoch": 0.294122952051706, + "flos": 22746053352960.0, + "grad_norm": 2.7955587730523157, + "language_loss": 0.78124285, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.80252314, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.50390625, + "step": 4892, + "time_per_iteration": 2.397629737854004 + }, + { + "auxiliary_loss_clip": 0.01069665, + "auxiliary_loss_mlp": 0.0104934, + "balance_loss_clip": 1.01612842, + "balance_loss_mlp": 1.02111638, + "epoch": 0.294183075304374, + "flos": 24971731482240.0, + "grad_norm": 2.327967692509786, + "language_loss": 0.78952008, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.81071019, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48632812, + "step": 4893, + "time_per_iteration": 2.4765195846557617 + }, + { + "auxiliary_loss_clip": 0.01070998, + "auxiliary_loss_mlp": 0.01055734, + "balance_loss_clip": 1.02106786, + "balance_loss_mlp": 1.02199149, + "epoch": 0.29424319855704195, + "flos": 30951760020480.0, + "grad_norm": 1.6083825263218963, + "language_loss": 0.85691649, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87818378, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48828125, + "step": 4894, + "time_per_iteration": 2.4833176136016846 + }, + { + "auxiliary_loss_clip": 0.01072848, + "auxiliary_loss_mlp": 0.01049971, + "balance_loss_clip": 1.01778483, + "balance_loss_mlp": 1.02286887, + "epoch": 0.2943033218097099, + "flos": 15230684465280.0, + "grad_norm": 4.0762549356447115, + "language_loss": 0.9149397, + "learning_rate": 3.310871672543274e-06, + "loss": 0.93616784, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.5, + "step": 4895, + "time_per_iteration": 2.401970148086548 + }, + { + "auxiliary_loss_clip": 0.01073209, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_clip": 1.01744437, + "balance_loss_mlp": 1.02204823, + "epoch": 0.2943634450623779, + "flos": 21724808906880.0, + "grad_norm": 1.7204799501478967, + "language_loss": 0.88888407, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.91015232, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.51171875, + "step": 4896, + "time_per_iteration": 2.4428389072418213 + }, + { + "auxiliary_loss_clip": 0.01073168, + "auxiliary_loss_mlp": 0.01060741, + "balance_loss_clip": 1.02628946, + "balance_loss_mlp": 1.02280641, + "epoch": 0.29442356831504585, + "flos": 22600989187200.0, + "grad_norm": 1.861655127612311, + "language_loss": 0.74606395, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.76740313, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.50390625, + "step": 4897, + "time_per_iteration": 2.4077720642089844 + }, + { + "auxiliary_loss_clip": 0.01076597, + "auxiliary_loss_mlp": 0.01057251, + "balance_loss_clip": 1.01841283, + "balance_loss_mlp": 1.02267504, + "epoch": 0.2944836915677138, + "flos": 20010363949440.0, + "grad_norm": 1.9473793465555769, + "language_loss": 0.7636286, + "learning_rate": 3.309989025093813e-06, + "loss": 0.78496706, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5390625, + "step": 4898, + "time_per_iteration": 2.4418370723724365 + }, + { + "auxiliary_loss_clip": 0.01076061, + "auxiliary_loss_mlp": 0.01066152, + "balance_loss_clip": 1.02736139, + "balance_loss_mlp": 1.02287102, + "epoch": 0.2945438148203818, + "flos": 20044893150720.0, + "grad_norm": 2.882657523578406, + "language_loss": 0.72890675, + "learning_rate": 3.309694709912618e-06, + "loss": 0.75032896, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.53125, + "step": 4899, + "time_per_iteration": 2.3737075328826904 + }, + { + "auxiliary_loss_clip": 0.01072335, + "auxiliary_loss_mlp": 0.01056938, + "balance_loss_clip": 1.02129459, + "balance_loss_mlp": 1.02196217, + "epoch": 0.29460393807304974, + "flos": 23732384572800.0, + "grad_norm": 2.0422618610528254, + "language_loss": 0.80682731, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.82812011, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.50390625, + "step": 4900, + "time_per_iteration": 2.464665412902832 + }, + { + "auxiliary_loss_clip": 0.01070789, + "auxiliary_loss_mlp": 0.01061173, + "balance_loss_clip": 1.02481437, + "balance_loss_mlp": 1.02050233, + "epoch": 0.2946640613257177, + "flos": 14975190069120.0, + "grad_norm": 1.7036865815961801, + "language_loss": 0.81882906, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.84014869, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.50390625, + "step": 4901, + "time_per_iteration": 2.3845183849334717 + }, + { + "auxiliary_loss_clip": 0.01066002, + "auxiliary_loss_mlp": 0.01045335, + "balance_loss_clip": 1.01500797, + "balance_loss_mlp": 1.01999021, + "epoch": 0.2947241845783857, + "flos": 24242744960640.0, + "grad_norm": 1.9926728591297775, + "language_loss": 0.59630316, + "learning_rate": 3.308811466431157e-06, + "loss": 0.6174165, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4609375, + "step": 4902, + "time_per_iteration": 2.4277822971343994 + }, + { + "auxiliary_loss_clip": 0.01072511, + "auxiliary_loss_mlp": 0.01051809, + "balance_loss_clip": 1.01833546, + "balance_loss_mlp": 1.02206349, + "epoch": 0.29478430783105364, + "flos": 19937360563200.0, + "grad_norm": 1.7393942931839688, + "language_loss": 0.76862359, + "learning_rate": 3.308516952661925e-06, + "loss": 0.78986681, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.50390625, + "step": 4903, + "time_per_iteration": 2.4084975719451904 + }, + { + "auxiliary_loss_clip": 0.01073023, + "auxiliary_loss_mlp": 0.01053052, + "balance_loss_clip": 1.01662183, + "balance_loss_mlp": 1.0226469, + "epoch": 0.2948444310837216, + "flos": 27380110089600.0, + "grad_norm": 2.3322529459646644, + "language_loss": 0.63927937, + "learning_rate": 3.3082223892736e-06, + "loss": 0.6605401, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.50390625, + "step": 4904, + "time_per_iteration": 2.4827022552490234 + }, + { + "auxiliary_loss_clip": 0.01072478, + "auxiliary_loss_mlp": 0.01055474, + "balance_loss_clip": 1.01994967, + "balance_loss_mlp": 1.0208993, + "epoch": 0.2949045543363896, + "flos": 23404305726720.0, + "grad_norm": 1.9987314484866203, + "language_loss": 0.74582869, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.7671082, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.515625, + "step": 4905, + "time_per_iteration": 2.3973491191864014 + }, + { + "auxiliary_loss_clip": 0.01071154, + "auxiliary_loss_mlp": 0.01055249, + "balance_loss_clip": 1.01927161, + "balance_loss_mlp": 1.0215739, + "epoch": 0.2949646775890576, + "flos": 23950347390720.0, + "grad_norm": 1.8243963168510884, + "language_loss": 0.82407546, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.84533948, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.49609375, + "step": 4906, + "time_per_iteration": 2.463884115219116 + }, + { + "auxiliary_loss_clip": 0.0106723, + "auxiliary_loss_mlp": 0.0105081, + "balance_loss_clip": 1.01855183, + "balance_loss_mlp": 1.01988578, + "epoch": 0.29502480084172555, + "flos": 22783200906240.0, + "grad_norm": 1.9292996709726093, + "language_loss": 0.88355416, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.90473461, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.47265625, + "step": 4907, + "time_per_iteration": 2.4347665309906006 + }, + { + "auxiliary_loss_clip": 0.01075031, + "auxiliary_loss_mlp": 0.01056443, + "balance_loss_clip": 1.018224, + "balance_loss_mlp": 1.0227021, + "epoch": 0.2950849240943935, + "flos": 19645626309120.0, + "grad_norm": 2.0037143148795864, + "language_loss": 0.84027827, + "learning_rate": 3.307043639752782e-06, + "loss": 0.86159301, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5234375, + "step": 4908, + "time_per_iteration": 2.4218852519989014 + }, + { + "auxiliary_loss_clip": 0.01016229, + "auxiliary_loss_mlp": 0.01007377, + "balance_loss_clip": 1.0026089, + "balance_loss_mlp": 1.00526738, + "epoch": 0.2951450473470615, + "flos": 70999790469120.0, + "grad_norm": 0.7758432712805711, + "language_loss": 0.57343763, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59367371, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.04760742, + "router_z_loss_mlp": 0.109375, + "step": 4909, + "time_per_iteration": 2.930645704269409 + }, + { + "auxiliary_loss_clip": 0.01070016, + "auxiliary_loss_mlp": 0.01058493, + "balance_loss_clip": 1.02525711, + "balance_loss_mlp": 1.02135289, + "epoch": 0.29520517059972945, + "flos": 22965203157120.0, + "grad_norm": 1.6828892703361031, + "language_loss": 0.87812179, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.89940691, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48632812, + "step": 4910, + "time_per_iteration": 2.4146642684936523 + }, + { + "auxiliary_loss_clip": 0.01068003, + "auxiliary_loss_mlp": 0.01052024, + "balance_loss_clip": 1.01979017, + "balance_loss_mlp": 1.02125287, + "epoch": 0.2952652938523974, + "flos": 20484624124800.0, + "grad_norm": 1.9260426355727707, + "language_loss": 0.7508145, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.77201474, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.46679688, + "step": 4911, + "time_per_iteration": 2.3895938396453857 + }, + { + "auxiliary_loss_clip": 0.01068787, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.01498497, + "balance_loss_mlp": 1.02101326, + "epoch": 0.2953254171050654, + "flos": 19645556486400.0, + "grad_norm": 1.6150248287891977, + "language_loss": 0.9094699, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.93063021, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4765625, + "step": 4912, + "time_per_iteration": 2.4016170501708984 + }, + { + "auxiliary_loss_clip": 0.01071249, + "auxiliary_loss_mlp": 0.01058366, + "balance_loss_clip": 1.0247488, + "balance_loss_mlp": 1.02207303, + "epoch": 0.29538554035773334, + "flos": 22746856314240.0, + "grad_norm": 1.5665799681793202, + "language_loss": 0.84850657, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.86980271, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4921875, + "step": 4913, + "time_per_iteration": 2.3933815956115723 + }, + { + "auxiliary_loss_clip": 0.01071415, + "auxiliary_loss_mlp": 0.01051326, + "balance_loss_clip": 1.01651704, + "balance_loss_mlp": 1.0206145, + "epoch": 0.2954456636104013, + "flos": 21870780768000.0, + "grad_norm": 1.8005118118482717, + "language_loss": 0.78663898, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.80786639, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5078125, + "step": 4914, + "time_per_iteration": 2.4108452796936035 + }, + { + "auxiliary_loss_clip": 0.01069694, + "auxiliary_loss_mlp": 0.0105911, + "balance_loss_clip": 1.02244163, + "balance_loss_mlp": 1.02077579, + "epoch": 0.2955057868630693, + "flos": 40440978334080.0, + "grad_norm": 1.748238031148167, + "language_loss": 0.82918084, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.85046887, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.48828125, + "step": 4915, + "time_per_iteration": 2.5443222522735596 + }, + { + "auxiliary_loss_clip": 0.01074516, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.02560246, + "balance_loss_mlp": 1.02329421, + "epoch": 0.29556591011573724, + "flos": 22563422697600.0, + "grad_norm": 2.4901175114973033, + "language_loss": 0.85699809, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.87837458, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.51171875, + "step": 4916, + "time_per_iteration": 2.417827844619751 + }, + { + "auxiliary_loss_clip": 0.01069864, + "auxiliary_loss_mlp": 0.01049955, + "balance_loss_clip": 1.01724362, + "balance_loss_mlp": 1.02229571, + "epoch": 0.2956260333684052, + "flos": 22088254826880.0, + "grad_norm": 2.0541672440034686, + "language_loss": 0.72090679, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.74210501, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4765625, + "step": 4917, + "time_per_iteration": 2.405491352081299 + }, + { + "auxiliary_loss_clip": 0.01073285, + "auxiliary_loss_mlp": 0.01051252, + "balance_loss_clip": 1.01606154, + "balance_loss_mlp": 1.02350497, + "epoch": 0.2956861566210732, + "flos": 16434559566720.0, + "grad_norm": 2.02610689917768, + "language_loss": 0.92322052, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.94446588, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.49804688, + "step": 4918, + "time_per_iteration": 2.4028449058532715 + }, + { + "auxiliary_loss_clip": 0.0107474, + "auxiliary_loss_mlp": 0.01055703, + "balance_loss_clip": 1.02098966, + "balance_loss_mlp": 1.02430367, + "epoch": 0.2957462798737412, + "flos": 25810903854720.0, + "grad_norm": 2.0603121661759816, + "language_loss": 0.73989892, + "learning_rate": 3.303797991757425e-06, + "loss": 0.76120335, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.50390625, + "step": 4919, + "time_per_iteration": 3.8773398399353027 + }, + { + "auxiliary_loss_clip": 0.01069251, + "auxiliary_loss_mlp": 0.01053329, + "balance_loss_clip": 1.01954544, + "balance_loss_mlp": 1.02120352, + "epoch": 0.29580640312640916, + "flos": 16689914317440.0, + "grad_norm": 2.010343786307566, + "language_loss": 0.77442658, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.79565239, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.48046875, + "step": 4920, + "time_per_iteration": 2.374286413192749 + }, + { + "auxiliary_loss_clip": 0.01073527, + "auxiliary_loss_mlp": 0.0106088, + "balance_loss_clip": 1.02480698, + "balance_loss_mlp": 1.02324927, + "epoch": 0.2958665263790771, + "flos": 23944621927680.0, + "grad_norm": 2.641901891840026, + "language_loss": 0.69957709, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.72092116, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.50390625, + "step": 4921, + "time_per_iteration": 2.4469776153564453 + }, + { + "auxiliary_loss_clip": 0.0107464, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.02188039, + "balance_loss_mlp": 1.02316546, + "epoch": 0.2959266496317451, + "flos": 18477432483840.0, + "grad_norm": 1.9260588407204906, + "language_loss": 0.75770396, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.77902567, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.515625, + "step": 4922, + "time_per_iteration": 3.7966063022613525 + }, + { + "auxiliary_loss_clip": 0.01075219, + "auxiliary_loss_mlp": 0.01050785, + "balance_loss_clip": 1.01493919, + "balance_loss_mlp": 1.02236199, + "epoch": 0.29598677288441305, + "flos": 25956317134080.0, + "grad_norm": 1.9803107240539604, + "language_loss": 0.77720785, + "learning_rate": 3.302616272134737e-06, + "loss": 0.79846793, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.52734375, + "step": 4923, + "time_per_iteration": 2.4408934116363525 + }, + { + "auxiliary_loss_clip": 0.01071541, + "auxiliary_loss_mlp": 0.01052874, + "balance_loss_clip": 1.01870894, + "balance_loss_mlp": 1.02268553, + "epoch": 0.296046896137081, + "flos": 25154815985280.0, + "grad_norm": 1.690597209565533, + "language_loss": 0.87408042, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.89532459, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.48828125, + "step": 4924, + "time_per_iteration": 3.8029675483703613 + }, + { + "auxiliary_loss_clip": 0.01071253, + "auxiliary_loss_mlp": 0.01051231, + "balance_loss_clip": 1.0169704, + "balance_loss_mlp": 1.02319121, + "epoch": 0.296107019389749, + "flos": 21760106158080.0, + "grad_norm": 1.3867499715422766, + "language_loss": 0.82853138, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84975624, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.48046875, + "step": 4925, + "time_per_iteration": 2.3817451000213623 + }, + { + "auxiliary_loss_clip": 0.01073216, + "auxiliary_loss_mlp": 0.01050021, + "balance_loss_clip": 1.01692891, + "balance_loss_mlp": 1.02528656, + "epoch": 0.29616714264241695, + "flos": 17959286862720.0, + "grad_norm": 3.0850632983080497, + "language_loss": 0.88759178, + "learning_rate": 3.301729463727452e-06, + "loss": 0.90882415, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48046875, + "step": 4926, + "time_per_iteration": 3.849029541015625 + }, + { + "auxiliary_loss_clip": 0.01072104, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.01965237, + "balance_loss_mlp": 1.02222085, + "epoch": 0.2962272658950849, + "flos": 15011883774720.0, + "grad_norm": 1.9768862123988153, + "language_loss": 0.8768667, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.89810944, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.5, + "step": 4927, + "time_per_iteration": 2.34637713432312 + }, + { + "auxiliary_loss_clip": 0.01070448, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.0184567, + "balance_loss_mlp": 1.023561, + "epoch": 0.2962873891477529, + "flos": 14719974963840.0, + "grad_norm": 1.6650179327074452, + "language_loss": 0.81734651, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83853734, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.46875, + "step": 4928, + "time_per_iteration": 2.4015843868255615 + }, + { + "auxiliary_loss_clip": 0.01074564, + "auxiliary_loss_mlp": 0.01061494, + "balance_loss_clip": 1.02172613, + "balance_loss_mlp": 1.02197695, + "epoch": 0.29634751240042084, + "flos": 26722590854400.0, + "grad_norm": 2.6008225094475828, + "language_loss": 0.75659472, + "learning_rate": 3.300842211064773e-06, + "loss": 0.77795529, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.52734375, + "step": 4929, + "time_per_iteration": 2.420578718185425 + }, + { + "auxiliary_loss_clip": 0.01075235, + "auxiliary_loss_mlp": 0.01058544, + "balance_loss_clip": 1.02085018, + "balance_loss_mlp": 1.02380538, + "epoch": 0.2964076356530888, + "flos": 14570511966720.0, + "grad_norm": 2.3251143620708588, + "language_loss": 0.7397933, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.76113117, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.515625, + "step": 4930, + "time_per_iteration": 2.3859140872955322 + }, + { + "auxiliary_loss_clip": 0.01017994, + "auxiliary_loss_mlp": 0.01010924, + "balance_loss_clip": 1.00596523, + "balance_loss_mlp": 1.0059793, + "epoch": 0.29646775890575683, + "flos": 63100969585920.0, + "grad_norm": 0.8118460453780725, + "language_loss": 0.60787827, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62816745, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.04956055, + "router_z_loss_mlp": 0.12011719, + "step": 4931, + "time_per_iteration": 2.968148708343506 + }, + { + "auxiliary_loss_clip": 0.01015881, + "auxiliary_loss_mlp": 0.01010139, + "balance_loss_clip": 1.00568056, + "balance_loss_mlp": 1.00415421, + "epoch": 0.2965278821584248, + "flos": 63064345703040.0, + "grad_norm": 0.7475617868296669, + "language_loss": 0.52471447, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54497457, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.1171875, + "step": 4932, + "time_per_iteration": 2.9332351684570312 + }, + { + "auxiliary_loss_clip": 0.01068926, + "auxiliary_loss_mlp": 0.01055929, + "balance_loss_clip": 1.02438581, + "balance_loss_mlp": 1.02187634, + "epoch": 0.29658800541109276, + "flos": 23767612001280.0, + "grad_norm": 1.822576153423367, + "language_loss": 0.83077341, + "learning_rate": 3.299658516973972e-06, + "loss": 0.85202199, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.47070312, + "step": 4933, + "time_per_iteration": 2.4172708988189697 + }, + { + "auxiliary_loss_clip": 0.01067279, + "auxiliary_loss_mlp": 0.01051815, + "balance_loss_clip": 1.01977205, + "balance_loss_mlp": 1.02147555, + "epoch": 0.2966481286637607, + "flos": 23987390209920.0, + "grad_norm": 2.1605541960150134, + "language_loss": 0.76602173, + "learning_rate": 3.299362470215261e-06, + "loss": 0.78721261, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45703125, + "step": 4934, + "time_per_iteration": 2.4323980808258057 + }, + { + "auxiliary_loss_clip": 0.01069594, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_clip": 1.03270173, + "balance_loss_mlp": 1.02077115, + "epoch": 0.2967082519164287, + "flos": 17164209404160.0, + "grad_norm": 1.7987647537193867, + "language_loss": 0.64107931, + "learning_rate": 3.299066374184594e-06, + "loss": 0.66244662, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48828125, + "step": 4935, + "time_per_iteration": 2.3592231273651123 + }, + { + "auxiliary_loss_clip": 0.01068163, + "auxiliary_loss_mlp": 0.01068427, + "balance_loss_clip": 1.0324738, + "balance_loss_mlp": 1.02128315, + "epoch": 0.29676837516909665, + "flos": 29386428946560.0, + "grad_norm": 1.514062830653207, + "language_loss": 0.80694783, + "learning_rate": 3.2987702288932e-06, + "loss": 0.82831371, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.46875, + "step": 4936, + "time_per_iteration": 2.4798471927642822 + }, + { + "auxiliary_loss_clip": 0.01071307, + "auxiliary_loss_mlp": 0.0106556, + "balance_loss_clip": 1.03058386, + "balance_loss_mlp": 1.02231121, + "epoch": 0.2968284984217646, + "flos": 34749786407040.0, + "grad_norm": 1.4403440175418074, + "language_loss": 0.75601327, + "learning_rate": 3.298474034352309e-06, + "loss": 0.77738202, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.49023438, + "step": 4937, + "time_per_iteration": 2.511199951171875 + }, + { + "auxiliary_loss_clip": 0.01068746, + "auxiliary_loss_mlp": 0.01052344, + "balance_loss_clip": 1.01860785, + "balance_loss_mlp": 1.02212822, + "epoch": 0.2968886216744326, + "flos": 21543016124160.0, + "grad_norm": 1.742266291902911, + "language_loss": 0.79571503, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.81692594, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46484375, + "step": 4938, + "time_per_iteration": 2.437199831008911 + }, + { + "auxiliary_loss_clip": 0.01075073, + "auxiliary_loss_mlp": 0.0106454, + "balance_loss_clip": 1.02920663, + "balance_loss_mlp": 1.02450371, + "epoch": 0.29694874492710055, + "flos": 12786484936320.0, + "grad_norm": 2.406652152552255, + "language_loss": 0.7837491, + "learning_rate": 3.297881497566964e-06, + "loss": 0.80514526, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5078125, + "step": 4939, + "time_per_iteration": 2.376600503921509 + }, + { + "auxiliary_loss_clip": 0.01076047, + "auxiliary_loss_mlp": 0.01059316, + "balance_loss_clip": 1.02498341, + "balance_loss_mlp": 1.02439332, + "epoch": 0.2970088681797685, + "flos": 24568868770560.0, + "grad_norm": 1.781395186859976, + "language_loss": 0.80015892, + "learning_rate": 3.297585155344979e-06, + "loss": 0.82151258, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.515625, + "step": 4940, + "time_per_iteration": 2.470235586166382 + }, + { + "auxiliary_loss_clip": 0.01075344, + "auxiliary_loss_mlp": 0.0105236, + "balance_loss_clip": 1.01459444, + "balance_loss_mlp": 1.02553821, + "epoch": 0.2970689914324365, + "flos": 23658054554880.0, + "grad_norm": 1.5239158411174356, + "language_loss": 0.76627851, + "learning_rate": 3.297288763918435e-06, + "loss": 0.78755552, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.49804688, + "step": 4941, + "time_per_iteration": 2.4202511310577393 + }, + { + "auxiliary_loss_clip": 0.01076294, + "auxiliary_loss_mlp": 0.01054551, + "balance_loss_clip": 1.01900256, + "balance_loss_mlp": 1.02520919, + "epoch": 0.29712911468510445, + "flos": 39668909328000.0, + "grad_norm": 2.467371886788019, + "language_loss": 0.76374435, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.78505284, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.51171875, + "step": 4942, + "time_per_iteration": 2.568516492843628 + }, + { + "auxiliary_loss_clip": 0.01077161, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.02258706, + "balance_loss_mlp": 1.0264678, + "epoch": 0.2971892379377724, + "flos": 26394127983360.0, + "grad_norm": 1.8201815378262778, + "language_loss": 0.72306985, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.74442589, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5078125, + "step": 4943, + "time_per_iteration": 2.4508590698242188 + }, + { + "auxiliary_loss_clip": 0.01079083, + "auxiliary_loss_mlp": 0.0104923, + "balance_loss_clip": 1.01268005, + "balance_loss_mlp": 1.02719212, + "epoch": 0.2972493611904404, + "flos": 17602229721600.0, + "grad_norm": 1.932238403533561, + "language_loss": 0.81977844, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.84106153, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.51953125, + "step": 4944, + "time_per_iteration": 2.413893222808838 + }, + { + "auxiliary_loss_clip": 0.01071874, + "auxiliary_loss_mlp": 0.01055656, + "balance_loss_clip": 1.02413738, + "balance_loss_mlp": 1.02408659, + "epoch": 0.2973094844431084, + "flos": 20411725472640.0, + "grad_norm": 2.066699572052358, + "language_loss": 0.84824955, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.86952484, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.47851562, + "step": 4945, + "time_per_iteration": 2.4090192317962646 + }, + { + "auxiliary_loss_clip": 0.01074413, + "auxiliary_loss_mlp": 0.01054625, + "balance_loss_clip": 1.02298677, + "balance_loss_mlp": 1.02712643, + "epoch": 0.29736960769577636, + "flos": 17492532629760.0, + "grad_norm": 2.284242911992892, + "language_loss": 0.68799412, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.70928448, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.47265625, + "step": 4946, + "time_per_iteration": 2.438011407852173 + }, + { + "auxiliary_loss_clip": 0.01075999, + "auxiliary_loss_mlp": 0.01049656, + "balance_loss_clip": 1.01692104, + "balance_loss_mlp": 1.02659726, + "epoch": 0.2974297309484443, + "flos": 26102777754240.0, + "grad_norm": 1.7237954190274103, + "language_loss": 0.75457019, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.77582675, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.49414062, + "step": 4947, + "time_per_iteration": 2.4563536643981934 + }, + { + "auxiliary_loss_clip": 0.01076242, + "auxiliary_loss_mlp": 0.01058754, + "balance_loss_clip": 1.02523255, + "balance_loss_mlp": 1.02560616, + "epoch": 0.2974898542011123, + "flos": 25665246195840.0, + "grad_norm": 2.8018723218448933, + "language_loss": 0.74262404, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.76397395, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.5078125, + "step": 4948, + "time_per_iteration": 2.487300157546997 + }, + { + "auxiliary_loss_clip": 0.01070975, + "auxiliary_loss_mlp": 0.0104876, + "balance_loss_clip": 1.0172168, + "balance_loss_mlp": 1.02363586, + "epoch": 0.29754997745378026, + "flos": 18660342430080.0, + "grad_norm": 2.6897353909676216, + "language_loss": 0.84877837, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86997569, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.47265625, + "step": 4949, + "time_per_iteration": 2.41959285736084 + }, + { + "auxiliary_loss_clip": 0.01071619, + "auxiliary_loss_mlp": 0.0105519, + "balance_loss_clip": 1.02222812, + "balance_loss_mlp": 1.02431333, + "epoch": 0.2976101007064482, + "flos": 22273468922880.0, + "grad_norm": 2.5214183998609663, + "language_loss": 0.7251277, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.74639583, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.47265625, + "step": 4950, + "time_per_iteration": 2.4777534008026123 + }, + { + "auxiliary_loss_clip": 0.01070759, + "auxiliary_loss_mlp": 0.01058955, + "balance_loss_clip": 1.03028536, + "balance_loss_mlp": 1.02473581, + "epoch": 0.2976702239591162, + "flos": 21944552204160.0, + "grad_norm": 1.9246962046195801, + "language_loss": 0.83728421, + "learning_rate": 3.294322145875789e-06, + "loss": 0.85858142, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4609375, + "step": 4951, + "time_per_iteration": 2.4023971557617188 + }, + { + "auxiliary_loss_clip": 0.01070931, + "auxiliary_loss_mlp": 0.01053651, + "balance_loss_clip": 1.02234674, + "balance_loss_mlp": 1.02250302, + "epoch": 0.29773034721178415, + "flos": 24636251427840.0, + "grad_norm": 2.8126589290533865, + "language_loss": 0.76597071, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.78721642, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.484375, + "step": 4952, + "time_per_iteration": 2.4573354721069336 + }, + { + "auxiliary_loss_clip": 0.01071916, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.02249324, + "balance_loss_mlp": 1.02409458, + "epoch": 0.2977904704644521, + "flos": 20556545258880.0, + "grad_norm": 2.124694617560849, + "language_loss": 0.85115862, + "learning_rate": 3.293728232937228e-06, + "loss": 0.87243414, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.47851562, + "step": 4953, + "time_per_iteration": 2.4074764251708984 + }, + { + "auxiliary_loss_clip": 0.01071434, + "auxiliary_loss_mlp": 0.01056046, + "balance_loss_clip": 1.0238595, + "balance_loss_mlp": 1.02293324, + "epoch": 0.2978505937171201, + "flos": 18915452801280.0, + "grad_norm": 2.1397316537029987, + "language_loss": 0.75664568, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.77792048, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.484375, + "step": 4954, + "time_per_iteration": 2.4374098777770996 + }, + { + "auxiliary_loss_clip": 0.0106787, + "auxiliary_loss_mlp": 0.01051478, + "balance_loss_clip": 1.02186668, + "balance_loss_mlp": 1.02131677, + "epoch": 0.29791071696978805, + "flos": 19316744501760.0, + "grad_norm": 1.7503918987079452, + "language_loss": 0.77055359, + "learning_rate": 3.293134123765452e-06, + "loss": 0.79174709, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.46484375, + "step": 4955, + "time_per_iteration": 2.3637735843658447 + }, + { + "auxiliary_loss_clip": 0.01070725, + "auxiliary_loss_mlp": 0.01050584, + "balance_loss_clip": 1.01782525, + "balance_loss_mlp": 1.02195513, + "epoch": 0.297970840222456, + "flos": 18805825532160.0, + "grad_norm": 1.8067624729262326, + "language_loss": 0.73442382, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.75563693, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48828125, + "step": 4956, + "time_per_iteration": 2.428098678588867 + }, + { + "auxiliary_loss_clip": 0.01072456, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_clip": 1.02250969, + "balance_loss_mlp": 1.021088, + "epoch": 0.298030963475124, + "flos": 22851770549760.0, + "grad_norm": 1.9053165860700283, + "language_loss": 0.8054024, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.82669312, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.51171875, + "step": 4957, + "time_per_iteration": 2.4103972911834717 + }, + { + "auxiliary_loss_clip": 0.0107012, + "auxiliary_loss_mlp": 0.01053212, + "balance_loss_clip": 1.02031076, + "balance_loss_mlp": 1.02158082, + "epoch": 0.298091086727792, + "flos": 21867499100160.0, + "grad_norm": 1.5783504801481614, + "language_loss": 0.71423602, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.73546934, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48632812, + "step": 4958, + "time_per_iteration": 3.8554458618164062 + }, + { + "auxiliary_loss_clip": 0.01069486, + "auxiliary_loss_mlp": 0.01055142, + "balance_loss_clip": 1.02398074, + "balance_loss_mlp": 1.02231836, + "epoch": 0.29815120998045996, + "flos": 21174054209280.0, + "grad_norm": 1.5973760184141246, + "language_loss": 0.79588437, + "learning_rate": 3.291945317082743e-06, + "loss": 0.81713068, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.47070312, + "step": 4959, + "time_per_iteration": 2.397167444229126 + }, + { + "auxiliary_loss_clip": 0.01067438, + "auxiliary_loss_mlp": 0.01050118, + "balance_loss_clip": 1.01752615, + "balance_loss_mlp": 1.02023959, + "epoch": 0.29821133323312793, + "flos": 19895395242240.0, + "grad_norm": 1.702436726490042, + "language_loss": 0.80673039, + "learning_rate": 3.291647992907147e-06, + "loss": 0.82790595, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.47265625, + "step": 4960, + "time_per_iteration": 2.4434754848480225 + }, + { + "auxiliary_loss_clip": 0.01075742, + "auxiliary_loss_mlp": 0.01053229, + "balance_loss_clip": 1.01913464, + "balance_loss_mlp": 1.02518713, + "epoch": 0.2982714564857959, + "flos": 12749930876160.0, + "grad_norm": 2.2320374554604627, + "language_loss": 0.75417262, + "learning_rate": 3.291350619752129e-06, + "loss": 0.77546239, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.50390625, + "step": 4961, + "time_per_iteration": 2.4707937240600586 + }, + { + "auxiliary_loss_clip": 0.01070778, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_clip": 1.01656294, + "balance_loss_mlp": 1.02334881, + "epoch": 0.29833157973846386, + "flos": 22270850570880.0, + "grad_norm": 1.847720614251277, + "language_loss": 0.63334548, + "learning_rate": 3.291053197628967e-06, + "loss": 0.65451264, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.47265625, + "step": 4962, + "time_per_iteration": 3.836911916732788 + }, + { + "auxiliary_loss_clip": 0.01072426, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.0245893, + "balance_loss_mlp": 1.02454448, + "epoch": 0.2983917029911318, + "flos": 15372222583680.0, + "grad_norm": 1.6545135719105266, + "language_loss": 0.84297562, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.86428595, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.47851562, + "step": 4963, + "time_per_iteration": 3.881481409072876 + }, + { + "auxiliary_loss_clip": 0.0107235, + "auxiliary_loss_mlp": 0.01053413, + "balance_loss_clip": 1.02058232, + "balance_loss_mlp": 1.02494872, + "epoch": 0.2984518262437998, + "flos": 15376726149120.0, + "grad_norm": 2.314063471996047, + "language_loss": 0.67699575, + "learning_rate": 3.290458206523322e-06, + "loss": 0.69825339, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.47265625, + "step": 4964, + "time_per_iteration": 2.399296760559082 + }, + { + "auxiliary_loss_clip": 0.01070137, + "auxiliary_loss_mlp": 0.01045895, + "balance_loss_clip": 1.01547289, + "balance_loss_mlp": 1.02375841, + "epoch": 0.29851194949646775, + "flos": 18107632696320.0, + "grad_norm": 1.8861872387344412, + "language_loss": 0.73020732, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.75136769, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.46484375, + "step": 4965, + "time_per_iteration": 3.8145718574523926 + }, + { + "auxiliary_loss_clip": 0.01074708, + "auxiliary_loss_mlp": 0.01053322, + "balance_loss_clip": 1.01996779, + "balance_loss_mlp": 1.02543378, + "epoch": 0.2985720727491357, + "flos": 22017136654080.0, + "grad_norm": 1.8354160542603977, + "language_loss": 0.68304247, + "learning_rate": 3.289863019680461e-06, + "loss": 0.70432276, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.49414062, + "step": 4966, + "time_per_iteration": 2.4712448120117188 + }, + { + "auxiliary_loss_clip": 0.01073345, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.0182054, + "balance_loss_mlp": 1.02518749, + "epoch": 0.2986321960018037, + "flos": 13040547966720.0, + "grad_norm": 2.5614621596665446, + "language_loss": 0.76271701, + "learning_rate": 3.289565352885785e-06, + "loss": 0.78394818, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.48046875, + "step": 4967, + "time_per_iteration": 2.384584426879883 + }, + { + "auxiliary_loss_clip": 0.01070709, + "auxiliary_loss_mlp": 0.0105061, + "balance_loss_clip": 1.02009249, + "balance_loss_mlp": 1.02178764, + "epoch": 0.29869231925447165, + "flos": 14464166365440.0, + "grad_norm": 2.676097966765139, + "language_loss": 0.73000598, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.75121915, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.48828125, + "step": 4968, + "time_per_iteration": 2.407240867614746 + }, + { + "auxiliary_loss_clip": 0.01072407, + "auxiliary_loss_mlp": 0.01050544, + "balance_loss_clip": 1.01723707, + "balance_loss_mlp": 1.02302289, + "epoch": 0.2987524425071396, + "flos": 31648870604160.0, + "grad_norm": 1.5827339973804933, + "language_loss": 0.77694046, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79816997, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.49414062, + "step": 4969, + "time_per_iteration": 2.4957313537597656 + }, + { + "auxiliary_loss_clip": 0.01070591, + "auxiliary_loss_mlp": 0.01048574, + "balance_loss_clip": 1.01772285, + "balance_loss_mlp": 1.02348757, + "epoch": 0.2988125657598076, + "flos": 21432376425600.0, + "grad_norm": 1.9583038821929744, + "language_loss": 0.71621495, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.73740661, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.47070312, + "step": 4970, + "time_per_iteration": 2.4234254360198975 + }, + { + "auxiliary_loss_clip": 0.01072487, + "auxiliary_loss_mlp": 0.01058459, + "balance_loss_clip": 1.02152777, + "balance_loss_mlp": 1.02125371, + "epoch": 0.2988726890124756, + "flos": 18076001137920.0, + "grad_norm": 2.2627687261326423, + "language_loss": 0.86775655, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.88906598, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.51171875, + "step": 4971, + "time_per_iteration": 2.3889262676239014 + }, + { + "auxiliary_loss_clip": 0.01069964, + "auxiliary_loss_mlp": 0.01053548, + "balance_loss_clip": 1.02105165, + "balance_loss_mlp": 1.02301002, + "epoch": 0.29893281226514357, + "flos": 21754764720000.0, + "grad_norm": 1.8828492890845896, + "language_loss": 0.80582368, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.82705879, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46875, + "step": 4972, + "time_per_iteration": 2.4405760765075684 + }, + { + "auxiliary_loss_clip": 0.01069854, + "auxiliary_loss_mlp": 0.01061361, + "balance_loss_clip": 1.02683806, + "balance_loss_mlp": 1.02099657, + "epoch": 0.29899293551781153, + "flos": 16835781444480.0, + "grad_norm": 1.926791096935694, + "language_loss": 0.87145305, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.89276516, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.48828125, + "step": 4973, + "time_per_iteration": 2.3882272243499756 + }, + { + "auxiliary_loss_clip": 0.01065729, + "auxiliary_loss_mlp": 0.01052008, + "balance_loss_clip": 1.02115655, + "balance_loss_mlp": 1.02044177, + "epoch": 0.2990530587704795, + "flos": 11728407139200.0, + "grad_norm": 1.727083796313815, + "language_loss": 0.78885752, + "learning_rate": 3.287480316742863e-06, + "loss": 0.81003493, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.453125, + "step": 4974, + "time_per_iteration": 2.359952211380005 + }, + { + "auxiliary_loss_clip": 0.01070999, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.02200532, + "balance_loss_mlp": 1.02192521, + "epoch": 0.29911318202314746, + "flos": 28038571931520.0, + "grad_norm": 1.9250078280834568, + "language_loss": 0.73424935, + "learning_rate": 3.287182259060815e-06, + "loss": 0.75552416, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.4921875, + "step": 4975, + "time_per_iteration": 2.5221619606018066 + }, + { + "auxiliary_loss_clip": 0.01070492, + "auxiliary_loss_mlp": 0.0105347, + "balance_loss_clip": 1.01787388, + "balance_loss_mlp": 1.02192378, + "epoch": 0.2991733052758154, + "flos": 18732577766400.0, + "grad_norm": 4.896063580601903, + "language_loss": 0.77738786, + "learning_rate": 3.286884152568687e-06, + "loss": 0.7986275, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.484375, + "step": 4976, + "time_per_iteration": 2.3710060119628906 + }, + { + "auxiliary_loss_clip": 0.01070177, + "auxiliary_loss_mlp": 0.01057715, + "balance_loss_clip": 1.0227387, + "balance_loss_mlp": 1.02189255, + "epoch": 0.2992334285284834, + "flos": 15558274552320.0, + "grad_norm": 2.1438299890149204, + "language_loss": 0.88398516, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.90526414, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.48242188, + "step": 4977, + "time_per_iteration": 2.3986966609954834 + }, + { + "auxiliary_loss_clip": 0.01072196, + "auxiliary_loss_mlp": 0.01054887, + "balance_loss_clip": 1.01960087, + "balance_loss_mlp": 1.02297258, + "epoch": 0.29929355178115136, + "flos": 21796520572800.0, + "grad_norm": 1.7746852726364324, + "language_loss": 0.70047677, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.72174764, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.4921875, + "step": 4978, + "time_per_iteration": 2.4016833305358887 + }, + { + "auxiliary_loss_clip": 0.01070915, + "auxiliary_loss_mlp": 0.01053662, + "balance_loss_clip": 1.01916242, + "balance_loss_mlp": 1.02274847, + "epoch": 0.2993536750338193, + "flos": 21177475522560.0, + "grad_norm": 2.797051485670118, + "language_loss": 0.7871955, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.80844128, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48242188, + "step": 4979, + "time_per_iteration": 2.45940899848938 + }, + { + "auxiliary_loss_clip": 0.01070951, + "auxiliary_loss_mlp": 0.01059142, + "balance_loss_clip": 1.02161455, + "balance_loss_mlp": 1.02116251, + "epoch": 0.2994137982864873, + "flos": 32120826629760.0, + "grad_norm": 1.5988488402270855, + "language_loss": 0.69468606, + "learning_rate": 3.285691238725484e-06, + "loss": 0.71598697, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.49609375, + "step": 4980, + "time_per_iteration": 2.483534336090088 + }, + { + "auxiliary_loss_clip": 0.01070375, + "auxiliary_loss_mlp": 0.01053733, + "balance_loss_clip": 1.02240443, + "balance_loss_mlp": 1.02307498, + "epoch": 0.29947392153915525, + "flos": 21104367402240.0, + "grad_norm": 1.8704588607237216, + "language_loss": 0.75134361, + "learning_rate": 3.285392888352555e-06, + "loss": 0.77258468, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.47265625, + "step": 4981, + "time_per_iteration": 2.4821488857269287 + }, + { + "auxiliary_loss_clip": 0.01072147, + "auxiliary_loss_mlp": 0.01056131, + "balance_loss_clip": 1.02010632, + "balance_loss_mlp": 1.02120996, + "epoch": 0.2995340447918232, + "flos": 21541584758400.0, + "grad_norm": 1.9686395990041081, + "language_loss": 0.87548542, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.89676821, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.51171875, + "step": 4982, + "time_per_iteration": 2.4308412075042725 + }, + { + "auxiliary_loss_clip": 0.01074508, + "auxiliary_loss_mlp": 0.01055561, + "balance_loss_clip": 1.01796293, + "balance_loss_mlp": 1.02260244, + "epoch": 0.2995941680444912, + "flos": 16724268961920.0, + "grad_norm": 2.3877687241719197, + "language_loss": 0.8806144, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.90191519, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.51953125, + "step": 4983, + "time_per_iteration": 2.398627519607544 + }, + { + "auxiliary_loss_clip": 0.01073647, + "auxiliary_loss_mlp": 0.01057009, + "balance_loss_clip": 1.02117491, + "balance_loss_mlp": 1.02402949, + "epoch": 0.2996542912971592, + "flos": 20922434974080.0, + "grad_norm": 1.9631790288429682, + "language_loss": 0.80006826, + "learning_rate": 3.284497544825668e-06, + "loss": 0.82137477, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.49609375, + "step": 4984, + "time_per_iteration": 2.4025232791900635 + }, + { + "auxiliary_loss_clip": 0.01073086, + "auxiliary_loss_mlp": 0.01056819, + "balance_loss_clip": 1.02115202, + "balance_loss_mlp": 1.02326334, + "epoch": 0.29971441454982717, + "flos": 25078775310720.0, + "grad_norm": 1.6385017071426284, + "language_loss": 0.79836261, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.81966168, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.49804688, + "step": 4985, + "time_per_iteration": 2.473898410797119 + }, + { + "auxiliary_loss_clip": 0.01075786, + "auxiliary_loss_mlp": 0.01059359, + "balance_loss_clip": 1.01930475, + "balance_loss_mlp": 1.0233469, + "epoch": 0.29977453780249513, + "flos": 52553989543680.0, + "grad_norm": 2.358454945663201, + "language_loss": 0.72462744, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74597889, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.5234375, + "step": 4986, + "time_per_iteration": 2.685598134994507 + }, + { + "auxiliary_loss_clip": 0.01075076, + "auxiliary_loss_mlp": 0.01061746, + "balance_loss_clip": 1.02085686, + "balance_loss_mlp": 1.02254725, + "epoch": 0.2998346610551631, + "flos": 22236042078720.0, + "grad_norm": 2.8651023241825784, + "language_loss": 0.75672078, + "learning_rate": 3.283601762924312e-06, + "loss": 0.77808905, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.5234375, + "step": 4987, + "time_per_iteration": 2.4541521072387695 + }, + { + "auxiliary_loss_clip": 0.0106996, + "auxiliary_loss_mlp": 0.0105109, + "balance_loss_clip": 1.01768756, + "balance_loss_mlp": 1.02114773, + "epoch": 0.29989478430783106, + "flos": 16872265681920.0, + "grad_norm": 1.7554611970159923, + "language_loss": 0.81561637, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.83682692, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.48828125, + "step": 4988, + "time_per_iteration": 2.3813319206237793 + }, + { + "auxiliary_loss_clip": 0.01071441, + "auxiliary_loss_mlp": 0.01057068, + "balance_loss_clip": 1.01934981, + "balance_loss_mlp": 1.0215286, + "epoch": 0.29995490756049903, + "flos": 23767751646720.0, + "grad_norm": 1.5836308066415319, + "language_loss": 0.71791768, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.73920274, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.5, + "step": 4989, + "time_per_iteration": 2.4964566230773926 + }, + { + "auxiliary_loss_clip": 0.01076071, + "auxiliary_loss_mlp": 0.01062099, + "balance_loss_clip": 1.022331, + "balance_loss_mlp": 1.02337372, + "epoch": 0.300015030813167, + "flos": 14464445656320.0, + "grad_norm": 1.7400781640343912, + "language_loss": 0.86931324, + "learning_rate": 3.282705542954199e-06, + "loss": 0.89069492, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.52734375, + "step": 4990, + "time_per_iteration": 2.3797109127044678 + }, + { + "auxiliary_loss_clip": 0.01074468, + "auxiliary_loss_mlp": 0.01060809, + "balance_loss_clip": 1.02123189, + "balance_loss_mlp": 1.02227974, + "epoch": 0.30007515406583496, + "flos": 25190811463680.0, + "grad_norm": 1.621467583866376, + "language_loss": 0.67898327, + "learning_rate": 3.28240670566841e-06, + "loss": 0.70033598, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.5234375, + "step": 4991, + "time_per_iteration": 2.4474854469299316 + }, + { + "auxiliary_loss_clip": 0.01073735, + "auxiliary_loss_mlp": 0.01061384, + "balance_loss_clip": 1.01939893, + "balance_loss_mlp": 1.02141237, + "epoch": 0.3001352773185029, + "flos": 19390166824320.0, + "grad_norm": 3.0283873455692163, + "language_loss": 0.80183244, + "learning_rate": 3.28210781975363e-06, + "loss": 0.82318366, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 0.5234375, + "step": 4992, + "time_per_iteration": 2.3969945907592773 + }, + { + "auxiliary_loss_clip": 0.01068551, + "auxiliary_loss_mlp": 0.01056611, + "balance_loss_clip": 1.02037144, + "balance_loss_mlp": 1.01982594, + "epoch": 0.3001954005711709, + "flos": 21542771744640.0, + "grad_norm": 4.024402022120727, + "language_loss": 0.83832443, + "learning_rate": 3.281808885221193e-06, + "loss": 0.85957605, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.48828125, + "step": 4993, + "time_per_iteration": 2.4585187435150146 + }, + { + "auxiliary_loss_clip": 0.0107408, + "auxiliary_loss_mlp": 0.01058799, + "balance_loss_clip": 1.01822042, + "balance_loss_mlp": 1.02091491, + "epoch": 0.30025552382383885, + "flos": 17383359208320.0, + "grad_norm": 2.155100777074444, + "language_loss": 0.88815671, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.90948552, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.53125, + "step": 4994, + "time_per_iteration": 2.3689606189727783 + }, + { + "auxiliary_loss_clip": 0.01072694, + "auxiliary_loss_mlp": 0.01057084, + "balance_loss_clip": 1.01960468, + "balance_loss_mlp": 1.02266192, + "epoch": 0.3003156470765068, + "flos": 29532051694080.0, + "grad_norm": 1.6351317122098659, + "language_loss": 0.8264932, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.84779096, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5, + "step": 4995, + "time_per_iteration": 2.521209478378296 + }, + { + "auxiliary_loss_clip": 0.01069759, + "auxiliary_loss_mlp": 0.0105258, + "balance_loss_clip": 1.01858115, + "balance_loss_mlp": 1.02160144, + "epoch": 0.3003757703291748, + "flos": 43644923159040.0, + "grad_norm": 2.131256717702012, + "language_loss": 0.68424606, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.70546949, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.48242188, + "step": 4996, + "time_per_iteration": 2.5656886100769043 + }, + { + "auxiliary_loss_clip": 0.01071241, + "auxiliary_loss_mlp": 0.01058361, + "balance_loss_clip": 1.02324247, + "balance_loss_mlp": 1.02123153, + "epoch": 0.30043589358184275, + "flos": 22527287573760.0, + "grad_norm": 1.9247347990659436, + "language_loss": 0.77205473, + "learning_rate": 3.280612661141615e-06, + "loss": 0.79335082, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.5, + "step": 4997, + "time_per_iteration": 2.465752124786377 + }, + { + "auxiliary_loss_clip": 0.01067637, + "auxiliary_loss_mlp": 0.01055995, + "balance_loss_clip": 1.01942182, + "balance_loss_mlp": 1.01969564, + "epoch": 0.30049601683451077, + "flos": 20994844867200.0, + "grad_norm": 1.9032760289841577, + "language_loss": 0.79370332, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.81493962, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.48046875, + "step": 4998, + "time_per_iteration": 3.8388283252716064 + }, + { + "auxiliary_loss_clip": 0.01068422, + "auxiliary_loss_mlp": 0.01049412, + "balance_loss_clip": 1.01400685, + "balance_loss_mlp": 1.02086139, + "epoch": 0.30055614008717874, + "flos": 23914840671360.0, + "grad_norm": 1.9403560870740555, + "language_loss": 0.74785626, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.76903462, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.4765625, + "step": 4999, + "time_per_iteration": 2.4379467964172363 + }, + { + "auxiliary_loss_clip": 0.0107218, + "auxiliary_loss_mlp": 0.01062678, + "balance_loss_clip": 1.02639103, + "balance_loss_mlp": 1.02259958, + "epoch": 0.3006162633398467, + "flos": 19168852515840.0, + "grad_norm": 1.9315622015446479, + "language_loss": 0.76540279, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78675139, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.49609375, + "step": 5000, + "time_per_iteration": 2.4575068950653076 + }, + { + "auxiliary_loss_clip": 0.01069791, + "auxiliary_loss_mlp": 0.01061497, + "balance_loss_clip": 1.02410078, + "balance_loss_mlp": 1.02254653, + "epoch": 0.30067638659251467, + "flos": 14678498401920.0, + "grad_norm": 1.8987753717566525, + "language_loss": 0.83056575, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.85187864, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.47265625, + "step": 5001, + "time_per_iteration": 3.830817461013794 + }, + { + "auxiliary_loss_clip": 0.01071699, + "auxiliary_loss_mlp": 0.01058326, + "balance_loss_clip": 1.02318335, + "balance_loss_mlp": 1.02284443, + "epoch": 0.30073650984518263, + "flos": 23366878882560.0, + "grad_norm": 1.7198324248108177, + "language_loss": 0.82245636, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.84375656, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.48828125, + "step": 5002, + "time_per_iteration": 2.4497947692871094 + }, + { + "auxiliary_loss_clip": 0.01074089, + "auxiliary_loss_mlp": 0.0105154, + "balance_loss_clip": 1.01646924, + "balance_loss_mlp": 1.02275467, + "epoch": 0.3007966330978506, + "flos": 22965517359360.0, + "grad_norm": 1.8434757817325793, + "language_loss": 0.72634661, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.74760294, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.51171875, + "step": 5003, + "time_per_iteration": 3.924666166305542 + }, + { + "auxiliary_loss_clip": 0.01076269, + "auxiliary_loss_mlp": 0.01059511, + "balance_loss_clip": 1.02277112, + "balance_loss_mlp": 1.02420783, + "epoch": 0.30085675635051856, + "flos": 27817222711680.0, + "grad_norm": 1.7948488417906276, + "language_loss": 0.71229869, + "learning_rate": 3.27851739984233e-06, + "loss": 0.73365653, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.51953125, + "step": 5004, + "time_per_iteration": 2.4612302780151367 + }, + { + "auxiliary_loss_clip": 0.01075571, + "auxiliary_loss_mlp": 0.01052794, + "balance_loss_clip": 1.01540971, + "balance_loss_mlp": 1.02344823, + "epoch": 0.3009168796031865, + "flos": 10882147760640.0, + "grad_norm": 3.2391286332799862, + "language_loss": 0.84085768, + "learning_rate": 3.278217882782715e-06, + "loss": 0.86214137, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.51953125, + "step": 5005, + "time_per_iteration": 3.7985782623291016 + }, + { + "auxiliary_loss_clip": 0.01073985, + "auxiliary_loss_mlp": 0.01049711, + "balance_loss_clip": 1.01502085, + "balance_loss_mlp": 1.02460074, + "epoch": 0.3009770028558545, + "flos": 23804270795520.0, + "grad_norm": 2.4471818945790846, + "language_loss": 0.76760125, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.78883815, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.49414062, + "step": 5006, + "time_per_iteration": 2.424163341522217 + }, + { + "auxiliary_loss_clip": 0.01070841, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.01848495, + "balance_loss_mlp": 1.02253604, + "epoch": 0.30103712610852246, + "flos": 26467026635520.0, + "grad_norm": 1.891280718473957, + "language_loss": 0.72389209, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.745121, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.48242188, + "step": 5007, + "time_per_iteration": 2.4873249530792236 + }, + { + "auxiliary_loss_clip": 0.01072324, + "auxiliary_loss_mlp": 0.01058209, + "balance_loss_clip": 1.01872706, + "balance_loss_mlp": 1.02177703, + "epoch": 0.3010972493611904, + "flos": 22855366419840.0, + "grad_norm": 2.2192495146268216, + "language_loss": 0.77896988, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.80027521, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.50390625, + "step": 5008, + "time_per_iteration": 2.398043155670166 + }, + { + "auxiliary_loss_clip": 0.01070718, + "auxiliary_loss_mlp": 0.01050957, + "balance_loss_clip": 1.01531386, + "balance_loss_mlp": 1.02096844, + "epoch": 0.3011573726138584, + "flos": 24052748008320.0, + "grad_norm": 1.8551053554044001, + "language_loss": 0.85963774, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.88085449, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.49609375, + "step": 5009, + "time_per_iteration": 2.4412825107574463 + }, + { + "auxiliary_loss_clip": 0.01074611, + "auxiliary_loss_mlp": 0.01059544, + "balance_loss_clip": 1.01796412, + "balance_loss_mlp": 1.02205706, + "epoch": 0.30121749586652635, + "flos": 20258841162240.0, + "grad_norm": 1.7982037868409169, + "language_loss": 0.84772271, + "learning_rate": 3.276719570659604e-06, + "loss": 0.86906421, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.52734375, + "step": 5010, + "time_per_iteration": 2.3783535957336426 + }, + { + "auxiliary_loss_clip": 0.01070797, + "auxiliary_loss_mlp": 0.01046258, + "balance_loss_clip": 1.01292694, + "balance_loss_mlp": 1.0211823, + "epoch": 0.3012776191191944, + "flos": 26941845392640.0, + "grad_norm": 2.3060467536658984, + "language_loss": 0.86656737, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.88773787, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.49609375, + "step": 5011, + "time_per_iteration": 2.4486145973205566 + }, + { + "auxiliary_loss_clip": 0.0107105, + "auxiliary_loss_mlp": 0.01059318, + "balance_loss_clip": 1.02107537, + "balance_loss_mlp": 1.02003503, + "epoch": 0.30133774237186234, + "flos": 20411271624960.0, + "grad_norm": 13.498548741618645, + "language_loss": 0.74503267, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.76633632, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5078125, + "step": 5012, + "time_per_iteration": 2.388587236404419 + }, + { + "auxiliary_loss_clip": 0.01070918, + "auxiliary_loss_mlp": 0.01052187, + "balance_loss_clip": 1.01687765, + "balance_loss_mlp": 1.02098131, + "epoch": 0.3013978656245303, + "flos": 19791423613440.0, + "grad_norm": 2.2749833932253796, + "language_loss": 0.88796914, + "learning_rate": 3.275820002334819e-06, + "loss": 0.90920019, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5, + "step": 5013, + "time_per_iteration": 2.409979820251465 + }, + { + "auxiliary_loss_clip": 0.01072762, + "auxiliary_loss_mlp": 0.01055079, + "balance_loss_clip": 1.01533484, + "balance_loss_mlp": 1.02091074, + "epoch": 0.30145798887719827, + "flos": 16248821800320.0, + "grad_norm": 2.2644869403148187, + "language_loss": 0.84417903, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.86545742, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.51953125, + "step": 5014, + "time_per_iteration": 2.3569135665893555 + }, + { + "auxiliary_loss_clip": 0.01067606, + "auxiliary_loss_mlp": 0.01048252, + "balance_loss_clip": 1.01513577, + "balance_loss_mlp": 1.02007318, + "epoch": 0.30151811212986623, + "flos": 24570579427200.0, + "grad_norm": 2.080932272673734, + "language_loss": 0.69374627, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.7149049, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4765625, + "step": 5015, + "time_per_iteration": 2.4217259883880615 + }, + { + "auxiliary_loss_clip": 0.01071265, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_clip": 1.01916003, + "balance_loss_mlp": 1.02148819, + "epoch": 0.3015782353825342, + "flos": 21870990236160.0, + "grad_norm": 3.2721365748174454, + "language_loss": 0.76518071, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.78644609, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.49804688, + "step": 5016, + "time_per_iteration": 2.3795278072357178 + }, + { + "auxiliary_loss_clip": 0.01074172, + "auxiliary_loss_mlp": 0.01053435, + "balance_loss_clip": 1.01614654, + "balance_loss_mlp": 1.021909, + "epoch": 0.30163835863520216, + "flos": 28768012300800.0, + "grad_norm": 1.4824433005868674, + "language_loss": 0.66731477, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.68859088, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.5234375, + "step": 5017, + "time_per_iteration": 2.4685685634613037 + }, + { + "auxiliary_loss_clip": 0.01072748, + "auxiliary_loss_mlp": 0.01054745, + "balance_loss_clip": 1.01851702, + "balance_loss_mlp": 1.02261543, + "epoch": 0.30169848188787013, + "flos": 22965098423040.0, + "grad_norm": 1.9672749448913653, + "language_loss": 0.69854927, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71982419, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5, + "step": 5018, + "time_per_iteration": 2.3909614086151123 + }, + { + "auxiliary_loss_clip": 0.01065947, + "auxiliary_loss_mlp": 0.01049574, + "balance_loss_clip": 1.01629031, + "balance_loss_mlp": 1.0193696, + "epoch": 0.3017586051405381, + "flos": 21834191796480.0, + "grad_norm": 2.3782958519125157, + "language_loss": 0.80409706, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.82525229, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46484375, + "step": 5019, + "time_per_iteration": 2.4048335552215576 + }, + { + "auxiliary_loss_clip": 0.0107328, + "auxiliary_loss_mlp": 0.01057728, + "balance_loss_clip": 1.02158403, + "balance_loss_mlp": 1.02238071, + "epoch": 0.30181872839320606, + "flos": 22159407911040.0, + "grad_norm": 2.3871032807140797, + "language_loss": 0.71315897, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.73446906, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5078125, + "step": 5020, + "time_per_iteration": 2.4019789695739746 + }, + { + "auxiliary_loss_clip": 0.01072526, + "auxiliary_loss_mlp": 0.01060014, + "balance_loss_clip": 1.02224827, + "balance_loss_mlp": 1.02111149, + "epoch": 0.301878851645874, + "flos": 18113183602560.0, + "grad_norm": 2.974096005308293, + "language_loss": 0.80062562, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.82195103, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.515625, + "step": 5021, + "time_per_iteration": 2.4390056133270264 + }, + { + "auxiliary_loss_clip": 0.01069315, + "auxiliary_loss_mlp": 0.01047679, + "balance_loss_clip": 1.01503992, + "balance_loss_mlp": 1.01974678, + "epoch": 0.301938974898542, + "flos": 17601287114880.0, + "grad_norm": 2.412480616839102, + "language_loss": 0.78158182, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.80275178, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.49609375, + "step": 5022, + "time_per_iteration": 2.384329080581665 + }, + { + "auxiliary_loss_clip": 0.01072447, + "auxiliary_loss_mlp": 0.0105802, + "balance_loss_clip": 1.02354503, + "balance_loss_mlp": 1.02162516, + "epoch": 0.30199909815120995, + "flos": 11180445350400.0, + "grad_norm": 2.0200771007926166, + "language_loss": 0.70740914, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72871375, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 5023, + "time_per_iteration": 2.4352095127105713 + }, + { + "auxiliary_loss_clip": 0.01073903, + "auxiliary_loss_mlp": 0.01048124, + "balance_loss_clip": 1.01343393, + "balance_loss_mlp": 1.02094996, + "epoch": 0.302059221403878, + "flos": 21906776246400.0, + "grad_norm": 1.9499243375798454, + "language_loss": 0.72858381, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.74980414, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.53125, + "step": 5024, + "time_per_iteration": 2.4216361045837402 + }, + { + "auxiliary_loss_clip": 0.01070588, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_clip": 1.0203048, + "balance_loss_mlp": 1.02161658, + "epoch": 0.30211934465654594, + "flos": 26395175324160.0, + "grad_norm": 2.430010377597016, + "language_loss": 0.75743711, + "learning_rate": 3.272217377978061e-06, + "loss": 0.77868509, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49023438, + "step": 5025, + "time_per_iteration": 2.428990602493286 + }, + { + "auxiliary_loss_clip": 0.01068341, + "auxiliary_loss_mlp": 0.01051153, + "balance_loss_clip": 1.02025366, + "balance_loss_mlp": 1.02086377, + "epoch": 0.3021794679092139, + "flos": 23399453047680.0, + "grad_norm": 1.777364022205478, + "language_loss": 0.69215989, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.71335483, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.47460938, + "step": 5026, + "time_per_iteration": 2.40132737159729 + }, + { + "auxiliary_loss_clip": 0.01072544, + "auxiliary_loss_mlp": 0.01056645, + "balance_loss_clip": 1.02197945, + "balance_loss_mlp": 1.02269721, + "epoch": 0.30223959116188187, + "flos": 20260097971200.0, + "grad_norm": 1.7160036824085434, + "language_loss": 0.86365914, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.88495106, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.5, + "step": 5027, + "time_per_iteration": 2.38614821434021 + }, + { + "auxiliary_loss_clip": 0.01069316, + "auxiliary_loss_mlp": 0.01054328, + "balance_loss_clip": 1.0226903, + "balance_loss_mlp": 1.02046561, + "epoch": 0.30229971441454984, + "flos": 26686630287360.0, + "grad_norm": 1.5444462044727607, + "language_loss": 0.79634988, + "learning_rate": 3.271315635661351e-06, + "loss": 0.8175863, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.48828125, + "step": 5028, + "time_per_iteration": 2.458916425704956 + }, + { + "auxiliary_loss_clip": 0.01073438, + "auxiliary_loss_mlp": 0.01053293, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.02336597, + "epoch": 0.3023598376672178, + "flos": 34344026052480.0, + "grad_norm": 1.9291470387018181, + "language_loss": 0.78513908, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.80640638, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5, + "step": 5029, + "time_per_iteration": 2.485807418823242 + }, + { + "auxiliary_loss_clip": 0.01072876, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_clip": 1.01514554, + "balance_loss_mlp": 1.02154922, + "epoch": 0.30241996091988577, + "flos": 23111035372800.0, + "grad_norm": 2.9419182933677774, + "language_loss": 0.83583832, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.85708141, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.51171875, + "step": 5030, + "time_per_iteration": 2.4216108322143555 + }, + { + "auxiliary_loss_clip": 0.01073543, + "auxiliary_loss_mlp": 0.01062478, + "balance_loss_clip": 1.02471292, + "balance_loss_mlp": 1.0218451, + "epoch": 0.30248008417255373, + "flos": 19389014749440.0, + "grad_norm": 6.21071449082839, + "language_loss": 0.72012389, + "learning_rate": 3.270413459468905e-06, + "loss": 0.74148405, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.515625, + "step": 5031, + "time_per_iteration": 2.3858802318573 + }, + { + "auxiliary_loss_clip": 0.0107122, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_clip": 1.01771128, + "balance_loss_mlp": 1.02133274, + "epoch": 0.3025402074252217, + "flos": 23768554608000.0, + "grad_norm": 1.870359270374137, + "language_loss": 0.8336134, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.85484648, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49804688, + "step": 5032, + "time_per_iteration": 2.4135398864746094 + }, + { + "auxiliary_loss_clip": 0.01077199, + "auxiliary_loss_mlp": 0.01060034, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.0236572, + "epoch": 0.30260033067788966, + "flos": 25992941016960.0, + "grad_norm": 2.4473277198238597, + "language_loss": 0.75224626, + "learning_rate": 3.269811767783906e-06, + "loss": 0.77361858, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.53515625, + "step": 5033, + "time_per_iteration": 2.410855770111084 + }, + { + "auxiliary_loss_clip": 0.01069713, + "auxiliary_loss_mlp": 0.01053028, + "balance_loss_clip": 1.01790953, + "balance_loss_mlp": 1.02055264, + "epoch": 0.3026604539305576, + "flos": 25373372296320.0, + "grad_norm": 1.7506073963392839, + "language_loss": 0.75641882, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.77764618, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.4921875, + "step": 5034, + "time_per_iteration": 2.4397478103637695 + }, + { + "auxiliary_loss_clip": 0.01069312, + "auxiliary_loss_mlp": 0.01050791, + "balance_loss_clip": 1.01490927, + "balance_loss_mlp": 1.02006042, + "epoch": 0.3027205771832256, + "flos": 25811532259200.0, + "grad_norm": 14.16051426589058, + "language_loss": 0.7401821, + "learning_rate": 3.269209883493352e-06, + "loss": 0.76138318, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.4921875, + "step": 5035, + "time_per_iteration": 2.4162325859069824 + }, + { + "auxiliary_loss_clip": 0.01069397, + "auxiliary_loss_mlp": 0.01047796, + "balance_loss_clip": 1.01379704, + "balance_loss_mlp": 1.02069831, + "epoch": 0.30278070043589356, + "flos": 27343311649920.0, + "grad_norm": 1.927460170060167, + "language_loss": 0.879713, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.90088487, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.48632812, + "step": 5036, + "time_per_iteration": 2.4485418796539307 + }, + { + "auxiliary_loss_clip": 0.01069933, + "auxiliary_loss_mlp": 0.0105247, + "balance_loss_clip": 1.01701689, + "balance_loss_mlp": 1.02089405, + "epoch": 0.3028408236885616, + "flos": 24785190754560.0, + "grad_norm": 1.873872701698567, + "language_loss": 0.78310674, + "learning_rate": 3.268607806688536e-06, + "loss": 0.80433083, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.49023438, + "step": 5037, + "time_per_iteration": 2.4137418270111084 + }, + { + "auxiliary_loss_clip": 0.01073302, + "auxiliary_loss_mlp": 0.01064434, + "balance_loss_clip": 1.02683568, + "balance_loss_mlp": 1.02241147, + "epoch": 0.30290094694122954, + "flos": 12931653836160.0, + "grad_norm": 7.323304345455554, + "language_loss": 0.79966164, + "learning_rate": 3.268306696121816e-06, + "loss": 0.82103896, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5078125, + "step": 5038, + "time_per_iteration": 3.8015122413635254 + }, + { + "auxiliary_loss_clip": 0.01071337, + "auxiliary_loss_mlp": 0.01054664, + "balance_loss_clip": 1.02023649, + "balance_loss_mlp": 1.02237332, + "epoch": 0.3029610701938975, + "flos": 25915399153920.0, + "grad_norm": 1.9404411583229821, + "language_loss": 0.75652719, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.77778721, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48828125, + "step": 5039, + "time_per_iteration": 2.463085174560547 + }, + { + "auxiliary_loss_clip": 0.01067544, + "auxiliary_loss_mlp": 0.01049952, + "balance_loss_clip": 1.01752722, + "balance_loss_mlp": 1.02026272, + "epoch": 0.3030211934465655, + "flos": 21979919278080.0, + "grad_norm": 1.866779395861628, + "language_loss": 0.80775464, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82892954, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47265625, + "step": 5040, + "time_per_iteration": 2.3880395889282227 + }, + { + "auxiliary_loss_clip": 0.01070453, + "auxiliary_loss_mlp": 0.01050589, + "balance_loss_clip": 1.01685286, + "balance_loss_mlp": 1.02263594, + "epoch": 0.30308131669923344, + "flos": 20991039528960.0, + "grad_norm": 1.6698683009402098, + "language_loss": 0.83183724, + "learning_rate": 3.267403075901438e-06, + "loss": 0.85304767, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.47851562, + "step": 5041, + "time_per_iteration": 3.798086643218994 + }, + { + "auxiliary_loss_clip": 0.01019193, + "auxiliary_loss_mlp": 0.01012316, + "balance_loss_clip": 1.00814414, + "balance_loss_mlp": 1.00836277, + "epoch": 0.3031414399519014, + "flos": 60545641599360.0, + "grad_norm": 0.7757072673499196, + "language_loss": 0.59618336, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61649847, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.10839844, + "step": 5042, + "time_per_iteration": 3.138378381729126 + }, + { + "auxiliary_loss_clip": 0.01073603, + "auxiliary_loss_mlp": 0.01056485, + "balance_loss_clip": 1.02079344, + "balance_loss_mlp": 1.02334857, + "epoch": 0.30320156320456937, + "flos": 21906601689600.0, + "grad_norm": 1.6505620178998337, + "language_loss": 0.72876102, + "learning_rate": 3.266800422101892e-06, + "loss": 0.75006193, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.50390625, + "step": 5043, + "time_per_iteration": 3.905677556991577 + }, + { + "auxiliary_loss_clip": 0.0107003, + "auxiliary_loss_mlp": 0.01058202, + "balance_loss_clip": 1.02179515, + "balance_loss_mlp": 1.02061582, + "epoch": 0.30326168645723733, + "flos": 21651700786560.0, + "grad_norm": 1.7837753615987932, + "language_loss": 0.70992553, + "learning_rate": 3.266499023140606e-06, + "loss": 0.73120779, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.49414062, + "step": 5044, + "time_per_iteration": 3.7758328914642334 + }, + { + "auxiliary_loss_clip": 0.01067901, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_clip": 1.02078843, + "balance_loss_mlp": 1.01981473, + "epoch": 0.3033218097099053, + "flos": 21870222186240.0, + "grad_norm": 1.5072027017112, + "language_loss": 0.78114605, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.80237514, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.48046875, + "step": 5045, + "time_per_iteration": 2.3931403160095215 + }, + { + "auxiliary_loss_clip": 0.01073477, + "auxiliary_loss_mlp": 0.01060991, + "balance_loss_clip": 1.02081764, + "balance_loss_mlp": 1.02247357, + "epoch": 0.30338193296257326, + "flos": 27088480569600.0, + "grad_norm": 2.9502297478864588, + "language_loss": 0.73201883, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.75336349, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.51171875, + "step": 5046, + "time_per_iteration": 2.4481136798858643 + }, + { + "auxiliary_loss_clip": 0.01074912, + "auxiliary_loss_mlp": 0.01060741, + "balance_loss_clip": 1.02061546, + "balance_loss_mlp": 1.02336752, + "epoch": 0.30344205621524123, + "flos": 19533415599360.0, + "grad_norm": 1.8196103573055495, + "language_loss": 0.82506216, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.84641868, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.515625, + "step": 5047, + "time_per_iteration": 2.4067912101745605 + }, + { + "auxiliary_loss_clip": 0.01073042, + "auxiliary_loss_mlp": 0.01051959, + "balance_loss_clip": 1.01555276, + "balance_loss_mlp": 1.02270246, + "epoch": 0.3035021794679092, + "flos": 23909953080960.0, + "grad_norm": 2.087499742101998, + "language_loss": 0.72591633, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7471664, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.50390625, + "step": 5048, + "time_per_iteration": 2.417738914489746 + }, + { + "auxiliary_loss_clip": 0.01073492, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_clip": 1.01705086, + "balance_loss_mlp": 1.02315593, + "epoch": 0.30356230272057716, + "flos": 16142685667200.0, + "grad_norm": 2.083109842007804, + "language_loss": 0.77523988, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.79649222, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.50390625, + "step": 5049, + "time_per_iteration": 2.3631772994995117 + }, + { + "auxiliary_loss_clip": 0.01075656, + "auxiliary_loss_mlp": 0.01059051, + "balance_loss_clip": 1.02297854, + "balance_loss_mlp": 1.02329624, + "epoch": 0.3036224259732452, + "flos": 28913390668800.0, + "grad_norm": 2.3461280589509266, + "language_loss": 0.82964134, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.85098839, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5234375, + "step": 5050, + "time_per_iteration": 2.5016918182373047 + }, + { + "auxiliary_loss_clip": 0.01074447, + "auxiliary_loss_mlp": 0.01053949, + "balance_loss_clip": 1.01778066, + "balance_loss_mlp": 1.02308464, + "epoch": 0.30368254922591315, + "flos": 21104541959040.0, + "grad_norm": 2.1943997928107986, + "language_loss": 0.75533092, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.7766149, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.515625, + "step": 5051, + "time_per_iteration": 2.389305353164673 + }, + { + "auxiliary_loss_clip": 0.01074581, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.02120078, + "balance_loss_mlp": 1.02318501, + "epoch": 0.3037426724785811, + "flos": 23001198635520.0, + "grad_norm": 1.6235288969938566, + "language_loss": 0.78011203, + "learning_rate": 3.264086103483033e-06, + "loss": 0.80143678, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.515625, + "step": 5052, + "time_per_iteration": 2.420043706893921 + }, + { + "auxiliary_loss_clip": 0.01077124, + "auxiliary_loss_mlp": 0.01054879, + "balance_loss_clip": 1.01635087, + "balance_loss_mlp": 1.02484727, + "epoch": 0.3038027957312491, + "flos": 15631801608960.0, + "grad_norm": 2.1500495499973726, + "language_loss": 0.84408176, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.86540174, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5234375, + "step": 5053, + "time_per_iteration": 2.35660719871521 + }, + { + "auxiliary_loss_clip": 0.01074308, + "auxiliary_loss_mlp": 0.01060324, + "balance_loss_clip": 1.02124751, + "balance_loss_mlp": 1.02298796, + "epoch": 0.30386291898391704, + "flos": 12713167347840.0, + "grad_norm": 1.751470347299621, + "language_loss": 0.72530311, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.74664938, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.51171875, + "step": 5054, + "time_per_iteration": 2.5394792556762695 + }, + { + "auxiliary_loss_clip": 0.0107453, + "auxiliary_loss_mlp": 0.01057052, + "balance_loss_clip": 1.01997745, + "balance_loss_mlp": 1.02432489, + "epoch": 0.303923042236585, + "flos": 26358237239040.0, + "grad_norm": 6.544393925936348, + "language_loss": 0.7105245, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.73184031, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.5, + "step": 5055, + "time_per_iteration": 2.4267289638519287 + }, + { + "auxiliary_loss_clip": 0.01075574, + "auxiliary_loss_mlp": 0.01057154, + "balance_loss_clip": 1.01912642, + "balance_loss_mlp": 1.02384543, + "epoch": 0.30398316548925297, + "flos": 19718210759040.0, + "grad_norm": 5.404389060484848, + "language_loss": 0.69350421, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.71483147, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.515625, + "step": 5056, + "time_per_iteration": 2.434068441390991 + }, + { + "auxiliary_loss_clip": 0.010728, + "auxiliary_loss_mlp": 0.01059564, + "balance_loss_clip": 1.02110708, + "balance_loss_mlp": 1.02235556, + "epoch": 0.30404328874192094, + "flos": 24238799976960.0, + "grad_norm": 1.7008474934378772, + "language_loss": 0.83647764, + "learning_rate": 3.262576470461507e-06, + "loss": 0.85780126, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.50390625, + "step": 5057, + "time_per_iteration": 2.4107775688171387 + }, + { + "auxiliary_loss_clip": 0.01071107, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.01676524, + "balance_loss_mlp": 1.02073741, + "epoch": 0.3041034119945889, + "flos": 24497785509120.0, + "grad_norm": 1.940119460312281, + "language_loss": 0.90798199, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.92923343, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 0.50390625, + "step": 5058, + "time_per_iteration": 2.5077600479125977 + }, + { + "auxiliary_loss_clip": 0.01076342, + "auxiliary_loss_mlp": 0.01067586, + "balance_loss_clip": 1.02738881, + "balance_loss_mlp": 1.0243932, + "epoch": 0.30416353524725687, + "flos": 28287747371520.0, + "grad_norm": 2.016789585624093, + "language_loss": 0.73350763, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.75494695, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.51953125, + "step": 5059, + "time_per_iteration": 2.444354772567749 + }, + { + "auxiliary_loss_clip": 0.01072779, + "auxiliary_loss_mlp": 0.01059369, + "balance_loss_clip": 1.02136517, + "balance_loss_mlp": 1.02203226, + "epoch": 0.30422365849992483, + "flos": 23659241541120.0, + "grad_norm": 2.3185117840740204, + "language_loss": 0.7407105, + "learning_rate": 3.26167011603268e-06, + "loss": 0.76203197, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5078125, + "step": 5060, + "time_per_iteration": 2.458911180496216 + }, + { + "auxiliary_loss_clip": 0.01074718, + "auxiliary_loss_mlp": 0.01056951, + "balance_loss_clip": 1.01951921, + "balance_loss_mlp": 1.02226746, + "epoch": 0.3042837817525928, + "flos": 22997777322240.0, + "grad_norm": 2.1861668184477954, + "language_loss": 0.78622031, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.80753696, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5234375, + "step": 5061, + "time_per_iteration": 2.3843657970428467 + }, + { + "auxiliary_loss_clip": 0.01072892, + "auxiliary_loss_mlp": 0.01057194, + "balance_loss_clip": 1.01811731, + "balance_loss_mlp": 1.02214551, + "epoch": 0.30434390500526076, + "flos": 22081482023040.0, + "grad_norm": 6.2913879853090675, + "language_loss": 0.83649349, + "learning_rate": 3.261065640514415e-06, + "loss": 0.8577944, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.5078125, + "step": 5062, + "time_per_iteration": 2.4116013050079346 + }, + { + "auxiliary_loss_clip": 0.01069473, + "auxiliary_loss_mlp": 0.01054396, + "balance_loss_clip": 1.01813293, + "balance_loss_mlp": 1.01993513, + "epoch": 0.3044040282579287, + "flos": 25482336249600.0, + "grad_norm": 1.8085129263372726, + "language_loss": 0.75770926, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.77894795, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.49609375, + "step": 5063, + "time_per_iteration": 2.41461443901062 + }, + { + "auxiliary_loss_clip": 0.01069418, + "auxiliary_loss_mlp": 0.0105453, + "balance_loss_clip": 1.01464224, + "balance_loss_mlp": 1.0207144, + "epoch": 0.30446415151059675, + "flos": 21944377647360.0, + "grad_norm": 1.643909431649782, + "language_loss": 0.85350156, + "learning_rate": 3.26046097371721e-06, + "loss": 0.87474102, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.48632812, + "step": 5064, + "time_per_iteration": 2.424729824066162 + }, + { + "auxiliary_loss_clip": 0.01071993, + "auxiliary_loss_mlp": 0.01058809, + "balance_loss_clip": 1.01913667, + "balance_loss_mlp": 1.02076554, + "epoch": 0.3045242747632647, + "flos": 16434489744000.0, + "grad_norm": 1.9620466328308048, + "language_loss": 0.76939148, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.79069948, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.51171875, + "step": 5065, + "time_per_iteration": 2.3697073459625244 + }, + { + "auxiliary_loss_clip": 0.01074686, + "auxiliary_loss_mlp": 0.01057077, + "balance_loss_clip": 1.01907325, + "balance_loss_mlp": 1.02179527, + "epoch": 0.3045843980159327, + "flos": 31538998955520.0, + "grad_norm": 1.9314077890442634, + "language_loss": 0.63796985, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.65928751, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.53125, + "step": 5066, + "time_per_iteration": 2.5084526538848877 + }, + { + "auxiliary_loss_clip": 0.01077795, + "auxiliary_loss_mlp": 0.01058359, + "balance_loss_clip": 1.01806664, + "balance_loss_mlp": 1.02377832, + "epoch": 0.30464452126860064, + "flos": 17852801616000.0, + "grad_norm": 2.43416767792794, + "language_loss": 0.84196019, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.86332178, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.5390625, + "step": 5067, + "time_per_iteration": 2.3800179958343506 + }, + { + "auxiliary_loss_clip": 0.01069603, + "auxiliary_loss_mlp": 0.01055508, + "balance_loss_clip": 1.01960266, + "balance_loss_mlp": 1.0209111, + "epoch": 0.3047046445212686, + "flos": 20630351606400.0, + "grad_norm": 2.1147086634504246, + "language_loss": 0.64794302, + "learning_rate": 3.259251066652873e-06, + "loss": 0.66919416, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48828125, + "step": 5068, + "time_per_iteration": 2.4038405418395996 + }, + { + "auxiliary_loss_clip": 0.01070728, + "auxiliary_loss_mlp": 0.01046991, + "balance_loss_clip": 1.0137794, + "balance_loss_mlp": 1.02118981, + "epoch": 0.3047647677739366, + "flos": 21286544209920.0, + "grad_norm": 1.8674556472093402, + "language_loss": 0.76492393, + "learning_rate": 3.258948470480793e-06, + "loss": 0.7861011, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.49609375, + "step": 5069, + "time_per_iteration": 2.3890998363494873 + }, + { + "auxiliary_loss_clip": 0.0106794, + "auxiliary_loss_mlp": 0.01054119, + "balance_loss_clip": 1.02109838, + "balance_loss_mlp": 1.02136803, + "epoch": 0.30482489102660454, + "flos": 20994879778560.0, + "grad_norm": 2.3109462775161265, + "language_loss": 0.77229071, + "learning_rate": 3.258645826569261e-06, + "loss": 0.79351127, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.46679688, + "step": 5070, + "time_per_iteration": 2.404808759689331 + }, + { + "auxiliary_loss_clip": 0.01075204, + "auxiliary_loss_mlp": 0.01055801, + "balance_loss_clip": 1.017416, + "balance_loss_mlp": 1.02359009, + "epoch": 0.3048850142792725, + "flos": 26289493038720.0, + "grad_norm": 2.0943075367219146, + "language_loss": 0.8311432, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.85245323, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.51953125, + "step": 5071, + "time_per_iteration": 2.4573731422424316 + }, + { + "auxiliary_loss_clip": 0.01074061, + "auxiliary_loss_mlp": 0.01057234, + "balance_loss_clip": 1.01896739, + "balance_loss_mlp": 1.02189803, + "epoch": 0.30494513753194047, + "flos": 22345145677440.0, + "grad_norm": 1.8257980131169882, + "language_loss": 0.76912189, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.79043484, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5234375, + "step": 5072, + "time_per_iteration": 2.4377450942993164 + }, + { + "auxiliary_loss_clip": 0.01070713, + "auxiliary_loss_mlp": 0.01061527, + "balance_loss_clip": 1.02574086, + "balance_loss_mlp": 1.02189088, + "epoch": 0.30500526078460843, + "flos": 19536627444480.0, + "grad_norm": 2.2120361172486005, + "language_loss": 0.73716968, + "learning_rate": 3.257737608512723e-06, + "loss": 0.75849211, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.48828125, + "step": 5073, + "time_per_iteration": 2.382911443710327 + }, + { + "auxiliary_loss_clip": 0.01077694, + "auxiliary_loss_mlp": 0.0106406, + "balance_loss_clip": 1.02493572, + "balance_loss_mlp": 1.02424109, + "epoch": 0.3050653840372764, + "flos": 14464445656320.0, + "grad_norm": 2.2791191539617612, + "language_loss": 0.78526831, + "learning_rate": 3.257434773758163e-06, + "loss": 0.80668586, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.53515625, + "step": 5074, + "time_per_iteration": 2.3593873977661133 + }, + { + "auxiliary_loss_clip": 0.01071255, + "auxiliary_loss_mlp": 0.01053342, + "balance_loss_clip": 1.01881862, + "balance_loss_mlp": 1.02273953, + "epoch": 0.30512550728994436, + "flos": 24242640226560.0, + "grad_norm": 2.0732910532876283, + "language_loss": 0.75352424, + "learning_rate": 3.25713189132155e-06, + "loss": 0.77477026, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.484375, + "step": 5075, + "time_per_iteration": 2.5213305950164795 + }, + { + "auxiliary_loss_clip": 0.01074799, + "auxiliary_loss_mlp": 0.01060936, + "balance_loss_clip": 1.02188325, + "balance_loss_mlp": 1.02275085, + "epoch": 0.30518563054261233, + "flos": 16359670967040.0, + "grad_norm": 1.8785706085053306, + "language_loss": 0.7660538, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.78741109, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.51953125, + "step": 5076, + "time_per_iteration": 2.350162982940674 + }, + { + "auxiliary_loss_clip": 0.01072016, + "auxiliary_loss_mlp": 0.01056173, + "balance_loss_clip": 1.02100658, + "balance_loss_mlp": 1.02230453, + "epoch": 0.30524575379528035, + "flos": 21578522843520.0, + "grad_norm": 2.474348726590459, + "language_loss": 0.80544156, + "learning_rate": 3.25652598344811e-06, + "loss": 0.82672346, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.49609375, + "step": 5077, + "time_per_iteration": 3.8848776817321777 + }, + { + "auxiliary_loss_clip": 0.01068255, + "auxiliary_loss_mlp": 0.01049145, + "balance_loss_clip": 1.01632726, + "balance_loss_mlp": 1.02172041, + "epoch": 0.3053058770479483, + "flos": 16544291569920.0, + "grad_norm": 1.709074816197999, + "language_loss": 0.7642765, + "learning_rate": 3.256222958034259e-06, + "loss": 0.78545046, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46484375, + "step": 5078, + "time_per_iteration": 2.394915819168091 + }, + { + "auxiliary_loss_clip": 0.01070279, + "auxiliary_loss_mlp": 0.01065772, + "balance_loss_clip": 1.03046238, + "balance_loss_mlp": 1.02170062, + "epoch": 0.3053660003006163, + "flos": 12312085115520.0, + "grad_norm": 2.1051780160616684, + "language_loss": 0.68505454, + "learning_rate": 3.255919884984307e-06, + "loss": 0.70641506, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.484375, + "step": 5079, + "time_per_iteration": 2.3724892139434814 + }, + { + "auxiliary_loss_clip": 0.01069386, + "auxiliary_loss_mlp": 0.0106075, + "balance_loss_clip": 1.02575016, + "balance_loss_mlp": 1.02117777, + "epoch": 0.30542612355328425, + "flos": 23111175018240.0, + "grad_norm": 2.237505647100242, + "language_loss": 0.81399369, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.83529508, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.48242188, + "step": 5080, + "time_per_iteration": 3.84330415725708 + }, + { + "auxiliary_loss_clip": 0.01070526, + "auxiliary_loss_mlp": 0.01054244, + "balance_loss_clip": 1.02408433, + "balance_loss_mlp": 1.02165651, + "epoch": 0.3054862468059522, + "flos": 24388297885440.0, + "grad_norm": 2.8982308064933404, + "language_loss": 0.825095, + "learning_rate": 3.255313596022074e-06, + "loss": 0.84634268, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.48828125, + "step": 5081, + "time_per_iteration": 2.4548110961914062 + }, + { + "auxiliary_loss_clip": 0.010701, + "auxiliary_loss_mlp": 0.01052634, + "balance_loss_clip": 1.02035189, + "balance_loss_mlp": 1.02161944, + "epoch": 0.3055463700586202, + "flos": 29384857935360.0, + "grad_norm": 1.6597751107066752, + "language_loss": 0.72529352, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74652088, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.484375, + "step": 5082, + "time_per_iteration": 2.4842379093170166 + }, + { + "auxiliary_loss_clip": 0.01072779, + "auxiliary_loss_mlp": 0.01056699, + "balance_loss_clip": 1.01836181, + "balance_loss_mlp": 1.02170897, + "epoch": 0.30560649331128814, + "flos": 25590636887040.0, + "grad_norm": 1.8493963364440114, + "language_loss": 0.74149477, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.76278949, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.51171875, + "step": 5083, + "time_per_iteration": 5.257324457168579 + }, + { + "auxiliary_loss_clip": 0.01071331, + "auxiliary_loss_mlp": 0.01049217, + "balance_loss_clip": 1.01464653, + "balance_loss_mlp": 1.0221734, + "epoch": 0.3056666165639561, + "flos": 19127515599360.0, + "grad_norm": 1.7645036763832327, + "language_loss": 0.71960485, + "learning_rate": 3.254403805595344e-06, + "loss": 0.74081033, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.4921875, + "step": 5084, + "time_per_iteration": 2.380970001220703 + }, + { + "auxiliary_loss_clip": 0.01073299, + "auxiliary_loss_mlp": 0.01049277, + "balance_loss_clip": 1.01577914, + "balance_loss_mlp": 1.02257752, + "epoch": 0.30572673981662407, + "flos": 15522942389760.0, + "grad_norm": 2.072618882303092, + "language_loss": 0.80779374, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.82901949, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5078125, + "step": 5085, + "time_per_iteration": 2.4142045974731445 + }, + { + "auxiliary_loss_clip": 0.01065951, + "auxiliary_loss_mlp": 0.01044643, + "balance_loss_clip": 1.01285017, + "balance_loss_mlp": 1.01950753, + "epoch": 0.30578686306929204, + "flos": 21505484545920.0, + "grad_norm": 1.7264796856460227, + "language_loss": 0.79084915, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.81195509, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46484375, + "step": 5086, + "time_per_iteration": 2.396934986114502 + }, + { + "auxiliary_loss_clip": 0.01069689, + "auxiliary_loss_mlp": 0.01052273, + "balance_loss_clip": 1.01867986, + "balance_loss_mlp": 1.02096426, + "epoch": 0.30584698632196, + "flos": 20953368305280.0, + "grad_norm": 1.843983980112431, + "language_loss": 0.77353632, + "learning_rate": 3.253493587064563e-06, + "loss": 0.79475594, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.48828125, + "step": 5087, + "time_per_iteration": 2.403778076171875 + }, + { + "auxiliary_loss_clip": 0.01073403, + "auxiliary_loss_mlp": 0.01050708, + "balance_loss_clip": 1.01690078, + "balance_loss_mlp": 1.02235365, + "epoch": 0.30590710957462797, + "flos": 24679962316800.0, + "grad_norm": 2.1140051699510236, + "language_loss": 0.74698031, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.76822138, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.51171875, + "step": 5088, + "time_per_iteration": 2.4165072441101074 + }, + { + "auxiliary_loss_clip": 0.01075339, + "auxiliary_loss_mlp": 0.01050549, + "balance_loss_clip": 1.01566827, + "balance_loss_mlp": 1.02293801, + "epoch": 0.30596723282729593, + "flos": 17086108959360.0, + "grad_norm": 5.6045978409837165, + "language_loss": 0.81893152, + "learning_rate": 3.252886537028521e-06, + "loss": 0.84019041, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5234375, + "step": 5089, + "time_per_iteration": 2.404055118560791 + }, + { + "auxiliary_loss_clip": 0.01071166, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.01808858, + "balance_loss_mlp": 1.02147269, + "epoch": 0.30602735607996395, + "flos": 22855994824320.0, + "grad_norm": 1.9275203758650057, + "language_loss": 0.78536457, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.80658162, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.49609375, + "step": 5090, + "time_per_iteration": 2.405430316925049 + }, + { + "auxiliary_loss_clip": 0.01077427, + "auxiliary_loss_mlp": 0.01060733, + "balance_loss_clip": 1.02456522, + "balance_loss_mlp": 1.02496302, + "epoch": 0.3060874793326319, + "flos": 29860200362880.0, + "grad_norm": 1.9930822413564173, + "language_loss": 0.78151226, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.80289388, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5234375, + "step": 5091, + "time_per_iteration": 2.459881544113159 + }, + { + "auxiliary_loss_clip": 0.01073567, + "auxiliary_loss_mlp": 0.01060388, + "balance_loss_clip": 1.02367127, + "balance_loss_mlp": 1.02266383, + "epoch": 0.3061476025852999, + "flos": 20447546394240.0, + "grad_norm": 1.7311628444979967, + "language_loss": 0.73208487, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.75342441, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.5078125, + "step": 5092, + "time_per_iteration": 2.3901138305664062 + }, + { + "auxiliary_loss_clip": 0.01070795, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_clip": 1.01618099, + "balance_loss_mlp": 1.02308965, + "epoch": 0.30620772583796785, + "flos": 19390446115200.0, + "grad_norm": 3.044908019468082, + "language_loss": 0.84100223, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.86220455, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4765625, + "step": 5093, + "time_per_iteration": 2.421408176422119 + }, + { + "auxiliary_loss_clip": 0.0107106, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.01815999, + "balance_loss_mlp": 1.02282286, + "epoch": 0.3062678490906358, + "flos": 24023420599680.0, + "grad_norm": 1.8841030791124576, + "language_loss": 0.76325655, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.7844677, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.48242188, + "step": 5094, + "time_per_iteration": 2.409820079803467 + }, + { + "auxiliary_loss_clip": 0.01071402, + "auxiliary_loss_mlp": 0.01051104, + "balance_loss_clip": 1.01841688, + "balance_loss_mlp": 1.02274799, + "epoch": 0.3063279723433038, + "flos": 19753647655680.0, + "grad_norm": 2.799021347419914, + "language_loss": 0.7707051, + "learning_rate": 3.251064247058868e-06, + "loss": 0.7919302, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.48828125, + "step": 5095, + "time_per_iteration": 2.435837745666504 + }, + { + "auxiliary_loss_clip": 0.01069714, + "auxiliary_loss_mlp": 0.01053382, + "balance_loss_clip": 1.01947904, + "balance_loss_mlp": 1.02248693, + "epoch": 0.30638809559597174, + "flos": 22449082394880.0, + "grad_norm": 1.7696949478451667, + "language_loss": 0.81516457, + "learning_rate": 3.250760365955042e-06, + "loss": 0.8363955, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.47265625, + "step": 5096, + "time_per_iteration": 2.388474702835083 + }, + { + "auxiliary_loss_clip": 0.0107025, + "auxiliary_loss_mlp": 0.01051935, + "balance_loss_clip": 1.01655352, + "balance_loss_mlp": 1.02128613, + "epoch": 0.3064482188486397, + "flos": 17164209404160.0, + "grad_norm": 12.773784040828941, + "language_loss": 0.83446789, + "learning_rate": 3.250456437422258e-06, + "loss": 0.85568976, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48828125, + "step": 5097, + "time_per_iteration": 2.3898282051086426 + }, + { + "auxiliary_loss_clip": 0.01071244, + "auxiliary_loss_mlp": 0.01057033, + "balance_loss_clip": 1.0220567, + "balance_loss_mlp": 1.02256751, + "epoch": 0.3065083421013077, + "flos": 23767367621760.0, + "grad_norm": 2.0541065209400626, + "language_loss": 0.78719682, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80847961, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.48632812, + "step": 5098, + "time_per_iteration": 2.4099769592285156 + }, + { + "auxiliary_loss_clip": 0.01069837, + "auxiliary_loss_mlp": 0.01056112, + "balance_loss_clip": 1.02390122, + "balance_loss_mlp": 1.02239275, + "epoch": 0.30656846535397564, + "flos": 26430647132160.0, + "grad_norm": 1.9589480011886165, + "language_loss": 0.85348004, + "learning_rate": 3.249848438115917e-06, + "loss": 0.87473953, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.47460938, + "step": 5099, + "time_per_iteration": 2.461533546447754 + }, + { + "auxiliary_loss_clip": 0.01070654, + "auxiliary_loss_mlp": 0.01064653, + "balance_loss_clip": 1.02908087, + "balance_loss_mlp": 1.02108455, + "epoch": 0.3066285886066436, + "flos": 26650564986240.0, + "grad_norm": 3.0460231258942607, + "language_loss": 0.86098075, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.88233376, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.49609375, + "step": 5100, + "time_per_iteration": 2.4253268241882324 + }, + { + "auxiliary_loss_clip": 0.01069276, + "auxiliary_loss_mlp": 0.01053759, + "balance_loss_clip": 1.01663697, + "balance_loss_mlp": 1.01954019, + "epoch": 0.30668871185931157, + "flos": 15049031328000.0, + "grad_norm": 1.8777134575389212, + "language_loss": 0.80407083, + "learning_rate": 3.249240249232065e-06, + "loss": 0.82530117, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.49804688, + "step": 5101, + "time_per_iteration": 2.4237868785858154 + }, + { + "auxiliary_loss_clip": 0.01072588, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.02169323, + "balance_loss_mlp": 1.02206755, + "epoch": 0.30674883511197953, + "flos": 20081133008640.0, + "grad_norm": 1.6912147011330148, + "language_loss": 0.8122707, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.83360356, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.50390625, + "step": 5102, + "time_per_iteration": 2.392975330352783 + }, + { + "auxiliary_loss_clip": 0.01073404, + "auxiliary_loss_mlp": 0.01055228, + "balance_loss_clip": 1.01865447, + "balance_loss_mlp": 1.02296972, + "epoch": 0.30680895836464755, + "flos": 22892688529920.0, + "grad_norm": 3.4190337351277953, + "language_loss": 0.90513766, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.92642397, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.50390625, + "step": 5103, + "time_per_iteration": 2.4399325847625732 + }, + { + "auxiliary_loss_clip": 0.01071053, + "auxiliary_loss_mlp": 0.01057152, + "balance_loss_clip": 1.02186584, + "balance_loss_mlp": 1.02132845, + "epoch": 0.3068690816173155, + "flos": 23695027551360.0, + "grad_norm": 1.7984118032353333, + "language_loss": 0.75549507, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.77677715, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.49609375, + "step": 5104, + "time_per_iteration": 2.404832601547241 + }, + { + "auxiliary_loss_clip": 0.01074074, + "auxiliary_loss_mlp": 0.01058332, + "balance_loss_clip": 1.02383316, + "balance_loss_mlp": 1.02271652, + "epoch": 0.3069292048699835, + "flos": 23549893562880.0, + "grad_norm": 2.0163609215055245, + "language_loss": 0.74245441, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.76377851, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51171875, + "step": 5105, + "time_per_iteration": 2.4386074542999268 + }, + { + "auxiliary_loss_clip": 0.0107264, + "auxiliary_loss_mlp": 0.01053785, + "balance_loss_clip": 1.01685381, + "balance_loss_mlp": 1.02163982, + "epoch": 0.30698932812265145, + "flos": 24530604053760.0, + "grad_norm": 1.917561820958087, + "language_loss": 0.88094896, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.90221322, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5078125, + "step": 5106, + "time_per_iteration": 2.424337863922119 + }, + { + "auxiliary_loss_clip": 0.0107638, + "auxiliary_loss_mlp": 0.01069652, + "balance_loss_clip": 1.02957392, + "balance_loss_mlp": 1.02295649, + "epoch": 0.3070494513753194, + "flos": 20995368537600.0, + "grad_norm": 2.4992034250415216, + "language_loss": 0.73744541, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.75890571, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.53125, + "step": 5107, + "time_per_iteration": 2.3932113647460938 + }, + { + "auxiliary_loss_clip": 0.01071908, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_clip": 1.02489972, + "balance_loss_mlp": 1.02182615, + "epoch": 0.3071095746279874, + "flos": 19024940424960.0, + "grad_norm": 3.1394699043648195, + "language_loss": 0.73471355, + "learning_rate": 3.247110096547814e-06, + "loss": 0.75602281, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.5, + "step": 5108, + "time_per_iteration": 2.387723207473755 + }, + { + "auxiliary_loss_clip": 0.01071328, + "auxiliary_loss_mlp": 0.01054329, + "balance_loss_clip": 1.0177561, + "balance_loss_mlp": 1.02197421, + "epoch": 0.30716969788065535, + "flos": 21214448519040.0, + "grad_norm": 1.6564544204465266, + "language_loss": 0.87507427, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.89633083, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.4921875, + "step": 5109, + "time_per_iteration": 2.455106019973755 + }, + { + "auxiliary_loss_clip": 0.01071505, + "auxiliary_loss_mlp": 0.01052261, + "balance_loss_clip": 1.01718938, + "balance_loss_mlp": 1.02105212, + "epoch": 0.3072298211333233, + "flos": 25771661619840.0, + "grad_norm": 1.6402262828633027, + "language_loss": 0.68449187, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.70572954, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.50390625, + "step": 5110, + "time_per_iteration": 2.415372133255005 + }, + { + "auxiliary_loss_clip": 0.01070186, + "auxiliary_loss_mlp": 0.01049501, + "balance_loss_clip": 1.01571751, + "balance_loss_mlp": 1.02219009, + "epoch": 0.3072899443859913, + "flos": 25847737205760.0, + "grad_norm": 1.8549822490896526, + "language_loss": 0.77528942, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79648638, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.48046875, + "step": 5111, + "time_per_iteration": 2.437851905822754 + }, + { + "auxiliary_loss_clip": 0.01072254, + "auxiliary_loss_mlp": 0.01058473, + "balance_loss_clip": 1.02216148, + "balance_loss_mlp": 1.02114236, + "epoch": 0.30735006763865924, + "flos": 25921578464640.0, + "grad_norm": 1.7280769296340903, + "language_loss": 0.69136262, + "learning_rate": 3.245891825796765e-06, + "loss": 0.71266985, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5078125, + "step": 5112, + "time_per_iteration": 2.4202606678009033 + }, + { + "auxiliary_loss_clip": 0.01075963, + "auxiliary_loss_mlp": 0.01070501, + "balance_loss_clip": 1.03073263, + "balance_loss_mlp": 1.02263546, + "epoch": 0.3074101908913272, + "flos": 30915764542080.0, + "grad_norm": 2.0524838202256266, + "language_loss": 0.80877024, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.83023489, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.53125, + "step": 5113, + "time_per_iteration": 2.4735631942749023 + }, + { + "auxiliary_loss_clip": 0.01074797, + "auxiliary_loss_mlp": 0.01063042, + "balance_loss_clip": 1.02520525, + "balance_loss_mlp": 1.0229547, + "epoch": 0.30747031414399517, + "flos": 18400204823040.0, + "grad_norm": 1.975581193867569, + "language_loss": 0.78863084, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.81000924, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.51953125, + "step": 5114, + "time_per_iteration": 2.3644065856933594 + }, + { + "auxiliary_loss_clip": 0.01074065, + "auxiliary_loss_mlp": 0.01050881, + "balance_loss_clip": 1.01476049, + "balance_loss_mlp": 1.02334118, + "epoch": 0.30753043739666314, + "flos": 22632201809280.0, + "grad_norm": 1.8341787668972858, + "language_loss": 0.63276172, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.65401119, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5078125, + "step": 5115, + "time_per_iteration": 2.408470869064331 + }, + { + "auxiliary_loss_clip": 0.01073069, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.018677, + "balance_loss_mlp": 1.02237582, + "epoch": 0.3075905606493311, + "flos": 27342857802240.0, + "grad_norm": 1.7557693363000624, + "language_loss": 0.83292222, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.8541832, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 5116, + "time_per_iteration": 3.9663705825805664 + }, + { + "auxiliary_loss_clip": 0.01070894, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.02092028, + "balance_loss_mlp": 1.02181041, + "epoch": 0.3076506839019991, + "flos": 22089721104000.0, + "grad_norm": 1.9637438441135333, + "language_loss": 0.77014351, + "learning_rate": 3.244367924446952e-06, + "loss": 0.79141974, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.49023438, + "step": 5117, + "time_per_iteration": 2.4196743965148926 + }, + { + "auxiliary_loss_clip": 0.01073729, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.01855397, + "balance_loss_mlp": 1.02193701, + "epoch": 0.3077108071546671, + "flos": 21288429423360.0, + "grad_norm": 2.1355947994428566, + "language_loss": 0.73208177, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.75335747, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.515625, + "step": 5118, + "time_per_iteration": 2.3682048320770264 + }, + { + "auxiliary_loss_clip": 0.01070619, + "auxiliary_loss_mlp": 0.01049174, + "balance_loss_clip": 1.01481807, + "balance_loss_mlp": 1.02193594, + "epoch": 0.30777093040733505, + "flos": 21430002453120.0, + "grad_norm": 1.692579113266664, + "language_loss": 0.75368643, + "learning_rate": 3.243758033520219e-06, + "loss": 0.77488434, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48828125, + "step": 5119, + "time_per_iteration": 2.416313886642456 + }, + { + "auxiliary_loss_clip": 0.01073292, + "auxiliary_loss_mlp": 0.01058437, + "balance_loss_clip": 1.02107692, + "balance_loss_mlp": 1.02215159, + "epoch": 0.307831053660003, + "flos": 23148148014720.0, + "grad_norm": 1.7687099637013077, + "language_loss": 0.81672311, + "learning_rate": 3.243453017305926e-06, + "loss": 0.83804047, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.51171875, + "step": 5120, + "time_per_iteration": 3.7767655849456787 + }, + { + "auxiliary_loss_clip": 0.01068265, + "auxiliary_loss_mlp": 0.01057199, + "balance_loss_clip": 1.02412999, + "balance_loss_mlp": 1.01970601, + "epoch": 0.307891176912671, + "flos": 17018796124800.0, + "grad_norm": 3.4130728231104692, + "language_loss": 0.81701326, + "learning_rate": 3.24314795393977e-06, + "loss": 0.83826798, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48632812, + "step": 5121, + "time_per_iteration": 2.389071464538574 + }, + { + "auxiliary_loss_clip": 0.01070284, + "auxiliary_loss_mlp": 0.01046148, + "balance_loss_clip": 1.0129844, + "balance_loss_mlp": 1.02178788, + "epoch": 0.30795130016533895, + "flos": 27703929749760.0, + "grad_norm": 1.5933910230576407, + "language_loss": 0.83659053, + "learning_rate": 3.242842843433319e-06, + "loss": 0.85775483, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.484375, + "step": 5122, + "time_per_iteration": 3.875358819961548 + }, + { + "auxiliary_loss_clip": 0.01020475, + "auxiliary_loss_mlp": 0.01005536, + "balance_loss_clip": 1.00219834, + "balance_loss_mlp": 1.00863051, + "epoch": 0.3080114234180069, + "flos": 69054987294720.0, + "grad_norm": 0.7449626435355371, + "language_loss": 0.58689028, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60715038, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.03344727, + "router_z_loss_mlp": 0.11816406, + "step": 5123, + "time_per_iteration": 4.671058893203735 + }, + { + "auxiliary_loss_clip": 0.01072968, + "auxiliary_loss_mlp": 0.01053676, + "balance_loss_clip": 1.0162915, + "balance_loss_mlp": 1.02206922, + "epoch": 0.3080715466706749, + "flos": 24059101875840.0, + "grad_norm": 1.6571578251871193, + "language_loss": 0.84449691, + "learning_rate": 3.242232481045813e-06, + "loss": 0.86576331, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5078125, + "step": 5124, + "time_per_iteration": 2.4056413173675537 + }, + { + "auxiliary_loss_clip": 0.01073926, + "auxiliary_loss_mlp": 0.01051614, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.02237928, + "epoch": 0.30813166992334284, + "flos": 25847492826240.0, + "grad_norm": 2.3927459047350017, + "language_loss": 0.81188536, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.83314073, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.515625, + "step": 5125, + "time_per_iteration": 2.4360854625701904 + }, + { + "auxiliary_loss_clip": 0.01075497, + "auxiliary_loss_mlp": 0.01052795, + "balance_loss_clip": 1.01519585, + "balance_loss_mlp": 1.02297163, + "epoch": 0.3081917931760108, + "flos": 20448558823680.0, + "grad_norm": 2.049789417834457, + "language_loss": 0.66047478, + "learning_rate": 3.241621930235989e-06, + "loss": 0.68175769, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.52734375, + "step": 5126, + "time_per_iteration": 2.3739819526672363 + }, + { + "auxiliary_loss_clip": 0.01068805, + "auxiliary_loss_mlp": 0.01052961, + "balance_loss_clip": 1.02199078, + "balance_loss_mlp": 1.02159858, + "epoch": 0.3082519164286788, + "flos": 22165098462720.0, + "grad_norm": 1.6946554675286505, + "language_loss": 0.87967771, + "learning_rate": 3.241316584201646e-06, + "loss": 0.90089536, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.47265625, + "step": 5127, + "time_per_iteration": 2.4258644580841064 + }, + { + "auxiliary_loss_clip": 0.01070685, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.02473295, + "balance_loss_mlp": 1.0217967, + "epoch": 0.30831203968134674, + "flos": 28912133859840.0, + "grad_norm": 2.2447140051668057, + "language_loss": 0.69969273, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.72102189, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.48828125, + "step": 5128, + "time_per_iteration": 2.4938104152679443 + }, + { + "auxiliary_loss_clip": 0.01071814, + "auxiliary_loss_mlp": 0.01058315, + "balance_loss_clip": 1.02510285, + "balance_loss_mlp": 1.02182305, + "epoch": 0.3083721629340147, + "flos": 25666503004800.0, + "grad_norm": 1.9658916150293373, + "language_loss": 0.73451251, + "learning_rate": 3.240705750931993e-06, + "loss": 0.75581384, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.5, + "step": 5129, + "time_per_iteration": 2.4641618728637695 + }, + { + "auxiliary_loss_clip": 0.01018701, + "auxiliary_loss_mlp": 0.01007454, + "balance_loss_clip": 1.00380599, + "balance_loss_mlp": 1.00696802, + "epoch": 0.3084322861866827, + "flos": 68209181763840.0, + "grad_norm": 0.8516039853672059, + "language_loss": 0.5937503, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61401188, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.03637695, + "router_z_loss_mlp": 0.1171875, + "step": 5130, + "time_per_iteration": 2.9810140132904053 + }, + { + "auxiliary_loss_clip": 0.01073298, + "auxiliary_loss_mlp": 0.01059563, + "balance_loss_clip": 1.02391982, + "balance_loss_mlp": 1.02233434, + "epoch": 0.3084924094393507, + "flos": 20295639601920.0, + "grad_norm": 2.4772547117305095, + "language_loss": 0.75050163, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.77183026, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.51171875, + "step": 5131, + "time_per_iteration": 2.401658773422241 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01052481, + "balance_loss_clip": 1.01996088, + "balance_loss_mlp": 1.02167547, + "epoch": 0.30855253269201866, + "flos": 23948741468160.0, + "grad_norm": 1.5160392369184326, + "language_loss": 0.72419083, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.74541593, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.484375, + "step": 5132, + "time_per_iteration": 2.39300537109375 + }, + { + "auxiliary_loss_clip": 0.01066037, + "auxiliary_loss_mlp": 0.010552, + "balance_loss_clip": 1.02513528, + "balance_loss_mlp": 1.02089572, + "epoch": 0.3086126559446866, + "flos": 19280853757440.0, + "grad_norm": 1.7102182831522172, + "language_loss": 0.91429913, + "learning_rate": 3.239483519913136e-06, + "loss": 0.93551153, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45117188, + "step": 5133, + "time_per_iteration": 2.396352529525757 + }, + { + "auxiliary_loss_clip": 0.01072075, + "auxiliary_loss_mlp": 0.01055621, + "balance_loss_clip": 1.02081203, + "balance_loss_mlp": 1.02227795, + "epoch": 0.3086727791973546, + "flos": 33759510203520.0, + "grad_norm": 2.3087133604267396, + "language_loss": 0.6917603, + "learning_rate": 3.239177844626102e-06, + "loss": 0.71303725, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.49804688, + "step": 5134, + "time_per_iteration": 2.481449604034424 + }, + { + "auxiliary_loss_clip": 0.01072189, + "auxiliary_loss_mlp": 0.01051454, + "balance_loss_clip": 1.01757526, + "balance_loss_mlp": 1.02207804, + "epoch": 0.30873290245002255, + "flos": 16033232954880.0, + "grad_norm": 1.9314705828625423, + "language_loss": 0.84629381, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.86753023, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5, + "step": 5135, + "time_per_iteration": 2.3870115280151367 + }, + { + "auxiliary_loss_clip": 0.01015505, + "auxiliary_loss_mlp": 0.01023855, + "balance_loss_clip": 1.01965928, + "balance_loss_mlp": 1.00410891, + "epoch": 0.3087930257026905, + "flos": 65044618819200.0, + "grad_norm": 0.7118924682739293, + "language_loss": 0.55389786, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57429141, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.04199219, + "router_z_loss_mlp": 0.11376953, + "step": 5136, + "time_per_iteration": 3.0375308990478516 + }, + { + "auxiliary_loss_clip": 0.01070233, + "auxiliary_loss_mlp": 0.01053123, + "balance_loss_clip": 1.02124608, + "balance_loss_mlp": 1.02128077, + "epoch": 0.3088531489553585, + "flos": 74736301173120.0, + "grad_norm": 1.9085728896238858, + "language_loss": 0.77449036, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.79572392, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.49023438, + "step": 5137, + "time_per_iteration": 2.880054235458374 + }, + { + "auxiliary_loss_clip": 0.01068389, + "auxiliary_loss_mlp": 0.01048948, + "balance_loss_clip": 1.02056444, + "balance_loss_mlp": 1.02134895, + "epoch": 0.30891327220802645, + "flos": 21141235664640.0, + "grad_norm": 2.2764489233321425, + "language_loss": 0.80530363, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82647699, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.46875, + "step": 5138, + "time_per_iteration": 2.3701059818267822 + }, + { + "auxiliary_loss_clip": 0.01070075, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.01545072, + "balance_loss_mlp": 1.02146792, + "epoch": 0.3089733954606944, + "flos": 25663360982400.0, + "grad_norm": 1.7518389281510307, + "language_loss": 0.82679707, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.84797263, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.484375, + "step": 5139, + "time_per_iteration": 2.458500385284424 + }, + { + "auxiliary_loss_clip": 0.01075578, + "auxiliary_loss_mlp": 0.01055916, + "balance_loss_clip": 1.01798403, + "balance_loss_mlp": 1.02370858, + "epoch": 0.3090335187133624, + "flos": 19426336859520.0, + "grad_norm": 1.8879372384968347, + "language_loss": 0.79357803, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.81489289, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.515625, + "step": 5140, + "time_per_iteration": 2.364794969558716 + }, + { + "auxiliary_loss_clip": 0.01066305, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.01977372, + "balance_loss_mlp": 1.0214467, + "epoch": 0.30909364196603034, + "flos": 20010294126720.0, + "grad_norm": 1.780706563398438, + "language_loss": 0.8050015, + "learning_rate": 3.237036802553252e-06, + "loss": 0.82613695, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.44921875, + "step": 5141, + "time_per_iteration": 2.442444086074829 + }, + { + "auxiliary_loss_clip": 0.01073107, + "auxiliary_loss_mlp": 0.0105638, + "balance_loss_clip": 1.02300119, + "balance_loss_mlp": 1.02266121, + "epoch": 0.3091537652186983, + "flos": 19676699285760.0, + "grad_norm": 2.2631928294318064, + "language_loss": 0.89642549, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.91772032, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.50390625, + "step": 5142, + "time_per_iteration": 2.375620126724243 + }, + { + "auxiliary_loss_clip": 0.01070304, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_clip": 1.01875257, + "balance_loss_mlp": 1.02100313, + "epoch": 0.3092138884713663, + "flos": 17019075415680.0, + "grad_norm": 1.9233390305670097, + "language_loss": 0.81350285, + "learning_rate": 3.23642465389567e-06, + "loss": 0.83472055, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4921875, + "step": 5143, + "time_per_iteration": 2.4567012786865234 + }, + { + "auxiliary_loss_clip": 0.01068455, + "auxiliary_loss_mlp": 0.01052312, + "balance_loss_clip": 1.01888585, + "balance_loss_mlp": 1.02046013, + "epoch": 0.3092740117240343, + "flos": 25008809212800.0, + "grad_norm": 1.7074340801698307, + "language_loss": 0.73157942, + "learning_rate": 3.236118509233055e-06, + "loss": 0.75278699, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.48046875, + "step": 5144, + "time_per_iteration": 2.433255672454834 + }, + { + "auxiliary_loss_clip": 0.01071722, + "auxiliary_loss_mlp": 0.01054278, + "balance_loss_clip": 1.02097106, + "balance_loss_mlp": 1.02165556, + "epoch": 0.30933413497670226, + "flos": 25589310255360.0, + "grad_norm": 5.9764391078815295, + "language_loss": 0.75549734, + "learning_rate": 3.235812317696702e-06, + "loss": 0.77675736, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5, + "step": 5145, + "time_per_iteration": 2.436875104904175 + }, + { + "auxiliary_loss_clip": 0.01070195, + "auxiliary_loss_mlp": 0.01061127, + "balance_loss_clip": 1.027951, + "balance_loss_mlp": 1.02236211, + "epoch": 0.3093942582293702, + "flos": 24388507353600.0, + "grad_norm": 1.5895592941385783, + "language_loss": 0.77002192, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.79133511, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4765625, + "step": 5146, + "time_per_iteration": 2.4170501232147217 + }, + { + "auxiliary_loss_clip": 0.01070771, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_clip": 1.01764953, + "balance_loss_mlp": 1.02209139, + "epoch": 0.3094543814820382, + "flos": 19645416840960.0, + "grad_norm": 2.3295970472663674, + "language_loss": 0.67968035, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.70087421, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48632812, + "step": 5147, + "time_per_iteration": 2.4015259742736816 + }, + { + "auxiliary_loss_clip": 0.010719, + "auxiliary_loss_mlp": 0.01055918, + "balance_loss_clip": 1.02430415, + "balance_loss_mlp": 1.02245235, + "epoch": 0.30951450473470615, + "flos": 25662697666560.0, + "grad_norm": 1.921095999329307, + "language_loss": 0.76709485, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.78837305, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.49609375, + "step": 5148, + "time_per_iteration": 2.4641566276550293 + }, + { + "auxiliary_loss_clip": 0.01076082, + "auxiliary_loss_mlp": 0.01060275, + "balance_loss_clip": 1.02439356, + "balance_loss_mlp": 1.02373981, + "epoch": 0.3095746279873741, + "flos": 12019617722880.0, + "grad_norm": 2.485356254775871, + "language_loss": 0.74787021, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.7692337, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5234375, + "step": 5149, + "time_per_iteration": 2.3894944190979004 + }, + { + "auxiliary_loss_clip": 0.01073572, + "auxiliary_loss_mlp": 0.01063899, + "balance_loss_clip": 1.02706313, + "balance_loss_mlp": 1.02168274, + "epoch": 0.3096347512400421, + "flos": 23621919431040.0, + "grad_norm": 1.9249934748612914, + "language_loss": 0.87117267, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.89254737, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.51953125, + "step": 5150, + "time_per_iteration": 2.401935338973999 + }, + { + "auxiliary_loss_clip": 0.01071148, + "auxiliary_loss_mlp": 0.01058125, + "balance_loss_clip": 1.02181411, + "balance_loss_mlp": 1.02167988, + "epoch": 0.30969487449271005, + "flos": 22528195269120.0, + "grad_norm": 1.766138025565247, + "language_loss": 0.79789162, + "learning_rate": 3.233974184780424e-06, + "loss": 0.8191843, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.49414062, + "step": 5151, + "time_per_iteration": 2.4270591735839844 + }, + { + "auxiliary_loss_clip": 0.01072217, + "auxiliary_loss_mlp": 0.01048712, + "balance_loss_clip": 1.01497579, + "balance_loss_mlp": 1.02199495, + "epoch": 0.309754997745378, + "flos": 15267029057280.0, + "grad_norm": 2.154399046636942, + "language_loss": 0.68448311, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.70569241, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5, + "step": 5152, + "time_per_iteration": 2.4004569053649902 + }, + { + "auxiliary_loss_clip": 0.0107141, + "auxiliary_loss_mlp": 0.01051924, + "balance_loss_clip": 1.01778293, + "balance_loss_mlp": 1.02199745, + "epoch": 0.309815120998046, + "flos": 26978085250560.0, + "grad_norm": 2.1061627107101106, + "language_loss": 0.84798014, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.86921346, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.49609375, + "step": 5153, + "time_per_iteration": 2.4476535320281982 + }, + { + "auxiliary_loss_clip": 0.01071324, + "auxiliary_loss_mlp": 0.01054671, + "balance_loss_clip": 1.02019572, + "balance_loss_mlp": 1.02237797, + "epoch": 0.30987524425071394, + "flos": 21142073537280.0, + "grad_norm": 1.8269964929786695, + "language_loss": 0.74849284, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76975274, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49023438, + "step": 5154, + "time_per_iteration": 2.4161250591278076 + }, + { + "auxiliary_loss_clip": 0.010685, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.01979852, + "balance_loss_mlp": 1.02043998, + "epoch": 0.3099353675033819, + "flos": 15267378170880.0, + "grad_norm": 1.7969514304878627, + "language_loss": 0.77030909, + "learning_rate": 3.232747826832858e-06, + "loss": 0.79151917, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.48046875, + "step": 5155, + "time_per_iteration": 2.368520975112915 + }, + { + "auxiliary_loss_clip": 0.01073791, + "auxiliary_loss_mlp": 0.010535, + "balance_loss_clip": 1.0178566, + "balance_loss_mlp": 1.02264357, + "epoch": 0.30999549075604993, + "flos": 15412896184320.0, + "grad_norm": 1.8895514638631774, + "language_loss": 0.80037403, + "learning_rate": 3.232441120452094e-06, + "loss": 0.82164693, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.51171875, + "step": 5156, + "time_per_iteration": 3.9244837760925293 + }, + { + "auxiliary_loss_clip": 0.01072847, + "auxiliary_loss_mlp": 0.01057355, + "balance_loss_clip": 1.02202153, + "balance_loss_mlp": 1.02093959, + "epoch": 0.3100556140087179, + "flos": 23183445265920.0, + "grad_norm": 2.3248984201419063, + "language_loss": 0.7638759, + "learning_rate": 3.23213436733704e-06, + "loss": 0.78517795, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.51953125, + "step": 5157, + "time_per_iteration": 2.4296038150787354 + }, + { + "auxiliary_loss_clip": 0.01068419, + "auxiliary_loss_mlp": 0.01049068, + "balance_loss_clip": 1.01757276, + "balance_loss_mlp": 1.02121985, + "epoch": 0.31011573726138586, + "flos": 25740902845440.0, + "grad_norm": 1.6625011598891408, + "language_loss": 0.7118687, + "learning_rate": 3.231827567499327e-06, + "loss": 0.73304355, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.47265625, + "step": 5158, + "time_per_iteration": 2.432555913925171 + }, + { + "auxiliary_loss_clip": 0.01067793, + "auxiliary_loss_mlp": 0.01050715, + "balance_loss_clip": 1.0179565, + "balance_loss_mlp": 1.01933956, + "epoch": 0.3101758605140538, + "flos": 20010294126720.0, + "grad_norm": 3.4978919448837513, + "language_loss": 0.85947597, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.88066113, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.484375, + "step": 5159, + "time_per_iteration": 2.398587703704834 + }, + { + "auxiliary_loss_clip": 0.01069594, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_clip": 1.02178764, + "balance_loss_mlp": 1.02063966, + "epoch": 0.3102359837667218, + "flos": 19134672428160.0, + "grad_norm": 2.0647536007220593, + "language_loss": 0.86994302, + "learning_rate": 3.231213827702462e-06, + "loss": 0.89119703, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49023438, + "step": 5160, + "time_per_iteration": 3.788621425628662 + }, + { + "auxiliary_loss_clip": 0.01068077, + "auxiliary_loss_mlp": 0.01049375, + "balance_loss_clip": 1.01863146, + "balance_loss_mlp": 1.02092481, + "epoch": 0.31029610701938976, + "flos": 22264531614720.0, + "grad_norm": 2.1531131361922036, + "language_loss": 0.77240151, + "learning_rate": 3.230906887766584e-06, + "loss": 0.79357606, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.47265625, + "step": 5161, + "time_per_iteration": 3.8507816791534424 + }, + { + "auxiliary_loss_clip": 0.01069035, + "auxiliary_loss_mlp": 0.01061368, + "balance_loss_clip": 1.02655911, + "balance_loss_mlp": 1.02022946, + "epoch": 0.3103562302720577, + "flos": 20804533712640.0, + "grad_norm": 2.052059089141047, + "language_loss": 0.83167791, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.85298193, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48828125, + "step": 5162, + "time_per_iteration": 2.3795039653778076 + }, + { + "auxiliary_loss_clip": 0.01065536, + "auxiliary_loss_mlp": 0.01047138, + "balance_loss_clip": 1.01851606, + "balance_loss_mlp": 1.01965165, + "epoch": 0.3104163535247257, + "flos": 22343120818560.0, + "grad_norm": 1.6027752743147308, + "language_loss": 0.83714461, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85827136, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.45898438, + "step": 5163, + "time_per_iteration": 3.883543014526367 + }, + { + "auxiliary_loss_clip": 0.01073893, + "auxiliary_loss_mlp": 0.01054117, + "balance_loss_clip": 1.02059495, + "balance_loss_mlp": 1.02289534, + "epoch": 0.31047647677739365, + "flos": 21688289758080.0, + "grad_norm": 1.8925122686477232, + "language_loss": 0.77696192, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.79824197, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.51171875, + "step": 5164, + "time_per_iteration": 2.4127323627471924 + }, + { + "auxiliary_loss_clip": 0.01074417, + "auxiliary_loss_mlp": 0.01058208, + "balance_loss_clip": 1.02308917, + "balance_loss_mlp": 1.02408981, + "epoch": 0.3105366000300616, + "flos": 18916255762560.0, + "grad_norm": 2.7791989305015012, + "language_loss": 0.76344168, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.78476799, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.50390625, + "step": 5165, + "time_per_iteration": 2.3649725914001465 + }, + { + "auxiliary_loss_clip": 0.01069011, + "auxiliary_loss_mlp": 0.01053492, + "balance_loss_clip": 1.02123356, + "balance_loss_mlp": 1.02131319, + "epoch": 0.3105967232827296, + "flos": 18259399843200.0, + "grad_norm": 1.526490629755684, + "language_loss": 0.77188253, + "learning_rate": 3.229371488178348e-06, + "loss": 0.79310757, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4765625, + "step": 5166, + "time_per_iteration": 2.4062132835388184 + }, + { + "auxiliary_loss_clip": 0.01070073, + "auxiliary_loss_mlp": 0.0106519, + "balance_loss_clip": 1.03128648, + "balance_loss_mlp": 1.02234244, + "epoch": 0.31065684653539755, + "flos": 17671288124160.0, + "grad_norm": 2.4469016985662715, + "language_loss": 0.75432926, + "learning_rate": 3.229064268360444e-06, + "loss": 0.77568185, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4765625, + "step": 5167, + "time_per_iteration": 2.3398211002349854 + }, + { + "auxiliary_loss_clip": 0.01017564, + "auxiliary_loss_mlp": 0.01011205, + "balance_loss_clip": 1.00739062, + "balance_loss_mlp": 1.00555813, + "epoch": 0.3107169697880655, + "flos": 68528742238080.0, + "grad_norm": 0.7313053793068099, + "language_loss": 0.52985996, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55014765, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.03808594, + "router_z_loss_mlp": 0.12011719, + "step": 5168, + "time_per_iteration": 3.085465431213379 + }, + { + "auxiliary_loss_clip": 0.01071998, + "auxiliary_loss_mlp": 0.01050317, + "balance_loss_clip": 1.016891, + "balance_loss_mlp": 1.02235556, + "epoch": 0.3107770930407335, + "flos": 13187881370880.0, + "grad_norm": 1.8453792944328133, + "language_loss": 0.80373275, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.82495588, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.49609375, + "step": 5169, + "time_per_iteration": 2.377397298812866 + }, + { + "auxiliary_loss_clip": 0.01071969, + "auxiliary_loss_mlp": 0.01053432, + "balance_loss_clip": 1.02036309, + "balance_loss_mlp": 1.0224613, + "epoch": 0.3108372162934015, + "flos": 31579393265280.0, + "grad_norm": 1.674792384646792, + "language_loss": 0.64768779, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66894174, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.49609375, + "step": 5170, + "time_per_iteration": 2.5138497352600098 + }, + { + "auxiliary_loss_clip": 0.01070783, + "auxiliary_loss_mlp": 0.01053655, + "balance_loss_clip": 1.01782036, + "balance_loss_mlp": 1.02193773, + "epoch": 0.31089733954606946, + "flos": 28728595509120.0, + "grad_norm": 2.3043042382350123, + "language_loss": 0.78863955, + "learning_rate": 3.22783492314295e-06, + "loss": 0.80988389, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.48828125, + "step": 5171, + "time_per_iteration": 2.479414701461792 + }, + { + "auxiliary_loss_clip": 0.01071313, + "auxiliary_loss_mlp": 0.0105794, + "balance_loss_clip": 1.02351248, + "balance_loss_mlp": 1.02224159, + "epoch": 0.3109574627987374, + "flos": 19682215280640.0, + "grad_norm": 1.8404824304962253, + "language_loss": 0.84676588, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86805832, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49023438, + "step": 5172, + "time_per_iteration": 2.3911263942718506 + }, + { + "auxiliary_loss_clip": 0.01070572, + "auxiliary_loss_mlp": 0.010587, + "balance_loss_clip": 1.02420068, + "balance_loss_mlp": 1.02136397, + "epoch": 0.3110175860514054, + "flos": 14683106701440.0, + "grad_norm": 2.477828263303185, + "language_loss": 0.86355531, + "learning_rate": 3.227219971129842e-06, + "loss": 0.884848, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.4921875, + "step": 5173, + "time_per_iteration": 2.3650271892547607 + }, + { + "auxiliary_loss_clip": 0.01068531, + "auxiliary_loss_mlp": 0.01050699, + "balance_loss_clip": 1.01999092, + "balance_loss_mlp": 1.02173376, + "epoch": 0.31107770930407336, + "flos": 25738459050240.0, + "grad_norm": 1.9878834802374086, + "language_loss": 0.84640324, + "learning_rate": 3.226912425313001e-06, + "loss": 0.86759549, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.46875, + "step": 5174, + "time_per_iteration": 2.4469003677368164 + }, + { + "auxiliary_loss_clip": 0.01072255, + "auxiliary_loss_mlp": 0.01057735, + "balance_loss_clip": 1.02278268, + "balance_loss_mlp": 1.02227283, + "epoch": 0.3111378325567413, + "flos": 19207256878080.0, + "grad_norm": 2.0306135138874195, + "language_loss": 0.86342078, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.88472062, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.49804688, + "step": 5175, + "time_per_iteration": 2.367511510848999 + }, + { + "auxiliary_loss_clip": 0.01072313, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_clip": 1.02117658, + "balance_loss_mlp": 1.02314067, + "epoch": 0.3111979558094093, + "flos": 23695237019520.0, + "grad_norm": 1.8494906022999613, + "language_loss": 0.85519576, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.87643838, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4921875, + "step": 5176, + "time_per_iteration": 2.429464817047119 + }, + { + "auxiliary_loss_clip": 0.01068422, + "auxiliary_loss_mlp": 0.01053784, + "balance_loss_clip": 1.01897466, + "balance_loss_mlp": 1.02022004, + "epoch": 0.31125807906207725, + "flos": 21031957509120.0, + "grad_norm": 1.8558402953411044, + "language_loss": 0.82265091, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.84387296, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48046875, + "step": 5177, + "time_per_iteration": 2.3679139614105225 + }, + { + "auxiliary_loss_clip": 0.01069708, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.02083802, + "balance_loss_mlp": 1.02116954, + "epoch": 0.3113182023147452, + "flos": 23075493742080.0, + "grad_norm": 1.5944093429577266, + "language_loss": 0.82537478, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.84662396, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48632812, + "step": 5178, + "time_per_iteration": 2.4036431312561035 + }, + { + "auxiliary_loss_clip": 0.01073101, + "auxiliary_loss_mlp": 0.01058026, + "balance_loss_clip": 1.02347875, + "balance_loss_mlp": 1.02107978, + "epoch": 0.3113783255674132, + "flos": 11838174053760.0, + "grad_norm": 2.0467490666592303, + "language_loss": 0.83373213, + "learning_rate": 3.225373998592471e-06, + "loss": 0.85504341, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.51953125, + "step": 5179, + "time_per_iteration": 2.3373751640319824 + }, + { + "auxiliary_loss_clip": 0.01073295, + "auxiliary_loss_mlp": 0.01057846, + "balance_loss_clip": 1.02441931, + "balance_loss_mlp": 1.02401721, + "epoch": 0.31143844882008115, + "flos": 16288622616960.0, + "grad_norm": 1.6511112735748263, + "language_loss": 0.79678512, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.81809658, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4921875, + "step": 5180, + "time_per_iteration": 2.377622365951538 + }, + { + "auxiliary_loss_clip": 0.0107242, + "auxiliary_loss_mlp": 0.01051837, + "balance_loss_clip": 1.01744485, + "balance_loss_mlp": 1.02165592, + "epoch": 0.3114985720727491, + "flos": 23216787480960.0, + "grad_norm": 1.6620241999140897, + "language_loss": 0.85305786, + "learning_rate": 3.22475830255844e-06, + "loss": 0.87430048, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 5181, + "time_per_iteration": 2.382403612136841 + }, + { + "auxiliary_loss_clip": 0.01069755, + "auxiliary_loss_mlp": 0.010592, + "balance_loss_clip": 1.02787197, + "balance_loss_mlp": 1.02114058, + "epoch": 0.3115586953254171, + "flos": 30043319777280.0, + "grad_norm": 1.720482424728682, + "language_loss": 0.75751722, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.77880681, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.484375, + "step": 5182, + "time_per_iteration": 2.4983091354370117 + }, + { + "auxiliary_loss_clip": 0.01073111, + "auxiliary_loss_mlp": 0.01066273, + "balance_loss_clip": 1.02886474, + "balance_loss_mlp": 1.02227163, + "epoch": 0.3116188185780851, + "flos": 25665141461760.0, + "grad_norm": 2.5718207409272926, + "language_loss": 0.71819961, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.73959351, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.5078125, + "step": 5183, + "time_per_iteration": 2.4236068725585938 + }, + { + "auxiliary_loss_clip": 0.01017202, + "auxiliary_loss_mlp": 0.01003695, + "balance_loss_clip": 0.9999277, + "balance_loss_mlp": 1.00604808, + "epoch": 0.31167894183075306, + "flos": 69506974022400.0, + "grad_norm": 0.9546983867770389, + "language_loss": 0.59849524, + "learning_rate": 3.223834410214408e-06, + "loss": 0.6187042, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.03759766, + "router_z_loss_mlp": 0.11132812, + "step": 5184, + "time_per_iteration": 3.056886911392212 + }, + { + "auxiliary_loss_clip": 0.01070924, + "auxiliary_loss_mlp": 0.01053476, + "balance_loss_clip": 1.02038383, + "balance_loss_mlp": 1.02206254, + "epoch": 0.31173906508342103, + "flos": 14938950211200.0, + "grad_norm": 2.093033739684117, + "language_loss": 0.71596563, + "learning_rate": 3.223526353268311e-06, + "loss": 0.73720956, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48828125, + "step": 5185, + "time_per_iteration": 2.3713455200195312 + }, + { + "auxiliary_loss_clip": 0.01073267, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.02196777, + "balance_loss_mlp": 1.02223134, + "epoch": 0.311799188336089, + "flos": 16175224920960.0, + "grad_norm": 2.2982094215745903, + "language_loss": 0.65841675, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.67971742, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.51171875, + "step": 5186, + "time_per_iteration": 2.3850576877593994 + }, + { + "auxiliary_loss_clip": 0.01075341, + "auxiliary_loss_mlp": 0.01058451, + "balance_loss_clip": 1.02094746, + "balance_loss_mlp": 1.02284062, + "epoch": 0.31185931158875696, + "flos": 25008460099200.0, + "grad_norm": 2.256261223420343, + "language_loss": 0.88292789, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.90426588, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.52734375, + "step": 5187, + "time_per_iteration": 2.4243381023406982 + }, + { + "auxiliary_loss_clip": 0.01069099, + "auxiliary_loss_mlp": 0.01059357, + "balance_loss_clip": 1.02755237, + "balance_loss_mlp": 1.02058697, + "epoch": 0.3119194348414249, + "flos": 37231377868800.0, + "grad_norm": 1.4431052697488067, + "language_loss": 0.63955593, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.66084051, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.484375, + "step": 5188, + "time_per_iteration": 2.552957773208618 + }, + { + "auxiliary_loss_clip": 0.0107321, + "auxiliary_loss_mlp": 0.01057579, + "balance_loss_clip": 1.0226984, + "balance_loss_mlp": 1.02394593, + "epoch": 0.3119795580940929, + "flos": 15011883774720.0, + "grad_norm": 2.4179121015235623, + "language_loss": 0.84975147, + "learning_rate": 3.222293661638346e-06, + "loss": 0.8710593, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.4921875, + "step": 5189, + "time_per_iteration": 2.3514082431793213 + }, + { + "auxiliary_loss_clip": 0.01070178, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_clip": 1.01687777, + "balance_loss_mlp": 1.02098298, + "epoch": 0.31203968134676086, + "flos": 15997237476480.0, + "grad_norm": 1.8281733639808793, + "language_loss": 0.80836272, + "learning_rate": 3.22198537282789e-06, + "loss": 0.82958233, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.4921875, + "step": 5190, + "time_per_iteration": 2.413468599319458 + }, + { + "auxiliary_loss_clip": 0.01071203, + "auxiliary_loss_mlp": 0.0104809, + "balance_loss_clip": 1.01674974, + "balance_loss_mlp": 1.0223546, + "epoch": 0.3120998045994288, + "flos": 23836356201600.0, + "grad_norm": 1.4948799295925403, + "language_loss": 0.76419735, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.7853902, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.48828125, + "step": 5191, + "time_per_iteration": 2.4620039463043213 + }, + { + "auxiliary_loss_clip": 0.01014131, + "auxiliary_loss_mlp": 0.01008473, + "balance_loss_clip": 1.004897, + "balance_loss_mlp": 1.00306869, + "epoch": 0.3121599278520968, + "flos": 69181059680640.0, + "grad_norm": 0.8543118982853731, + "language_loss": 0.63968867, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65991479, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.11035156, + "step": 5192, + "time_per_iteration": 3.1050448417663574 + }, + { + "auxiliary_loss_clip": 0.01071262, + "auxiliary_loss_mlp": 0.01051128, + "balance_loss_clip": 1.01646233, + "balance_loss_mlp": 1.02089429, + "epoch": 0.31222005110476475, + "flos": 23805213402240.0, + "grad_norm": 1.8317926943818843, + "language_loss": 0.81799304, + "learning_rate": 3.221060228416446e-06, + "loss": 0.83921695, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.50390625, + "step": 5193, + "time_per_iteration": 2.3983662128448486 + }, + { + "auxiliary_loss_clip": 0.01070381, + "auxiliary_loss_mlp": 0.01048663, + "balance_loss_clip": 1.01480722, + "balance_loss_mlp": 1.02068388, + "epoch": 0.3122801743574327, + "flos": 25225026462720.0, + "grad_norm": 2.0039074490327637, + "language_loss": 0.74035192, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.76154232, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49609375, + "step": 5194, + "time_per_iteration": 2.4376049041748047 + }, + { + "auxiliary_loss_clip": 0.01070712, + "auxiliary_loss_mlp": 0.01049675, + "balance_loss_clip": 1.01829958, + "balance_loss_mlp": 1.02250111, + "epoch": 0.3123402976101007, + "flos": 22965377713920.0, + "grad_norm": 1.45831254579897, + "language_loss": 0.77248347, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.79368734, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.484375, + "step": 5195, + "time_per_iteration": 2.4168224334716797 + }, + { + "auxiliary_loss_clip": 0.01071347, + "auxiliary_loss_mlp": 0.01057431, + "balance_loss_clip": 1.02398038, + "balance_loss_mlp": 1.02129197, + "epoch": 0.3124004208627687, + "flos": 25190916197760.0, + "grad_norm": 1.3767205403481748, + "language_loss": 0.79075599, + "learning_rate": 3.220134667280476e-06, + "loss": 0.81204379, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.5, + "step": 5196, + "time_per_iteration": 3.878204107284546 + }, + { + "auxiliary_loss_clip": 0.01015562, + "auxiliary_loss_mlp": 0.01010079, + "balance_loss_clip": 1.00590658, + "balance_loss_mlp": 1.00425708, + "epoch": 0.31246054411543667, + "flos": 67481626608000.0, + "grad_norm": 0.7756839994348759, + "language_loss": 0.54920948, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56946588, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.11328125, + "step": 5197, + "time_per_iteration": 3.0562310218811035 + }, + { + "auxiliary_loss_clip": 0.01068732, + "auxiliary_loss_mlp": 0.01048203, + "balance_loss_clip": 1.01735198, + "balance_loss_mlp": 1.02224123, + "epoch": 0.31252066736810463, + "flos": 17857549560960.0, + "grad_norm": 1.6532230801798935, + "language_loss": 0.68239284, + "learning_rate": 3.21951739516552e-06, + "loss": 0.70356214, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46484375, + "step": 5198, + "time_per_iteration": 2.372730016708374 + }, + { + "auxiliary_loss_clip": 0.01072384, + "auxiliary_loss_mlp": 0.0104696, + "balance_loss_clip": 1.01386738, + "balance_loss_mlp": 1.02244329, + "epoch": 0.3125807906207726, + "flos": 18474150816000.0, + "grad_norm": 2.2424357622413615, + "language_loss": 0.71584618, + "learning_rate": 3.219208689735857e-06, + "loss": 0.73703963, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.5, + "step": 5199, + "time_per_iteration": 2.415602445602417 + }, + { + "auxiliary_loss_clip": 0.01070962, + "auxiliary_loss_mlp": 0.01051359, + "balance_loss_clip": 1.01953006, + "balance_loss_mlp": 1.02281141, + "epoch": 0.31264091387344056, + "flos": 18945722816640.0, + "grad_norm": 1.7232472157588758, + "language_loss": 0.80145156, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.82267475, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.48046875, + "step": 5200, + "time_per_iteration": 3.7842605113983154 + }, + { + "auxiliary_loss_clip": 0.01067784, + "auxiliary_loss_mlp": 0.01048611, + "balance_loss_clip": 1.01635313, + "balance_loss_mlp": 1.02186179, + "epoch": 0.3127010371261085, + "flos": 21467499120000.0, + "grad_norm": 1.927265265356138, + "language_loss": 0.83817327, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85933727, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45898438, + "step": 5201, + "time_per_iteration": 3.8097481727600098 + }, + { + "auxiliary_loss_clip": 0.01073978, + "auxiliary_loss_mlp": 0.01057974, + "balance_loss_clip": 1.02466679, + "balance_loss_mlp": 1.02459717, + "epoch": 0.3127611603787765, + "flos": 15335284498560.0, + "grad_norm": 2.205348501182241, + "language_loss": 0.71398854, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.73530805, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4921875, + "step": 5202, + "time_per_iteration": 3.7968175411224365 + }, + { + "auxiliary_loss_clip": 0.01071042, + "auxiliary_loss_mlp": 0.01051898, + "balance_loss_clip": 1.02247739, + "balance_loss_mlp": 1.02249014, + "epoch": 0.31282128363144446, + "flos": 17602020253440.0, + "grad_norm": 1.9016282309650459, + "language_loss": 0.86019588, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.88142526, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.48632812, + "step": 5203, + "time_per_iteration": 2.374354362487793 + }, + { + "auxiliary_loss_clip": 0.01071509, + "auxiliary_loss_mlp": 0.01052762, + "balance_loss_clip": 1.02195823, + "balance_loss_mlp": 1.02397895, + "epoch": 0.3128814068841124, + "flos": 26755653778560.0, + "grad_norm": 2.2299667211218672, + "language_loss": 0.62939286, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.6506356, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.47460938, + "step": 5204, + "time_per_iteration": 2.4553256034851074 + }, + { + "auxiliary_loss_clip": 0.01069373, + "auxiliary_loss_mlp": 0.0104928, + "balance_loss_clip": 1.01871514, + "balance_loss_mlp": 1.02230287, + "epoch": 0.3129415301367804, + "flos": 22271304418560.0, + "grad_norm": 1.7491852071254264, + "language_loss": 0.67582119, + "learning_rate": 3.217355486684887e-06, + "loss": 0.69700778, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47070312, + "step": 5205, + "time_per_iteration": 2.4038052558898926 + }, + { + "auxiliary_loss_clip": 0.0107176, + "auxiliary_loss_mlp": 0.01052098, + "balance_loss_clip": 1.02103186, + "balance_loss_mlp": 1.02178526, + "epoch": 0.31300165338944835, + "flos": 26463814790400.0, + "grad_norm": 1.804577263627564, + "language_loss": 0.77489406, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.79613256, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.5, + "step": 5206, + "time_per_iteration": 2.4570634365081787 + }, + { + "auxiliary_loss_clip": 0.01068624, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_clip": 1.02443099, + "balance_loss_mlp": 1.0207665, + "epoch": 0.3130617766421163, + "flos": 21943574686080.0, + "grad_norm": 1.9279142064930386, + "language_loss": 0.84919417, + "learning_rate": 3.216737382911672e-06, + "loss": 0.87043393, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.47851562, + "step": 5207, + "time_per_iteration": 2.382864475250244 + }, + { + "auxiliary_loss_clip": 0.01069752, + "auxiliary_loss_mlp": 0.01054856, + "balance_loss_clip": 1.02529263, + "balance_loss_mlp": 1.02339363, + "epoch": 0.3131218998947843, + "flos": 23291710992000.0, + "grad_norm": 1.8164120712581893, + "language_loss": 0.7296446, + "learning_rate": 3.216428261810999e-06, + "loss": 0.75089073, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.46484375, + "step": 5208, + "time_per_iteration": 2.439727783203125 + }, + { + "auxiliary_loss_clip": 0.01070382, + "auxiliary_loss_mlp": 0.01062593, + "balance_loss_clip": 1.02933407, + "balance_loss_mlp": 1.0218811, + "epoch": 0.3131820231474523, + "flos": 21138652224000.0, + "grad_norm": 2.0840254482686995, + "language_loss": 0.75391412, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.77524388, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48632812, + "step": 5209, + "time_per_iteration": 2.393115997314453 + }, + { + "auxiliary_loss_clip": 0.01070354, + "auxiliary_loss_mlp": 0.01058597, + "balance_loss_clip": 1.0259819, + "balance_loss_mlp": 1.0203135, + "epoch": 0.31324214640012027, + "flos": 23908870828800.0, + "grad_norm": 2.445655947184509, + "language_loss": 0.78262985, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.80391937, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.5, + "step": 5210, + "time_per_iteration": 2.4371728897094727 + }, + { + "auxiliary_loss_clip": 0.01065652, + "auxiliary_loss_mlp": 0.01053774, + "balance_loss_clip": 1.02250564, + "balance_loss_mlp": 1.01953793, + "epoch": 0.31330226965278823, + "flos": 22235832610560.0, + "grad_norm": 1.7577599569913829, + "language_loss": 0.80717778, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.82837206, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4609375, + "step": 5211, + "time_per_iteration": 2.39263653755188 + }, + { + "auxiliary_loss_clip": 0.01068505, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.02070594, + "balance_loss_mlp": 1.02130592, + "epoch": 0.3133623929054562, + "flos": 19753019251200.0, + "grad_norm": 2.0914961147355937, + "language_loss": 0.8025614, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82376158, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.47265625, + "step": 5212, + "time_per_iteration": 2.4355316162109375 + }, + { + "auxiliary_loss_clip": 0.01071788, + "auxiliary_loss_mlp": 0.01057313, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.02130675, + "epoch": 0.31342251615812416, + "flos": 27161030108160.0, + "grad_norm": 1.8487610218477404, + "language_loss": 0.73007202, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.75136304, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.50390625, + "step": 5213, + "time_per_iteration": 2.444756031036377 + }, + { + "auxiliary_loss_clip": 0.0107359, + "auxiliary_loss_mlp": 0.01050688, + "balance_loss_clip": 1.01645136, + "balance_loss_mlp": 1.02409983, + "epoch": 0.31348263941079213, + "flos": 20228780615040.0, + "grad_norm": 2.199763849710305, + "language_loss": 0.78746653, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.80870932, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49414062, + "step": 5214, + "time_per_iteration": 2.454724073410034 + }, + { + "auxiliary_loss_clip": 0.01066942, + "auxiliary_loss_mlp": 0.0104743, + "balance_loss_clip": 1.01833129, + "balance_loss_mlp": 1.02131057, + "epoch": 0.3135427626634601, + "flos": 24606505082880.0, + "grad_norm": 1.760291973419956, + "language_loss": 0.83633059, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.85747427, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.45703125, + "step": 5215, + "time_per_iteration": 2.4192135334014893 + }, + { + "auxiliary_loss_clip": 0.01074184, + "auxiliary_loss_mlp": 0.01047671, + "balance_loss_clip": 1.01355314, + "balance_loss_mlp": 1.02384043, + "epoch": 0.31360288591612806, + "flos": 20958814477440.0, + "grad_norm": 1.8626322683080796, + "language_loss": 0.80165428, + "learning_rate": 3.213953633415686e-06, + "loss": 0.82287282, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.50390625, + "step": 5216, + "time_per_iteration": 2.4442548751831055 + }, + { + "auxiliary_loss_clip": 0.01072399, + "auxiliary_loss_mlp": 0.01053958, + "balance_loss_clip": 1.02036464, + "balance_loss_mlp": 1.02276802, + "epoch": 0.313663009168796, + "flos": 26979272236800.0, + "grad_norm": 1.6352702738941254, + "language_loss": 0.69741678, + "learning_rate": 3.213644097593477e-06, + "loss": 0.71868038, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.49609375, + "step": 5217, + "time_per_iteration": 2.457782745361328 + }, + { + "auxiliary_loss_clip": 0.01072437, + "auxiliary_loss_mlp": 0.01051547, + "balance_loss_clip": 1.018538, + "balance_loss_mlp": 1.02316451, + "epoch": 0.313723132421464, + "flos": 18039935836800.0, + "grad_norm": 1.6935756772149768, + "language_loss": 0.81814843, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83938825, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.4921875, + "step": 5218, + "time_per_iteration": 2.4063880443573 + }, + { + "auxiliary_loss_clip": 0.01072899, + "auxiliary_loss_mlp": 0.01055729, + "balance_loss_clip": 1.02041924, + "balance_loss_mlp": 1.0238266, + "epoch": 0.31378325567413196, + "flos": 22487905693440.0, + "grad_norm": 2.318848433835022, + "language_loss": 0.70850694, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.72979319, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.4921875, + "step": 5219, + "time_per_iteration": 2.38535475730896 + }, + { + "auxiliary_loss_clip": 0.01071439, + "auxiliary_loss_mlp": 0.01051888, + "balance_loss_clip": 1.02112031, + "balance_loss_mlp": 1.02265692, + "epoch": 0.3138433789267999, + "flos": 22418149063680.0, + "grad_norm": 1.8821233299526994, + "language_loss": 0.81344795, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.83468121, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.48828125, + "step": 5220, + "time_per_iteration": 2.4430110454559326 + }, + { + "auxiliary_loss_clip": 0.01071242, + "auxiliary_loss_mlp": 0.01053281, + "balance_loss_clip": 1.02180982, + "balance_loss_mlp": 1.02231085, + "epoch": 0.3139035021794679, + "flos": 13005076158720.0, + "grad_norm": 1.8941626302371748, + "language_loss": 0.75200433, + "learning_rate": 3.212405494206986e-06, + "loss": 0.77324957, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.48828125, + "step": 5221, + "time_per_iteration": 2.3628551959991455 + }, + { + "auxiliary_loss_clip": 0.0106775, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.01683009, + "balance_loss_mlp": 1.02090764, + "epoch": 0.31396362543213585, + "flos": 16945059600000.0, + "grad_norm": 1.670124918867196, + "language_loss": 0.83362567, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.85478079, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46875, + "step": 5222, + "time_per_iteration": 2.399254083633423 + }, + { + "auxiliary_loss_clip": 0.01070247, + "auxiliary_loss_mlp": 0.01063356, + "balance_loss_clip": 1.02549505, + "balance_loss_mlp": 1.02042913, + "epoch": 0.31402374868480387, + "flos": 20155707406080.0, + "grad_norm": 2.083163638828032, + "language_loss": 0.7261641, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.74750012, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5, + "step": 5223, + "time_per_iteration": 2.39150333404541 + }, + { + "auxiliary_loss_clip": 0.0106842, + "auxiliary_loss_mlp": 0.01050806, + "balance_loss_clip": 1.02026498, + "balance_loss_mlp": 1.02054203, + "epoch": 0.31408387193747184, + "flos": 21250025061120.0, + "grad_norm": 1.9053527719262828, + "language_loss": 0.81848609, + "learning_rate": 3.211476058893379e-06, + "loss": 0.83967841, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47851562, + "step": 5224, + "time_per_iteration": 2.4357762336730957 + }, + { + "auxiliary_loss_clip": 0.01072871, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_clip": 1.02887392, + "balance_loss_mlp": 1.02176905, + "epoch": 0.3141439951901398, + "flos": 27483208934400.0, + "grad_norm": 1.95242971105236, + "language_loss": 0.59111428, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.6124922, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.51171875, + "step": 5225, + "time_per_iteration": 2.4326047897338867 + }, + { + "auxiliary_loss_clip": 0.01066427, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.01560092, + "balance_loss_mlp": 1.02176571, + "epoch": 0.31420411844280777, + "flos": 17851440072960.0, + "grad_norm": 1.8809420896722737, + "language_loss": 0.83780313, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.8589077, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.44726562, + "step": 5226, + "time_per_iteration": 2.3958053588867188 + }, + { + "auxiliary_loss_clip": 0.01069532, + "auxiliary_loss_mlp": 0.01053108, + "balance_loss_clip": 1.02106428, + "balance_loss_mlp": 1.02131128, + "epoch": 0.31426424169547573, + "flos": 21615879864960.0, + "grad_norm": 1.767596340028278, + "language_loss": 0.76043135, + "learning_rate": 3.210546210126141e-06, + "loss": 0.78165776, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.48046875, + "step": 5227, + "time_per_iteration": 2.407646417617798 + }, + { + "auxiliary_loss_clip": 0.01070162, + "auxiliary_loss_mlp": 0.01051891, + "balance_loss_clip": 1.01758218, + "balance_loss_mlp": 1.02111697, + "epoch": 0.3143243649481437, + "flos": 30919290589440.0, + "grad_norm": 1.8346805571407, + "language_loss": 0.69877088, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.71999133, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49023438, + "step": 5228, + "time_per_iteration": 2.5092973709106445 + }, + { + "auxiliary_loss_clip": 0.01068533, + "auxiliary_loss_mlp": 0.01049786, + "balance_loss_clip": 1.0193882, + "balance_loss_mlp": 1.02061296, + "epoch": 0.31438448820081166, + "flos": 22820278636800.0, + "grad_norm": 1.7395101257556493, + "language_loss": 0.8110733, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.83225656, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47851562, + "step": 5229, + "time_per_iteration": 2.4077649116516113 + }, + { + "auxiliary_loss_clip": 0.0106876, + "auxiliary_loss_mlp": 0.0104631, + "balance_loss_clip": 1.01488662, + "balance_loss_mlp": 1.02184606, + "epoch": 0.3144446114534796, + "flos": 23291082587520.0, + "grad_norm": 2.246271709480059, + "language_loss": 0.71340632, + "learning_rate": 3.209615948222611e-06, + "loss": 0.73455697, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.46875, + "step": 5230, + "time_per_iteration": 2.477158546447754 + }, + { + "auxiliary_loss_clip": 0.01069277, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_clip": 1.01291358, + "balance_loss_mlp": 1.0205431, + "epoch": 0.3145047347061476, + "flos": 31354692554880.0, + "grad_norm": 1.6306315721820377, + "language_loss": 0.81126571, + "learning_rate": 3.209305769168239e-06, + "loss": 0.83241063, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.48828125, + "step": 5231, + "time_per_iteration": 2.4799187183380127 + }, + { + "auxiliary_loss_clip": 0.01068971, + "auxiliary_loss_mlp": 0.01051076, + "balance_loss_clip": 1.01867533, + "balance_loss_mlp": 1.02221942, + "epoch": 0.31456485795881556, + "flos": 10888780919040.0, + "grad_norm": 2.0811146321771026, + "language_loss": 0.87015748, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.8913579, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46875, + "step": 5232, + "time_per_iteration": 2.398193836212158 + }, + { + "auxiliary_loss_clip": 0.01069661, + "auxiliary_loss_mlp": 0.01050436, + "balance_loss_clip": 1.02009737, + "balance_loss_mlp": 1.02307439, + "epoch": 0.3146249812114835, + "flos": 17091485308800.0, + "grad_norm": 1.6137559948303661, + "language_loss": 0.81712723, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.83832824, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.46679688, + "step": 5233, + "time_per_iteration": 2.387022018432617 + }, + { + "auxiliary_loss_clip": 0.01073691, + "auxiliary_loss_mlp": 0.01051242, + "balance_loss_clip": 1.01965153, + "balance_loss_mlp": 1.02474451, + "epoch": 0.3146851044641515, + "flos": 55289469479040.0, + "grad_norm": 3.312536206334712, + "language_loss": 0.72098821, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.74223757, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.49023438, + "step": 5234, + "time_per_iteration": 2.7314324378967285 + }, + { + "auxiliary_loss_clip": 0.01071037, + "auxiliary_loss_mlp": 0.01046917, + "balance_loss_clip": 1.01420569, + "balance_loss_mlp": 1.02304292, + "epoch": 0.31474522771681945, + "flos": 27014674222080.0, + "grad_norm": 1.8132133769517973, + "language_loss": 0.74403185, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.76521134, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48046875, + "step": 5235, + "time_per_iteration": 2.4337353706359863 + }, + { + "auxiliary_loss_clip": 0.01070018, + "auxiliary_loss_mlp": 0.01051635, + "balance_loss_clip": 1.02066469, + "balance_loss_mlp": 1.02218592, + "epoch": 0.3148053509694875, + "flos": 21250862933760.0, + "grad_norm": 2.1440482320343195, + "language_loss": 0.80217016, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.82338667, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.47851562, + "step": 5236, + "time_per_iteration": 3.822981595993042 + }, + { + "auxiliary_loss_clip": 0.01073298, + "auxiliary_loss_mlp": 0.01051203, + "balance_loss_clip": 1.01888537, + "balance_loss_mlp": 1.02301085, + "epoch": 0.31486547422215544, + "flos": 31247334524160.0, + "grad_norm": 1.6036461674562097, + "language_loss": 0.77764118, + "learning_rate": 3.207443732256881e-06, + "loss": 0.79888618, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.50390625, + "step": 5237, + "time_per_iteration": 2.4718377590179443 + }, + { + "auxiliary_loss_clip": 0.01067823, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_clip": 1.0214088, + "balance_loss_mlp": 1.02219653, + "epoch": 0.3149255974748234, + "flos": 19827593648640.0, + "grad_norm": 1.6924257704912489, + "language_loss": 0.80578446, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82695895, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.45703125, + "step": 5238, + "time_per_iteration": 2.425583600997925 + }, + { + "auxiliary_loss_clip": 0.01024325, + "auxiliary_loss_mlp": 0.01012078, + "balance_loss_clip": 1.00797677, + "balance_loss_mlp": 1.01283383, + "epoch": 0.31498572072749137, + "flos": 67680981671040.0, + "grad_norm": 0.8394430830183599, + "language_loss": 0.67995822, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.70032221, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.11523438, + "step": 5239, + "time_per_iteration": 4.453044652938843 + }, + { + "auxiliary_loss_clip": 0.01072161, + "auxiliary_loss_mlp": 0.01055722, + "balance_loss_clip": 1.02234328, + "balance_loss_mlp": 1.02191567, + "epoch": 0.31504584398015933, + "flos": 19792086929280.0, + "grad_norm": 2.2192597522232185, + "language_loss": 0.84296697, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.86424577, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.50390625, + "step": 5240, + "time_per_iteration": 2.3748996257781982 + }, + { + "auxiliary_loss_clip": 0.01071374, + "auxiliary_loss_mlp": 0.01049716, + "balance_loss_clip": 1.01930583, + "balance_loss_mlp": 1.02398157, + "epoch": 0.3151059672328273, + "flos": 26614185482880.0, + "grad_norm": 1.7960838303538833, + "language_loss": 0.82376146, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.84497237, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47265625, + "step": 5241, + "time_per_iteration": 3.897196054458618 + }, + { + "auxiliary_loss_clip": 0.01066698, + "auxiliary_loss_mlp": 0.01047129, + "balance_loss_clip": 1.01757705, + "balance_loss_mlp": 1.0215764, + "epoch": 0.31516609048549526, + "flos": 24203363080320.0, + "grad_norm": 1.7329527624694296, + "language_loss": 0.757236, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.77837425, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.45117188, + "step": 5242, + "time_per_iteration": 3.86982798576355 + }, + { + "auxiliary_loss_clip": 0.01068942, + "auxiliary_loss_mlp": 0.01052236, + "balance_loss_clip": 1.0203352, + "balance_loss_mlp": 1.02181268, + "epoch": 0.31522621373816323, + "flos": 25957504120320.0, + "grad_norm": 2.9761900714500036, + "language_loss": 0.75114924, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.77236104, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.47265625, + "step": 5243, + "time_per_iteration": 2.4343652725219727 + }, + { + "auxiliary_loss_clip": 0.01068552, + "auxiliary_loss_mlp": 0.01047885, + "balance_loss_clip": 1.0170579, + "balance_loss_mlp": 1.02121615, + "epoch": 0.3152863369908312, + "flos": 21907718853120.0, + "grad_norm": 1.8462600918309915, + "language_loss": 0.66357994, + "learning_rate": 3.205269272758513e-06, + "loss": 0.68474436, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.47265625, + "step": 5244, + "time_per_iteration": 2.4286348819732666 + }, + { + "auxiliary_loss_clip": 0.01070185, + "auxiliary_loss_mlp": 0.01051101, + "balance_loss_clip": 1.01960588, + "balance_loss_mlp": 1.02142859, + "epoch": 0.31534646024349916, + "flos": 16280383536000.0, + "grad_norm": 2.2098782574068228, + "language_loss": 0.92865026, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.9498632, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.48632812, + "step": 5245, + "time_per_iteration": 2.3441379070281982 + }, + { + "auxiliary_loss_clip": 0.01069258, + "auxiliary_loss_mlp": 0.01050242, + "balance_loss_clip": 1.01939106, + "balance_loss_mlp": 1.02136993, + "epoch": 0.3154065834961671, + "flos": 24716097440640.0, + "grad_norm": 1.72139007021704, + "language_loss": 0.76803029, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.78922534, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48046875, + "step": 5246, + "time_per_iteration": 2.4571502208709717 + }, + { + "auxiliary_loss_clip": 0.01068677, + "auxiliary_loss_mlp": 0.0105076, + "balance_loss_clip": 1.01855016, + "balance_loss_mlp": 1.02088571, + "epoch": 0.3154667067488351, + "flos": 35369704241280.0, + "grad_norm": 1.5190253110627694, + "language_loss": 0.6208325, + "learning_rate": 3.204336675750321e-06, + "loss": 0.64202684, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.47851562, + "step": 5247, + "time_per_iteration": 2.512319564819336 + }, + { + "auxiliary_loss_clip": 0.01070628, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_clip": 1.01817238, + "balance_loss_mlp": 1.02130365, + "epoch": 0.31552683000150306, + "flos": 17455524721920.0, + "grad_norm": 2.3255917622268445, + "language_loss": 0.84753036, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.86875147, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.49414062, + "step": 5248, + "time_per_iteration": 2.469712734222412 + }, + { + "auxiliary_loss_clip": 0.01068358, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_clip": 1.01956415, + "balance_loss_mlp": 1.02134681, + "epoch": 0.3155869532541711, + "flos": 18404778211200.0, + "grad_norm": 1.9934699823643216, + "language_loss": 0.8708899, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.89209557, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.47070312, + "step": 5249, + "time_per_iteration": 2.410456895828247 + }, + { + "auxiliary_loss_clip": 0.01070674, + "auxiliary_loss_mlp": 0.01055399, + "balance_loss_clip": 1.02187753, + "balance_loss_mlp": 1.02207613, + "epoch": 0.31564707650683904, + "flos": 21578697400320.0, + "grad_norm": 1.9296998845635263, + "language_loss": 0.87784648, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.89910728, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.484375, + "step": 5250, + "time_per_iteration": 2.446606397628784 + }, + { + "auxiliary_loss_clip": 0.01069997, + "auxiliary_loss_mlp": 0.01050545, + "balance_loss_clip": 1.0177505, + "balance_loss_mlp": 1.02213836, + "epoch": 0.315707199759507, + "flos": 21029967561600.0, + "grad_norm": 2.710296723764754, + "language_loss": 0.7099719, + "learning_rate": 3.203092573767835e-06, + "loss": 0.73117733, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48046875, + "step": 5251, + "time_per_iteration": 2.3985767364501953 + }, + { + "auxiliary_loss_clip": 0.01070667, + "auxiliary_loss_mlp": 0.01055449, + "balance_loss_clip": 1.02128327, + "balance_loss_mlp": 1.02278113, + "epoch": 0.31576732301217497, + "flos": 26827784380800.0, + "grad_norm": 1.7197688231866093, + "language_loss": 0.79083216, + "learning_rate": 3.202781434189246e-06, + "loss": 0.81209338, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.47851562, + "step": 5252, + "time_per_iteration": 2.493068218231201 + }, + { + "auxiliary_loss_clip": 0.01071519, + "auxiliary_loss_mlp": 0.0105044, + "balance_loss_clip": 1.01966047, + "balance_loss_mlp": 1.02465093, + "epoch": 0.31582744626484294, + "flos": 22710057874560.0, + "grad_norm": 1.970590278792238, + "language_loss": 0.75033164, + "learning_rate": 3.202470249001066e-06, + "loss": 0.77155125, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46875, + "step": 5253, + "time_per_iteration": 2.4654693603515625 + }, + { + "auxiliary_loss_clip": 0.01072753, + "auxiliary_loss_mlp": 0.01057657, + "balance_loss_clip": 1.02406418, + "balance_loss_mlp": 1.02361798, + "epoch": 0.3158875695175109, + "flos": 23950766327040.0, + "grad_norm": 2.1234512084127037, + "language_loss": 0.75584072, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.77714479, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4921875, + "step": 5254, + "time_per_iteration": 2.454055070877075 + }, + { + "auxiliary_loss_clip": 0.0107498, + "auxiliary_loss_mlp": 0.01050053, + "balance_loss_clip": 1.01831937, + "balance_loss_mlp": 1.02448297, + "epoch": 0.31594769277017887, + "flos": 13261024402560.0, + "grad_norm": 2.1552508075454977, + "language_loss": 0.7905091, + "learning_rate": 3.201847741843128e-06, + "loss": 0.81175947, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.50390625, + "step": 5255, + "time_per_iteration": 2.4090025424957275 + }, + { + "auxiliary_loss_clip": 0.01075315, + "auxiliary_loss_mlp": 0.01049684, + "balance_loss_clip": 1.01691365, + "balance_loss_mlp": 1.02617776, + "epoch": 0.31600781602284683, + "flos": 23367123262080.0, + "grad_norm": 1.7390802481407164, + "language_loss": 0.79582548, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.81707543, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4921875, + "step": 5256, + "time_per_iteration": 2.4355077743530273 + }, + { + "auxiliary_loss_clip": 0.01068754, + "auxiliary_loss_mlp": 0.01050353, + "balance_loss_clip": 1.01928675, + "balance_loss_mlp": 1.02273667, + "epoch": 0.3160679392755148, + "flos": 19827558737280.0, + "grad_norm": 1.5768911733651956, + "language_loss": 0.72140604, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.7425971, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4609375, + "step": 5257, + "time_per_iteration": 2.387295961380005 + }, + { + "auxiliary_loss_clip": 0.01073865, + "auxiliary_loss_mlp": 0.01054736, + "balance_loss_clip": 1.0198555, + "balance_loss_mlp": 1.02383232, + "epoch": 0.31612806252818276, + "flos": 20192191643520.0, + "grad_norm": 6.306783600694465, + "language_loss": 0.7848593, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.80614531, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5, + "step": 5258, + "time_per_iteration": 2.408771514892578 + }, + { + "auxiliary_loss_clip": 0.01074496, + "auxiliary_loss_mlp": 0.01058635, + "balance_loss_clip": 1.02432621, + "balance_loss_mlp": 1.02512383, + "epoch": 0.31618818578085073, + "flos": 24235029550080.0, + "grad_norm": 2.5753877980233004, + "language_loss": 0.74357021, + "learning_rate": 3.200602180731467e-06, + "loss": 0.76490152, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49414062, + "step": 5259, + "time_per_iteration": 2.437163829803467 + }, + { + "auxiliary_loss_clip": 0.01075676, + "auxiliary_loss_mlp": 0.0105796, + "balance_loss_clip": 1.02553475, + "balance_loss_mlp": 1.02428055, + "epoch": 0.3162483090335187, + "flos": 25080695435520.0, + "grad_norm": 2.044464727386963, + "language_loss": 0.67675292, + "learning_rate": 3.20029067660664e-06, + "loss": 0.69808924, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.515625, + "step": 5260, + "time_per_iteration": 2.4226109981536865 + }, + { + "auxiliary_loss_clip": 0.01072007, + "auxiliary_loss_mlp": 0.01047656, + "balance_loss_clip": 1.01389575, + "balance_loss_mlp": 1.02192974, + "epoch": 0.31630843228618666, + "flos": 26322171937920.0, + "grad_norm": 1.638706927547112, + "language_loss": 0.73197103, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.75316763, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.5, + "step": 5261, + "time_per_iteration": 2.4610073566436768 + }, + { + "auxiliary_loss_clip": 0.01022059, + "auxiliary_loss_mlp": 0.01009432, + "balance_loss_clip": 1.00559306, + "balance_loss_mlp": 1.01103616, + "epoch": 0.3163685555388547, + "flos": 66754839502080.0, + "grad_norm": 0.7625295196193397, + "language_loss": 0.50722063, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52753556, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.03833008, + "router_z_loss_mlp": 0.11035156, + "step": 5262, + "time_per_iteration": 3.0610902309417725 + }, + { + "auxiliary_loss_clip": 0.01074981, + "auxiliary_loss_mlp": 0.0106131, + "balance_loss_clip": 1.02852726, + "balance_loss_mlp": 1.02388692, + "epoch": 0.31642867879152264, + "flos": 25994442205440.0, + "grad_norm": 1.4728549589476287, + "language_loss": 0.86351657, + "learning_rate": 3.19935589118856e-06, + "loss": 0.88487947, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.51171875, + "step": 5263, + "time_per_iteration": 2.4337849617004395 + }, + { + "auxiliary_loss_clip": 0.01068987, + "auxiliary_loss_mlp": 0.01057335, + "balance_loss_clip": 1.0253396, + "balance_loss_mlp": 1.02184272, + "epoch": 0.3164888020441906, + "flos": 25773791212800.0, + "grad_norm": 1.5763325679328357, + "language_loss": 0.82868755, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.84995079, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.47265625, + "step": 5264, + "time_per_iteration": 2.4524686336517334 + }, + { + "auxiliary_loss_clip": 0.01073586, + "auxiliary_loss_mlp": 0.01056095, + "balance_loss_clip": 1.02169156, + "balance_loss_mlp": 1.02223945, + "epoch": 0.3165489252968586, + "flos": 19755183755520.0, + "grad_norm": 1.8930375384932434, + "language_loss": 0.81388956, + "learning_rate": 3.19873247349167e-06, + "loss": 0.83518636, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51171875, + "step": 5265, + "time_per_iteration": 2.4012763500213623 + }, + { + "auxiliary_loss_clip": 0.01071697, + "auxiliary_loss_mlp": 0.01058955, + "balance_loss_clip": 1.02078414, + "balance_loss_mlp": 1.02177775, + "epoch": 0.31660904854952654, + "flos": 23182851772800.0, + "grad_norm": 1.9229585886148168, + "language_loss": 0.76246703, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.78377354, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.5, + "step": 5266, + "time_per_iteration": 2.436870813369751 + }, + { + "auxiliary_loss_clip": 0.01071027, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.01861906, + "balance_loss_mlp": 1.02046442, + "epoch": 0.3166691718021945, + "flos": 20407571020800.0, + "grad_norm": 2.280663168675178, + "language_loss": 0.79843605, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81967223, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.5078125, + "step": 5267, + "time_per_iteration": 2.412386417388916 + }, + { + "auxiliary_loss_clip": 0.01015678, + "auxiliary_loss_mlp": 0.01009982, + "balance_loss_clip": 1.00635767, + "balance_loss_mlp": 1.00515652, + "epoch": 0.31672929505486247, + "flos": 70141275336960.0, + "grad_norm": 0.7408640447911711, + "language_loss": 0.57927114, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59952772, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.03613281, + "router_z_loss_mlp": 0.10546875, + "step": 5268, + "time_per_iteration": 3.026930809020996 + }, + { + "auxiliary_loss_clip": 0.01070165, + "auxiliary_loss_mlp": 0.01047971, + "balance_loss_clip": 1.01442599, + "balance_loss_mlp": 1.02050269, + "epoch": 0.31678941830753043, + "flos": 14354888209920.0, + "grad_norm": 2.3712846312568474, + "language_loss": 0.75493264, + "learning_rate": 3.197485092719815e-06, + "loss": 0.77611399, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.49609375, + "step": 5269, + "time_per_iteration": 2.360624074935913 + }, + { + "auxiliary_loss_clip": 0.01070197, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_clip": 1.03413177, + "balance_loss_mlp": 1.02131486, + "epoch": 0.3168495415601984, + "flos": 22746611934720.0, + "grad_norm": 1.7384246295876193, + "language_loss": 0.80942857, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.83082879, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.48828125, + "step": 5270, + "time_per_iteration": 2.4235222339630127 + }, + { + "auxiliary_loss_clip": 0.01073866, + "auxiliary_loss_mlp": 0.01069361, + "balance_loss_clip": 1.03247738, + "balance_loss_mlp": 1.02196133, + "epoch": 0.31690966481286637, + "flos": 20114370489600.0, + "grad_norm": 2.0868920888820037, + "language_loss": 0.80817556, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.82960784, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.51953125, + "step": 5271, + "time_per_iteration": 2.379120111465454 + }, + { + "auxiliary_loss_clip": 0.01072509, + "auxiliary_loss_mlp": 0.01063021, + "balance_loss_clip": 1.026829, + "balance_loss_mlp": 1.02233291, + "epoch": 0.31696978806553433, + "flos": 21177859547520.0, + "grad_norm": 1.8181744179150368, + "language_loss": 0.74974525, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.77110058, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5, + "step": 5272, + "time_per_iteration": 2.4362761974334717 + }, + { + "auxiliary_loss_clip": 0.01073373, + "auxiliary_loss_mlp": 0.0105485, + "balance_loss_clip": 1.01861012, + "balance_loss_mlp": 1.02208471, + "epoch": 0.3170299113182023, + "flos": 42995363713920.0, + "grad_norm": 2.011412551998453, + "language_loss": 0.70186627, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.72314847, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.51171875, + "step": 5273, + "time_per_iteration": 2.628800630569458 + }, + { + "auxiliary_loss_clip": 0.01072598, + "auxiliary_loss_mlp": 0.01060162, + "balance_loss_clip": 1.02642608, + "balance_loss_mlp": 1.0233686, + "epoch": 0.31709003457087026, + "flos": 24459066944640.0, + "grad_norm": 1.7913170699868086, + "language_loss": 0.69304222, + "learning_rate": 3.195924845146795e-06, + "loss": 0.71436983, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4921875, + "step": 5274, + "time_per_iteration": 2.4788818359375 + }, + { + "auxiliary_loss_clip": 0.01073509, + "auxiliary_loss_mlp": 0.01060306, + "balance_loss_clip": 1.02816677, + "balance_loss_mlp": 1.02680719, + "epoch": 0.3171501578235382, + "flos": 24134130120960.0, + "grad_norm": 1.6739342046005805, + "language_loss": 0.81407428, + "learning_rate": 3.195612659536081e-06, + "loss": 0.8354125, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46679688, + "step": 5275, + "time_per_iteration": 3.846859931945801 + }, + { + "auxiliary_loss_clip": 0.01075251, + "auxiliary_loss_mlp": 0.01061134, + "balance_loss_clip": 1.02658713, + "balance_loss_mlp": 1.02552605, + "epoch": 0.31721028107620625, + "flos": 18878724184320.0, + "grad_norm": 2.650343740689562, + "language_loss": 0.74914122, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.77050507, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.49609375, + "step": 5276, + "time_per_iteration": 2.417881965637207 + }, + { + "auxiliary_loss_clip": 0.01076278, + "auxiliary_loss_mlp": 0.0104749, + "balance_loss_clip": 1.01634061, + "balance_loss_mlp": 1.02888978, + "epoch": 0.3172704043288742, + "flos": 23146786471680.0, + "grad_norm": 5.47447096409296, + "language_loss": 0.79065847, + "learning_rate": 3.194988152313236e-06, + "loss": 0.81189609, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.47265625, + "step": 5277, + "time_per_iteration": 2.4378886222839355 + }, + { + "auxiliary_loss_clip": 0.01080307, + "auxiliary_loss_mlp": 0.01054868, + "balance_loss_clip": 1.02055907, + "balance_loss_mlp": 1.02998543, + "epoch": 0.3173305275815422, + "flos": 17857549560960.0, + "grad_norm": 1.6813324270739345, + "language_loss": 0.80767864, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.82903039, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.50390625, + "step": 5278, + "time_per_iteration": 3.8530702590942383 + }, + { + "auxiliary_loss_clip": 0.01040007, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.03133547, + "balance_loss_mlp": 1.02890253, + "epoch": 0.31739065083421014, + "flos": 59971042442880.0, + "grad_norm": 0.8956242460750529, + "language_loss": 0.62866634, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64942026, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.04052734, + "router_z_loss_mlp": 0.11132812, + "step": 5279, + "time_per_iteration": 2.8170394897460938 + }, + { + "auxiliary_loss_clip": 0.01082289, + "auxiliary_loss_mlp": 0.01059741, + "balance_loss_clip": 1.02445519, + "balance_loss_mlp": 1.03040886, + "epoch": 0.3174507740868781, + "flos": 23799976698240.0, + "grad_norm": 1.8541126050614032, + "language_loss": 0.82302749, + "learning_rate": 3.194051051653053e-06, + "loss": 0.84444785, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.51953125, + "step": 5280, + "time_per_iteration": 3.9273319244384766 + }, + { + "auxiliary_loss_clip": 0.01080356, + "auxiliary_loss_mlp": 0.01069687, + "balance_loss_clip": 1.03404343, + "balance_loss_mlp": 1.03177142, + "epoch": 0.31751089733954607, + "flos": 27637594433280.0, + "grad_norm": 1.5320587096582174, + "language_loss": 0.79704827, + "learning_rate": 3.19373859419346e-06, + "loss": 0.81854868, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48632812, + "step": 5281, + "time_per_iteration": 3.9269840717315674 + }, + { + "auxiliary_loss_clip": 0.01079069, + "auxiliary_loss_mlp": 0.01054765, + "balance_loss_clip": 1.02057624, + "balance_loss_mlp": 1.03009272, + "epoch": 0.31757102059221404, + "flos": 23768135671680.0, + "grad_norm": 1.6108709873997022, + "language_loss": 0.7994746, + "learning_rate": 3.193426091467179e-06, + "loss": 0.82081294, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.49023438, + "step": 5282, + "time_per_iteration": 2.467024087905884 + }, + { + "auxiliary_loss_clip": 0.01082038, + "auxiliary_loss_mlp": 0.01079624, + "balance_loss_clip": 1.04319346, + "balance_loss_mlp": 1.03053927, + "epoch": 0.317631143844882, + "flos": 25263361002240.0, + "grad_norm": 2.120713385321906, + "language_loss": 0.69387424, + "learning_rate": 3.193113543486061e-06, + "loss": 0.71549082, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.515625, + "step": 5283, + "time_per_iteration": 2.4508302211761475 + }, + { + "auxiliary_loss_clip": 0.01034238, + "auxiliary_loss_mlp": 0.01010851, + "balance_loss_clip": 1.00751317, + "balance_loss_mlp": 1.02221727, + "epoch": 0.31769126709754997, + "flos": 55823290300800.0, + "grad_norm": 0.7342500911289369, + "language_loss": 0.52929211, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54974294, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.03344727, + "router_z_loss_mlp": 0.12011719, + "step": 5284, + "time_per_iteration": 3.0201048851013184 + }, + { + "auxiliary_loss_clip": 0.01078478, + "auxiliary_loss_mlp": 0.0109157, + "balance_loss_clip": 1.05661774, + "balance_loss_mlp": 1.02829397, + "epoch": 0.31775139035021793, + "flos": 16689635026560.0, + "grad_norm": 1.7868761006537501, + "language_loss": 0.7209605, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.74266094, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5, + "step": 5285, + "time_per_iteration": 2.3896243572235107 + }, + { + "auxiliary_loss_clip": 0.01024316, + "auxiliary_loss_mlp": 0.01040542, + "balance_loss_clip": 1.03646517, + "balance_loss_mlp": 1.01171994, + "epoch": 0.3178115136028859, + "flos": 64224090979200.0, + "grad_norm": 0.8497457265122933, + "language_loss": 0.60613078, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62677938, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.12597656, + "step": 5286, + "time_per_iteration": 3.098567008972168 + }, + { + "auxiliary_loss_clip": 0.0107285, + "auxiliary_loss_mlp": 0.0109999, + "balance_loss_clip": 1.06680202, + "balance_loss_mlp": 1.02351356, + "epoch": 0.31787163685555386, + "flos": 18696477553920.0, + "grad_norm": 2.0064110287313324, + "language_loss": 0.73250329, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.75423169, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4921875, + "step": 5287, + "time_per_iteration": 2.394767999649048 + }, + { + "auxiliary_loss_clip": 0.01074051, + "auxiliary_loss_mlp": 0.01101593, + "balance_loss_clip": 1.06614041, + "balance_loss_mlp": 1.02404046, + "epoch": 0.31793176010822183, + "flos": 21323691763200.0, + "grad_norm": 1.7471313968381195, + "language_loss": 0.77178717, + "learning_rate": 3.191550125172792e-06, + "loss": 0.79354358, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5, + "step": 5288, + "time_per_iteration": 2.4358932971954346 + }, + { + "auxiliary_loss_clip": 0.01069275, + "auxiliary_loss_mlp": 0.01084294, + "balance_loss_clip": 1.05544531, + "balance_loss_mlp": 1.02318287, + "epoch": 0.31799188336088985, + "flos": 20957662402560.0, + "grad_norm": 1.6415278801456097, + "language_loss": 0.89464515, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.91618085, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4609375, + "step": 5289, + "time_per_iteration": 2.4164464473724365 + }, + { + "auxiliary_loss_clip": 0.01074263, + "auxiliary_loss_mlp": 0.01094781, + "balance_loss_clip": 1.06273723, + "balance_loss_mlp": 1.02794003, + "epoch": 0.3180520066135578, + "flos": 22490838247680.0, + "grad_norm": 1.5507489212056371, + "language_loss": 0.69412875, + "learning_rate": 3.190924441478572e-06, + "loss": 0.71581924, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46289062, + "step": 5290, + "time_per_iteration": 2.4639391899108887 + }, + { + "auxiliary_loss_clip": 0.01084965, + "auxiliary_loss_mlp": 0.01076148, + "balance_loss_clip": 1.03907359, + "balance_loss_mlp": 1.0330205, + "epoch": 0.3181121298662258, + "flos": 27234103317120.0, + "grad_norm": 1.7114450717499257, + "language_loss": 0.80361569, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82522684, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.51953125, + "step": 5291, + "time_per_iteration": 2.4788317680358887 + }, + { + "auxiliary_loss_clip": 0.01085228, + "auxiliary_loss_mlp": 0.0106393, + "balance_loss_clip": 1.0275712, + "balance_loss_mlp": 1.03501868, + "epoch": 0.31817225311889374, + "flos": 23179186080000.0, + "grad_norm": 1.7881264918587272, + "language_loss": 0.81282347, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.83431506, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5, + "step": 5292, + "time_per_iteration": 2.487516164779663 + }, + { + "auxiliary_loss_clip": 0.01085817, + "auxiliary_loss_mlp": 0.01045557, + "balance_loss_clip": 1.01381111, + "balance_loss_mlp": 1.03916824, + "epoch": 0.3182323763715617, + "flos": 23257670549760.0, + "grad_norm": 1.533348268992777, + "language_loss": 0.76398575, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.78529948, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46679688, + "step": 5293, + "time_per_iteration": 2.443641424179077 + }, + { + "auxiliary_loss_clip": 0.01094514, + "auxiliary_loss_mlp": 0.01047447, + "balance_loss_clip": 1.01597607, + "balance_loss_mlp": 1.045084, + "epoch": 0.3182924996242297, + "flos": 29015581818240.0, + "grad_norm": 1.9636175822456505, + "language_loss": 0.76262188, + "learning_rate": 3.189672532265379e-06, + "loss": 0.78404152, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.49414062, + "step": 5294, + "time_per_iteration": 2.505371570587158 + }, + { + "auxiliary_loss_clip": 0.01098672, + "auxiliary_loss_mlp": 0.01058051, + "balance_loss_clip": 1.01926041, + "balance_loss_mlp": 1.04595006, + "epoch": 0.31835262287689764, + "flos": 20448139887360.0, + "grad_norm": 1.9088344499108978, + "language_loss": 0.77599728, + "learning_rate": 3.189359442151152e-06, + "loss": 0.79756457, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.52734375, + "step": 5295, + "time_per_iteration": 2.415400266647339 + }, + { + "auxiliary_loss_clip": 0.01100138, + "auxiliary_loss_mlp": 0.01065009, + "balance_loss_clip": 1.03031945, + "balance_loss_mlp": 1.04638052, + "epoch": 0.3184127461295656, + "flos": 25118296836480.0, + "grad_norm": 1.6424314819835317, + "language_loss": 0.70547593, + "learning_rate": 3.189046306936296e-06, + "loss": 0.72712743, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5390625, + "step": 5296, + "time_per_iteration": 2.5075509548187256 + }, + { + "auxiliary_loss_clip": 0.01100821, + "auxiliary_loss_mlp": 0.01070356, + "balance_loss_clip": 1.03745425, + "balance_loss_mlp": 1.04839766, + "epoch": 0.31847286938223357, + "flos": 25550207665920.0, + "grad_norm": 1.8891895755792623, + "language_loss": 0.7905463, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.81225812, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.5234375, + "step": 5297, + "time_per_iteration": 2.477365016937256 + }, + { + "auxiliary_loss_clip": 0.01099653, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_clip": 1.03609395, + "balance_loss_mlp": 1.04707313, + "epoch": 0.31853299263490154, + "flos": 27781227233280.0, + "grad_norm": 1.9791655771332406, + "language_loss": 0.79975033, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.82147157, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.52734375, + "step": 5298, + "time_per_iteration": 2.559987783432007 + }, + { + "auxiliary_loss_clip": 0.01101713, + "auxiliary_loss_mlp": 0.01076001, + "balance_loss_clip": 1.04111993, + "balance_loss_mlp": 1.04622579, + "epoch": 0.3185931158875695, + "flos": 22705763777280.0, + "grad_norm": 2.4145989260791136, + "language_loss": 0.75689077, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.77866781, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5546875, + "step": 5299, + "time_per_iteration": 2.4582860469818115 + }, + { + "auxiliary_loss_clip": 0.01101526, + "auxiliary_loss_mlp": 0.0108208, + "balance_loss_clip": 1.04748595, + "balance_loss_mlp": 1.04648376, + "epoch": 0.31865323914023747, + "flos": 24570369959040.0, + "grad_norm": 1.9904173455086722, + "language_loss": 0.79698408, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.81882012, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.55078125, + "step": 5300, + "time_per_iteration": 2.4962432384490967 + }, + { + "auxiliary_loss_clip": 0.01095643, + "auxiliary_loss_mlp": 0.01083145, + "balance_loss_clip": 1.04402018, + "balance_loss_mlp": 1.04237199, + "epoch": 0.31871336239290543, + "flos": 18185593495680.0, + "grad_norm": 2.1407910914103168, + "language_loss": 0.8577677, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.87955558, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.53125, + "step": 5301, + "time_per_iteration": 2.393170118331909 + }, + { + "auxiliary_loss_clip": 0.01093402, + "auxiliary_loss_mlp": 0.01087174, + "balance_loss_clip": 1.04688096, + "balance_loss_mlp": 1.04271543, + "epoch": 0.31877348564557345, + "flos": 21825917804160.0, + "grad_norm": 2.783000342552586, + "language_loss": 0.78815091, + "learning_rate": 3.187166549199015e-06, + "loss": 0.80995661, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.50390625, + "step": 5302, + "time_per_iteration": 2.470381498336792 + }, + { + "auxiliary_loss_clip": 0.01088578, + "auxiliary_loss_mlp": 0.01073046, + "balance_loss_clip": 1.03668737, + "balance_loss_mlp": 1.03935909, + "epoch": 0.3188336088982414, + "flos": 22014239011200.0, + "grad_norm": 1.880056496055087, + "language_loss": 0.81230628, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.83392251, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.4921875, + "step": 5303, + "time_per_iteration": 2.4180784225463867 + }, + { + "auxiliary_loss_clip": 0.01094333, + "auxiliary_loss_mlp": 0.01082376, + "balance_loss_clip": 1.04384804, + "balance_loss_mlp": 1.03911197, + "epoch": 0.3188937321509094, + "flos": 20046848186880.0, + "grad_norm": 3.802462332914547, + "language_loss": 0.75231647, + "learning_rate": 3.186539603020047e-06, + "loss": 0.77408355, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.55078125, + "step": 5304, + "time_per_iteration": 2.4787485599517822 + }, + { + "auxiliary_loss_clip": 0.01085053, + "auxiliary_loss_mlp": 0.01060688, + "balance_loss_clip": 1.02884746, + "balance_loss_mlp": 1.03637123, + "epoch": 0.31895385540357735, + "flos": 25846934244480.0, + "grad_norm": 1.7887476071716195, + "language_loss": 0.73590744, + "learning_rate": 3.186226062434068e-06, + "loss": 0.75736481, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.48632812, + "step": 5305, + "time_per_iteration": 2.448594331741333 + }, + { + "auxiliary_loss_clip": 0.0108642, + "auxiliary_loss_mlp": 0.01071605, + "balance_loss_clip": 1.03722465, + "balance_loss_mlp": 1.03543019, + "epoch": 0.3190139786562453, + "flos": 23476575974400.0, + "grad_norm": 2.155738980986795, + "language_loss": 0.65139675, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.67297703, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 5306, + "time_per_iteration": 2.499844551086426 + }, + { + "auxiliary_loss_clip": 0.01082657, + "auxiliary_loss_mlp": 0.01056749, + "balance_loss_clip": 1.02060509, + "balance_loss_mlp": 1.03228605, + "epoch": 0.3190741019089133, + "flos": 29094275756160.0, + "grad_norm": 2.144802732078294, + "language_loss": 0.81708348, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.83847749, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.50390625, + "step": 5307, + "time_per_iteration": 2.4775967597961426 + }, + { + "auxiliary_loss_clip": 0.0107849, + "auxiliary_loss_mlp": 0.0106261, + "balance_loss_clip": 1.02851605, + "balance_loss_mlp": 1.03045678, + "epoch": 0.31913422516158124, + "flos": 17128563039360.0, + "grad_norm": 1.7086527069488036, + "language_loss": 0.79128689, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.81269783, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.48046875, + "step": 5308, + "time_per_iteration": 2.42376971244812 + }, + { + "auxiliary_loss_clip": 0.01087035, + "auxiliary_loss_mlp": 0.01069122, + "balance_loss_clip": 1.02634966, + "balance_loss_mlp": 1.030756, + "epoch": 0.3191943484142492, + "flos": 16068949142400.0, + "grad_norm": 2.360625776253197, + "language_loss": 0.76489568, + "learning_rate": 3.184971450390961e-06, + "loss": 0.7864573, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 0.5625, + "step": 5309, + "time_per_iteration": 2.387672185897827 + }, + { + "auxiliary_loss_clip": 0.01079498, + "auxiliary_loss_mlp": 0.01052683, + "balance_loss_clip": 1.01787376, + "balance_loss_mlp": 1.02912009, + "epoch": 0.3192544716669172, + "flos": 22965063511680.0, + "grad_norm": 3.0057776187950913, + "language_loss": 0.84225726, + "learning_rate": 3.184657685014856e-06, + "loss": 0.86357903, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.50390625, + "step": 5310, + "time_per_iteration": 2.4562137126922607 + }, + { + "auxiliary_loss_clip": 0.01076832, + "auxiliary_loss_mlp": 0.01050083, + "balance_loss_clip": 1.01687098, + "balance_loss_mlp": 1.02764034, + "epoch": 0.31931459491958514, + "flos": 26869121297280.0, + "grad_norm": 1.513019590978647, + "language_loss": 0.79699457, + "learning_rate": 3.184343874716412e-06, + "loss": 0.81826377, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4921875, + "step": 5311, + "time_per_iteration": 2.4525325298309326 + }, + { + "auxiliary_loss_clip": 0.01078126, + "auxiliary_loss_mlp": 0.01057684, + "balance_loss_clip": 1.02323258, + "balance_loss_mlp": 1.02821243, + "epoch": 0.3193747181722531, + "flos": 21835413694080.0, + "grad_norm": 1.869547838031828, + "language_loss": 0.8585124, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.87987041, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5, + "step": 5312, + "time_per_iteration": 2.458223581314087 + }, + { + "auxiliary_loss_clip": 0.01083656, + "auxiliary_loss_mlp": 0.01063879, + "balance_loss_clip": 1.02501643, + "balance_loss_mlp": 1.0298574, + "epoch": 0.31943484142492107, + "flos": 18324233971200.0, + "grad_norm": 4.535159417530678, + "language_loss": 0.814973, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.83644831, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.5390625, + "step": 5313, + "time_per_iteration": 2.3895158767700195 + }, + { + "auxiliary_loss_clip": 0.01078826, + "auxiliary_loss_mlp": 0.01052771, + "balance_loss_clip": 1.01777077, + "balance_loss_mlp": 1.02925158, + "epoch": 0.31949496467758903, + "flos": 21614762701440.0, + "grad_norm": 2.0690697916635923, + "language_loss": 0.88905668, + "learning_rate": 3.183402174406057e-06, + "loss": 0.91037261, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.49609375, + "step": 5314, + "time_per_iteration": 3.9060401916503906 + }, + { + "auxiliary_loss_clip": 0.0107897, + "auxiliary_loss_mlp": 0.01056263, + "balance_loss_clip": 1.01887941, + "balance_loss_mlp": 1.02858031, + "epoch": 0.31955508793025705, + "flos": 21759198462720.0, + "grad_norm": 1.6991661193942498, + "language_loss": 0.80710447, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82845676, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.50390625, + "step": 5315, + "time_per_iteration": 2.4090631008148193 + }, + { + "auxiliary_loss_clip": 0.01080732, + "auxiliary_loss_mlp": 0.01062122, + "balance_loss_clip": 1.02566755, + "balance_loss_mlp": 1.02867055, + "epoch": 0.319615211182925, + "flos": 17163406442880.0, + "grad_norm": 1.8470864472336228, + "language_loss": 0.69077098, + "learning_rate": 3.18277414980567e-06, + "loss": 0.71219951, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.51953125, + "step": 5316, + "time_per_iteration": 2.412233829498291 + }, + { + "auxiliary_loss_clip": 0.01082549, + "auxiliary_loss_mlp": 0.01054574, + "balance_loss_clip": 1.0205754, + "balance_loss_mlp": 1.03147411, + "epoch": 0.319675334435593, + "flos": 28111505495040.0, + "grad_norm": 1.4085707627975654, + "language_loss": 0.69867092, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.72004217, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.51171875, + "step": 5317, + "time_per_iteration": 2.493027925491333 + }, + { + "auxiliary_loss_clip": 0.010249, + "auxiliary_loss_mlp": 0.01011737, + "balance_loss_clip": 1.00756443, + "balance_loss_mlp": 1.01457906, + "epoch": 0.31973545768826095, + "flos": 69497266798080.0, + "grad_norm": 0.7565392190354854, + "language_loss": 0.53227985, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55264616, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.10302734, + "step": 5318, + "time_per_iteration": 4.599034309387207 + }, + { + "auxiliary_loss_clip": 0.01076058, + "auxiliary_loss_mlp": 0.0105753, + "balance_loss_clip": 1.02424705, + "balance_loss_mlp": 1.02731836, + "epoch": 0.3197955809409289, + "flos": 13698346492800.0, + "grad_norm": 1.8533392514338791, + "language_loss": 0.855883, + "learning_rate": 3.181831776553012e-06, + "loss": 0.87721884, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48828125, + "step": 5319, + "time_per_iteration": 3.892874240875244 + }, + { + "auxiliary_loss_clip": 0.0107319, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.01814365, + "balance_loss_mlp": 1.02460241, + "epoch": 0.3198557041935969, + "flos": 33216750207360.0, + "grad_norm": 1.7171385411795295, + "language_loss": 0.65361011, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.67488134, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.484375, + "step": 5320, + "time_per_iteration": 2.5267019271850586 + }, + { + "auxiliary_loss_clip": 0.01081953, + "auxiliary_loss_mlp": 0.01055164, + "balance_loss_clip": 1.01954436, + "balance_loss_mlp": 1.02816904, + "epoch": 0.31991582744626484, + "flos": 23730918295680.0, + "grad_norm": 1.779136174045927, + "language_loss": 0.7171756, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.73854673, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5390625, + "step": 5321, + "time_per_iteration": 3.8277320861816406 + }, + { + "auxiliary_loss_clip": 0.01082474, + "auxiliary_loss_mlp": 0.01066653, + "balance_loss_clip": 1.02664661, + "balance_loss_mlp": 1.02797019, + "epoch": 0.3199759506989328, + "flos": 18549877288320.0, + "grad_norm": 2.7141396308160814, + "language_loss": 0.87779176, + "learning_rate": 3.180888999963749e-06, + "loss": 0.89928299, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 0.546875, + "step": 5322, + "time_per_iteration": 2.396885633468628 + }, + { + "auxiliary_loss_clip": 0.01072919, + "auxiliary_loss_mlp": 0.01055887, + "balance_loss_clip": 1.02198362, + "balance_loss_mlp": 1.02375412, + "epoch": 0.3200360739516008, + "flos": 22417799950080.0, + "grad_norm": 1.6916859156196709, + "language_loss": 0.84275699, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.86404508, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4921875, + "step": 5323, + "time_per_iteration": 2.4066548347473145 + }, + { + "auxiliary_loss_clip": 0.01073192, + "auxiliary_loss_mlp": 0.01058507, + "balance_loss_clip": 1.02150416, + "balance_loss_mlp": 1.02442443, + "epoch": 0.32009619720426874, + "flos": 20594181571200.0, + "grad_norm": 1.8289421235018428, + "language_loss": 0.80105436, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.82237136, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.48632812, + "step": 5324, + "time_per_iteration": 2.4336817264556885 + }, + { + "auxiliary_loss_clip": 0.01073205, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.01862192, + "balance_loss_mlp": 1.02450752, + "epoch": 0.3201563204569367, + "flos": 18146735285760.0, + "grad_norm": 1.782018379905865, + "language_loss": 0.81830835, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.83956683, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.48828125, + "step": 5325, + "time_per_iteration": 2.377291440963745 + }, + { + "auxiliary_loss_clip": 0.01074552, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_clip": 1.01521063, + "balance_loss_mlp": 1.02382767, + "epoch": 0.32021644370960467, + "flos": 31682945957760.0, + "grad_norm": 3.1994053538647735, + "language_loss": 0.76575285, + "learning_rate": 3.179631337655037e-06, + "loss": 0.78699791, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.5078125, + "step": 5326, + "time_per_iteration": 2.522315263748169 + }, + { + "auxiliary_loss_clip": 0.01072262, + "auxiliary_loss_mlp": 0.01054188, + "balance_loss_clip": 1.02009392, + "balance_loss_mlp": 1.02498507, + "epoch": 0.32027656696227264, + "flos": 26864827200000.0, + "grad_norm": 1.5652294964671287, + "language_loss": 0.82204431, + "learning_rate": 3.179316810218701e-06, + "loss": 0.84330887, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.47265625, + "step": 5327, + "time_per_iteration": 2.429962396621704 + }, + { + "auxiliary_loss_clip": 0.01077018, + "auxiliary_loss_mlp": 0.01053216, + "balance_loss_clip": 1.01669025, + "balance_loss_mlp": 1.02465582, + "epoch": 0.32033669021494066, + "flos": 24168798967680.0, + "grad_norm": 1.4228225780300756, + "language_loss": 0.7867617, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80806398, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.5234375, + "step": 5328, + "time_per_iteration": 2.440349817276001 + }, + { + "auxiliary_loss_clip": 0.01076572, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_clip": 1.02033877, + "balance_loss_mlp": 1.0261457, + "epoch": 0.3203968134676086, + "flos": 24459660437760.0, + "grad_norm": 1.5720753985115923, + "language_loss": 0.75034404, + "learning_rate": 3.178687621198524e-06, + "loss": 0.77169102, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.50390625, + "step": 5329, + "time_per_iteration": 2.441746711730957 + }, + { + "auxiliary_loss_clip": 0.01070568, + "auxiliary_loss_mlp": 0.01045197, + "balance_loss_clip": 1.01401162, + "balance_loss_mlp": 1.02313757, + "epoch": 0.3204569367202766, + "flos": 18003730890240.0, + "grad_norm": 1.5193149345362091, + "language_loss": 0.72675335, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.74791098, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.47460938, + "step": 5330, + "time_per_iteration": 2.4335949420928955 + }, + { + "auxiliary_loss_clip": 0.01078654, + "auxiliary_loss_mlp": 0.0106059, + "balance_loss_clip": 1.02144158, + "balance_loss_mlp": 1.02545941, + "epoch": 0.32051705997294455, + "flos": 30588418834560.0, + "grad_norm": 1.900630617744238, + "language_loss": 0.80979061, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.83118308, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.53125, + "step": 5331, + "time_per_iteration": 2.4651646614074707 + }, + { + "auxiliary_loss_clip": 0.01019584, + "auxiliary_loss_mlp": 0.01012094, + "balance_loss_clip": 1.00801671, + "balance_loss_mlp": 1.00893307, + "epoch": 0.3205771832256125, + "flos": 68414855783040.0, + "grad_norm": 0.843834935414847, + "language_loss": 0.57832181, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59863853, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.10644531, + "step": 5332, + "time_per_iteration": 2.996425151824951 + }, + { + "auxiliary_loss_clip": 0.01078934, + "auxiliary_loss_mlp": 0.01051075, + "balance_loss_clip": 1.01512146, + "balance_loss_mlp": 1.02671742, + "epoch": 0.3206373064782805, + "flos": 30442691352960.0, + "grad_norm": 1.715237850432866, + "language_loss": 0.74779356, + "learning_rate": 3.177428706902205e-06, + "loss": 0.76909363, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.5234375, + "step": 5333, + "time_per_iteration": 2.494990587234497 + }, + { + "auxiliary_loss_clip": 0.01075803, + "auxiliary_loss_mlp": 0.01068103, + "balance_loss_clip": 1.02962279, + "balance_loss_mlp": 1.02496874, + "epoch": 0.32069742973094845, + "flos": 22053411423360.0, + "grad_norm": 1.688153371058265, + "language_loss": 0.72589695, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.74733603, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5078125, + "step": 5334, + "time_per_iteration": 2.4250235557556152 + }, + { + "auxiliary_loss_clip": 0.0107688, + "auxiliary_loss_mlp": 0.01056193, + "balance_loss_clip": 1.0218842, + "balance_loss_mlp": 1.02628374, + "epoch": 0.3207575529836164, + "flos": 22052922664320.0, + "grad_norm": 1.8455297319702417, + "language_loss": 0.78437495, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.80570567, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 5335, + "time_per_iteration": 2.3910887241363525 + }, + { + "auxiliary_loss_clip": 0.01077406, + "auxiliary_loss_mlp": 0.010546, + "balance_loss_clip": 1.0199101, + "balance_loss_mlp": 1.02692723, + "epoch": 0.3208176762362844, + "flos": 34056132048000.0, + "grad_norm": 1.5272764861102277, + "language_loss": 0.69813651, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.71945655, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.50390625, + "step": 5336, + "time_per_iteration": 2.5134389400482178 + }, + { + "auxiliary_loss_clip": 0.01074094, + "auxiliary_loss_mlp": 0.01055696, + "balance_loss_clip": 1.0216496, + "balance_loss_mlp": 1.02470577, + "epoch": 0.32087779948895234, + "flos": 21797637736320.0, + "grad_norm": 2.239572123714747, + "language_loss": 0.805601, + "learning_rate": 3.176169078234487e-06, + "loss": 0.82689893, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49414062, + "step": 5337, + "time_per_iteration": 2.391011953353882 + }, + { + "auxiliary_loss_clip": 0.01072211, + "auxiliary_loss_mlp": 0.01051786, + "balance_loss_clip": 1.02019596, + "balance_loss_mlp": 1.02443957, + "epoch": 0.3209379227416203, + "flos": 21433039741440.0, + "grad_norm": 1.8597491806404594, + "language_loss": 0.75949812, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.78073806, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.47851562, + "step": 5338, + "time_per_iteration": 2.44710636138916 + }, + { + "auxiliary_loss_clip": 0.01077209, + "auxiliary_loss_mlp": 0.01054789, + "balance_loss_clip": 1.016904, + "balance_loss_mlp": 1.02492702, + "epoch": 0.3209980459942883, + "flos": 25847876851200.0, + "grad_norm": 2.566792869620281, + "language_loss": 0.63884574, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.66016567, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5234375, + "step": 5339, + "time_per_iteration": 2.4235384464263916 + }, + { + "auxiliary_loss_clip": 0.01075391, + "auxiliary_loss_mlp": 0.01052442, + "balance_loss_clip": 1.0163213, + "balance_loss_mlp": 1.02535319, + "epoch": 0.32105816924695624, + "flos": 19098153279360.0, + "grad_norm": 2.559830153304163, + "language_loss": 0.83516055, + "learning_rate": 3.175223888387192e-06, + "loss": 0.85643888, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.5, + "step": 5340, + "time_per_iteration": 2.4333345890045166 + }, + { + "auxiliary_loss_clip": 0.01074834, + "auxiliary_loss_mlp": 0.01054456, + "balance_loss_clip": 1.01919365, + "balance_loss_mlp": 1.02579057, + "epoch": 0.3211182924996242, + "flos": 16580915452800.0, + "grad_norm": 1.994193063387936, + "language_loss": 0.77932751, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.80062044, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.48828125, + "step": 5341, + "time_per_iteration": 2.3701252937316895 + }, + { + "auxiliary_loss_clip": 0.010741, + "auxiliary_loss_mlp": 0.01061562, + "balance_loss_clip": 1.0271821, + "balance_loss_mlp": 1.02481294, + "epoch": 0.3211784157522922, + "flos": 22671164753280.0, + "grad_norm": 1.6289575995991281, + "language_loss": 0.80186397, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.82322061, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.4921875, + "step": 5342, + "time_per_iteration": 2.454230546951294 + }, + { + "auxiliary_loss_clip": 0.01076247, + "auxiliary_loss_mlp": 0.0106076, + "balance_loss_clip": 1.02390075, + "balance_loss_mlp": 1.02565682, + "epoch": 0.3212385390049602, + "flos": 20557732245120.0, + "grad_norm": 3.717515285155883, + "language_loss": 0.76654184, + "learning_rate": 3.174278297458438e-06, + "loss": 0.78791189, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.50390625, + "step": 5343, + "time_per_iteration": 2.388232946395874 + }, + { + "auxiliary_loss_clip": 0.01071533, + "auxiliary_loss_mlp": 0.01058018, + "balance_loss_clip": 1.02330446, + "balance_loss_mlp": 1.02356303, + "epoch": 0.32129866225762815, + "flos": 24789973610880.0, + "grad_norm": 1.5918031018582057, + "language_loss": 0.83301401, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85430956, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48046875, + "step": 5344, + "time_per_iteration": 2.503647804260254 + }, + { + "auxiliary_loss_clip": 0.01074161, + "auxiliary_loss_mlp": 0.01058665, + "balance_loss_clip": 1.02347469, + "balance_loss_mlp": 1.02274394, + "epoch": 0.3213587855102961, + "flos": 18365954912640.0, + "grad_norm": 2.0070524252350515, + "language_loss": 0.80985272, + "learning_rate": 3.173647680842262e-06, + "loss": 0.83118105, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.515625, + "step": 5345, + "time_per_iteration": 2.3665239810943604 + }, + { + "auxiliary_loss_clip": 0.01072934, + "auxiliary_loss_mlp": 0.010601, + "balance_loss_clip": 1.02481461, + "balance_loss_mlp": 1.02339399, + "epoch": 0.3214189087629641, + "flos": 27014778956160.0, + "grad_norm": 2.3492583308115353, + "language_loss": 0.85454512, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.87587547, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.49609375, + "step": 5346, + "time_per_iteration": 2.495651960372925 + }, + { + "auxiliary_loss_clip": 0.01072741, + "auxiliary_loss_mlp": 0.01065584, + "balance_loss_clip": 1.02872396, + "balance_loss_mlp": 1.02257276, + "epoch": 0.32147903201563205, + "flos": 23147170496640.0, + "grad_norm": 1.3979298526122703, + "language_loss": 0.82226199, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.84364522, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.5, + "step": 5347, + "time_per_iteration": 2.4102063179016113 + }, + { + "auxiliary_loss_clip": 0.0107293, + "auxiliary_loss_mlp": 0.01057598, + "balance_loss_clip": 1.02092886, + "balance_loss_mlp": 1.02309024, + "epoch": 0.3215391552683, + "flos": 16579833200640.0, + "grad_norm": 1.8850878616764366, + "language_loss": 0.81717306, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.83847833, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.49804688, + "step": 5348, + "time_per_iteration": 2.401184320449829 + }, + { + "auxiliary_loss_clip": 0.01072115, + "auxiliary_loss_mlp": 0.01065006, + "balance_loss_clip": 1.02960038, + "balance_loss_mlp": 1.02290881, + "epoch": 0.321599278520968, + "flos": 17820855855360.0, + "grad_norm": 2.229003139877603, + "language_loss": 0.87149775, + "learning_rate": 3.172385913647542e-06, + "loss": 0.892869, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.4921875, + "step": 5349, + "time_per_iteration": 2.3969485759735107 + }, + { + "auxiliary_loss_clip": 0.01071559, + "auxiliary_loss_mlp": 0.01057313, + "balance_loss_clip": 1.02388728, + "balance_loss_mlp": 1.02273154, + "epoch": 0.32165940177363594, + "flos": 16250881570560.0, + "grad_norm": 4.677403323415619, + "language_loss": 0.82135165, + "learning_rate": 3.172070360676475e-06, + "loss": 0.8426404, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.48828125, + "step": 5350, + "time_per_iteration": 2.393501043319702 + }, + { + "auxiliary_loss_clip": 0.01071605, + "auxiliary_loss_mlp": 0.01055711, + "balance_loss_clip": 1.02280879, + "balance_loss_mlp": 1.0227195, + "epoch": 0.3217195250263039, + "flos": 27598666400640.0, + "grad_norm": 1.6716829715246035, + "language_loss": 0.80686098, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82813418, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48828125, + "step": 5351, + "time_per_iteration": 2.451160192489624 + }, + { + "auxiliary_loss_clip": 0.01073855, + "auxiliary_loss_mlp": 0.01055752, + "balance_loss_clip": 1.02168179, + "balance_loss_mlp": 1.02471137, + "epoch": 0.3217796482789719, + "flos": 21469523978880.0, + "grad_norm": 2.1282186637394886, + "language_loss": 0.77355957, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.79485571, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49023438, + "step": 5352, + "time_per_iteration": 2.4305784702301025 + }, + { + "auxiliary_loss_clip": 0.01072119, + "auxiliary_loss_mlp": 0.01054641, + "balance_loss_clip": 1.01875889, + "balance_loss_mlp": 1.02321172, + "epoch": 0.32183977153163984, + "flos": 21214518341760.0, + "grad_norm": 3.9397898159138607, + "language_loss": 0.83299512, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.85426277, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48828125, + "step": 5353, + "time_per_iteration": 2.4186646938323975 + }, + { + "auxiliary_loss_clip": 0.01071784, + "auxiliary_loss_mlp": 0.01053087, + "balance_loss_clip": 1.01863575, + "balance_loss_mlp": 1.02418971, + "epoch": 0.3218998947843078, + "flos": 24607028753280.0, + "grad_norm": 1.927550456289071, + "language_loss": 0.73878002, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.76002866, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47460938, + "step": 5354, + "time_per_iteration": 3.8893332481384277 + }, + { + "auxiliary_loss_clip": 0.01072125, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.01530588, + "balance_loss_mlp": 1.02234936, + "epoch": 0.3219600180369758, + "flos": 22269558850560.0, + "grad_norm": 1.5962943843952742, + "language_loss": 0.84732413, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.86854827, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.49804688, + "step": 5355, + "time_per_iteration": 2.400207042694092 + }, + { + "auxiliary_loss_clip": 0.01077387, + "auxiliary_loss_mlp": 0.01051133, + "balance_loss_clip": 1.01811171, + "balance_loss_mlp": 1.02704418, + "epoch": 0.3220201412896438, + "flos": 14938251984000.0, + "grad_norm": 2.1566029435096934, + "language_loss": 0.72698617, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.74827135, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.50390625, + "step": 5356, + "time_per_iteration": 2.4160313606262207 + }, + { + "auxiliary_loss_clip": 0.01080242, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.02417386, + "balance_loss_mlp": 1.02656484, + "epoch": 0.32208026454231176, + "flos": 22666486631040.0, + "grad_norm": 4.915512416054033, + "language_loss": 0.70382702, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.72526383, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.5390625, + "step": 5357, + "time_per_iteration": 2.4091241359710693 + }, + { + "auxiliary_loss_clip": 0.01036248, + "auxiliary_loss_mlp": 0.01004391, + "balance_loss_clip": 1.00029004, + "balance_loss_mlp": 1.02505684, + "epoch": 0.3221403877949797, + "flos": 64601606177280.0, + "grad_norm": 0.7045714542121865, + "language_loss": 0.58356035, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60396677, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.11181641, + "step": 5358, + "time_per_iteration": 4.601735591888428 + }, + { + "auxiliary_loss_clip": 0.01072745, + "auxiliary_loss_mlp": 0.01055479, + "balance_loss_clip": 1.0212425, + "balance_loss_mlp": 1.02334261, + "epoch": 0.3222005110476477, + "flos": 20155986696960.0, + "grad_norm": 1.6028304190900455, + "language_loss": 0.84817374, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.86945599, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49414062, + "step": 5359, + "time_per_iteration": 3.886730670928955 + }, + { + "auxiliary_loss_clip": 0.01073645, + "auxiliary_loss_mlp": 0.01052166, + "balance_loss_clip": 1.0180006, + "balance_loss_mlp": 1.0244112, + "epoch": 0.32226063430031565, + "flos": 22673084878080.0, + "grad_norm": 1.6316932453079498, + "language_loss": 0.81251717, + "learning_rate": 3.168912388464595e-06, + "loss": 0.83377528, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.4921875, + "step": 5360, + "time_per_iteration": 2.39911150932312 + }, + { + "auxiliary_loss_clip": 0.0103236, + "auxiliary_loss_mlp": 0.01005384, + "balance_loss_clip": 1.00114059, + "balance_loss_mlp": 1.02097952, + "epoch": 0.3223207575529836, + "flos": 63825312896640.0, + "grad_norm": 0.6620936757752361, + "language_loss": 0.57169688, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59207439, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.04248047, + "router_z_loss_mlp": 0.11376953, + "step": 5361, + "time_per_iteration": 4.293464660644531 + }, + { + "auxiliary_loss_clip": 0.01072167, + "auxiliary_loss_mlp": 0.010551, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.02327645, + "epoch": 0.3223808808056516, + "flos": 26868911829120.0, + "grad_norm": 2.084438072676227, + "language_loss": 0.73501831, + "learning_rate": 3.168280261735588e-06, + "loss": 0.75629097, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.49023438, + "step": 5362, + "time_per_iteration": 2.4395878314971924 + }, + { + "auxiliary_loss_clip": 0.01074714, + "auxiliary_loss_mlp": 0.01059426, + "balance_loss_clip": 1.02585626, + "balance_loss_mlp": 1.02528656, + "epoch": 0.32244100405831955, + "flos": 26760122432640.0, + "grad_norm": 1.879965940489933, + "language_loss": 0.74037546, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76171684, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.49609375, + "step": 5363, + "time_per_iteration": 2.4255177974700928 + }, + { + "auxiliary_loss_clip": 0.01075305, + "auxiliary_loss_mlp": 0.0106571, + "balance_loss_clip": 1.02777767, + "balance_loss_mlp": 1.02240944, + "epoch": 0.3225011273109875, + "flos": 23801966645760.0, + "grad_norm": 2.3652508863327313, + "language_loss": 0.77210021, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79351032, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.52734375, + "step": 5364, + "time_per_iteration": 2.438962697982788 + }, + { + "auxiliary_loss_clip": 0.01071199, + "auxiliary_loss_mlp": 0.01058465, + "balance_loss_clip": 1.02217793, + "balance_loss_mlp": 1.02212012, + "epoch": 0.3225612505636555, + "flos": 17273557382400.0, + "grad_norm": 2.201892041572041, + "language_loss": 0.78386867, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.4921875, + "step": 5365, + "time_per_iteration": 2.359048843383789 + }, + { + "auxiliary_loss_clip": 0.01078948, + "auxiliary_loss_mlp": 0.01061201, + "balance_loss_clip": 1.02465129, + "balance_loss_mlp": 1.02752852, + "epoch": 0.32262137381632344, + "flos": 23365168225920.0, + "grad_norm": 1.6457702571563129, + "language_loss": 0.77858675, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79998815, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.515625, + "step": 5366, + "time_per_iteration": 2.454446315765381 + }, + { + "auxiliary_loss_clip": 0.01073496, + "auxiliary_loss_mlp": 0.01063451, + "balance_loss_clip": 1.02711606, + "balance_loss_mlp": 1.02327287, + "epoch": 0.3226814970689914, + "flos": 23257670549760.0, + "grad_norm": 1.7877354926387343, + "language_loss": 0.73417664, + "learning_rate": 3.166699169850055e-06, + "loss": 0.75554609, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.50390625, + "step": 5367, + "time_per_iteration": 2.4168241024017334 + }, + { + "auxiliary_loss_clip": 0.01073335, + "auxiliary_loss_mlp": 0.01052624, + "balance_loss_clip": 1.01975846, + "balance_loss_mlp": 1.02487493, + "epoch": 0.32274162032165943, + "flos": 16394374725120.0, + "grad_norm": 3.372138627812276, + "language_loss": 0.75685072, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.77811027, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.484375, + "step": 5368, + "time_per_iteration": 2.4387261867523193 + }, + { + "auxiliary_loss_clip": 0.01072251, + "auxiliary_loss_mlp": 0.01048916, + "balance_loss_clip": 1.01594222, + "balance_loss_mlp": 1.02392721, + "epoch": 0.3228017435743274, + "flos": 27853846594560.0, + "grad_norm": 1.6716565853287766, + "language_loss": 0.79722649, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81843817, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.484375, + "step": 5369, + "time_per_iteration": 2.4463508129119873 + }, + { + "auxiliary_loss_clip": 0.01073142, + "auxiliary_loss_mlp": 0.01048363, + "balance_loss_clip": 1.01691556, + "balance_loss_mlp": 1.02617073, + "epoch": 0.32286186682699536, + "flos": 19607780528640.0, + "grad_norm": 4.083048216431409, + "language_loss": 0.84502208, + "learning_rate": 3.16574998372661e-06, + "loss": 0.86623704, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.47070312, + "step": 5370, + "time_per_iteration": 2.4403798580169678 + }, + { + "auxiliary_loss_clip": 0.01075417, + "auxiliary_loss_mlp": 0.01052671, + "balance_loss_clip": 1.01903081, + "balance_loss_mlp": 1.02634525, + "epoch": 0.3229219900796633, + "flos": 24132873312000.0, + "grad_norm": 1.8899369378546755, + "language_loss": 0.8403371, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.86161804, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4921875, + "step": 5371, + "time_per_iteration": 2.412097692489624 + }, + { + "auxiliary_loss_clip": 0.01079241, + "auxiliary_loss_mlp": 0.01060882, + "balance_loss_clip": 1.02027893, + "balance_loss_mlp": 1.0259856, + "epoch": 0.3229821133323313, + "flos": 17747747735040.0, + "grad_norm": 2.1036766974130265, + "language_loss": 0.89894164, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.92034292, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 0.53125, + "step": 5372, + "time_per_iteration": 2.431300401687622 + }, + { + "auxiliary_loss_clip": 0.01078004, + "auxiliary_loss_mlp": 0.01057795, + "balance_loss_clip": 1.0222466, + "balance_loss_mlp": 1.0275476, + "epoch": 0.32304223658499925, + "flos": 22344936209280.0, + "grad_norm": 1.9634578772324833, + "language_loss": 0.74283206, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.76419002, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.50390625, + "step": 5373, + "time_per_iteration": 2.4360761642456055 + }, + { + "auxiliary_loss_clip": 0.01077397, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_clip": 1.01869369, + "balance_loss_mlp": 1.02881074, + "epoch": 0.3231023598376672, + "flos": 18477327749760.0, + "grad_norm": 3.045696752843061, + "language_loss": 0.817644, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83894533, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.484375, + "step": 5374, + "time_per_iteration": 2.4189534187316895 + }, + { + "auxiliary_loss_clip": 0.01073474, + "auxiliary_loss_mlp": 0.01050202, + "balance_loss_clip": 1.01553595, + "balance_loss_mlp": 1.02510417, + "epoch": 0.3231624830903352, + "flos": 27635080815360.0, + "grad_norm": 2.8782843399253637, + "language_loss": 0.89801115, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.91924787, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.484375, + "step": 5375, + "time_per_iteration": 2.479678153991699 + }, + { + "auxiliary_loss_clip": 0.01079471, + "auxiliary_loss_mlp": 0.01060245, + "balance_loss_clip": 1.02154946, + "balance_loss_mlp": 1.02694654, + "epoch": 0.32322260634300315, + "flos": 21725332577280.0, + "grad_norm": 1.8891747951166262, + "language_loss": 0.76981825, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.79121542, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 0.52734375, + "step": 5376, + "time_per_iteration": 2.459602117538452 + }, + { + "auxiliary_loss_clip": 0.01077125, + "auxiliary_loss_mlp": 0.01046612, + "balance_loss_clip": 1.01373422, + "balance_loss_mlp": 1.02900934, + "epoch": 0.3232827295956711, + "flos": 22636565729280.0, + "grad_norm": 4.506894607255021, + "language_loss": 0.68424475, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.70548213, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48242188, + "step": 5377, + "time_per_iteration": 2.4227726459503174 + }, + { + "auxiliary_loss_clip": 0.01077219, + "auxiliary_loss_mlp": 0.01059996, + "balance_loss_clip": 1.02087188, + "balance_loss_mlp": 1.02735305, + "epoch": 0.3233428528483391, + "flos": 26321403888000.0, + "grad_norm": 1.373386005949879, + "language_loss": 0.73503339, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.75640547, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.49804688, + "step": 5378, + "time_per_iteration": 2.4730067253112793 + }, + { + "auxiliary_loss_clip": 0.01074396, + "auxiliary_loss_mlp": 0.01057093, + "balance_loss_clip": 1.02237976, + "balance_loss_mlp": 1.02504814, + "epoch": 0.32340297610100704, + "flos": 28583950279680.0, + "grad_norm": 2.0684930625456146, + "language_loss": 0.84002566, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.86134058, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.49414062, + "step": 5379, + "time_per_iteration": 2.4681522846221924 + }, + { + "auxiliary_loss_clip": 0.01079831, + "auxiliary_loss_mlp": 0.01052698, + "balance_loss_clip": 1.01691151, + "balance_loss_mlp": 1.02760744, + "epoch": 0.323463099353675, + "flos": 30772480855680.0, + "grad_norm": 1.790506556291751, + "language_loss": 0.79808342, + "learning_rate": 3.162583158454388e-06, + "loss": 0.81940871, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5234375, + "step": 5380, + "time_per_iteration": 2.543022871017456 + }, + { + "auxiliary_loss_clip": 0.01080687, + "auxiliary_loss_mlp": 0.01054087, + "balance_loss_clip": 1.0196594, + "balance_loss_mlp": 1.02890849, + "epoch": 0.32352322260634303, + "flos": 25227435346560.0, + "grad_norm": 1.7152187089911302, + "language_loss": 0.78077674, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.8021245, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51953125, + "step": 5381, + "time_per_iteration": 2.453763008117676 + }, + { + "auxiliary_loss_clip": 0.01072321, + "auxiliary_loss_mlp": 0.01048754, + "balance_loss_clip": 1.01736593, + "balance_loss_mlp": 1.02430904, + "epoch": 0.323583345859011, + "flos": 23329382215680.0, + "grad_norm": 1.7683284405171216, + "language_loss": 0.72971821, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.750929, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.48046875, + "step": 5382, + "time_per_iteration": 2.4316177368164062 + }, + { + "auxiliary_loss_clip": 0.01076468, + "auxiliary_loss_mlp": 0.01062689, + "balance_loss_clip": 1.02690232, + "balance_loss_mlp": 1.02522779, + "epoch": 0.32364346911167896, + "flos": 26206470092160.0, + "grad_norm": 2.1683943437674134, + "language_loss": 0.72957492, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.75096643, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.51171875, + "step": 5383, + "time_per_iteration": 2.479782819747925 + }, + { + "auxiliary_loss_clip": 0.01068754, + "auxiliary_loss_mlp": 0.01047115, + "balance_loss_clip": 1.01634693, + "balance_loss_mlp": 1.02261925, + "epoch": 0.3237035923643469, + "flos": 23694643526400.0, + "grad_norm": 2.106078595233845, + "language_loss": 0.79978096, + "learning_rate": 3.161315193285283e-06, + "loss": 0.82093966, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4609375, + "step": 5384, + "time_per_iteration": 2.449404001235962 + }, + { + "auxiliary_loss_clip": 0.01074341, + "auxiliary_loss_mlp": 0.01051799, + "balance_loss_clip": 1.01732337, + "balance_loss_mlp": 1.02310228, + "epoch": 0.3237637156170149, + "flos": 14427856684800.0, + "grad_norm": 2.129877652288771, + "language_loss": 0.76424098, + "learning_rate": 3.16099809186998e-06, + "loss": 0.78550237, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.51171875, + "step": 5385, + "time_per_iteration": 2.3916494846343994 + }, + { + "auxiliary_loss_clip": 0.01070995, + "auxiliary_loss_mlp": 0.01055094, + "balance_loss_clip": 1.02057099, + "balance_loss_mlp": 1.02263343, + "epoch": 0.32382383886968286, + "flos": 31061736403200.0, + "grad_norm": 1.9378952731384336, + "language_loss": 0.72863191, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74989283, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.484375, + "step": 5386, + "time_per_iteration": 2.5048959255218506 + }, + { + "auxiliary_loss_clip": 0.01073094, + "auxiliary_loss_mlp": 0.01055162, + "balance_loss_clip": 1.0172776, + "balance_loss_mlp": 1.02225494, + "epoch": 0.3238839621223508, + "flos": 23255855159040.0, + "grad_norm": 1.8226092625650352, + "language_loss": 0.94995379, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.97123635, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 5387, + "time_per_iteration": 2.4220590591430664 + }, + { + "auxiliary_loss_clip": 0.01073552, + "auxiliary_loss_mlp": 0.01061621, + "balance_loss_clip": 1.02559638, + "balance_loss_mlp": 1.02306688, + "epoch": 0.3239440853750188, + "flos": 22963597234560.0, + "grad_norm": 3.962550969210986, + "language_loss": 0.79342973, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.81478155, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.50390625, + "step": 5388, + "time_per_iteration": 2.419628620147705 + }, + { + "auxiliary_loss_clip": 0.01071318, + "auxiliary_loss_mlp": 0.0105979, + "balance_loss_clip": 1.02359843, + "balance_loss_mlp": 1.02134371, + "epoch": 0.32400420862768675, + "flos": 36245151383040.0, + "grad_norm": 72.60882506206835, + "language_loss": 0.73355424, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.75486535, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.5, + "step": 5389, + "time_per_iteration": 2.5340301990509033 + }, + { + "auxiliary_loss_clip": 0.01070569, + "auxiliary_loss_mlp": 0.01055029, + "balance_loss_clip": 1.02267563, + "balance_loss_mlp": 1.0223875, + "epoch": 0.3240643318803547, + "flos": 21615426017280.0, + "grad_norm": 1.8492626619526433, + "language_loss": 0.82169491, + "learning_rate": 3.159411924656557e-06, + "loss": 0.84295082, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.48242188, + "step": 5390, + "time_per_iteration": 2.430562734603882 + }, + { + "auxiliary_loss_clip": 0.01071528, + "auxiliary_loss_mlp": 0.01062718, + "balance_loss_clip": 1.02726507, + "balance_loss_mlp": 1.02308106, + "epoch": 0.3241244551330227, + "flos": 23294468989440.0, + "grad_norm": 1.8671097769638614, + "language_loss": 0.74012989, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.76147234, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.484375, + "step": 5391, + "time_per_iteration": 2.4453272819519043 + }, + { + "auxiliary_loss_clip": 0.01067724, + "auxiliary_loss_mlp": 0.01054354, + "balance_loss_clip": 1.02028441, + "balance_loss_mlp": 1.02156544, + "epoch": 0.32418457838569065, + "flos": 14096461259520.0, + "grad_norm": 1.5912701377115213, + "language_loss": 0.78533107, + "learning_rate": 3.158777149931855e-06, + "loss": 0.80655181, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4609375, + "step": 5392, + "time_per_iteration": 2.412327766418457 + }, + { + "auxiliary_loss_clip": 0.01073912, + "auxiliary_loss_mlp": 0.01059773, + "balance_loss_clip": 1.02410555, + "balance_loss_mlp": 1.02343357, + "epoch": 0.3242447016383586, + "flos": 29751376055040.0, + "grad_norm": 2.1384618425371698, + "language_loss": 0.6484791, + "learning_rate": 3.158459696652067e-06, + "loss": 0.6698159, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.50390625, + "step": 5393, + "time_per_iteration": 2.530757427215576 + }, + { + "auxiliary_loss_clip": 0.01071863, + "auxiliary_loss_mlp": 0.01049909, + "balance_loss_clip": 1.01669729, + "balance_loss_mlp": 1.02263808, + "epoch": 0.3243048248910266, + "flos": 24350102991360.0, + "grad_norm": 1.6529343115322592, + "language_loss": 0.84259325, + "learning_rate": 3.158142199443371e-06, + "loss": 0.86381096, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4921875, + "step": 5394, + "time_per_iteration": 3.8516106605529785 + }, + { + "auxiliary_loss_clip": 0.01067986, + "auxiliary_loss_mlp": 0.01056083, + "balance_loss_clip": 1.0280931, + "balance_loss_mlp": 1.02290463, + "epoch": 0.3243649481436946, + "flos": 24351883470720.0, + "grad_norm": 2.295931697609266, + "language_loss": 0.83406454, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.85530519, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.45117188, + "step": 5395, + "time_per_iteration": 2.4713172912597656 + }, + { + "auxiliary_loss_clip": 0.01069434, + "auxiliary_loss_mlp": 0.01048728, + "balance_loss_clip": 1.01813889, + "balance_loss_mlp": 1.02368665, + "epoch": 0.32442507139636256, + "flos": 22924250265600.0, + "grad_norm": 1.7416627620095333, + "language_loss": 0.84676731, + "learning_rate": 3.157507073287417e-06, + "loss": 0.86794889, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45703125, + "step": 5396, + "time_per_iteration": 2.478872776031494 + }, + { + "auxiliary_loss_clip": 0.0107559, + "auxiliary_loss_mlp": 0.01059133, + "balance_loss_clip": 1.02227378, + "balance_loss_mlp": 1.02404928, + "epoch": 0.32448519464903053, + "flos": 22199103993600.0, + "grad_norm": 2.2675136963688263, + "language_loss": 0.78957427, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.81092155, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.515625, + "step": 5397, + "time_per_iteration": 3.873203754425049 + }, + { + "auxiliary_loss_clip": 0.01070743, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_clip": 1.01585662, + "balance_loss_mlp": 1.02331638, + "epoch": 0.3245453179016985, + "flos": 18837596736000.0, + "grad_norm": 2.3500274704199926, + "language_loss": 0.68839014, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.70957243, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.47460938, + "step": 5398, + "time_per_iteration": 2.4005684852600098 + }, + { + "auxiliary_loss_clip": 0.01073245, + "auxiliary_loss_mlp": 0.01047448, + "balance_loss_clip": 1.01375985, + "balance_loss_mlp": 1.02432895, + "epoch": 0.32460544115436646, + "flos": 21177335877120.0, + "grad_norm": 1.6917340587054654, + "language_loss": 0.74200213, + "learning_rate": 3.156554054887718e-06, + "loss": 0.76320904, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.49023438, + "step": 5399, + "time_per_iteration": 3.9258251190185547 + }, + { + "auxiliary_loss_clip": 0.01071067, + "auxiliary_loss_mlp": 0.01054956, + "balance_loss_clip": 1.02048039, + "balance_loss_mlp": 1.02225947, + "epoch": 0.3246655644070344, + "flos": 21980058923520.0, + "grad_norm": 2.270077222946056, + "language_loss": 0.72705936, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.74831963, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48632812, + "step": 5400, + "time_per_iteration": 2.416074752807617 + }, + { + "auxiliary_loss_clip": 0.010748, + "auxiliary_loss_mlp": 0.01051292, + "balance_loss_clip": 1.01798475, + "balance_loss_mlp": 1.024369, + "epoch": 0.3247256876597024, + "flos": 32158393119360.0, + "grad_norm": 1.962007132366792, + "language_loss": 0.81225431, + "learning_rate": 3.155918489984614e-06, + "loss": 0.83351517, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.50390625, + "step": 5401, + "time_per_iteration": 3.919761896133423 + }, + { + "auxiliary_loss_clip": 0.0107304, + "auxiliary_loss_mlp": 0.01052009, + "balance_loss_clip": 1.01693738, + "balance_loss_mlp": 1.0236727, + "epoch": 0.32478581091237035, + "flos": 20996450789760.0, + "grad_norm": 1.406501091278154, + "language_loss": 0.88214248, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.90339297, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.49414062, + "step": 5402, + "time_per_iteration": 2.4712460041046143 + }, + { + "auxiliary_loss_clip": 0.01069806, + "auxiliary_loss_mlp": 0.01051812, + "balance_loss_clip": 1.01874375, + "balance_loss_mlp": 1.02264214, + "epoch": 0.3248459341650383, + "flos": 17924199079680.0, + "grad_norm": 2.192572470956526, + "language_loss": 0.86398447, + "learning_rate": 3.155282749751332e-06, + "loss": 0.88520062, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.47265625, + "step": 5403, + "time_per_iteration": 2.4334774017333984 + }, + { + "auxiliary_loss_clip": 0.01070808, + "auxiliary_loss_mlp": 0.01050221, + "balance_loss_clip": 1.02051449, + "balance_loss_mlp": 1.0245024, + "epoch": 0.3249060574177063, + "flos": 24534444303360.0, + "grad_norm": 2.101241750218129, + "language_loss": 0.88642085, + "learning_rate": 3.154964813916007e-06, + "loss": 0.90763116, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.46289062, + "step": 5404, + "time_per_iteration": 2.501919984817505 + }, + { + "auxiliary_loss_clip": 0.01073035, + "auxiliary_loss_mlp": 0.01044185, + "balance_loss_clip": 1.01326203, + "balance_loss_mlp": 1.02480054, + "epoch": 0.32496618067037425, + "flos": 25993569421440.0, + "grad_norm": 2.0640148729944383, + "language_loss": 0.7448808, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.76605296, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48242188, + "step": 5405, + "time_per_iteration": 2.473304033279419 + }, + { + "auxiliary_loss_clip": 0.0106976, + "auxiliary_loss_mlp": 0.01051096, + "balance_loss_clip": 1.02079272, + "balance_loss_mlp": 1.02291512, + "epoch": 0.3250263039230422, + "flos": 19572727656960.0, + "grad_norm": 1.6726731866535738, + "language_loss": 0.84487426, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.86608285, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.46875, + "step": 5406, + "time_per_iteration": 2.4587175846099854 + }, + { + "auxiliary_loss_clip": 0.01070448, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_clip": 1.0148232, + "balance_loss_mlp": 1.02317178, + "epoch": 0.3250864271757102, + "flos": 16762708235520.0, + "grad_norm": 2.2467438769757644, + "language_loss": 0.8891651, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.91032809, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.47265625, + "step": 5407, + "time_per_iteration": 2.420661449432373 + }, + { + "auxiliary_loss_clip": 0.0107079, + "auxiliary_loss_mlp": 0.01051912, + "balance_loss_clip": 1.01862836, + "balance_loss_mlp": 1.02192295, + "epoch": 0.3251465504283782, + "flos": 27818200229760.0, + "grad_norm": 2.4797960334983826, + "language_loss": 0.70449662, + "learning_rate": 3.153692632731479e-06, + "loss": 0.72572362, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48828125, + "step": 5408, + "time_per_iteration": 2.4985427856445312 + }, + { + "auxiliary_loss_clip": 0.01077546, + "auxiliary_loss_mlp": 0.01053325, + "balance_loss_clip": 1.01782477, + "balance_loss_mlp": 1.02375257, + "epoch": 0.32520667368104617, + "flos": 19062122889600.0, + "grad_norm": 1.894853243482386, + "language_loss": 0.79191172, + "learning_rate": 3.153374478034841e-06, + "loss": 0.81322038, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.5390625, + "step": 5409, + "time_per_iteration": 2.427690267562866 + }, + { + "auxiliary_loss_clip": 0.01073307, + "auxiliary_loss_mlp": 0.010628, + "balance_loss_clip": 1.02703726, + "balance_loss_mlp": 1.02191305, + "epoch": 0.32526679693371413, + "flos": 29381017685760.0, + "grad_norm": 1.9855431623151774, + "language_loss": 0.84235168, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.86371279, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.515625, + "step": 5410, + "time_per_iteration": 2.4572863578796387 + }, + { + "auxiliary_loss_clip": 0.01068278, + "auxiliary_loss_mlp": 0.01048409, + "balance_loss_clip": 1.01801062, + "balance_loss_mlp": 1.02198017, + "epoch": 0.3253269201863821, + "flos": 20703459726720.0, + "grad_norm": 2.1213510556811475, + "language_loss": 0.72358406, + "learning_rate": 3.152738037445405e-06, + "loss": 0.74475092, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.46289062, + "step": 5411, + "time_per_iteration": 2.4290783405303955 + }, + { + "auxiliary_loss_clip": 0.01072015, + "auxiliary_loss_mlp": 0.01054861, + "balance_loss_clip": 1.02124381, + "balance_loss_mlp": 1.02283847, + "epoch": 0.32538704343905006, + "flos": 29092914213120.0, + "grad_norm": 1.4700535619837132, + "language_loss": 0.83272803, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85399675, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4921875, + "step": 5412, + "time_per_iteration": 2.4644923210144043 + }, + { + "auxiliary_loss_clip": 0.01072034, + "auxiliary_loss_mlp": 0.01053339, + "balance_loss_clip": 1.0165031, + "balance_loss_mlp": 1.02170682, + "epoch": 0.325447166691718, + "flos": 24675109637760.0, + "grad_norm": 2.813748543754715, + "language_loss": 0.821527, + "learning_rate": 3.152101422008203e-06, + "loss": 0.84278071, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.50390625, + "step": 5413, + "time_per_iteration": 2.43977952003479 + }, + { + "auxiliary_loss_clip": 0.01070623, + "auxiliary_loss_mlp": 0.01057489, + "balance_loss_clip": 1.02382469, + "balance_loss_mlp": 1.02209306, + "epoch": 0.325507289944386, + "flos": 21542073517440.0, + "grad_norm": 9.813382738388531, + "language_loss": 0.776443, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79772413, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.484375, + "step": 5414, + "time_per_iteration": 2.4251210689544678 + }, + { + "auxiliary_loss_clip": 0.01021482, + "auxiliary_loss_mlp": 0.01016883, + "balance_loss_clip": 1.0133549, + "balance_loss_mlp": 1.00996482, + "epoch": 0.32556741319705396, + "flos": 71515527206400.0, + "grad_norm": 0.916110389372844, + "language_loss": 0.64105648, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66144013, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.11523438, + "step": 5415, + "time_per_iteration": 2.980988025665283 + }, + { + "auxiliary_loss_clip": 0.01071123, + "auxiliary_loss_mlp": 0.01051984, + "balance_loss_clip": 1.01672196, + "balance_loss_mlp": 1.02178097, + "epoch": 0.3256275364497219, + "flos": 23731302320640.0, + "grad_norm": 1.4956860872341247, + "language_loss": 0.75759935, + "learning_rate": 3.151146171224075e-06, + "loss": 0.77883041, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.49414062, + "step": 5416, + "time_per_iteration": 2.5002965927124023 + }, + { + "auxiliary_loss_clip": 0.0101889, + "auxiliary_loss_mlp": 0.01005746, + "balance_loss_clip": 1.00216997, + "balance_loss_mlp": 1.00739551, + "epoch": 0.3256876597023899, + "flos": 67286043838080.0, + "grad_norm": 0.7839268279134989, + "language_loss": 0.58087015, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60111654, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.11523438, + "step": 5417, + "time_per_iteration": 3.1322898864746094 + }, + { + "auxiliary_loss_clip": 0.01017011, + "auxiliary_loss_mlp": 0.01002974, + "balance_loss_clip": 0.99939764, + "balance_loss_mlp": 1.00593174, + "epoch": 0.32574778295505785, + "flos": 71278605653760.0, + "grad_norm": 0.8264227216248546, + "language_loss": 0.63551438, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65571415, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.11083984, + "step": 5418, + "time_per_iteration": 3.182905912399292 + }, + { + "auxiliary_loss_clip": 0.01069817, + "auxiliary_loss_mlp": 0.0105398, + "balance_loss_clip": 1.01969528, + "balance_loss_mlp": 1.02194524, + "epoch": 0.3258079062077258, + "flos": 20775345949440.0, + "grad_norm": 1.907013141048205, + "language_loss": 0.70861197, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.72984993, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47851562, + "step": 5419, + "time_per_iteration": 2.3993401527404785 + }, + { + "auxiliary_loss_clip": 0.010733, + "auxiliary_loss_mlp": 0.01061461, + "balance_loss_clip": 1.02469683, + "balance_loss_mlp": 1.02365243, + "epoch": 0.3258680294603938, + "flos": 22234401244800.0, + "grad_norm": 1.841016469714559, + "language_loss": 0.78120553, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.80255312, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.49804688, + "step": 5420, + "time_per_iteration": 2.4388620853424072 + }, + { + "auxiliary_loss_clip": 0.01074154, + "auxiliary_loss_mlp": 0.01063399, + "balance_loss_clip": 1.02887583, + "balance_loss_mlp": 1.02342415, + "epoch": 0.3259281527130618, + "flos": 26978748566400.0, + "grad_norm": 1.6217717993817948, + "language_loss": 0.81671405, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.83808964, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.5078125, + "step": 5421, + "time_per_iteration": 2.442211151123047 + }, + { + "auxiliary_loss_clip": 0.01070446, + "auxiliary_loss_mlp": 0.01062748, + "balance_loss_clip": 1.03153872, + "balance_loss_mlp": 1.02371478, + "epoch": 0.32598827596572977, + "flos": 26213033427840.0, + "grad_norm": 1.6773035130411653, + "language_loss": 0.76736867, + "learning_rate": 3.149234491389381e-06, + "loss": 0.78870058, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.46875, + "step": 5422, + "time_per_iteration": 2.4773449897766113 + }, + { + "auxiliary_loss_clip": 0.01076307, + "auxiliary_loss_mlp": 0.0105687, + "balance_loss_clip": 1.02141666, + "balance_loss_mlp": 1.02581787, + "epoch": 0.32604839921839773, + "flos": 17638783781760.0, + "grad_norm": 1.9322188569213734, + "language_loss": 0.65143359, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.67276537, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.50390625, + "step": 5423, + "time_per_iteration": 2.373544692993164 + }, + { + "auxiliary_loss_clip": 0.01069683, + "auxiliary_loss_mlp": 0.01049099, + "balance_loss_clip": 1.01796138, + "balance_loss_mlp": 1.02442181, + "epoch": 0.3261085224710657, + "flos": 23621605228800.0, + "grad_norm": 2.986424433040947, + "language_loss": 0.75760174, + "learning_rate": 3.148596916016224e-06, + "loss": 0.77878964, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.453125, + "step": 5424, + "time_per_iteration": 2.4547507762908936 + }, + { + "auxiliary_loss_clip": 0.0107287, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.02077365, + "balance_loss_mlp": 1.02569342, + "epoch": 0.32616864572373366, + "flos": 23259276472320.0, + "grad_norm": 1.7356835849425494, + "language_loss": 0.77858114, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79983842, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.47265625, + "step": 5425, + "time_per_iteration": 2.4368221759796143 + }, + { + "auxiliary_loss_clip": 0.01078023, + "auxiliary_loss_mlp": 0.01055006, + "balance_loss_clip": 1.01833773, + "balance_loss_mlp": 1.0260911, + "epoch": 0.32622876897640163, + "flos": 25592242809600.0, + "grad_norm": 2.372130252363461, + "language_loss": 0.8035115, + "learning_rate": 3.147959166423428e-06, + "loss": 0.8248418, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.51953125, + "step": 5426, + "time_per_iteration": 2.475670099258423 + }, + { + "auxiliary_loss_clip": 0.01075611, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.01699674, + "balance_loss_mlp": 1.02714205, + "epoch": 0.3262888922290696, + "flos": 22417904684160.0, + "grad_norm": 1.7686895509048688, + "language_loss": 0.75634062, + "learning_rate": 3.147640226324893e-06, + "loss": 0.77761197, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.484375, + "step": 5427, + "time_per_iteration": 2.418316602706909 + }, + { + "auxiliary_loss_clip": 0.01076223, + "auxiliary_loss_mlp": 0.0105951, + "balance_loss_clip": 1.02379465, + "balance_loss_mlp": 1.02585506, + "epoch": 0.32634901548173756, + "flos": 19717896556800.0, + "grad_norm": 1.6525174805102785, + "language_loss": 0.8073107, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.82866812, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.50390625, + "step": 5428, + "time_per_iteration": 2.4138104915618896 + }, + { + "auxiliary_loss_clip": 0.01076023, + "auxiliary_loss_mlp": 0.01050097, + "balance_loss_clip": 1.01783955, + "balance_loss_mlp": 1.02744412, + "epoch": 0.3264091387344055, + "flos": 16142022351360.0, + "grad_norm": 1.7184462075443965, + "language_loss": 0.72449541, + "learning_rate": 3.147002215584023e-06, + "loss": 0.74575663, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.48632812, + "step": 5429, + "time_per_iteration": 2.375328540802002 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.01051213, + "balance_loss_clip": 1.01928902, + "balance_loss_mlp": 1.02685153, + "epoch": 0.3264692619870735, + "flos": 16398145152000.0, + "grad_norm": 1.623737806390898, + "language_loss": 0.79884219, + "learning_rate": 3.146683144965881e-06, + "loss": 0.82009184, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46875, + "step": 5430, + "time_per_iteration": 2.4042210578918457 + }, + { + "auxiliary_loss_clip": 0.01078615, + "auxiliary_loss_mlp": 0.01058397, + "balance_loss_clip": 1.02001154, + "balance_loss_mlp": 1.02876234, + "epoch": 0.32652938523974145, + "flos": 22381245889920.0, + "grad_norm": 2.0523064420053783, + "language_loss": 0.85554427, + "learning_rate": 3.146364030865399e-06, + "loss": 0.87691438, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.49804688, + "step": 5431, + "time_per_iteration": 2.425642967224121 + }, + { + "auxiliary_loss_clip": 0.01072381, + "auxiliary_loss_mlp": 0.01053857, + "balance_loss_clip": 1.02252889, + "balance_loss_mlp": 1.0256691, + "epoch": 0.3265895084924094, + "flos": 21906985714560.0, + "grad_norm": 2.244245676502082, + "language_loss": 0.7185365, + "learning_rate": 3.146044873294678e-06, + "loss": 0.7397989, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.46679688, + "step": 5432, + "time_per_iteration": 2.4290618896484375 + }, + { + "auxiliary_loss_clip": 0.01071912, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_clip": 1.0183624, + "balance_loss_mlp": 1.02413869, + "epoch": 0.3266496317450774, + "flos": 16066330790400.0, + "grad_norm": 1.5161923125829169, + "language_loss": 0.85051274, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.87173259, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4765625, + "step": 5433, + "time_per_iteration": 2.3777236938476562 + }, + { + "auxiliary_loss_clip": 0.01071675, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.01959825, + "balance_loss_mlp": 1.02553105, + "epoch": 0.3267097549977454, + "flos": 22527147928320.0, + "grad_norm": 1.4175278873650927, + "language_loss": 0.86593187, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88716656, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4609375, + "step": 5434, + "time_per_iteration": 3.8467822074890137 + }, + { + "auxiliary_loss_clip": 0.01073816, + "auxiliary_loss_mlp": 0.01057656, + "balance_loss_clip": 1.02320504, + "balance_loss_mlp": 1.02446198, + "epoch": 0.32676987825041337, + "flos": 27269226011520.0, + "grad_norm": 1.9315651965383005, + "language_loss": 0.88976389, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.91107857, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49414062, + "step": 5435, + "time_per_iteration": 2.4756805896759033 + }, + { + "auxiliary_loss_clip": 0.01070153, + "auxiliary_loss_mlp": 0.01060163, + "balance_loss_clip": 1.0254972, + "balance_loss_mlp": 1.02280569, + "epoch": 0.32683000150308134, + "flos": 11507511767040.0, + "grad_norm": 2.8922459361520643, + "language_loss": 0.77456701, + "learning_rate": 3.144767808551479e-06, + "loss": 0.79587007, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.47265625, + "step": 5436, + "time_per_iteration": 3.7797019481658936 + }, + { + "auxiliary_loss_clip": 0.01068792, + "auxiliary_loss_mlp": 0.0106173, + "balance_loss_clip": 1.0299015, + "balance_loss_mlp": 1.02175474, + "epoch": 0.3268901247557493, + "flos": 25629006337920.0, + "grad_norm": 1.5883987611686912, + "language_loss": 0.73507679, + "learning_rate": 3.144448433811134e-06, + "loss": 0.75638199, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.47070312, + "step": 5437, + "time_per_iteration": 2.489107370376587 + }, + { + "auxiliary_loss_clip": 0.0107268, + "auxiliary_loss_mlp": 0.01071883, + "balance_loss_clip": 1.03519022, + "balance_loss_mlp": 1.02306485, + "epoch": 0.32695024800841727, + "flos": 24859765152000.0, + "grad_norm": 1.6515809280844398, + "language_loss": 0.65437382, + "learning_rate": 3.144129015673189e-06, + "loss": 0.6758194, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.49609375, + "step": 5438, + "time_per_iteration": 2.4375667572021484 + }, + { + "auxiliary_loss_clip": 0.01069178, + "auxiliary_loss_mlp": 0.01059226, + "balance_loss_clip": 1.02491784, + "balance_loss_mlp": 1.02211332, + "epoch": 0.32701037126108523, + "flos": 28838013310080.0, + "grad_norm": 1.5956351011431187, + "language_loss": 0.75156707, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.77285105, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47070312, + "step": 5439, + "time_per_iteration": 3.8575401306152344 + }, + { + "auxiliary_loss_clip": 0.01071806, + "auxiliary_loss_mlp": 0.01061373, + "balance_loss_clip": 1.0258013, + "balance_loss_mlp": 1.02383471, + "epoch": 0.3270704945137532, + "flos": 27963822977280.0, + "grad_norm": 1.9100087715132943, + "language_loss": 0.75615025, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77748209, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48046875, + "step": 5440, + "time_per_iteration": 3.9729113578796387 + }, + { + "auxiliary_loss_clip": 0.0106876, + "auxiliary_loss_mlp": 0.01068905, + "balance_loss_clip": 1.03638506, + "balance_loss_mlp": 1.02208066, + "epoch": 0.32713061776642116, + "flos": 23689721024640.0, + "grad_norm": 2.5122424468144366, + "language_loss": 0.8578831, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.87925971, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46679688, + "step": 5441, + "time_per_iteration": 2.4053778648376465 + }, + { + "auxiliary_loss_clip": 0.01071364, + "auxiliary_loss_mlp": 0.01058184, + "balance_loss_clip": 1.0233748, + "balance_loss_mlp": 1.0225265, + "epoch": 0.3271907410190891, + "flos": 22454528567040.0, + "grad_norm": 3.656126749402145, + "language_loss": 0.87226444, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.89355993, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48828125, + "step": 5442, + "time_per_iteration": 2.4195220470428467 + }, + { + "auxiliary_loss_clip": 0.01073343, + "auxiliary_loss_mlp": 0.01058192, + "balance_loss_clip": 1.02159512, + "balance_loss_mlp": 1.02376413, + "epoch": 0.3272508642717571, + "flos": 22819021827840.0, + "grad_norm": 3.5725335227434942, + "language_loss": 0.78860748, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.80992281, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.49609375, + "step": 5443, + "time_per_iteration": 2.4414238929748535 + }, + { + "auxiliary_loss_clip": 0.01072325, + "auxiliary_loss_mlp": 0.01063636, + "balance_loss_clip": 1.0291127, + "balance_loss_mlp": 1.02279401, + "epoch": 0.32731098752442506, + "flos": 11800572652800.0, + "grad_norm": 2.2289621349534, + "language_loss": 0.82967389, + "learning_rate": 3.142211596174343e-06, + "loss": 0.85103345, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.49609375, + "step": 5444, + "time_per_iteration": 2.428816318511963 + }, + { + "auxiliary_loss_clip": 0.01072905, + "auxiliary_loss_mlp": 0.01051386, + "balance_loss_clip": 1.01857936, + "balance_loss_mlp": 1.02367544, + "epoch": 0.327371110777093, + "flos": 21026860450560.0, + "grad_norm": 2.2900670856512995, + "language_loss": 0.60307884, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.6243217, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4921875, + "step": 5445, + "time_per_iteration": 2.39406418800354 + }, + { + "auxiliary_loss_clip": 0.01073795, + "auxiliary_loss_mlp": 0.01050318, + "balance_loss_clip": 1.01617622, + "balance_loss_mlp": 1.02543032, + "epoch": 0.327431234029761, + "flos": 19061110460160.0, + "grad_norm": 3.5649947630195182, + "language_loss": 0.89480209, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.91604316, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.484375, + "step": 5446, + "time_per_iteration": 2.4272255897521973 + }, + { + "auxiliary_loss_clip": 0.01080223, + "auxiliary_loss_mlp": 0.01056565, + "balance_loss_clip": 1.01815546, + "balance_loss_mlp": 1.02795064, + "epoch": 0.32749135728242895, + "flos": 25848016496640.0, + "grad_norm": 1.7806944635142687, + "language_loss": 0.81356847, + "learning_rate": 3.141252301538802e-06, + "loss": 0.83493638, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5234375, + "step": 5447, + "time_per_iteration": 2.4461405277252197 + }, + { + "auxiliary_loss_clip": 0.01075336, + "auxiliary_loss_mlp": 0.01042455, + "balance_loss_clip": 1.01200867, + "balance_loss_mlp": 1.02683651, + "epoch": 0.327551480535097, + "flos": 20119502459520.0, + "grad_norm": 3.0478611482951345, + "language_loss": 0.74475169, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.76592964, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.484375, + "step": 5448, + "time_per_iteration": 2.473785161972046 + }, + { + "auxiliary_loss_clip": 0.01075064, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_clip": 1.02005744, + "balance_loss_mlp": 1.02754307, + "epoch": 0.32761160378776494, + "flos": 28802297122560.0, + "grad_norm": 1.6342473213826885, + "language_loss": 0.68416876, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.70544446, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47460938, + "step": 5449, + "time_per_iteration": 2.471534013748169 + }, + { + "auxiliary_loss_clip": 0.01073158, + "auxiliary_loss_mlp": 0.01053384, + "balance_loss_clip": 1.02153158, + "balance_loss_mlp": 1.02693212, + "epoch": 0.3276717270404329, + "flos": 26936713422720.0, + "grad_norm": 2.064709985085134, + "language_loss": 0.66588527, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.68715072, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46289062, + "step": 5450, + "time_per_iteration": 2.5470423698425293 + }, + { + "auxiliary_loss_clip": 0.01077097, + "auxiliary_loss_mlp": 0.0105631, + "balance_loss_clip": 1.02104807, + "balance_loss_mlp": 1.02733135, + "epoch": 0.32773185029310087, + "flos": 25337237172480.0, + "grad_norm": 7.533722740191079, + "language_loss": 0.79008245, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.81141651, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.49609375, + "step": 5451, + "time_per_iteration": 2.465240716934204 + }, + { + "auxiliary_loss_clip": 0.0107707, + "auxiliary_loss_mlp": 0.01051516, + "balance_loss_clip": 1.01873326, + "balance_loss_mlp": 1.02829516, + "epoch": 0.32779197354576883, + "flos": 26390636847360.0, + "grad_norm": 2.2762253566263864, + "language_loss": 0.71898574, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.74027169, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48828125, + "step": 5452, + "time_per_iteration": 2.500575065612793 + }, + { + "auxiliary_loss_clip": 0.01073571, + "auxiliary_loss_mlp": 0.01054084, + "balance_loss_clip": 1.02080107, + "balance_loss_mlp": 1.02752995, + "epoch": 0.3278520967984368, + "flos": 24898239336960.0, + "grad_norm": 2.021175622655834, + "language_loss": 0.80025762, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.82153416, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4609375, + "step": 5453, + "time_per_iteration": 2.443418264389038 + }, + { + "auxiliary_loss_clip": 0.01076042, + "auxiliary_loss_mlp": 0.01050225, + "balance_loss_clip": 1.01937389, + "balance_loss_mlp": 1.02849853, + "epoch": 0.32791222005110476, + "flos": 29751690257280.0, + "grad_norm": 2.2253679777236686, + "language_loss": 0.77778327, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.79904604, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4765625, + "step": 5454, + "time_per_iteration": 2.506333827972412 + }, + { + "auxiliary_loss_clip": 0.01072915, + "auxiliary_loss_mlp": 0.01047548, + "balance_loss_clip": 1.01915264, + "balance_loss_mlp": 1.0275079, + "epoch": 0.32797234330377273, + "flos": 16507144016640.0, + "grad_norm": 1.9272359003889732, + "language_loss": 0.78227305, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.80347764, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.453125, + "step": 5455, + "time_per_iteration": 2.385925054550171 + }, + { + "auxiliary_loss_clip": 0.01074256, + "auxiliary_loss_mlp": 0.01058226, + "balance_loss_clip": 1.02328563, + "balance_loss_mlp": 1.02472138, + "epoch": 0.3280324665564407, + "flos": 26576723727360.0, + "grad_norm": 1.7615411674805983, + "language_loss": 0.75151205, + "learning_rate": 3.138372082016768e-06, + "loss": 0.77283686, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.49609375, + "step": 5456, + "time_per_iteration": 2.4691927433013916 + }, + { + "auxiliary_loss_clip": 0.01071246, + "auxiliary_loss_mlp": 0.01056299, + "balance_loss_clip": 1.02327764, + "balance_loss_mlp": 1.02376473, + "epoch": 0.32809258980910866, + "flos": 22928858565120.0, + "grad_norm": 4.515423897120848, + "language_loss": 0.79429221, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.81556761, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.47460938, + "step": 5457, + "time_per_iteration": 2.4406580924987793 + }, + { + "auxiliary_loss_clip": 0.01074035, + "auxiliary_loss_mlp": 0.01053582, + "balance_loss_clip": 1.02120471, + "balance_loss_mlp": 1.02468073, + "epoch": 0.3281527130617766, + "flos": 22782747058560.0, + "grad_norm": 2.8050815148248485, + "language_loss": 0.82064992, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.8419261, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.49414062, + "step": 5458, + "time_per_iteration": 2.4208099842071533 + }, + { + "auxiliary_loss_clip": 0.01070146, + "auxiliary_loss_mlp": 0.01049152, + "balance_loss_clip": 1.01853871, + "balance_loss_mlp": 1.02330208, + "epoch": 0.3282128363144446, + "flos": 21249641036160.0, + "grad_norm": 1.5804921887537156, + "language_loss": 0.74598271, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.76717567, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.46875, + "step": 5459, + "time_per_iteration": 2.453692674636841 + }, + { + "auxiliary_loss_clip": 0.01072789, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.01734769, + "balance_loss_mlp": 1.02361631, + "epoch": 0.32827295956711255, + "flos": 30841853460480.0, + "grad_norm": 1.7971312118093028, + "language_loss": 0.84910744, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.87034082, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4921875, + "step": 5460, + "time_per_iteration": 2.4760990142822266 + }, + { + "auxiliary_loss_clip": 0.01068278, + "auxiliary_loss_mlp": 0.01051182, + "balance_loss_clip": 1.01892376, + "balance_loss_mlp": 1.02145934, + "epoch": 0.3283330828197806, + "flos": 25914002699520.0, + "grad_norm": 2.2742410700853584, + "language_loss": 0.78478205, + "learning_rate": 3.136770448642288e-06, + "loss": 0.80597669, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.46875, + "step": 5461, + "time_per_iteration": 2.4587838649749756 + }, + { + "auxiliary_loss_clip": 0.0106753, + "auxiliary_loss_mlp": 0.01054307, + "balance_loss_clip": 1.02095222, + "balance_loss_mlp": 1.02127337, + "epoch": 0.32839320607244854, + "flos": 38580526604160.0, + "grad_norm": 2.0386841497406, + "language_loss": 0.63943118, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.66064954, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.46289062, + "step": 5462, + "time_per_iteration": 2.5685999393463135 + }, + { + "auxiliary_loss_clip": 0.01068174, + "auxiliary_loss_mlp": 0.01048701, + "balance_loss_clip": 1.0180881, + "balance_loss_mlp": 1.02236867, + "epoch": 0.3284533293251165, + "flos": 26649692202240.0, + "grad_norm": 1.6519164436443026, + "language_loss": 0.7902385, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.81140721, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45703125, + "step": 5463, + "time_per_iteration": 2.5087454319000244 + }, + { + "auxiliary_loss_clip": 0.01070634, + "auxiliary_loss_mlp": 0.01058797, + "balance_loss_clip": 1.02389216, + "balance_loss_mlp": 1.02248919, + "epoch": 0.32851345257778447, + "flos": 15303268915200.0, + "grad_norm": 2.422501592683041, + "language_loss": 0.71335477, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.73464906, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.48242188, + "step": 5464, + "time_per_iteration": 2.3807013034820557 + }, + { + "auxiliary_loss_clip": 0.01066969, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.01636767, + "balance_loss_mlp": 1.02246165, + "epoch": 0.32857357583045244, + "flos": 23512606364160.0, + "grad_norm": 1.815561213613867, + "language_loss": 0.72772062, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.7488749, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4453125, + "step": 5465, + "time_per_iteration": 2.488621711730957 + }, + { + "auxiliary_loss_clip": 0.01069689, + "auxiliary_loss_mlp": 0.01049612, + "balance_loss_clip": 1.01851022, + "balance_loss_mlp": 1.02268434, + "epoch": 0.3286336990831204, + "flos": 20994181551360.0, + "grad_norm": 1.647091917263592, + "language_loss": 0.83805096, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85924393, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.47070312, + "step": 5466, + "time_per_iteration": 2.425475597381592 + }, + { + "auxiliary_loss_clip": 0.01069323, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.01581478, + "balance_loss_mlp": 1.02225649, + "epoch": 0.32869382233578837, + "flos": 23657705441280.0, + "grad_norm": 1.9541633133664364, + "language_loss": 0.80722165, + "learning_rate": 3.134847066213879e-06, + "loss": 0.82839721, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47070312, + "step": 5467, + "time_per_iteration": 2.4333090782165527 + }, + { + "auxiliary_loss_clip": 0.01070179, + "auxiliary_loss_mlp": 0.0104958, + "balance_loss_clip": 1.01608181, + "balance_loss_mlp": 1.02253199, + "epoch": 0.32875394558845633, + "flos": 25335386870400.0, + "grad_norm": 1.6498360893539201, + "language_loss": 0.75585294, + "learning_rate": 3.134526351787587e-06, + "loss": 0.7770505, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4765625, + "step": 5468, + "time_per_iteration": 2.44427752494812 + }, + { + "auxiliary_loss_clip": 0.01076054, + "auxiliary_loss_mlp": 0.01052415, + "balance_loss_clip": 1.01743913, + "balance_loss_mlp": 1.02614713, + "epoch": 0.3288140688411243, + "flos": 14902221594240.0, + "grad_norm": 1.700049437406739, + "language_loss": 0.80479324, + "learning_rate": 3.134205594339942e-06, + "loss": 0.82607794, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5, + "step": 5469, + "time_per_iteration": 2.425823926925659 + }, + { + "auxiliary_loss_clip": 0.01069638, + "auxiliary_loss_mlp": 0.01048316, + "balance_loss_clip": 1.01553297, + "balance_loss_mlp": 1.0220536, + "epoch": 0.32887419209379226, + "flos": 18550366047360.0, + "grad_norm": 1.7361436100796737, + "language_loss": 0.82999158, + "learning_rate": 3.133884793883107e-06, + "loss": 0.85117114, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4765625, + "step": 5470, + "time_per_iteration": 2.3766744136810303 + }, + { + "auxiliary_loss_clip": 0.01070519, + "auxiliary_loss_mlp": 0.01048725, + "balance_loss_clip": 1.01469111, + "balance_loss_mlp": 1.02240157, + "epoch": 0.3289343153464602, + "flos": 48103785360000.0, + "grad_norm": 1.979029946281867, + "language_loss": 0.69589424, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.71708667, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.48046875, + "step": 5471, + "time_per_iteration": 2.6472537517547607 + }, + { + "auxiliary_loss_clip": 0.01073236, + "auxiliary_loss_mlp": 0.01060639, + "balance_loss_clip": 1.02592516, + "balance_loss_mlp": 1.02372217, + "epoch": 0.3289944385991282, + "flos": 27599050425600.0, + "grad_norm": 2.090357218063405, + "language_loss": 0.66654038, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.6878792, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.49609375, + "step": 5472, + "time_per_iteration": 2.4556658267974854 + }, + { + "auxiliary_loss_clip": 0.01070522, + "auxiliary_loss_mlp": 0.0105651, + "balance_loss_clip": 1.02108133, + "balance_loss_mlp": 1.02276826, + "epoch": 0.32905456185179616, + "flos": 20119292991360.0, + "grad_norm": 1.657580946351629, + "language_loss": 0.89811158, + "learning_rate": 3.13292213457912e-06, + "loss": 0.91938186, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.4765625, + "step": 5473, + "time_per_iteration": 3.8442516326904297 + }, + { + "auxiliary_loss_clip": 0.01070671, + "auxiliary_loss_mlp": 0.0105083, + "balance_loss_clip": 1.01482916, + "balance_loss_mlp": 1.02272439, + "epoch": 0.3291146851044642, + "flos": 23179255902720.0, + "grad_norm": 1.7489138242311906, + "language_loss": 0.8018288, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.82304382, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48046875, + "step": 5474, + "time_per_iteration": 2.4720981121063232 + }, + { + "auxiliary_loss_clip": 0.01018412, + "auxiliary_loss_mlp": 0.01011898, + "balance_loss_clip": 1.00794005, + "balance_loss_mlp": 1.00735259, + "epoch": 0.32917480835713214, + "flos": 67618626249600.0, + "grad_norm": 0.8151204912893346, + "language_loss": 0.60261691, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62292004, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.11035156, + "step": 5475, + "time_per_iteration": 3.0370781421661377 + }, + { + "auxiliary_loss_clip": 0.01070796, + "auxiliary_loss_mlp": 0.01053714, + "balance_loss_clip": 1.01833272, + "balance_loss_mlp": 1.02154732, + "epoch": 0.3292349316098001, + "flos": 27963299306880.0, + "grad_norm": 2.6025908525083623, + "language_loss": 0.78246182, + "learning_rate": 3.131959088630455e-06, + "loss": 0.80370694, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.4921875, + "step": 5476, + "time_per_iteration": 3.8866868019104004 + }, + { + "auxiliary_loss_clip": 0.01068185, + "auxiliary_loss_mlp": 0.01056084, + "balance_loss_clip": 1.02426696, + "balance_loss_mlp": 1.0216763, + "epoch": 0.3292950548624681, + "flos": 20262716323200.0, + "grad_norm": 1.9755285835353642, + "language_loss": 0.76634508, + "learning_rate": 3.131637987449997e-06, + "loss": 0.78758776, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46484375, + "step": 5477, + "time_per_iteration": 2.458207130432129 + }, + { + "auxiliary_loss_clip": 0.01066479, + "auxiliary_loss_mlp": 0.01048941, + "balance_loss_clip": 1.01915038, + "balance_loss_mlp": 1.02221942, + "epoch": 0.32935517811513604, + "flos": 20811969832320.0, + "grad_norm": 2.332572780205217, + "language_loss": 0.77103531, + "learning_rate": 3.131316843357713e-06, + "loss": 0.79218954, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.44335938, + "step": 5478, + "time_per_iteration": 3.719945192337036 + }, + { + "auxiliary_loss_clip": 0.0106732, + "auxiliary_loss_mlp": 0.01049459, + "balance_loss_clip": 1.02084851, + "balance_loss_mlp": 1.02237797, + "epoch": 0.329415301367804, + "flos": 18440878423680.0, + "grad_norm": 1.979483805708507, + "language_loss": 0.81874299, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.83991075, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.44921875, + "step": 5479, + "time_per_iteration": 2.415287494659424 + }, + { + "auxiliary_loss_clip": 0.01014871, + "auxiliary_loss_mlp": 0.01006304, + "balance_loss_clip": 1.00227451, + "balance_loss_mlp": 1.00381243, + "epoch": 0.32947542462047197, + "flos": 66319367713920.0, + "grad_norm": 0.7574141346833446, + "language_loss": 0.56652832, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58674002, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.11035156, + "step": 5480, + "time_per_iteration": 4.502478837966919 + }, + { + "auxiliary_loss_clip": 0.01067432, + "auxiliary_loss_mlp": 0.01051628, + "balance_loss_clip": 1.02115798, + "balance_loss_mlp": 1.0213089, + "epoch": 0.32953554787313993, + "flos": 23220488085120.0, + "grad_norm": 1.7147040560600784, + "language_loss": 0.79020059, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.81139117, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4609375, + "step": 5481, + "time_per_iteration": 2.4088613986968994 + }, + { + "auxiliary_loss_clip": 0.01069121, + "auxiliary_loss_mlp": 0.01050198, + "balance_loss_clip": 1.01970458, + "balance_loss_mlp": 1.02226782, + "epoch": 0.3295956711258079, + "flos": 27008460000000.0, + "grad_norm": 1.6021857296811188, + "language_loss": 0.7944243, + "learning_rate": 3.130031838113899e-06, + "loss": 0.8156175, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.46875, + "step": 5482, + "time_per_iteration": 2.5287725925445557 + }, + { + "auxiliary_loss_clip": 0.01067923, + "auxiliary_loss_mlp": 0.01053743, + "balance_loss_clip": 1.02112746, + "balance_loss_mlp": 1.02166462, + "epoch": 0.32965579437847586, + "flos": 19170702817920.0, + "grad_norm": 1.7349894740063465, + "language_loss": 0.75354445, + "learning_rate": 3.129710479645185e-06, + "loss": 0.77476108, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.46289062, + "step": 5483, + "time_per_iteration": 2.3935012817382812 + }, + { + "auxiliary_loss_clip": 0.01068569, + "auxiliary_loss_mlp": 0.0104648, + "balance_loss_clip": 1.01467514, + "balance_loss_mlp": 1.02223825, + "epoch": 0.32971591763114383, + "flos": 30481200449280.0, + "grad_norm": 1.510621522648374, + "language_loss": 0.76468033, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.78583086, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46289062, + "step": 5484, + "time_per_iteration": 2.4957189559936523 + }, + { + "auxiliary_loss_clip": 0.01066773, + "auxiliary_loss_mlp": 0.01053798, + "balance_loss_clip": 1.02403176, + "balance_loss_mlp": 1.0221231, + "epoch": 0.3297760408838118, + "flos": 16288657528320.0, + "grad_norm": 1.8867738920876655, + "language_loss": 0.73081923, + "learning_rate": 3.129067634203742e-06, + "loss": 0.75202501, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44726562, + "step": 5485, + "time_per_iteration": 2.381308078765869 + }, + { + "auxiliary_loss_clip": 0.01068353, + "auxiliary_loss_mlp": 0.0104807, + "balance_loss_clip": 1.01742101, + "balance_loss_mlp": 1.02324891, + "epoch": 0.32983616413647976, + "flos": 29529712632960.0, + "grad_norm": 1.6071068054535178, + "language_loss": 0.81287748, + "learning_rate": 3.128746147255388e-06, + "loss": 0.83404171, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45117188, + "step": 5486, + "time_per_iteration": 2.492466449737549 + }, + { + "auxiliary_loss_clip": 0.01065595, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_clip": 1.01674342, + "balance_loss_mlp": 1.02061772, + "epoch": 0.3298962873891478, + "flos": 20630351606400.0, + "grad_norm": 2.0795644484903235, + "language_loss": 0.85784519, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.87896466, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44921875, + "step": 5487, + "time_per_iteration": 2.4108831882476807 + }, + { + "auxiliary_loss_clip": 0.0107001, + "auxiliary_loss_mlp": 0.01051858, + "balance_loss_clip": 1.01743019, + "balance_loss_mlp": 1.02168489, + "epoch": 0.32995641064181574, + "flos": 14975120246400.0, + "grad_norm": 2.2842888403109223, + "language_loss": 0.76377618, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.78499484, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48242188, + "step": 5488, + "time_per_iteration": 2.3967974185943604 + }, + { + "auxiliary_loss_clip": 0.01068799, + "auxiliary_loss_mlp": 0.01048399, + "balance_loss_clip": 1.01502037, + "balance_loss_mlp": 1.02126646, + "epoch": 0.3300165338944837, + "flos": 18660447164160.0, + "grad_norm": 2.265223701919076, + "language_loss": 0.74390244, + "learning_rate": 3.127781429646098e-06, + "loss": 0.76507437, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4765625, + "step": 5489, + "time_per_iteration": 2.367581605911255 + }, + { + "auxiliary_loss_clip": 0.01065031, + "auxiliary_loss_mlp": 0.01047996, + "balance_loss_clip": 1.01904035, + "balance_loss_mlp": 1.02081585, + "epoch": 0.3300766571471517, + "flos": 25582816742400.0, + "grad_norm": 3.6457118803645074, + "language_loss": 0.90861881, + "learning_rate": 3.127459771562238e-06, + "loss": 0.92974907, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44140625, + "step": 5490, + "time_per_iteration": 2.4556074142456055 + }, + { + "auxiliary_loss_clip": 0.01065667, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.01813054, + "balance_loss_mlp": 1.02036381, + "epoch": 0.33013678039981964, + "flos": 11362726892160.0, + "grad_norm": 2.120566837696673, + "language_loss": 0.84614813, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.86727339, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.453125, + "step": 5491, + "time_per_iteration": 2.400254249572754 + }, + { + "auxiliary_loss_clip": 0.01067154, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_clip": 1.01475954, + "balance_loss_mlp": 1.02242565, + "epoch": 0.3301969036524876, + "flos": 24820208714880.0, + "grad_norm": 1.7802664977785094, + "language_loss": 0.7887345, + "learning_rate": 3.126816327146554e-06, + "loss": 0.80985993, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44921875, + "step": 5492, + "time_per_iteration": 2.4701054096221924 + }, + { + "auxiliary_loss_clip": 0.0107213, + "auxiliary_loss_mlp": 0.01052329, + "balance_loss_clip": 1.01954651, + "balance_loss_mlp": 1.02508175, + "epoch": 0.33025702690515557, + "flos": 15960229568640.0, + "grad_norm": 2.3051591869456094, + "language_loss": 0.79062033, + "learning_rate": 3.12649454083913e-06, + "loss": 0.81186491, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.47070312, + "step": 5493, + "time_per_iteration": 2.380720376968384 + }, + { + "auxiliary_loss_clip": 0.01018753, + "auxiliary_loss_mlp": 0.01013495, + "balance_loss_clip": 1.00963306, + "balance_loss_mlp": 1.00810361, + "epoch": 0.33031715015782354, + "flos": 59413582897920.0, + "grad_norm": 0.7924774017818957, + "language_loss": 0.54032993, + "learning_rate": 3.12617271181492e-06, + "loss": 0.56065243, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.03857422, + "router_z_loss_mlp": 0.10644531, + "step": 5494, + "time_per_iteration": 3.0080111026763916 + }, + { + "auxiliary_loss_clip": 0.01066421, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.0096432, + "balance_loss_mlp": 1.02180552, + "epoch": 0.3303772734104915, + "flos": 23183270709120.0, + "grad_norm": 1.6078702539089902, + "language_loss": 0.88068652, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.90174246, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44726562, + "step": 5495, + "time_per_iteration": 2.42714524269104 + }, + { + "auxiliary_loss_clip": 0.01068712, + "auxiliary_loss_mlp": 0.0105223, + "balance_loss_clip": 1.01854157, + "balance_loss_mlp": 1.02165258, + "epoch": 0.33043739666315947, + "flos": 33070533966720.0, + "grad_norm": 2.1179678722908046, + "language_loss": 0.7483533, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.76956272, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.47070312, + "step": 5496, + "time_per_iteration": 2.5375235080718994 + }, + { + "auxiliary_loss_clip": 0.01064277, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.01180291, + "balance_loss_mlp": 1.0195998, + "epoch": 0.33049751991582743, + "flos": 24894399087360.0, + "grad_norm": 1.88307215298916, + "language_loss": 0.73423445, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.75527513, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.44726562, + "step": 5497, + "time_per_iteration": 2.4137399196624756 + }, + { + "auxiliary_loss_clip": 0.01066817, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_clip": 1.0203855, + "balance_loss_mlp": 1.0219686, + "epoch": 0.3305576431684954, + "flos": 29459292687360.0, + "grad_norm": 10.21787146031512, + "language_loss": 0.82659531, + "learning_rate": 3.124884968794321e-06, + "loss": 0.84776092, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44921875, + "step": 5498, + "time_per_iteration": 2.5626227855682373 + }, + { + "auxiliary_loss_clip": 0.01064704, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_clip": 1.01708603, + "balance_loss_mlp": 1.01883054, + "epoch": 0.33061776642116336, + "flos": 22631363936640.0, + "grad_norm": 1.8232880853808997, + "language_loss": 0.77206975, + "learning_rate": 3.12456292636927e-06, + "loss": 0.7932086, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45898438, + "step": 5499, + "time_per_iteration": 2.452155113220215 + }, + { + "auxiliary_loss_clip": 0.01065824, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.01584184, + "balance_loss_mlp": 1.0213263, + "epoch": 0.3306778896738313, + "flos": 25775117844480.0, + "grad_norm": 1.5327099319429727, + "language_loss": 0.80021322, + "learning_rate": 3.124240841300681e-06, + "loss": 0.82130742, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4453125, + "step": 5500, + "time_per_iteration": 2.467546224594116 + }, + { + "auxiliary_loss_clip": 0.01068533, + "auxiliary_loss_mlp": 0.01052879, + "balance_loss_clip": 1.02101469, + "balance_loss_mlp": 1.02160621, + "epoch": 0.33073801292649935, + "flos": 36939050121600.0, + "grad_norm": 3.9929329613767677, + "language_loss": 0.68335807, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.7045722, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46875, + "step": 5501, + "time_per_iteration": 2.5195114612579346 + }, + { + "auxiliary_loss_clip": 0.0106715, + "auxiliary_loss_mlp": 0.0105617, + "balance_loss_clip": 1.02329218, + "balance_loss_mlp": 1.02014756, + "epoch": 0.3307981361791673, + "flos": 12966951087360.0, + "grad_norm": 2.0132504324614344, + "language_loss": 0.7947762, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.8160094, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46875, + "step": 5502, + "time_per_iteration": 2.4046525955200195 + }, + { + "auxiliary_loss_clip": 0.01068722, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.02225184, + "balance_loss_mlp": 1.02211761, + "epoch": 0.3308582594318353, + "flos": 25373197739520.0, + "grad_norm": 3.6641027725551827, + "language_loss": 0.7379241, + "learning_rate": 3.123274330355824e-06, + "loss": 0.75913733, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.46484375, + "step": 5503, + "time_per_iteration": 2.4225854873657227 + }, + { + "auxiliary_loss_clip": 0.01066458, + "auxiliary_loss_mlp": 0.01047582, + "balance_loss_clip": 1.0182445, + "balance_loss_mlp": 1.02013576, + "epoch": 0.33091838268450324, + "flos": 26467375749120.0, + "grad_norm": 1.5122997215626903, + "language_loss": 0.76359332, + "learning_rate": 3.12295207483523e-06, + "loss": 0.78473377, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.46289062, + "step": 5504, + "time_per_iteration": 2.4794552326202393 + }, + { + "auxiliary_loss_clip": 0.01067863, + "auxiliary_loss_mlp": 0.01054725, + "balance_loss_clip": 1.02468407, + "balance_loss_mlp": 1.02207375, + "epoch": 0.3309785059371712, + "flos": 24970055736960.0, + "grad_norm": 1.7107289746303507, + "language_loss": 0.71400166, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.73522747, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45703125, + "step": 5505, + "time_per_iteration": 2.4640347957611084 + }, + { + "auxiliary_loss_clip": 0.0106794, + "auxiliary_loss_mlp": 0.01058361, + "balance_loss_clip": 1.02841604, + "balance_loss_mlp": 1.02239799, + "epoch": 0.3310386291898392, + "flos": 20445731003520.0, + "grad_norm": 1.7265188124845996, + "language_loss": 0.83167517, + "learning_rate": 3.122307436058899e-06, + "loss": 0.85293818, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.45507812, + "step": 5506, + "time_per_iteration": 2.4464664459228516 + }, + { + "auxiliary_loss_clip": 0.01068137, + "auxiliary_loss_mlp": 0.01050148, + "balance_loss_clip": 1.01862884, + "balance_loss_mlp": 1.02224028, + "epoch": 0.33109875244250714, + "flos": 23181629875200.0, + "grad_norm": 1.822852472188488, + "language_loss": 0.80865204, + "learning_rate": 3.121985052827606e-06, + "loss": 0.82983482, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45898438, + "step": 5507, + "time_per_iteration": 2.405184507369995 + }, + { + "auxiliary_loss_clip": 0.01067925, + "auxiliary_loss_mlp": 0.01052236, + "balance_loss_clip": 1.02140844, + "balance_loss_mlp": 1.02103972, + "epoch": 0.3311588756951751, + "flos": 24167297779200.0, + "grad_norm": 1.5609399838841558, + "language_loss": 0.73063791, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.75183958, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46875, + "step": 5508, + "time_per_iteration": 2.4126219749450684 + }, + { + "auxiliary_loss_clip": 0.01066651, + "auxiliary_loss_mlp": 0.01045151, + "balance_loss_clip": 1.01799536, + "balance_loss_mlp": 1.02215159, + "epoch": 0.33121899894784307, + "flos": 28144533507840.0, + "grad_norm": 1.9528328791651648, + "language_loss": 0.72910577, + "learning_rate": 3.12134015873989e-06, + "loss": 0.75022376, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4453125, + "step": 5509, + "time_per_iteration": 2.4431326389312744 + }, + { + "auxiliary_loss_clip": 0.01067881, + "auxiliary_loss_mlp": 0.01055806, + "balance_loss_clip": 1.02323818, + "balance_loss_mlp": 1.02149141, + "epoch": 0.33127912220051103, + "flos": 29566441249920.0, + "grad_norm": 1.5838832746367593, + "language_loss": 0.75300443, + "learning_rate": 3.121017647907921e-06, + "loss": 0.77424133, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.46289062, + "step": 5510, + "time_per_iteration": 2.460125684738159 + }, + { + "auxiliary_loss_clip": 0.01066979, + "auxiliary_loss_mlp": 0.01044518, + "balance_loss_clip": 1.01597941, + "balance_loss_mlp": 1.0215615, + "epoch": 0.331339245453179, + "flos": 14427961418880.0, + "grad_norm": 2.151046158574119, + "language_loss": 0.88810003, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90921497, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.45507812, + "step": 5511, + "time_per_iteration": 2.4014720916748047 + }, + { + "auxiliary_loss_clip": 0.01061953, + "auxiliary_loss_mlp": 0.01045342, + "balance_loss_clip": 1.01737547, + "balance_loss_mlp": 1.02015519, + "epoch": 0.33139936870584696, + "flos": 20886055470720.0, + "grad_norm": 1.6469094366882815, + "language_loss": 0.75107086, + "learning_rate": 3.12037249872891e-06, + "loss": 0.77214372, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 5512, + "time_per_iteration": 2.413630485534668 + }, + { + "auxiliary_loss_clip": 0.01065175, + "auxiliary_loss_mlp": 0.01047974, + "balance_loss_clip": 1.01845741, + "balance_loss_mlp": 1.02096868, + "epoch": 0.33145949195851493, + "flos": 36282857518080.0, + "grad_norm": 1.7402102366933752, + "language_loss": 0.7423619, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.76349342, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44140625, + "step": 5513, + "time_per_iteration": 3.9858715534210205 + }, + { + "auxiliary_loss_clip": 0.01070656, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.014961, + "balance_loss_mlp": 1.02202606, + "epoch": 0.33151961521118295, + "flos": 14278952269440.0, + "grad_norm": 1.9445672528085716, + "language_loss": 0.70136344, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.72256196, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.48632812, + "step": 5514, + "time_per_iteration": 2.4123313426971436 + }, + { + "auxiliary_loss_clip": 0.01068751, + "auxiliary_loss_mlp": 0.01055762, + "balance_loss_clip": 1.01921237, + "balance_loss_mlp": 1.02175939, + "epoch": 0.3315797384638509, + "flos": 20773356001920.0, + "grad_norm": 1.9624782120564408, + "language_loss": 0.67865241, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.69989753, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.46875, + "step": 5515, + "time_per_iteration": 2.4035611152648926 + }, + { + "auxiliary_loss_clip": 0.01068248, + "auxiliary_loss_mlp": 0.01050317, + "balance_loss_clip": 1.01815462, + "balance_loss_mlp": 1.02127504, + "epoch": 0.3316398617165189, + "flos": 24678356394240.0, + "grad_norm": 1.503308295948925, + "language_loss": 0.7042461, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.72543174, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46875, + "step": 5516, + "time_per_iteration": 3.963435411453247 + }, + { + "auxiliary_loss_clip": 0.01068019, + "auxiliary_loss_mlp": 0.0104742, + "balance_loss_clip": 1.01530528, + "balance_loss_mlp": 1.02098012, + "epoch": 0.33169998496918685, + "flos": 18586989930240.0, + "grad_norm": 2.3380752536182934, + "language_loss": 0.81801927, + "learning_rate": 3.118758882514359e-06, + "loss": 0.83917367, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.46875, + "step": 5517, + "time_per_iteration": 2.3873612880706787 + }, + { + "auxiliary_loss_clip": 0.01063658, + "auxiliary_loss_mlp": 0.01051938, + "balance_loss_clip": 1.02257693, + "balance_loss_mlp": 1.0199821, + "epoch": 0.3317601082218548, + "flos": 20192610579840.0, + "grad_norm": 1.7445561686908913, + "language_loss": 0.75411689, + "learning_rate": 3.118436031952143e-06, + "loss": 0.77527279, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43554688, + "step": 5518, + "time_per_iteration": 3.9123880863189697 + }, + { + "auxiliary_loss_clip": 0.01013896, + "auxiliary_loss_mlp": 0.01003542, + "balance_loss_clip": 1.00018036, + "balance_loss_mlp": 1.00407183, + "epoch": 0.3318202314745228, + "flos": 68971301032320.0, + "grad_norm": 0.6134744398193794, + "language_loss": 0.5438391, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56401348, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.09863281, + "step": 5519, + "time_per_iteration": 3.159447431564331 + }, + { + "auxiliary_loss_clip": 0.01068602, + "auxiliary_loss_mlp": 0.0104616, + "balance_loss_clip": 1.01187575, + "balance_loss_mlp": 1.02180362, + "epoch": 0.33188035472719074, + "flos": 21499235412480.0, + "grad_norm": 2.3831432986703933, + "language_loss": 0.8002184, + "learning_rate": 3.117790203606336e-06, + "loss": 0.82136601, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46679688, + "step": 5520, + "time_per_iteration": 3.846764326095581 + }, + { + "auxiliary_loss_clip": 0.01065797, + "auxiliary_loss_mlp": 0.01044476, + "balance_loss_clip": 1.01505494, + "balance_loss_mlp": 1.0214653, + "epoch": 0.3319404779798587, + "flos": 28869400488960.0, + "grad_norm": 1.755299528246814, + "language_loss": 0.77069938, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.79180211, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44335938, + "step": 5521, + "time_per_iteration": 2.436143159866333 + }, + { + "auxiliary_loss_clip": 0.01067193, + "auxiliary_loss_mlp": 0.01052319, + "balance_loss_clip": 1.01921439, + "balance_loss_mlp": 1.02030063, + "epoch": 0.33200060123252667, + "flos": 23075773032960.0, + "grad_norm": 2.278920552109586, + "language_loss": 0.7170167, + "learning_rate": 3.117144205713664e-06, + "loss": 0.73821181, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46875, + "step": 5522, + "time_per_iteration": 2.4178173542022705 + }, + { + "auxiliary_loss_clip": 0.01066347, + "auxiliary_loss_mlp": 0.01045501, + "balance_loss_clip": 1.01521027, + "balance_loss_mlp": 1.02113008, + "epoch": 0.33206072448519464, + "flos": 21141410221440.0, + "grad_norm": 1.6408733300866396, + "language_loss": 0.74969375, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.77081221, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.453125, + "step": 5523, + "time_per_iteration": 2.3969779014587402 + }, + { + "auxiliary_loss_clip": 0.01063143, + "auxiliary_loss_mlp": 0.01052029, + "balance_loss_clip": 1.02067685, + "balance_loss_mlp": 1.01901472, + "epoch": 0.3321208477378626, + "flos": 13078254101760.0, + "grad_norm": 1.6908955603754228, + "language_loss": 0.82884973, + "learning_rate": 3.116498038372114e-06, + "loss": 0.85000145, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44140625, + "step": 5524, + "time_per_iteration": 2.405308246612549 + }, + { + "auxiliary_loss_clip": 0.01063999, + "auxiliary_loss_mlp": 0.01046272, + "balance_loss_clip": 1.01638639, + "balance_loss_mlp": 1.01989019, + "epoch": 0.33218097099053057, + "flos": 21214343784960.0, + "grad_norm": 1.6301800347981898, + "language_loss": 0.84321415, + "learning_rate": 3.116174891188636e-06, + "loss": 0.86431688, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.44140625, + "step": 5525, + "time_per_iteration": 2.382605791091919 + }, + { + "auxiliary_loss_clip": 0.0101468, + "auxiliary_loss_mlp": 0.01006241, + "balance_loss_clip": 1.00324929, + "balance_loss_mlp": 1.00471103, + "epoch": 0.33224109424319853, + "flos": 64345483376640.0, + "grad_norm": 0.7671220726632565, + "language_loss": 0.5283705, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54857969, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.02990723, + "router_z_loss_mlp": 0.09960938, + "step": 5526, + "time_per_iteration": 3.0100691318511963 + }, + { + "auxiliary_loss_clip": 0.01068227, + "auxiliary_loss_mlp": 0.0104961, + "balance_loss_clip": 1.01711345, + "balance_loss_mlp": 1.02146459, + "epoch": 0.33230121749586655, + "flos": 17345094491520.0, + "grad_norm": 2.028689111528651, + "language_loss": 0.79429865, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.81547707, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46875, + "step": 5527, + "time_per_iteration": 2.339938163757324 + }, + { + "auxiliary_loss_clip": 0.01067287, + "auxiliary_loss_mlp": 0.01050539, + "balance_loss_clip": 1.02108264, + "balance_loss_mlp": 1.02183199, + "epoch": 0.3323613407485345, + "flos": 20995962030720.0, + "grad_norm": 2.307783155249437, + "language_loss": 0.74100137, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.76217967, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.45507812, + "step": 5528, + "time_per_iteration": 2.4466662406921387 + }, + { + "auxiliary_loss_clip": 0.01065829, + "auxiliary_loss_mlp": 0.01042018, + "balance_loss_clip": 1.01130939, + "balance_loss_mlp": 1.02075148, + "epoch": 0.3324214640012025, + "flos": 13151676424320.0, + "grad_norm": 3.1808135996316844, + "language_loss": 0.85380387, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.8748824, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44921875, + "step": 5529, + "time_per_iteration": 2.359931707382202 + }, + { + "auxiliary_loss_clip": 0.01071208, + "auxiliary_loss_mlp": 0.01049837, + "balance_loss_clip": 1.01701844, + "balance_loss_mlp": 1.02240777, + "epoch": 0.33248158725387045, + "flos": 22272421582080.0, + "grad_norm": 1.8985316959994523, + "language_loss": 0.71406293, + "learning_rate": 3.114558520634423e-06, + "loss": 0.73527336, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48828125, + "step": 5530, + "time_per_iteration": 2.4435713291168213 + }, + { + "auxiliary_loss_clip": 0.01069353, + "auxiliary_loss_mlp": 0.01060694, + "balance_loss_clip": 1.02609909, + "balance_loss_mlp": 1.02106833, + "epoch": 0.3325417105065384, + "flos": 20739943964160.0, + "grad_norm": 2.6105215786288465, + "language_loss": 0.78959298, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.81089348, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.48242188, + "step": 5531, + "time_per_iteration": 2.3819162845611572 + }, + { + "auxiliary_loss_clip": 0.01068696, + "auxiliary_loss_mlp": 0.01056592, + "balance_loss_clip": 1.02225995, + "balance_loss_mlp": 1.02109945, + "epoch": 0.3326018337592064, + "flos": 24789380117760.0, + "grad_norm": 1.874893765084891, + "language_loss": 0.7516399, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.77289283, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47460938, + "step": 5532, + "time_per_iteration": 2.4730160236358643 + }, + { + "auxiliary_loss_clip": 0.01067484, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.01348662, + "balance_loss_mlp": 1.02159595, + "epoch": 0.33266195701187434, + "flos": 14500825159680.0, + "grad_norm": 1.9488794901658864, + "language_loss": 0.67895961, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.70008105, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45703125, + "step": 5533, + "time_per_iteration": 2.392252206802368 + }, + { + "auxiliary_loss_clip": 0.01066154, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.01526546, + "balance_loss_mlp": 1.01999915, + "epoch": 0.3327220802645423, + "flos": 15303513294720.0, + "grad_norm": 1.7105923164397687, + "language_loss": 0.72640055, + "learning_rate": 3.113264663362451e-06, + "loss": 0.74753875, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4609375, + "step": 5534, + "time_per_iteration": 2.3850626945495605 + }, + { + "auxiliary_loss_clip": 0.0106673, + "auxiliary_loss_mlp": 0.01049427, + "balance_loss_clip": 1.01881421, + "balance_loss_mlp": 1.02165818, + "epoch": 0.3327822035172103, + "flos": 23476401417600.0, + "grad_norm": 1.5040667491867912, + "language_loss": 0.68271017, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.70387179, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45117188, + "step": 5535, + "time_per_iteration": 2.42374849319458 + }, + { + "auxiliary_loss_clip": 0.01067112, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.0211513, + "balance_loss_mlp": 1.02041352, + "epoch": 0.33284232676987824, + "flos": 25373337384960.0, + "grad_norm": 2.414191757133029, + "language_loss": 0.74202591, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.76324612, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46679688, + "step": 5536, + "time_per_iteration": 2.4249255657196045 + }, + { + "auxiliary_loss_clip": 0.01067003, + "auxiliary_loss_mlp": 0.01053325, + "balance_loss_clip": 1.01997042, + "balance_loss_mlp": 1.01956081, + "epoch": 0.3329024500225462, + "flos": 23693281983360.0, + "grad_norm": 1.5487821894580853, + "language_loss": 0.8231827, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84438598, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.47265625, + "step": 5537, + "time_per_iteration": 2.4379494190216064 + }, + { + "auxiliary_loss_clip": 0.01071755, + "auxiliary_loss_mlp": 0.0106032, + "balance_loss_clip": 1.02424753, + "balance_loss_mlp": 1.02279043, + "epoch": 0.33296257327521417, + "flos": 31721804167680.0, + "grad_norm": 1.7360288970815656, + "language_loss": 0.72808194, + "learning_rate": 3.111970130648789e-06, + "loss": 0.7494027, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48828125, + "step": 5538, + "time_per_iteration": 2.4575870037078857 + }, + { + "auxiliary_loss_clip": 0.01066021, + "auxiliary_loss_mlp": 0.01047458, + "balance_loss_clip": 1.01536655, + "balance_loss_mlp": 1.02033854, + "epoch": 0.33302269652788213, + "flos": 22743679380480.0, + "grad_norm": 1.7502021943892294, + "language_loss": 0.76000631, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.78114104, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45703125, + "step": 5539, + "time_per_iteration": 2.4459168910980225 + }, + { + "auxiliary_loss_clip": 0.01069861, + "auxiliary_loss_mlp": 0.0106094, + "balance_loss_clip": 1.02417541, + "balance_loss_mlp": 1.02080703, + "epoch": 0.33308281978055015, + "flos": 11472947654400.0, + "grad_norm": 2.1327551749953875, + "language_loss": 0.7253201, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.74662811, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.4921875, + "step": 5540, + "time_per_iteration": 2.362872838973999 + }, + { + "auxiliary_loss_clip": 0.01065538, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_clip": 1.01383877, + "balance_loss_mlp": 1.01914048, + "epoch": 0.3331429430332181, + "flos": 38212262916480.0, + "grad_norm": 1.95670488799209, + "language_loss": 0.61848658, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.63959789, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46484375, + "step": 5541, + "time_per_iteration": 2.5486080646514893 + }, + { + "auxiliary_loss_clip": 0.01067634, + "auxiliary_loss_mlp": 0.01047786, + "balance_loss_clip": 1.01497912, + "balance_loss_mlp": 1.01954246, + "epoch": 0.3332030662858861, + "flos": 22527566864640.0, + "grad_norm": 1.6650601468472461, + "language_loss": 0.70365632, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.72481048, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48046875, + "step": 5542, + "time_per_iteration": 2.3815624713897705 + }, + { + "auxiliary_loss_clip": 0.01068059, + "auxiliary_loss_mlp": 0.01052371, + "balance_loss_clip": 1.01989865, + "balance_loss_mlp": 1.02084637, + "epoch": 0.33326318953855405, + "flos": 15996853451520.0, + "grad_norm": 1.6876551651327367, + "language_loss": 0.76841801, + "learning_rate": 3.110351016113414e-06, + "loss": 0.78962231, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47265625, + "step": 5543, + "time_per_iteration": 2.383723735809326 + }, + { + "auxiliary_loss_clip": 0.01070056, + "auxiliary_loss_mlp": 0.01045169, + "balance_loss_clip": 1.01231503, + "balance_loss_mlp": 1.02211916, + "epoch": 0.333323312791222, + "flos": 25592347543680.0, + "grad_norm": 1.6894072031269822, + "language_loss": 0.76614809, + "learning_rate": 3.110027066843348e-06, + "loss": 0.78730023, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48046875, + "step": 5544, + "time_per_iteration": 2.419053792953491 + }, + { + "auxiliary_loss_clip": 0.01064103, + "auxiliary_loss_mlp": 0.01047354, + "balance_loss_clip": 1.01490521, + "balance_loss_mlp": 1.01936388, + "epoch": 0.33338343604389, + "flos": 25118366659200.0, + "grad_norm": 1.7375543028142386, + "language_loss": 0.72078669, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.74190122, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44726562, + "step": 5545, + "time_per_iteration": 2.4689018726348877 + }, + { + "auxiliary_loss_clip": 0.0106316, + "auxiliary_loss_mlp": 0.01050276, + "balance_loss_clip": 1.01954389, + "balance_loss_mlp": 1.01910865, + "epoch": 0.33344355929655795, + "flos": 16946316408960.0, + "grad_norm": 1.6401444080112273, + "language_loss": 0.70850277, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.72963715, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44140625, + "step": 5546, + "time_per_iteration": 2.3480796813964844 + }, + { + "auxiliary_loss_clip": 0.01068035, + "auxiliary_loss_mlp": 0.01050286, + "balance_loss_clip": 1.01748013, + "balance_loss_mlp": 1.01914144, + "epoch": 0.3335036825492259, + "flos": 27888410707200.0, + "grad_norm": 1.8541341515863892, + "language_loss": 0.66176319, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.68294644, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.49023438, + "step": 5547, + "time_per_iteration": 2.4429309368133545 + }, + { + "auxiliary_loss_clip": 0.01064187, + "auxiliary_loss_mlp": 0.01044231, + "balance_loss_clip": 1.01527524, + "balance_loss_mlp": 1.01977515, + "epoch": 0.3335638058018939, + "flos": 16178646234240.0, + "grad_norm": 2.3799267170328116, + "language_loss": 0.86656547, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.88764966, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4453125, + "step": 5548, + "time_per_iteration": 2.3469579219818115 + }, + { + "auxiliary_loss_clip": 0.01066262, + "auxiliary_loss_mlp": 0.01048664, + "balance_loss_clip": 1.014642, + "balance_loss_mlp": 1.01813841, + "epoch": 0.33362392905456184, + "flos": 39894517733760.0, + "grad_norm": 3.0900609617376693, + "language_loss": 0.76219141, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.78334063, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.48046875, + "step": 5549, + "time_per_iteration": 2.597487449645996 + }, + { + "auxiliary_loss_clip": 0.01067226, + "auxiliary_loss_mlp": 0.01050627, + "balance_loss_clip": 1.01746321, + "balance_loss_mlp": 1.0211978, + "epoch": 0.3336840523072298, + "flos": 44269588938240.0, + "grad_norm": 1.9986789310884376, + "language_loss": 0.70765281, + "learning_rate": 3.108082487713921e-06, + "loss": 0.72883129, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4609375, + "step": 5550, + "time_per_iteration": 2.556265354156494 + }, + { + "auxiliary_loss_clip": 0.01066262, + "auxiliary_loss_mlp": 0.0104822, + "balance_loss_clip": 1.01527071, + "balance_loss_mlp": 1.01987147, + "epoch": 0.33374417555989777, + "flos": 15084782426880.0, + "grad_norm": 2.1305881467545884, + "language_loss": 0.62159312, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.64273798, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46484375, + "step": 5551, + "time_per_iteration": 2.370605707168579 + }, + { + "auxiliary_loss_clip": 0.01063264, + "auxiliary_loss_mlp": 0.01045457, + "balance_loss_clip": 1.01412892, + "balance_loss_mlp": 1.018139, + "epoch": 0.33380429881256574, + "flos": 15848333061120.0, + "grad_norm": 1.7851044608370739, + "language_loss": 0.71472001, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.73580718, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45117188, + "step": 5552, + "time_per_iteration": 2.346456527709961 + }, + { + "auxiliary_loss_clip": 0.01067, + "auxiliary_loss_mlp": 0.01049714, + "balance_loss_clip": 1.01793289, + "balance_loss_mlp": 1.02008259, + "epoch": 0.33386442206523376, + "flos": 13479475979520.0, + "grad_norm": 1.9740342489108995, + "language_loss": 0.85231888, + "learning_rate": 3.107109630732192e-06, + "loss": 0.87348604, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46875, + "step": 5553, + "time_per_iteration": 3.800049304962158 + }, + { + "auxiliary_loss_clip": 0.0106815, + "auxiliary_loss_mlp": 0.01052184, + "balance_loss_clip": 1.01955664, + "balance_loss_mlp": 1.02085578, + "epoch": 0.3339245453179017, + "flos": 16689739760640.0, + "grad_norm": 2.152275254503383, + "language_loss": 0.82703221, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.84823561, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.47265625, + "step": 5554, + "time_per_iteration": 2.3561229705810547 + }, + { + "auxiliary_loss_clip": 0.0106671, + "auxiliary_loss_mlp": 0.010531, + "balance_loss_clip": 1.01826715, + "balance_loss_mlp": 1.01989329, + "epoch": 0.3339846685705697, + "flos": 24609402725760.0, + "grad_norm": 1.5597031850965786, + "language_loss": 0.83123964, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.85243773, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.46875, + "step": 5555, + "time_per_iteration": 3.8779356479644775 + }, + { + "auxiliary_loss_clip": 0.01065097, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.02067995, + "balance_loss_mlp": 1.01899779, + "epoch": 0.33404479182323765, + "flos": 30952562981760.0, + "grad_norm": 1.844870268208161, + "language_loss": 0.75452471, + "learning_rate": 3.106136395915099e-06, + "loss": 0.77570069, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4609375, + "step": 5556, + "time_per_iteration": 2.508586883544922 + }, + { + "auxiliary_loss_clip": 0.01064384, + "auxiliary_loss_mlp": 0.01052706, + "balance_loss_clip": 1.01999497, + "balance_loss_mlp": 1.01947927, + "epoch": 0.3341049150759056, + "flos": 23512187427840.0, + "grad_norm": 1.4398053396018582, + "language_loss": 0.83800936, + "learning_rate": 3.105811900403391e-06, + "loss": 0.85918033, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44921875, + "step": 5557, + "time_per_iteration": 3.798022508621216 + }, + { + "auxiliary_loss_clip": 0.01067183, + "auxiliary_loss_mlp": 0.01055834, + "balance_loss_clip": 1.02309871, + "balance_loss_mlp": 1.02065444, + "epoch": 0.3341650383285736, + "flos": 24025620015360.0, + "grad_norm": 1.6513518840894525, + "language_loss": 0.81562614, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.83685625, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46484375, + "step": 5558, + "time_per_iteration": 2.4696714878082275 + }, + { + "auxiliary_loss_clip": 0.01067048, + "auxiliary_loss_mlp": 0.01047962, + "balance_loss_clip": 1.01508427, + "balance_loss_mlp": 1.02057874, + "epoch": 0.33422516158124155, + "flos": 24900752954880.0, + "grad_norm": 1.877649582523763, + "language_loss": 0.82343793, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84458804, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46484375, + "step": 5559, + "time_per_iteration": 3.8448479175567627 + }, + { + "auxiliary_loss_clip": 0.01065012, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.01452065, + "balance_loss_mlp": 1.01985836, + "epoch": 0.3342852848339095, + "flos": 18332403229440.0, + "grad_norm": 1.7383749147130878, + "language_loss": 0.72301978, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74412507, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45117188, + "step": 5560, + "time_per_iteration": 2.348094940185547 + }, + { + "auxiliary_loss_clip": 0.01069369, + "auxiliary_loss_mlp": 0.01059877, + "balance_loss_clip": 1.02549708, + "balance_loss_mlp": 1.02116024, + "epoch": 0.3343454080865775, + "flos": 30045170079360.0, + "grad_norm": 1.4026019237757872, + "language_loss": 0.76227593, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.78356838, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48242188, + "step": 5561, + "time_per_iteration": 2.4864394664764404 + }, + { + "auxiliary_loss_clip": 0.01065767, + "auxiliary_loss_mlp": 0.01053471, + "balance_loss_clip": 1.02064061, + "balance_loss_mlp": 1.01939368, + "epoch": 0.33440553133924544, + "flos": 16397900772480.0, + "grad_norm": 1.978325194487784, + "language_loss": 0.70770085, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.72889322, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46289062, + "step": 5562, + "time_per_iteration": 2.357445478439331 + }, + { + "auxiliary_loss_clip": 0.01063417, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.01887655, + "balance_loss_mlp": 1.01929855, + "epoch": 0.3344656545919134, + "flos": 24240964481280.0, + "grad_norm": 1.5967967096923024, + "language_loss": 0.66440094, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.68552661, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44140625, + "step": 5563, + "time_per_iteration": 2.4313278198242188 + }, + { + "auxiliary_loss_clip": 0.01070596, + "auxiliary_loss_mlp": 0.01062736, + "balance_loss_clip": 1.02244306, + "balance_loss_mlp": 1.02152395, + "epoch": 0.3345257778445814, + "flos": 52116911832960.0, + "grad_norm": 1.4314163691201478, + "language_loss": 0.7496143, + "learning_rate": 3.103539258400766e-06, + "loss": 0.77094758, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.49023438, + "step": 5564, + "time_per_iteration": 2.6400632858276367 + }, + { + "auxiliary_loss_clip": 0.01012921, + "auxiliary_loss_mlp": 0.01017718, + "balance_loss_clip": 1.01426136, + "balance_loss_mlp": 1.00325263, + "epoch": 0.33458590109724934, + "flos": 68045614577280.0, + "grad_norm": 0.7912708442023627, + "language_loss": 0.55605054, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57635689, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.09667969, + "step": 5565, + "time_per_iteration": 2.9798686504364014 + }, + { + "auxiliary_loss_clip": 0.01065345, + "auxiliary_loss_mlp": 0.01047019, + "balance_loss_clip": 1.01564276, + "balance_loss_mlp": 1.02047253, + "epoch": 0.3346460243499173, + "flos": 37413275385600.0, + "grad_norm": 1.720286915989064, + "language_loss": 0.66047359, + "learning_rate": 3.102889555312721e-06, + "loss": 0.68159723, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44921875, + "step": 5566, + "time_per_iteration": 2.4966275691986084 + }, + { + "auxiliary_loss_clip": 0.01067502, + "auxiliary_loss_mlp": 0.01054935, + "balance_loss_clip": 1.02055478, + "balance_loss_mlp": 1.02218676, + "epoch": 0.3347061476025853, + "flos": 18696372819840.0, + "grad_norm": 1.8465678724378116, + "language_loss": 0.79194248, + "learning_rate": 3.102564641030016e-06, + "loss": 0.81316686, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.453125, + "step": 5567, + "time_per_iteration": 2.379993438720703 + }, + { + "auxiliary_loss_clip": 0.01068202, + "auxiliary_loss_mlp": 0.01054158, + "balance_loss_clip": 1.0196588, + "balance_loss_mlp": 1.02079844, + "epoch": 0.3347662708552533, + "flos": 13916972626560.0, + "grad_norm": 1.6636252602609993, + "language_loss": 0.78767353, + "learning_rate": 3.102239684937949e-06, + "loss": 0.80889714, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47460938, + "step": 5568, + "time_per_iteration": 2.367408275604248 + }, + { + "auxiliary_loss_clip": 0.01069453, + "auxiliary_loss_mlp": 0.01054087, + "balance_loss_clip": 1.01975489, + "balance_loss_mlp": 1.02062917, + "epoch": 0.33482639410792125, + "flos": 19749528115200.0, + "grad_norm": 2.410536661939314, + "language_loss": 0.72736979, + "learning_rate": 3.101914687048842e-06, + "loss": 0.74860513, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48828125, + "step": 5569, + "time_per_iteration": 2.3954734802246094 + }, + { + "auxiliary_loss_clip": 0.01068192, + "auxiliary_loss_mlp": 0.01054216, + "balance_loss_clip": 1.01785684, + "balance_loss_mlp": 1.01997232, + "epoch": 0.3348865173605892, + "flos": 16102186623360.0, + "grad_norm": 2.0483152476769706, + "language_loss": 0.91174716, + "learning_rate": 3.10158964737502e-06, + "loss": 0.93297124, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.48242188, + "step": 5570, + "time_per_iteration": 2.350863456726074 + }, + { + "auxiliary_loss_clip": 0.01067499, + "auxiliary_loss_mlp": 0.0105263, + "balance_loss_clip": 1.01839328, + "balance_loss_mlp": 1.02037358, + "epoch": 0.3349466406132572, + "flos": 25007796783360.0, + "grad_norm": 1.5514983945285732, + "language_loss": 0.81122887, + "learning_rate": 3.101264565928808e-06, + "loss": 0.83243018, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47070312, + "step": 5571, + "time_per_iteration": 2.4733152389526367 + }, + { + "auxiliary_loss_clip": 0.01013143, + "auxiliary_loss_mlp": 0.0101139, + "balance_loss_clip": 1.00762308, + "balance_loss_mlp": 1.00299728, + "epoch": 0.33500676386592515, + "flos": 54316647089280.0, + "grad_norm": 0.9013404278223167, + "language_loss": 0.56114805, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.58139336, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.03759766, + "router_z_loss_mlp": 0.1015625, + "step": 5572, + "time_per_iteration": 2.9561569690704346 + }, + { + "auxiliary_loss_clip": 0.0106837, + "auxiliary_loss_mlp": 0.01059071, + "balance_loss_clip": 1.02526331, + "balance_loss_mlp": 1.02196646, + "epoch": 0.3350668871185931, + "flos": 26796117911040.0, + "grad_norm": 1.9449858012547798, + "language_loss": 0.79893881, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.8202132, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46289062, + "step": 5573, + "time_per_iteration": 2.4576141834259033 + }, + { + "auxiliary_loss_clip": 0.01071479, + "auxiliary_loss_mlp": 0.01066208, + "balance_loss_clip": 1.02784634, + "balance_loss_mlp": 1.02226162, + "epoch": 0.3351270103712611, + "flos": 33509112865920.0, + "grad_norm": 2.3134797752167184, + "language_loss": 0.74609244, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.76746935, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.4921875, + "step": 5574, + "time_per_iteration": 2.471407651901245 + }, + { + "auxiliary_loss_clip": 0.01065267, + "auxiliary_loss_mlp": 0.01056802, + "balance_loss_clip": 1.02516413, + "balance_loss_mlp": 1.02083969, + "epoch": 0.33518713362392905, + "flos": 26505012061440.0, + "grad_norm": 1.5849993303176617, + "language_loss": 0.89587009, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.91709083, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44335938, + "step": 5575, + "time_per_iteration": 2.4425432682037354 + }, + { + "auxiliary_loss_clip": 0.01072843, + "auxiliary_loss_mlp": 0.01062257, + "balance_loss_clip": 1.02325165, + "balance_loss_mlp": 1.02178812, + "epoch": 0.335247256876597, + "flos": 17231557150080.0, + "grad_norm": 2.1406495349784174, + "language_loss": 0.8409152, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.86226618, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.51171875, + "step": 5576, + "time_per_iteration": 2.3642022609710693 + }, + { + "auxiliary_loss_clip": 0.01072025, + "auxiliary_loss_mlp": 0.01054056, + "balance_loss_clip": 1.01621926, + "balance_loss_mlp": 1.0221715, + "epoch": 0.335307380129265, + "flos": 25628203376640.0, + "grad_norm": 2.2366104411912704, + "language_loss": 0.74545348, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.76671433, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5, + "step": 5577, + "time_per_iteration": 2.439854860305786 + }, + { + "auxiliary_loss_clip": 0.01071105, + "auxiliary_loss_mlp": 0.01061037, + "balance_loss_clip": 1.02515507, + "balance_loss_mlp": 1.0225606, + "epoch": 0.33536750338193294, + "flos": 19679143080960.0, + "grad_norm": 1.6990164250579898, + "language_loss": 0.82839006, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.84971148, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48632812, + "step": 5578, + "time_per_iteration": 2.382715940475464 + }, + { + "auxiliary_loss_clip": 0.01065315, + "auxiliary_loss_mlp": 0.01056542, + "balance_loss_clip": 1.02230477, + "balance_loss_mlp": 1.0210346, + "epoch": 0.3354276266346009, + "flos": 18331635179520.0, + "grad_norm": 1.8691328426394411, + "language_loss": 0.72553027, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.74674881, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.44335938, + "step": 5579, + "time_per_iteration": 2.393550157546997 + }, + { + "auxiliary_loss_clip": 0.01070876, + "auxiliary_loss_mlp": 0.01058872, + "balance_loss_clip": 1.02184534, + "balance_loss_mlp": 1.02159524, + "epoch": 0.3354877498872689, + "flos": 17857584472320.0, + "grad_norm": 1.945729865758927, + "language_loss": 0.82978308, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.85108054, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.4921875, + "step": 5580, + "time_per_iteration": 2.373728036880493 + }, + { + "auxiliary_loss_clip": 0.01070101, + "auxiliary_loss_mlp": 0.0106308, + "balance_loss_clip": 1.02479017, + "balance_loss_mlp": 1.02109623, + "epoch": 0.3355478731399369, + "flos": 24716586199680.0, + "grad_norm": 1.50146117954155, + "language_loss": 0.78759801, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.8089298, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.48828125, + "step": 5581, + "time_per_iteration": 2.4463448524475098 + }, + { + "auxiliary_loss_clip": 0.01072286, + "auxiliary_loss_mlp": 0.01068025, + "balance_loss_clip": 1.02763629, + "balance_loss_mlp": 1.02129281, + "epoch": 0.33560799639260486, + "flos": 16872928997760.0, + "grad_norm": 2.316754631207495, + "language_loss": 0.75798225, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.77938539, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 0.51171875, + "step": 5582, + "time_per_iteration": 2.3552191257476807 + }, + { + "auxiliary_loss_clip": 0.01069807, + "auxiliary_loss_mlp": 0.01068062, + "balance_loss_clip": 1.0304637, + "balance_loss_mlp": 1.02032948, + "epoch": 0.3356681196452728, + "flos": 18332507963520.0, + "grad_norm": 1.6817101311945357, + "language_loss": 0.84439367, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.86577237, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.49414062, + "step": 5583, + "time_per_iteration": 2.3987197875976562 + }, + { + "auxiliary_loss_clip": 0.01066963, + "auxiliary_loss_mlp": 0.01072437, + "balance_loss_clip": 1.03806901, + "balance_loss_mlp": 1.02014875, + "epoch": 0.3357282428979408, + "flos": 34749192913920.0, + "grad_norm": 1.6573257988027668, + "language_loss": 0.78526652, + "learning_rate": 3.097034711451581e-06, + "loss": 0.80666053, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46679688, + "step": 5584, + "time_per_iteration": 2.505084991455078 + }, + { + "auxiliary_loss_clip": 0.0106953, + "auxiliary_loss_mlp": 0.01059566, + "balance_loss_clip": 1.02563953, + "balance_loss_mlp": 1.02039647, + "epoch": 0.33578836615060875, + "flos": 21579011602560.0, + "grad_norm": 1.564355042238551, + "language_loss": 0.77799487, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.79928583, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4921875, + "step": 5585, + "time_per_iteration": 2.404996395111084 + }, + { + "auxiliary_loss_clip": 0.01064999, + "auxiliary_loss_mlp": 0.01057631, + "balance_loss_clip": 1.0217731, + "balance_loss_mlp": 1.02014709, + "epoch": 0.3358484894032767, + "flos": 24529277422080.0, + "grad_norm": 1.5912428176731959, + "language_loss": 0.78817058, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.80939686, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.44921875, + "step": 5586, + "time_per_iteration": 2.408074140548706 + }, + { + "auxiliary_loss_clip": 0.01075294, + "auxiliary_loss_mlp": 0.01063938, + "balance_loss_clip": 1.02447999, + "balance_loss_mlp": 1.02501559, + "epoch": 0.3359086126559447, + "flos": 22454493655680.0, + "grad_norm": 1.793191053025723, + "language_loss": 0.82842308, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.84981537, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.50390625, + "step": 5587, + "time_per_iteration": 2.423187494277954 + }, + { + "auxiliary_loss_clip": 0.01065544, + "auxiliary_loss_mlp": 0.01044502, + "balance_loss_clip": 1.01472366, + "balance_loss_mlp": 1.02197158, + "epoch": 0.33596873590861265, + "flos": 16542790381440.0, + "grad_norm": 1.8442974788881963, + "language_loss": 0.69347405, + "learning_rate": 3.095731802118677e-06, + "loss": 0.71457446, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4375, + "step": 5588, + "time_per_iteration": 2.354553461074829 + }, + { + "auxiliary_loss_clip": 0.01069308, + "auxiliary_loss_mlp": 0.01051025, + "balance_loss_clip": 1.0170505, + "balance_loss_mlp": 1.02263761, + "epoch": 0.3360288591612806, + "flos": 31174470783360.0, + "grad_norm": 1.8814749055754945, + "language_loss": 0.71817869, + "learning_rate": 3.095405970878919e-06, + "loss": 0.73938203, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.46679688, + "step": 5589, + "time_per_iteration": 2.5201454162597656 + }, + { + "auxiliary_loss_clip": 0.01069013, + "auxiliary_loss_mlp": 0.01048457, + "balance_loss_clip": 1.01462543, + "balance_loss_mlp": 1.02132547, + "epoch": 0.3360889824139486, + "flos": 23695760689920.0, + "grad_norm": 1.6048324799078835, + "language_loss": 0.68367332, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.70484805, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.4765625, + "step": 5590, + "time_per_iteration": 2.4083657264709473 + }, + { + "auxiliary_loss_clip": 0.0106836, + "auxiliary_loss_mlp": 0.01057016, + "balance_loss_clip": 1.02411413, + "balance_loss_mlp": 1.02254176, + "epoch": 0.33614910566661654, + "flos": 19317093615360.0, + "grad_norm": 1.9948221248738829, + "language_loss": 0.76425999, + "learning_rate": 3.094754183798047e-06, + "loss": 0.78551376, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45703125, + "step": 5591, + "time_per_iteration": 2.4222261905670166 + }, + { + "auxiliary_loss_clip": 0.01066895, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_clip": 1.01583624, + "balance_loss_mlp": 1.02092147, + "epoch": 0.3362092289192845, + "flos": 16471323095040.0, + "grad_norm": 2.137944518183399, + "language_loss": 0.71465892, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.73581517, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.45898438, + "step": 5592, + "time_per_iteration": 3.776217460632324 + }, + { + "auxiliary_loss_clip": 0.0106547, + "auxiliary_loss_mlp": 0.01046484, + "balance_loss_clip": 1.01634765, + "balance_loss_mlp": 1.02133238, + "epoch": 0.33626935217195253, + "flos": 24242430758400.0, + "grad_norm": 2.085771832003528, + "language_loss": 0.78196812, + "learning_rate": 3.094102230664423e-06, + "loss": 0.80308765, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44140625, + "step": 5593, + "time_per_iteration": 2.452629804611206 + }, + { + "auxiliary_loss_clip": 0.01068651, + "auxiliary_loss_mlp": 0.01056807, + "balance_loss_clip": 1.01911354, + "balance_loss_mlp": 1.02012658, + "epoch": 0.3363294754246205, + "flos": 19717756911360.0, + "grad_norm": 1.932849782080291, + "language_loss": 0.7406249, + "learning_rate": 3.093776191858731e-06, + "loss": 0.76187944, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.48632812, + "step": 5594, + "time_per_iteration": 2.367388963699341 + }, + { + "auxiliary_loss_clip": 0.01070265, + "auxiliary_loss_mlp": 0.01056695, + "balance_loss_clip": 1.02314973, + "balance_loss_mlp": 1.02195191, + "epoch": 0.33638959867728846, + "flos": 22595333546880.0, + "grad_norm": 1.7973973089832094, + "language_loss": 0.81084824, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.8321178, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.484375, + "step": 5595, + "time_per_iteration": 3.814136028289795 + }, + { + "auxiliary_loss_clip": 0.01067146, + "auxiliary_loss_mlp": 0.01049942, + "balance_loss_clip": 1.0208075, + "balance_loss_mlp": 1.02215672, + "epoch": 0.3364497219299564, + "flos": 20993727703680.0, + "grad_norm": 1.5575724463033636, + "language_loss": 0.82913584, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.85030675, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.45117188, + "step": 5596, + "time_per_iteration": 2.4350147247314453 + }, + { + "auxiliary_loss_clip": 0.01067292, + "auxiliary_loss_mlp": 0.01052038, + "balance_loss_clip": 1.02178288, + "balance_loss_mlp": 1.02142727, + "epoch": 0.3365098451826244, + "flos": 25227435346560.0, + "grad_norm": 1.6620991452001994, + "language_loss": 0.7710458, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.79223913, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.45703125, + "step": 5597, + "time_per_iteration": 3.790078639984131 + }, + { + "auxiliary_loss_clip": 0.01063078, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.01506233, + "balance_loss_mlp": 1.01916647, + "epoch": 0.33656996843529235, + "flos": 24570544515840.0, + "grad_norm": 1.7724394713429623, + "language_loss": 0.80304623, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.82410693, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.43945312, + "step": 5598, + "time_per_iteration": 2.442047357559204 + }, + { + "auxiliary_loss_clip": 0.01070528, + "auxiliary_loss_mlp": 0.01054346, + "balance_loss_clip": 1.01906013, + "balance_loss_mlp": 1.02067888, + "epoch": 0.3366300916879603, + "flos": 44089436989440.0, + "grad_norm": 1.8238151171857462, + "language_loss": 0.66568685, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.68693566, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.49609375, + "step": 5599, + "time_per_iteration": 4.015202283859253 + }, + { + "auxiliary_loss_clip": 0.01071841, + "auxiliary_loss_mlp": 0.01061639, + "balance_loss_clip": 1.02220416, + "balance_loss_mlp": 1.02139378, + "epoch": 0.3366902149406283, + "flos": 13879057023360.0, + "grad_norm": 2.649585378524128, + "language_loss": 0.83834624, + "learning_rate": 3.091819088459249e-06, + "loss": 0.85968101, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.50390625, + "step": 5600, + "time_per_iteration": 2.3532330989837646 + }, + { + "auxiliary_loss_clip": 0.01067335, + "auxiliary_loss_mlp": 0.0106036, + "balance_loss_clip": 1.02268982, + "balance_loss_mlp": 1.01973832, + "epoch": 0.33675033819329625, + "flos": 16252173290880.0, + "grad_norm": 2.2368321115076104, + "language_loss": 0.85485125, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.8761282, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.4765625, + "step": 5601, + "time_per_iteration": 2.361980438232422 + }, + { + "auxiliary_loss_clip": 0.01064328, + "auxiliary_loss_mlp": 0.01041418, + "balance_loss_clip": 1.01469076, + "balance_loss_mlp": 1.02177763, + "epoch": 0.3368104614459642, + "flos": 17054861425920.0, + "grad_norm": 1.5130828270610155, + "language_loss": 0.84979594, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.8708533, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.42578125, + "step": 5602, + "time_per_iteration": 2.3786814212799072 + }, + { + "auxiliary_loss_clip": 0.0106685, + "auxiliary_loss_mlp": 0.01047838, + "balance_loss_clip": 1.01656997, + "balance_loss_mlp": 1.02177846, + "epoch": 0.3368705846986322, + "flos": 17857654295040.0, + "grad_norm": 1.8340634698263256, + "language_loss": 0.71117705, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.73232388, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45117188, + "step": 5603, + "time_per_iteration": 2.359121322631836 + }, + { + "auxiliary_loss_clip": 0.01068584, + "auxiliary_loss_mlp": 0.01047422, + "balance_loss_clip": 1.01734543, + "balance_loss_mlp": 1.02123737, + "epoch": 0.33693070795130015, + "flos": 22928404717440.0, + "grad_norm": 1.666676119656041, + "language_loss": 0.84509242, + "learning_rate": 3.090513524656898e-06, + "loss": 0.86625254, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.47265625, + "step": 5604, + "time_per_iteration": 2.4440155029296875 + }, + { + "auxiliary_loss_clip": 0.0106661, + "auxiliary_loss_mlp": 0.01045017, + "balance_loss_clip": 1.01367652, + "balance_loss_mlp": 1.02001548, + "epoch": 0.3369908312039681, + "flos": 22016368604160.0, + "grad_norm": 1.4998229855372456, + "language_loss": 0.74902987, + "learning_rate": 3.090187030294409e-06, + "loss": 0.77014619, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.46679688, + "step": 5605, + "time_per_iteration": 2.385272741317749 + }, + { + "auxiliary_loss_clip": 0.01068266, + "auxiliary_loss_mlp": 0.01056058, + "balance_loss_clip": 1.02242887, + "balance_loss_mlp": 1.02030778, + "epoch": 0.33705095445663613, + "flos": 11801166145920.0, + "grad_norm": 3.209363442309294, + "language_loss": 0.86183006, + "learning_rate": 3.089860494591919e-06, + "loss": 0.88307327, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.48046875, + "step": 5606, + "time_per_iteration": 2.375382423400879 + }, + { + "auxiliary_loss_clip": 0.01064294, + "auxiliary_loss_mlp": 0.0104647, + "balance_loss_clip": 1.01633382, + "balance_loss_mlp": 1.01952648, + "epoch": 0.3371110777093041, + "flos": 25045223627520.0, + "grad_norm": 1.4629039632044774, + "language_loss": 0.69309455, + "learning_rate": 3.089533917561809e-06, + "loss": 0.71420217, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44726562, + "step": 5607, + "time_per_iteration": 2.403574228286743 + }, + { + "auxiliary_loss_clip": 0.01068252, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.01223803, + "balance_loss_mlp": 1.02122915, + "epoch": 0.33717120096197206, + "flos": 26577805979520.0, + "grad_norm": 1.7680212763639795, + "language_loss": 0.72287518, + "learning_rate": 3.089207299216464e-06, + "loss": 0.74398655, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.47070312, + "step": 5608, + "time_per_iteration": 2.4223086833953857 + }, + { + "auxiliary_loss_clip": 0.01067747, + "auxiliary_loss_mlp": 0.01046577, + "balance_loss_clip": 1.01710856, + "balance_loss_mlp": 1.02257061, + "epoch": 0.33723132421464, + "flos": 15157646167680.0, + "grad_norm": 1.8903261776455096, + "language_loss": 0.80904281, + "learning_rate": 3.088880639568269e-06, + "loss": 0.83018601, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.45117188, + "step": 5609, + "time_per_iteration": 2.395296096801758 + }, + { + "auxiliary_loss_clip": 0.01066684, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.02086902, + "balance_loss_mlp": 1.02178288, + "epoch": 0.337291447467308, + "flos": 23435099412480.0, + "grad_norm": 1.61559855405847, + "language_loss": 0.83324283, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.8544246, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44921875, + "step": 5610, + "time_per_iteration": 2.4401659965515137 + }, + { + "auxiliary_loss_clip": 0.01064321, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.01770651, + "balance_loss_mlp": 1.0214057, + "epoch": 0.33735157071997596, + "flos": 17237212790400.0, + "grad_norm": 1.7756487139104977, + "language_loss": 0.83714342, + "learning_rate": 3.088227196412879e-06, + "loss": 0.8582654, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42773438, + "step": 5611, + "time_per_iteration": 2.3652403354644775 + }, + { + "auxiliary_loss_clip": 0.01067025, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.01826215, + "balance_loss_mlp": 1.02191639, + "epoch": 0.3374116939726439, + "flos": 28256115813120.0, + "grad_norm": 1.7052895778472068, + "language_loss": 0.81135404, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.83251745, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.45117188, + "step": 5612, + "time_per_iteration": 2.4766154289245605 + }, + { + "auxiliary_loss_clip": 0.01066627, + "auxiliary_loss_mlp": 0.01050076, + "balance_loss_clip": 1.0199635, + "balance_loss_mlp": 1.02038646, + "epoch": 0.3374718172253119, + "flos": 35917910409600.0, + "grad_norm": 2.4930242637816953, + "language_loss": 0.71199965, + "learning_rate": 3.087573588194753e-06, + "loss": 0.73316664, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.46289062, + "step": 5613, + "time_per_iteration": 2.4944190979003906 + }, + { + "auxiliary_loss_clip": 0.01067754, + "auxiliary_loss_mlp": 0.01048724, + "balance_loss_clip": 1.01527417, + "balance_loss_mlp": 1.02140534, + "epoch": 0.33753194047797985, + "flos": 18185698229760.0, + "grad_norm": 1.7273037168813863, + "language_loss": 0.8061527, + "learning_rate": 3.087246722218144e-06, + "loss": 0.82731748, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.46484375, + "step": 5614, + "time_per_iteration": 2.395639181137085 + }, + { + "auxiliary_loss_clip": 0.01068172, + "auxiliary_loss_mlp": 0.01054816, + "balance_loss_clip": 1.02055562, + "balance_loss_mlp": 1.02133751, + "epoch": 0.3375920637306478, + "flos": 23147798901120.0, + "grad_norm": 1.6763883247726639, + "language_loss": 0.92505437, + "learning_rate": 3.086919815013031e-06, + "loss": 0.94628423, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.46875, + "step": 5615, + "time_per_iteration": 2.4026145935058594 + }, + { + "auxiliary_loss_clip": 0.01062714, + "auxiliary_loss_mlp": 0.01051161, + "balance_loss_clip": 1.0212512, + "balance_loss_mlp": 1.01903152, + "epoch": 0.3376521869833158, + "flos": 23111105195520.0, + "grad_norm": 1.5814366093274828, + "language_loss": 0.82083392, + "learning_rate": 3.086592866591809e-06, + "loss": 0.84197259, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.4375, + "step": 5616, + "time_per_iteration": 2.4318456649780273 + }, + { + "auxiliary_loss_clip": 0.01069082, + "auxiliary_loss_mlp": 0.01055931, + "balance_loss_clip": 1.01897621, + "balance_loss_mlp": 1.02055013, + "epoch": 0.33771231023598375, + "flos": 19273766751360.0, + "grad_norm": 4.239232846691606, + "language_loss": 0.84659141, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.8678416, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.484375, + "step": 5617, + "time_per_iteration": 2.3806350231170654 + }, + { + "auxiliary_loss_clip": 0.01066782, + "auxiliary_loss_mlp": 0.01056426, + "balance_loss_clip": 1.02475262, + "balance_loss_mlp": 1.02082992, + "epoch": 0.3377724334886517, + "flos": 18149213992320.0, + "grad_norm": 2.5899141139806807, + "language_loss": 0.81293839, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.83417046, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4609375, + "step": 5618, + "time_per_iteration": 2.424600601196289 + }, + { + "auxiliary_loss_clip": 0.01067773, + "auxiliary_loss_mlp": 0.01050949, + "balance_loss_clip": 1.01866674, + "balance_loss_mlp": 1.0205704, + "epoch": 0.3378325567413197, + "flos": 25774803642240.0, + "grad_norm": 2.130488424054777, + "language_loss": 0.72706228, + "learning_rate": 3.085611774155481e-06, + "loss": 0.74824953, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.47265625, + "step": 5619, + "time_per_iteration": 2.4351589679718018 + }, + { + "auxiliary_loss_clip": 0.01065439, + "auxiliary_loss_mlp": 0.01058509, + "balance_loss_clip": 1.02433121, + "balance_loss_mlp": 1.01956177, + "epoch": 0.3378926799939877, + "flos": 21316255643520.0, + "grad_norm": 2.5600653154588264, + "language_loss": 0.73385012, + "learning_rate": 3.085284660993821e-06, + "loss": 0.75508964, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.45898438, + "step": 5620, + "time_per_iteration": 2.4205617904663086 + }, + { + "auxiliary_loss_clip": 0.01068037, + "auxiliary_loss_mlp": 0.01049534, + "balance_loss_clip": 1.01751447, + "balance_loss_mlp": 1.02213919, + "epoch": 0.33795280324665566, + "flos": 24898867741440.0, + "grad_norm": 1.7986644355415815, + "language_loss": 0.69128573, + "learning_rate": 3.084957506678058e-06, + "loss": 0.71246147, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45898438, + "step": 5621, + "time_per_iteration": 2.419731378555298 + }, + { + "auxiliary_loss_clip": 0.01064212, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.01631784, + "balance_loss_mlp": 1.01956904, + "epoch": 0.33801292649932363, + "flos": 24752791146240.0, + "grad_norm": 1.6316057400667148, + "language_loss": 0.83597463, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.85709715, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4453125, + "step": 5622, + "time_per_iteration": 2.461059331893921 + }, + { + "auxiliary_loss_clip": 0.01064603, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.01865292, + "balance_loss_mlp": 1.01957572, + "epoch": 0.3380730497519916, + "flos": 26722765411200.0, + "grad_norm": 1.439254976643121, + "language_loss": 0.74579406, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.76692116, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44921875, + "step": 5623, + "time_per_iteration": 2.418384552001953 + }, + { + "auxiliary_loss_clip": 0.01012283, + "auxiliary_loss_mlp": 0.0101313, + "balance_loss_clip": 1.00974464, + "balance_loss_mlp": 1.00300384, + "epoch": 0.33813317300465956, + "flos": 70032241560960.0, + "grad_norm": 0.754273690806335, + "language_loss": 0.55062795, + "learning_rate": 3.083975796930215e-06, + "loss": 0.57088208, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.09277344, + "step": 5624, + "time_per_iteration": 3.166423797607422 + }, + { + "auxiliary_loss_clip": 0.01069638, + "auxiliary_loss_mlp": 0.01054681, + "balance_loss_clip": 1.02193403, + "balance_loss_mlp": 1.02202594, + "epoch": 0.3381932962573275, + "flos": 24096179606400.0, + "grad_norm": 2.195090536368631, + "language_loss": 0.74645597, + "learning_rate": 3.083648478122111e-06, + "loss": 0.76769918, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4765625, + "step": 5625, + "time_per_iteration": 2.4176554679870605 + }, + { + "auxiliary_loss_clip": 0.01068571, + "auxiliary_loss_mlp": 0.01054101, + "balance_loss_clip": 1.01752734, + "balance_loss_mlp": 1.02091706, + "epoch": 0.3382534195099955, + "flos": 19277327710080.0, + "grad_norm": 4.200518995108873, + "language_loss": 0.72725612, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.74848282, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.4765625, + "step": 5626, + "time_per_iteration": 2.430955410003662 + }, + { + "auxiliary_loss_clip": 0.01066147, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.01250744, + "balance_loss_mlp": 1.02172852, + "epoch": 0.33831354276266346, + "flos": 25225131196800.0, + "grad_norm": 1.4524140757776258, + "language_loss": 0.81593943, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83703732, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4453125, + "step": 5627, + "time_per_iteration": 2.4581594467163086 + }, + { + "auxiliary_loss_clip": 0.01073324, + "auxiliary_loss_mlp": 0.01052834, + "balance_loss_clip": 1.0170238, + "balance_loss_mlp": 1.02345395, + "epoch": 0.3383736660153314, + "flos": 23110895727360.0, + "grad_norm": 2.1426097959603077, + "language_loss": 0.81893432, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.84019589, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.5, + "step": 5628, + "time_per_iteration": 2.4344191551208496 + }, + { + "auxiliary_loss_clip": 0.01069328, + "auxiliary_loss_mlp": 0.01053932, + "balance_loss_clip": 1.01690578, + "balance_loss_mlp": 1.02131104, + "epoch": 0.3384337892679994, + "flos": 23476017392640.0, + "grad_norm": 2.0113061686234697, + "language_loss": 0.80134434, + "learning_rate": 3.082338792093254e-06, + "loss": 0.82257688, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.47851562, + "step": 5629, + "time_per_iteration": 2.4255802631378174 + }, + { + "auxiliary_loss_clip": 0.01070134, + "auxiliary_loss_mlp": 0.01060229, + "balance_loss_clip": 1.02184343, + "balance_loss_mlp": 1.02171016, + "epoch": 0.33849391252066735, + "flos": 19424835671040.0, + "grad_norm": 2.207572320126079, + "language_loss": 0.86414099, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.88544464, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.484375, + "step": 5630, + "time_per_iteration": 2.449812173843384 + }, + { + "auxiliary_loss_clip": 0.01069622, + "auxiliary_loss_mlp": 0.01053561, + "balance_loss_clip": 1.02068329, + "balance_loss_mlp": 1.02207804, + "epoch": 0.3385540357733353, + "flos": 21063903269760.0, + "grad_norm": 2.3032729434987393, + "language_loss": 0.73639035, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.75762212, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4765625, + "step": 5631, + "time_per_iteration": 3.7972166538238525 + }, + { + "auxiliary_loss_clip": 0.01017539, + "auxiliary_loss_mlp": 0.01014062, + "balance_loss_clip": 1.0106523, + "balance_loss_mlp": 1.00779164, + "epoch": 0.3386141590260033, + "flos": 69205220208000.0, + "grad_norm": 0.8673833218218914, + "language_loss": 0.56141651, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58173251, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.03417969, + "router_z_loss_mlp": 0.09765625, + "step": 5632, + "time_per_iteration": 3.1092751026153564 + }, + { + "auxiliary_loss_clip": 0.010678, + "auxiliary_loss_mlp": 0.01051723, + "balance_loss_clip": 1.01963222, + "balance_loss_mlp": 1.02113211, + "epoch": 0.3386742822786713, + "flos": 25518331728000.0, + "grad_norm": 1.5801291992127109, + "language_loss": 0.8149032, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.83609843, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46679688, + "step": 5633, + "time_per_iteration": 2.4536216259002686 + }, + { + "auxiliary_loss_clip": 0.01069064, + "auxiliary_loss_mlp": 0.01059529, + "balance_loss_clip": 1.02511287, + "balance_loss_mlp": 1.02189565, + "epoch": 0.33873440553133927, + "flos": 23621989253760.0, + "grad_norm": 2.0767955283823034, + "language_loss": 0.61774224, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.63902819, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47265625, + "step": 5634, + "time_per_iteration": 2.397555112838745 + }, + { + "auxiliary_loss_clip": 0.01066134, + "auxiliary_loss_mlp": 0.01050971, + "balance_loss_clip": 1.01930952, + "balance_loss_mlp": 1.01997304, + "epoch": 0.33879452878400723, + "flos": 17088029084160.0, + "grad_norm": 1.7449384144245945, + "language_loss": 0.93973994, + "learning_rate": 3.080373032026589e-06, + "loss": 0.96091104, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4609375, + "step": 5635, + "time_per_iteration": 3.814988851547241 + }, + { + "auxiliary_loss_clip": 0.01066924, + "auxiliary_loss_mlp": 0.01050911, + "balance_loss_clip": 1.02070332, + "balance_loss_mlp": 1.02181721, + "epoch": 0.3388546520366752, + "flos": 15741149587200.0, + "grad_norm": 2.5639829651667614, + "language_loss": 0.76754224, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.78872055, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.45117188, + "step": 5636, + "time_per_iteration": 2.3698232173919678 + }, + { + "auxiliary_loss_clip": 0.01065879, + "auxiliary_loss_mlp": 0.0106257, + "balance_loss_clip": 1.02764153, + "balance_loss_mlp": 1.01992273, + "epoch": 0.33891477528934316, + "flos": 22417660304640.0, + "grad_norm": 1.5083877657105533, + "language_loss": 0.84587693, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.86716139, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.45898438, + "step": 5637, + "time_per_iteration": 3.897296905517578 + }, + { + "auxiliary_loss_clip": 0.0106821, + "auxiliary_loss_mlp": 0.01058224, + "balance_loss_clip": 1.02377272, + "balance_loss_mlp": 1.0213871, + "epoch": 0.3389748985420111, + "flos": 17273871584640.0, + "grad_norm": 1.6994407656914603, + "language_loss": 0.71319056, + "learning_rate": 3.079389598759495e-06, + "loss": 0.73445487, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46875, + "step": 5638, + "time_per_iteration": 3.8198401927948 + }, + { + "auxiliary_loss_clip": 0.01067422, + "auxiliary_loss_mlp": 0.01057321, + "balance_loss_clip": 1.02601635, + "balance_loss_mlp": 1.02179754, + "epoch": 0.3390350217946791, + "flos": 27743765477760.0, + "grad_norm": 2.3277485940039213, + "language_loss": 0.8196165, + "learning_rate": 3.079061705792765e-06, + "loss": 0.84086394, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45703125, + "step": 5639, + "time_per_iteration": 2.4499642848968506 + }, + { + "auxiliary_loss_clip": 0.01069732, + "auxiliary_loss_mlp": 0.01064092, + "balance_loss_clip": 1.03095198, + "balance_loss_mlp": 1.02131987, + "epoch": 0.33909514504734706, + "flos": 20338756997760.0, + "grad_norm": 2.2520731126705176, + "language_loss": 0.7039336, + "learning_rate": 3.078733771907907e-06, + "loss": 0.72527182, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.484375, + "step": 5640, + "time_per_iteration": 2.421077251434326 + }, + { + "auxiliary_loss_clip": 0.01069683, + "auxiliary_loss_mlp": 0.01062253, + "balance_loss_clip": 1.02736056, + "balance_loss_mlp": 1.02330172, + "epoch": 0.339155268300015, + "flos": 14829148385280.0, + "grad_norm": 1.8612771365746288, + "language_loss": 0.71280992, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.73412931, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.46484375, + "step": 5641, + "time_per_iteration": 2.380199670791626 + }, + { + "auxiliary_loss_clip": 0.01070265, + "auxiliary_loss_mlp": 0.0106036, + "balance_loss_clip": 1.02750587, + "balance_loss_mlp": 1.02320087, + "epoch": 0.339215391552683, + "flos": 26066747364480.0, + "grad_norm": 2.325122979416618, + "language_loss": 0.89288127, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.91418755, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.47070312, + "step": 5642, + "time_per_iteration": 2.45731258392334 + }, + { + "auxiliary_loss_clip": 0.01066293, + "auxiliary_loss_mlp": 0.01042587, + "balance_loss_clip": 1.0155865, + "balance_loss_mlp": 1.02445638, + "epoch": 0.33927551480535095, + "flos": 14573828545920.0, + "grad_norm": 1.8994795941211735, + "language_loss": 0.85544008, + "learning_rate": 3.077749724868924e-06, + "loss": 0.87652886, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41796875, + "step": 5643, + "time_per_iteration": 2.367480516433716 + }, + { + "auxiliary_loss_clip": 0.01073961, + "auxiliary_loss_mlp": 0.01061869, + "balance_loss_clip": 1.02705956, + "balance_loss_mlp": 1.02678633, + "epoch": 0.3393356380580189, + "flos": 23804445352320.0, + "grad_norm": 1.8958805459253072, + "language_loss": 0.78257638, + "learning_rate": 3.077421627435922e-06, + "loss": 0.80393469, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.47265625, + "step": 5644, + "time_per_iteration": 2.476710081100464 + }, + { + "auxiliary_loss_clip": 0.01070941, + "auxiliary_loss_mlp": 0.01055974, + "balance_loss_clip": 1.02261889, + "balance_loss_mlp": 1.0243212, + "epoch": 0.3393957613106869, + "flos": 17346909882240.0, + "grad_norm": 2.8005990176777504, + "language_loss": 0.65766782, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.67893696, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.46484375, + "step": 5645, + "time_per_iteration": 2.385627269744873 + }, + { + "auxiliary_loss_clip": 0.01069287, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_clip": 1.0192678, + "balance_loss_mlp": 1.02371526, + "epoch": 0.3394558845633549, + "flos": 28432846448640.0, + "grad_norm": 1.8165372734160639, + "language_loss": 0.78374553, + "learning_rate": 3.076765310014552e-06, + "loss": 0.80494988, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45507812, + "step": 5646, + "time_per_iteration": 2.5007402896881104 + }, + { + "auxiliary_loss_clip": 0.01074071, + "auxiliary_loss_mlp": 0.01058163, + "balance_loss_clip": 1.02232838, + "balance_loss_mlp": 1.02495623, + "epoch": 0.33951600781602287, + "flos": 22085950677120.0, + "grad_norm": 2.2243244255691095, + "language_loss": 0.80648029, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.8278026, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.4921875, + "step": 5647, + "time_per_iteration": 2.391749858856201 + }, + { + "auxiliary_loss_clip": 0.01073609, + "auxiliary_loss_mlp": 0.01048573, + "balance_loss_clip": 1.01641011, + "balance_loss_mlp": 1.02645135, + "epoch": 0.33957613106869083, + "flos": 23877134536320.0, + "grad_norm": 1.865436779866164, + "language_loss": 0.78433573, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.80555761, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.47265625, + "step": 5648, + "time_per_iteration": 2.447050094604492 + }, + { + "auxiliary_loss_clip": 0.01022441, + "auxiliary_loss_mlp": 0.01013518, + "balance_loss_clip": 1.01027572, + "balance_loss_mlp": 1.01255059, + "epoch": 0.3396362543213588, + "flos": 71239014305280.0, + "grad_norm": 0.7829316428918759, + "language_loss": 0.56472409, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5850836, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.09863281, + "step": 5649, + "time_per_iteration": 3.081279993057251 + }, + { + "auxiliary_loss_clip": 0.01071077, + "auxiliary_loss_mlp": 0.01070466, + "balance_loss_clip": 1.03726625, + "balance_loss_mlp": 1.02389264, + "epoch": 0.33969637757402676, + "flos": 25920426389760.0, + "grad_norm": 1.7252809558863162, + "language_loss": 0.86639726, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.88781273, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.47265625, + "step": 5650, + "time_per_iteration": 2.4688503742218018 + }, + { + "auxiliary_loss_clip": 0.01067434, + "auxiliary_loss_mlp": 0.01047888, + "balance_loss_clip": 1.01744235, + "balance_loss_mlp": 1.02256405, + "epoch": 0.33975650082669473, + "flos": 35260286440320.0, + "grad_norm": 2.2058824262148247, + "language_loss": 0.72950113, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.75065434, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44921875, + "step": 5651, + "time_per_iteration": 2.5045197010040283 + }, + { + "auxiliary_loss_clip": 0.01070161, + "auxiliary_loss_mlp": 0.01056861, + "balance_loss_clip": 1.02460337, + "balance_loss_mlp": 1.0234189, + "epoch": 0.3398166240793627, + "flos": 16646273251200.0, + "grad_norm": 1.8469861320367245, + "language_loss": 0.82596618, + "learning_rate": 3.074795378203616e-06, + "loss": 0.84723639, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.46679688, + "step": 5652, + "time_per_iteration": 2.3887338638305664 + }, + { + "auxiliary_loss_clip": 0.01072574, + "auxiliary_loss_mlp": 0.01067266, + "balance_loss_clip": 1.03121686, + "balance_loss_mlp": 1.0240922, + "epoch": 0.33987674733203066, + "flos": 24061022000640.0, + "grad_norm": 1.7994202073844106, + "language_loss": 0.78551811, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.80691648, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.484375, + "step": 5653, + "time_per_iteration": 2.427478790283203 + }, + { + "auxiliary_loss_clip": 0.01068922, + "auxiliary_loss_mlp": 0.01057257, + "balance_loss_clip": 1.0269779, + "balance_loss_mlp": 1.02255452, + "epoch": 0.3399368705846986, + "flos": 13250132058240.0, + "grad_norm": 2.6312825690648634, + "language_loss": 0.88596958, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.90723133, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.46289062, + "step": 5654, + "time_per_iteration": 2.393951654434204 + }, + { + "auxiliary_loss_clip": 0.01065915, + "auxiliary_loss_mlp": 0.01065719, + "balance_loss_clip": 1.03303134, + "balance_loss_mlp": 1.01994693, + "epoch": 0.3399969938373666, + "flos": 27011706756480.0, + "grad_norm": 2.1108059507937167, + "language_loss": 0.66965902, + "learning_rate": 3.073809861919351e-06, + "loss": 0.69097543, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.45898438, + "step": 5655, + "time_per_iteration": 2.417910099029541 + }, + { + "auxiliary_loss_clip": 0.01067532, + "auxiliary_loss_mlp": 0.01071993, + "balance_loss_clip": 1.03913879, + "balance_loss_mlp": 1.02195704, + "epoch": 0.34005711709003456, + "flos": 28548792673920.0, + "grad_norm": 1.3816214381408023, + "language_loss": 0.78117597, + "learning_rate": 3.073481275036697e-06, + "loss": 0.8025713, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.45507812, + "step": 5656, + "time_per_iteration": 2.5124590396881104 + }, + { + "auxiliary_loss_clip": 0.01070652, + "auxiliary_loss_mlp": 0.01060815, + "balance_loss_clip": 1.02520728, + "balance_loss_mlp": 1.0212419, + "epoch": 0.3401172403427025, + "flos": 21615914776320.0, + "grad_norm": 1.6427287200198972, + "language_loss": 0.84516001, + "learning_rate": 3.073152647447525e-06, + "loss": 0.86647463, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.49414062, + "step": 5657, + "time_per_iteration": 2.3979270458221436 + }, + { + "auxiliary_loss_clip": 0.01068475, + "auxiliary_loss_mlp": 0.01062294, + "balance_loss_clip": 1.03056073, + "balance_loss_mlp": 1.02290452, + "epoch": 0.3401773635953705, + "flos": 25884570556800.0, + "grad_norm": 1.7059449772311954, + "language_loss": 0.86982334, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.89113104, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45703125, + "step": 5658, + "time_per_iteration": 2.45029354095459 + }, + { + "auxiliary_loss_clip": 0.01016332, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.02861536, + "balance_loss_mlp": 1.0067929, + "epoch": 0.3402374868480385, + "flos": 65504704982400.0, + "grad_norm": 0.831706680058992, + "language_loss": 0.60245812, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62294769, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.09570312, + "step": 5659, + "time_per_iteration": 2.963164806365967 + }, + { + "auxiliary_loss_clip": 0.01067021, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.02217627, + "balance_loss_mlp": 1.02377653, + "epoch": 0.34029761010070647, + "flos": 24059450989440.0, + "grad_norm": 1.7271249737596355, + "language_loss": 0.69158816, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.7127738, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43359375, + "step": 5660, + "time_per_iteration": 2.4333102703094482 + }, + { + "auxiliary_loss_clip": 0.01071509, + "auxiliary_loss_mlp": 0.01061874, + "balance_loss_clip": 1.02806616, + "balance_loss_mlp": 1.02509403, + "epoch": 0.34035773335337444, + "flos": 27598491843840.0, + "grad_norm": 1.6713153413618167, + "language_loss": 0.68250728, + "learning_rate": 3.071837730274918e-06, + "loss": 0.70384109, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46484375, + "step": 5661, + "time_per_iteration": 2.431236982345581 + }, + { + "auxiliary_loss_clip": 0.0107096, + "auxiliary_loss_mlp": 0.01045404, + "balance_loss_clip": 1.01536345, + "balance_loss_mlp": 1.02653146, + "epoch": 0.3404178566060424, + "flos": 20811760364160.0, + "grad_norm": 1.8036056685809438, + "language_loss": 0.80335987, + "learning_rate": 3.071508899340113e-06, + "loss": 0.82452357, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4453125, + "step": 5662, + "time_per_iteration": 2.4718222618103027 + }, + { + "auxiliary_loss_clip": 0.01073863, + "auxiliary_loss_mlp": 0.01059443, + "balance_loss_clip": 1.02646971, + "balance_loss_mlp": 1.02678442, + "epoch": 0.34047797985871037, + "flos": 26832357768960.0, + "grad_norm": 2.1693058631748676, + "language_loss": 0.75115919, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.77249223, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.47070312, + "step": 5663, + "time_per_iteration": 2.435824394226074 + }, + { + "auxiliary_loss_clip": 0.01071343, + "auxiliary_loss_mlp": 0.01049049, + "balance_loss_clip": 1.02082026, + "balance_loss_mlp": 1.02670598, + "epoch": 0.34053810311137833, + "flos": 19681621787520.0, + "grad_norm": 1.6851841233484264, + "language_loss": 0.87545007, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.89665401, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4453125, + "step": 5664, + "time_per_iteration": 2.4426486492156982 + }, + { + "auxiliary_loss_clip": 0.01076556, + "auxiliary_loss_mlp": 0.01053028, + "balance_loss_clip": 1.02112758, + "balance_loss_mlp": 1.02924585, + "epoch": 0.3405982263640463, + "flos": 21724669261440.0, + "grad_norm": 1.9516805675259707, + "language_loss": 0.71003103, + "learning_rate": 3.070522162795235e-06, + "loss": 0.73132682, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.47265625, + "step": 5665, + "time_per_iteration": 2.4141886234283447 + }, + { + "auxiliary_loss_clip": 0.01074553, + "auxiliary_loss_mlp": 0.01053789, + "balance_loss_clip": 1.01819277, + "balance_loss_mlp": 1.02785206, + "epoch": 0.34065834961671426, + "flos": 18040634064000.0, + "grad_norm": 2.2830985657942633, + "language_loss": 0.74162149, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.76290488, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.46875, + "step": 5666, + "time_per_iteration": 2.3984804153442383 + }, + { + "auxiliary_loss_clip": 0.01079469, + "auxiliary_loss_mlp": 0.01060069, + "balance_loss_clip": 1.02723908, + "balance_loss_mlp": 1.03018582, + "epoch": 0.3407184728693822, + "flos": 21396276213120.0, + "grad_norm": 1.4622782622882156, + "language_loss": 0.74294615, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.76434153, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4921875, + "step": 5667, + "time_per_iteration": 2.4133548736572266 + }, + { + "auxiliary_loss_clip": 0.01032171, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.02830684, + "balance_loss_mlp": 1.02114749, + "epoch": 0.3407785961220502, + "flos": 68684559102720.0, + "grad_norm": 0.8570098718066301, + "language_loss": 0.63295496, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65359354, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.11035156, + "step": 5668, + "time_per_iteration": 3.2349441051483154 + }, + { + "auxiliary_loss_clip": 0.01075616, + "auxiliary_loss_mlp": 0.010616, + "balance_loss_clip": 1.02857888, + "balance_loss_mlp": 1.02784371, + "epoch": 0.34083871937471816, + "flos": 14063503069440.0, + "grad_norm": 2.4082240318282517, + "language_loss": 0.74911129, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.77048349, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.4765625, + "step": 5669, + "time_per_iteration": 2.4131181240081787 + }, + { + "auxiliary_loss_clip": 0.01074809, + "auxiliary_loss_mlp": 0.0106156, + "balance_loss_clip": 1.02746582, + "balance_loss_mlp": 1.02590704, + "epoch": 0.3408988426273861, + "flos": 17084677593600.0, + "grad_norm": 3.1800991339803932, + "language_loss": 0.81805015, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.83941382, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.49023438, + "step": 5670, + "time_per_iteration": 2.411785364151001 + }, + { + "auxiliary_loss_clip": 0.0107399, + "auxiliary_loss_mlp": 0.0106258, + "balance_loss_clip": 1.02998769, + "balance_loss_mlp": 1.02552009, + "epoch": 0.3409589658800541, + "flos": 24023420599680.0, + "grad_norm": 1.6774302659132672, + "language_loss": 0.78817052, + "learning_rate": 3.068547593996078e-06, + "loss": 0.80953622, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.484375, + "step": 5671, + "time_per_iteration": 3.8471426963806152 + }, + { + "auxiliary_loss_clip": 0.01070905, + "auxiliary_loss_mlp": 0.01053152, + "balance_loss_clip": 1.01878381, + "balance_loss_mlp": 1.02437437, + "epoch": 0.34101908913272205, + "flos": 21140956373760.0, + "grad_norm": 1.916541119153878, + "language_loss": 0.76110888, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.78234941, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46484375, + "step": 5672, + "time_per_iteration": 2.4507548809051514 + }, + { + "auxiliary_loss_clip": 0.01073286, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_clip": 1.01587367, + "balance_loss_mlp": 1.02528203, + "epoch": 0.3410792123853901, + "flos": 15701209125120.0, + "grad_norm": 2.2857001510481694, + "language_loss": 0.7534306, + "learning_rate": 3.06788908010777e-06, + "loss": 0.77464956, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48046875, + "step": 5673, + "time_per_iteration": 2.3886961936950684 + }, + { + "auxiliary_loss_clip": 0.0107036, + "auxiliary_loss_mlp": 0.01055648, + "balance_loss_clip": 1.022174, + "balance_loss_mlp": 1.02424264, + "epoch": 0.34113933563805804, + "flos": 23034994698240.0, + "grad_norm": 2.011370358154867, + "language_loss": 0.80454427, + "learning_rate": 3.067559762415682e-06, + "loss": 0.82580435, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4609375, + "step": 5674, + "time_per_iteration": 3.8167154788970947 + }, + { + "auxiliary_loss_clip": 0.01017504, + "auxiliary_loss_mlp": 0.01006454, + "balance_loss_clip": 1.00297332, + "balance_loss_mlp": 1.00820589, + "epoch": 0.341199458890726, + "flos": 69611294764800.0, + "grad_norm": 0.797325223601645, + "language_loss": 0.56101996, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58125955, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.03491211, + "router_z_loss_mlp": 0.09326172, + "step": 5675, + "time_per_iteration": 3.194430351257324 + }, + { + "auxiliary_loss_clip": 0.01068318, + "auxiliary_loss_mlp": 0.01059604, + "balance_loss_clip": 1.02777493, + "balance_loss_mlp": 1.0226016, + "epoch": 0.34125958214339397, + "flos": 22345250411520.0, + "grad_norm": 5.524830132421128, + "language_loss": 0.80056775, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.82184696, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.45703125, + "step": 5676, + "time_per_iteration": 2.405494213104248 + }, + { + "auxiliary_loss_clip": 0.01068177, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_clip": 1.01733565, + "balance_loss_mlp": 1.02274084, + "epoch": 0.34131970539606193, + "flos": 21870850590720.0, + "grad_norm": 1.7354259748701053, + "language_loss": 0.87189418, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.89305925, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.453125, + "step": 5677, + "time_per_iteration": 3.8039214611053467 + }, + { + "auxiliary_loss_clip": 0.01069292, + "auxiliary_loss_mlp": 0.0104861, + "balance_loss_clip": 1.01561296, + "balance_loss_mlp": 1.02379203, + "epoch": 0.3413798286487299, + "flos": 24934583928960.0, + "grad_norm": 1.8113056525966331, + "language_loss": 0.81562585, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.83680487, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45507812, + "step": 5678, + "time_per_iteration": 3.8676421642303467 + }, + { + "auxiliary_loss_clip": 0.01070979, + "auxiliary_loss_mlp": 0.01049337, + "balance_loss_clip": 1.01827061, + "balance_loss_mlp": 1.02445126, + "epoch": 0.34143995190139786, + "flos": 25373197739520.0, + "grad_norm": 1.7756478912855396, + "language_loss": 0.76115233, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.78235555, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.46484375, + "step": 5679, + "time_per_iteration": 2.4393506050109863 + }, + { + "auxiliary_loss_clip": 0.0102782, + "auxiliary_loss_mlp": 0.01012724, + "balance_loss_clip": 1.00957739, + "balance_loss_mlp": 1.01853764, + "epoch": 0.34150007515406583, + "flos": 67778876856960.0, + "grad_norm": 0.729788562271646, + "language_loss": 0.59580165, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61620712, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.03149414, + "router_z_loss_mlp": 0.09277344, + "step": 5680, + "time_per_iteration": 3.1339969635009766 + }, + { + "auxiliary_loss_clip": 0.01071407, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.01093256, + "balance_loss_mlp": 1.02642381, + "epoch": 0.3415601984067338, + "flos": 20301399976320.0, + "grad_norm": 1.9160044382698322, + "language_loss": 0.73369193, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.75482309, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44921875, + "step": 5681, + "time_per_iteration": 2.4488778114318848 + }, + { + "auxiliary_loss_clip": 0.01072563, + "auxiliary_loss_mlp": 0.0105102, + "balance_loss_clip": 1.01869011, + "balance_loss_mlp": 1.02797818, + "epoch": 0.34162032165940176, + "flos": 26029983836160.0, + "grad_norm": 2.76735905707216, + "language_loss": 0.73649359, + "learning_rate": 3.064923764577233e-06, + "loss": 0.75772941, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4453125, + "step": 5682, + "time_per_iteration": 2.470813512802124 + }, + { + "auxiliary_loss_clip": 0.01073476, + "auxiliary_loss_mlp": 0.01052725, + "balance_loss_clip": 1.01941812, + "balance_loss_mlp": 1.02635622, + "epoch": 0.3416804449120697, + "flos": 28802087654400.0, + "grad_norm": 2.9247902728938744, + "language_loss": 0.85380483, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.87506682, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.47265625, + "step": 5683, + "time_per_iteration": 2.473719596862793 + }, + { + "auxiliary_loss_clip": 0.01077857, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_clip": 1.01639509, + "balance_loss_mlp": 1.02911544, + "epoch": 0.3417405681647377, + "flos": 22600500428160.0, + "grad_norm": 1.563380151474871, + "language_loss": 0.72129476, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.74258447, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48828125, + "step": 5684, + "time_per_iteration": 2.436532497406006 + }, + { + "auxiliary_loss_clip": 0.01073216, + "auxiliary_loss_mlp": 0.01042415, + "balance_loss_clip": 1.01394773, + "balance_loss_mlp": 1.02911651, + "epoch": 0.34180069141740566, + "flos": 24715119922560.0, + "grad_norm": 1.7446194262066692, + "language_loss": 0.76059675, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.78175306, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.44140625, + "step": 5685, + "time_per_iteration": 2.541807174682617 + }, + { + "auxiliary_loss_clip": 0.01075378, + "auxiliary_loss_mlp": 0.01047884, + "balance_loss_clip": 1.01920199, + "balance_loss_mlp": 1.03128076, + "epoch": 0.3418608146700737, + "flos": 30517440307200.0, + "grad_norm": 1.8372544284945331, + "language_loss": 0.72438145, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.74561405, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.44140625, + "step": 5686, + "time_per_iteration": 2.5034778118133545 + }, + { + "auxiliary_loss_clip": 0.01075006, + "auxiliary_loss_mlp": 0.01062817, + "balance_loss_clip": 1.02859199, + "balance_loss_mlp": 1.02733314, + "epoch": 0.34192093792274164, + "flos": 15121441221120.0, + "grad_norm": 3.9435679580528413, + "language_loss": 0.79914653, + "learning_rate": 3.06327495310661e-06, + "loss": 0.82052475, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.4765625, + "step": 5687, + "time_per_iteration": 2.423860788345337 + }, + { + "auxiliary_loss_clip": 0.01075471, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.0225873, + "balance_loss_mlp": 1.03092527, + "epoch": 0.3419810611754096, + "flos": 13186973675520.0, + "grad_norm": 2.081531358766658, + "language_loss": 0.88487405, + "learning_rate": 3.062945069803981e-06, + "loss": 0.90617508, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4453125, + "step": 5688, + "time_per_iteration": 2.380075216293335 + }, + { + "auxiliary_loss_clip": 0.01079016, + "auxiliary_loss_mlp": 0.01053839, + "balance_loss_clip": 1.01876736, + "balance_loss_mlp": 1.02955294, + "epoch": 0.34204118442807757, + "flos": 19535265901440.0, + "grad_norm": 1.9700002392103135, + "language_loss": 0.81622171, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.83755022, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.49414062, + "step": 5689, + "time_per_iteration": 2.4419405460357666 + }, + { + "auxiliary_loss_clip": 0.01073218, + "auxiliary_loss_mlp": 0.01058868, + "balance_loss_clip": 1.02320051, + "balance_loss_mlp": 1.02693951, + "epoch": 0.34210130768074554, + "flos": 15193955848320.0, + "grad_norm": 1.753873517267955, + "language_loss": 0.74820369, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.76952451, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.46289062, + "step": 5690, + "time_per_iteration": 2.387691020965576 + }, + { + "auxiliary_loss_clip": 0.01072095, + "auxiliary_loss_mlp": 0.0104857, + "balance_loss_clip": 1.01788509, + "balance_loss_mlp": 1.02629852, + "epoch": 0.3421614309334135, + "flos": 24935072688000.0, + "grad_norm": 1.882885931429003, + "language_loss": 0.77677071, + "learning_rate": 3.061955178104237e-06, + "loss": 0.79797745, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45703125, + "step": 5691, + "time_per_iteration": 2.424591302871704 + }, + { + "auxiliary_loss_clip": 0.01070661, + "auxiliary_loss_mlp": 0.010558, + "balance_loss_clip": 1.02628386, + "balance_loss_mlp": 1.02651513, + "epoch": 0.34222155418608147, + "flos": 21907544296320.0, + "grad_norm": 1.50400519121594, + "language_loss": 0.70109326, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.72235787, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44140625, + "step": 5692, + "time_per_iteration": 2.430199146270752 + }, + { + "auxiliary_loss_clip": 0.01072907, + "auxiliary_loss_mlp": 0.01064409, + "balance_loss_clip": 1.03203154, + "balance_loss_mlp": 1.02596974, + "epoch": 0.34228167743874943, + "flos": 18113078868480.0, + "grad_norm": 2.023127776983481, + "language_loss": 0.74997973, + "learning_rate": 3.06129504893632e-06, + "loss": 0.77135289, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46875, + "step": 5693, + "time_per_iteration": 2.37575626373291 + }, + { + "auxiliary_loss_clip": 0.01067926, + "auxiliary_loss_mlp": 0.01056191, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.02339828, + "epoch": 0.3423418006914174, + "flos": 21287521728000.0, + "grad_norm": 1.801020675033579, + "language_loss": 0.77356243, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.79480368, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4453125, + "step": 5694, + "time_per_iteration": 2.444194793701172 + }, + { + "auxiliary_loss_clip": 0.01067155, + "auxiliary_loss_mlp": 0.01049272, + "balance_loss_clip": 1.02138901, + "balance_loss_mlp": 1.02357984, + "epoch": 0.34240192394408536, + "flos": 19822601324160.0, + "grad_norm": 1.617828695510483, + "language_loss": 0.80668491, + "learning_rate": 3.060634758790747e-06, + "loss": 0.82784915, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.43554688, + "step": 5695, + "time_per_iteration": 2.3903141021728516 + }, + { + "auxiliary_loss_clip": 0.01069399, + "auxiliary_loss_mlp": 0.01061295, + "balance_loss_clip": 1.02919149, + "balance_loss_mlp": 1.02337098, + "epoch": 0.3424620471967533, + "flos": 24534374480640.0, + "grad_norm": 1.9407192578553727, + "language_loss": 0.75209624, + "learning_rate": 3.060304553382635e-06, + "loss": 0.77340317, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4609375, + "step": 5696, + "time_per_iteration": 2.452543020248413 + }, + { + "auxiliary_loss_clip": 0.01068536, + "auxiliary_loss_mlp": 0.01054603, + "balance_loss_clip": 1.02388322, + "balance_loss_mlp": 1.0227567, + "epoch": 0.3425221704494213, + "flos": 25847702294400.0, + "grad_norm": 1.6339911130147682, + "language_loss": 0.72258162, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.74381304, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45703125, + "step": 5697, + "time_per_iteration": 2.4322593212127686 + }, + { + "auxiliary_loss_clip": 0.0106746, + "auxiliary_loss_mlp": 0.01050245, + "balance_loss_clip": 1.02113461, + "balance_loss_mlp": 1.02403378, + "epoch": 0.34258229370208926, + "flos": 21539524988160.0, + "grad_norm": 2.095406030491964, + "language_loss": 0.83368838, + "learning_rate": 3.05964402195837e-06, + "loss": 0.85486543, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43359375, + "step": 5698, + "time_per_iteration": 2.4209043979644775 + }, + { + "auxiliary_loss_clip": 0.01068482, + "auxiliary_loss_mlp": 0.01054536, + "balance_loss_clip": 1.02263522, + "balance_loss_mlp": 1.02297163, + "epoch": 0.3426424169547573, + "flos": 23651840332800.0, + "grad_norm": 2.844284802878053, + "language_loss": 0.70506597, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.72629613, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.45507812, + "step": 5699, + "time_per_iteration": 2.458503246307373 + }, + { + "auxiliary_loss_clip": 0.01070266, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.02052093, + "balance_loss_mlp": 1.02520561, + "epoch": 0.34270254020742524, + "flos": 24643722458880.0, + "grad_norm": 3.2437080643096254, + "language_loss": 0.73953211, + "learning_rate": 3.058983329806877e-06, + "loss": 0.7607373, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44921875, + "step": 5700, + "time_per_iteration": 2.516770839691162 + }, + { + "auxiliary_loss_clip": 0.0107061, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_clip": 1.02261996, + "balance_loss_mlp": 1.02603209, + "epoch": 0.3427626634600932, + "flos": 20995682739840.0, + "grad_norm": 2.6073895956425246, + "language_loss": 0.8302384, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.85147434, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4453125, + "step": 5701, + "time_per_iteration": 2.3982882499694824 + }, + { + "auxiliary_loss_clip": 0.01074766, + "auxiliary_loss_mlp": 0.01047813, + "balance_loss_clip": 1.01812983, + "balance_loss_mlp": 1.02850592, + "epoch": 0.3428227867127612, + "flos": 21432725539200.0, + "grad_norm": 1.7224087955386915, + "language_loss": 0.73097426, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.75220001, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.46289062, + "step": 5702, + "time_per_iteration": 2.4410970211029053 + }, + { + "auxiliary_loss_clip": 0.01023833, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.03360093, + "balance_loss_mlp": 1.01420283, + "epoch": 0.34288290996542914, + "flos": 55728709827840.0, + "grad_norm": 0.7937176343867128, + "language_loss": 0.57630271, + "learning_rate": 3.057991990435309e-06, + "loss": 0.5969097, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.03271484, + "router_z_loss_mlp": 0.09667969, + "step": 5703, + "time_per_iteration": 2.9813315868377686 + }, + { + "auxiliary_loss_clip": 0.01076218, + "auxiliary_loss_mlp": 0.01051576, + "balance_loss_clip": 1.02009332, + "balance_loss_mlp": 1.02995694, + "epoch": 0.3429430332180971, + "flos": 20155777228800.0, + "grad_norm": 1.8007491647591978, + "language_loss": 0.76080173, + "learning_rate": 3.057661463723086e-06, + "loss": 0.7820797, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.46289062, + "step": 5704, + "time_per_iteration": 2.427797555923462 + }, + { + "auxiliary_loss_clip": 0.01076174, + "auxiliary_loss_mlp": 0.01044375, + "balance_loss_clip": 1.01559758, + "balance_loss_mlp": 1.03126025, + "epoch": 0.34300315647076507, + "flos": 17964942503040.0, + "grad_norm": 1.698680485094175, + "language_loss": 0.74181914, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.76302463, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44921875, + "step": 5705, + "time_per_iteration": 2.416372060775757 + }, + { + "auxiliary_loss_clip": 0.01077133, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_clip": 1.01320565, + "balance_loss_mlp": 1.03102636, + "epoch": 0.34306327972343303, + "flos": 22085845943040.0, + "grad_norm": 1.7612681891900592, + "language_loss": 0.80691528, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82813025, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4609375, + "step": 5706, + "time_per_iteration": 2.4324662685394287 + }, + { + "auxiliary_loss_clip": 0.01080749, + "auxiliary_loss_mlp": 0.01054268, + "balance_loss_clip": 1.01943517, + "balance_loss_mlp": 1.0324111, + "epoch": 0.343123402976101, + "flos": 18441681384960.0, + "grad_norm": 2.9293442679527617, + "language_loss": 0.84360284, + "learning_rate": 3.056669642996787e-06, + "loss": 0.86495298, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.484375, + "step": 5707, + "time_per_iteration": 2.390139579772949 + }, + { + "auxiliary_loss_clip": 0.01079273, + "auxiliary_loss_mlp": 0.0105328, + "balance_loss_clip": 1.02307272, + "balance_loss_mlp": 1.03323352, + "epoch": 0.34318352622876896, + "flos": 17162778038400.0, + "grad_norm": 1.5513188976544074, + "language_loss": 0.76151133, + "learning_rate": 3.056338955933266e-06, + "loss": 0.78283679, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4609375, + "step": 5708, + "time_per_iteration": 2.4183433055877686 + }, + { + "auxiliary_loss_clip": 0.01074602, + "auxiliary_loss_mlp": 0.01059314, + "balance_loss_clip": 1.02996433, + "balance_loss_mlp": 1.02974343, + "epoch": 0.34324364948143693, + "flos": 26686944489600.0, + "grad_norm": 1.6775795919288319, + "language_loss": 0.82250297, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.84384203, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44921875, + "step": 5709, + "time_per_iteration": 2.4513614177703857 + }, + { + "auxiliary_loss_clip": 0.01078512, + "auxiliary_loss_mlp": 0.0106648, + "balance_loss_clip": 1.03086019, + "balance_loss_mlp": 1.03223443, + "epoch": 0.3433037727341049, + "flos": 21250513820160.0, + "grad_norm": 3.2113395377003258, + "language_loss": 0.79954803, + "learning_rate": 3.055677461649329e-06, + "loss": 0.82099795, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.4609375, + "step": 5710, + "time_per_iteration": 2.477811813354492 + }, + { + "auxiliary_loss_clip": 0.01079013, + "auxiliary_loss_mlp": 0.01065758, + "balance_loss_clip": 1.0324986, + "balance_loss_mlp": 1.03029358, + "epoch": 0.34336389598677286, + "flos": 20628431481600.0, + "grad_norm": 1.9849187756445519, + "language_loss": 0.72390556, + "learning_rate": 3.055346654453996e-06, + "loss": 0.74535328, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48828125, + "step": 5711, + "time_per_iteration": 3.8559608459472656 + }, + { + "auxiliary_loss_clip": 0.01074369, + "auxiliary_loss_mlp": 0.010637, + "balance_loss_clip": 1.03275323, + "balance_loss_mlp": 1.02920246, + "epoch": 0.3434240192394409, + "flos": 14537693422080.0, + "grad_norm": 1.6276711220056086, + "language_loss": 0.67746156, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69884229, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45117188, + "step": 5712, + "time_per_iteration": 2.378652811050415 + }, + { + "auxiliary_loss_clip": 0.01029077, + "auxiliary_loss_mlp": 0.01010886, + "balance_loss_clip": 1.00784576, + "balance_loss_mlp": 1.01965177, + "epoch": 0.34348414249210885, + "flos": 58048828784640.0, + "grad_norm": 0.8559329663354557, + "language_loss": 0.5811196, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60151923, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.03039551, + "router_z_loss_mlp": 0.09472656, + "step": 5713, + "time_per_iteration": 4.43656325340271 + }, + { + "auxiliary_loss_clip": 0.01072563, + "auxiliary_loss_mlp": 0.01065244, + "balance_loss_clip": 1.03471446, + "balance_loss_mlp": 1.02657461, + "epoch": 0.3435442657447768, + "flos": 20703389904000.0, + "grad_norm": 2.5304577626493634, + "language_loss": 0.82612526, + "learning_rate": 3.054353992805076e-06, + "loss": 0.8475033, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4609375, + "step": 5714, + "time_per_iteration": 2.4125444889068604 + }, + { + "auxiliary_loss_clip": 0.01070811, + "auxiliary_loss_mlp": 0.01058082, + "balance_loss_clip": 1.02603829, + "balance_loss_mlp": 1.0245235, + "epoch": 0.3436043889974448, + "flos": 22929137856000.0, + "grad_norm": 2.0735953514594745, + "language_loss": 0.73683095, + "learning_rate": 3.05402302560962e-06, + "loss": 0.75811982, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4609375, + "step": 5715, + "time_per_iteration": 2.4785869121551514 + }, + { + "auxiliary_loss_clip": 0.0102073, + "auxiliary_loss_mlp": 0.01006763, + "balance_loss_clip": 1.00402153, + "balance_loss_mlp": 1.01126695, + "epoch": 0.34366451225011274, + "flos": 58399914216960.0, + "grad_norm": 0.922837362982081, + "language_loss": 0.66045725, + "learning_rate": 3.053692018445505e-06, + "loss": 0.68073219, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.09472656, + "step": 5716, + "time_per_iteration": 4.523667335510254 + }, + { + "auxiliary_loss_clip": 0.01069183, + "auxiliary_loss_mlp": 0.01055385, + "balance_loss_clip": 1.02504623, + "balance_loss_mlp": 1.02408946, + "epoch": 0.3437246355027807, + "flos": 15595387194240.0, + "grad_norm": 2.0030742370389962, + "language_loss": 0.7576679, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.7789135, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.45117188, + "step": 5717, + "time_per_iteration": 3.808115243911743 + }, + { + "auxiliary_loss_clip": 0.01069097, + "auxiliary_loss_mlp": 0.01057396, + "balance_loss_clip": 1.02582955, + "balance_loss_mlp": 1.0228976, + "epoch": 0.34378475875544867, + "flos": 27671041382400.0, + "grad_norm": 1.8088632674885232, + "language_loss": 0.76923823, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.79050314, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46289062, + "step": 5718, + "time_per_iteration": 2.453612804412842 + }, + { + "auxiliary_loss_clip": 0.01069587, + "auxiliary_loss_mlp": 0.01061462, + "balance_loss_clip": 1.02813113, + "balance_loss_mlp": 1.0231185, + "epoch": 0.34384488200811664, + "flos": 31430139736320.0, + "grad_norm": 9.302290551672355, + "language_loss": 0.65685511, + "learning_rate": 3.052698757266734e-06, + "loss": 0.67816561, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46484375, + "step": 5719, + "time_per_iteration": 2.5195231437683105 + }, + { + "auxiliary_loss_clip": 0.01072763, + "auxiliary_loss_mlp": 0.01054272, + "balance_loss_clip": 1.01974893, + "balance_loss_mlp": 1.02417529, + "epoch": 0.3439050052607846, + "flos": 24898763007360.0, + "grad_norm": 1.7430904047433307, + "language_loss": 0.7511543, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.7724247, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.484375, + "step": 5720, + "time_per_iteration": 2.4391825199127197 + }, + { + "auxiliary_loss_clip": 0.01070441, + "auxiliary_loss_mlp": 0.01057806, + "balance_loss_clip": 1.02404559, + "balance_loss_mlp": 1.02413976, + "epoch": 0.34396512851345257, + "flos": 18149109258240.0, + "grad_norm": 1.6832609888496959, + "language_loss": 0.74779928, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76908171, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46289062, + "step": 5721, + "time_per_iteration": 2.4479684829711914 + }, + { + "auxiliary_loss_clip": 0.01075736, + "auxiliary_loss_mlp": 0.01067164, + "balance_loss_clip": 1.03287947, + "balance_loss_mlp": 1.02701187, + "epoch": 0.34402525176612053, + "flos": 16033512245760.0, + "grad_norm": 3.0327519998443653, + "language_loss": 0.81278598, + "learning_rate": 3.051705136821992e-06, + "loss": 0.83421493, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48632812, + "step": 5722, + "time_per_iteration": 2.3867363929748535 + }, + { + "auxiliary_loss_clip": 0.0107583, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.01893604, + "balance_loss_mlp": 1.03023899, + "epoch": 0.3440853750187885, + "flos": 21177580256640.0, + "grad_norm": 1.6366154329359717, + "language_loss": 0.82716632, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84841943, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.45507812, + "step": 5723, + "time_per_iteration": 2.5139505863189697 + }, + { + "auxiliary_loss_clip": 0.01078332, + "auxiliary_loss_mlp": 0.01055039, + "balance_loss_clip": 1.02290034, + "balance_loss_mlp": 1.03017139, + "epoch": 0.34414549827145646, + "flos": 12677032224000.0, + "grad_norm": 2.083228577428286, + "language_loss": 0.82787812, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.84921181, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.48046875, + "step": 5724, + "time_per_iteration": 2.381873369216919 + }, + { + "auxiliary_loss_clip": 0.01078119, + "auxiliary_loss_mlp": 0.01057606, + "balance_loss_clip": 1.02310681, + "balance_loss_mlp": 1.03059959, + "epoch": 0.3442056215241244, + "flos": 31283190357120.0, + "grad_norm": 1.7440029003371, + "language_loss": 0.71366781, + "learning_rate": 3.05071115745038e-06, + "loss": 0.73502505, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.4765625, + "step": 5725, + "time_per_iteration": 2.4994454383850098 + }, + { + "auxiliary_loss_clip": 0.01082876, + "auxiliary_loss_mlp": 0.01058896, + "balance_loss_clip": 1.0211072, + "balance_loss_mlp": 1.03221977, + "epoch": 0.34426574477679245, + "flos": 23366180655360.0, + "grad_norm": 1.460834838701929, + "language_loss": 0.712883, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.73430073, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.5078125, + "step": 5726, + "time_per_iteration": 2.431016206741333 + }, + { + "auxiliary_loss_clip": 0.01076561, + "auxiliary_loss_mlp": 0.01048587, + "balance_loss_clip": 1.01756871, + "balance_loss_mlp": 1.03028727, + "epoch": 0.3443258680294604, + "flos": 24534269746560.0, + "grad_norm": 2.2547706066433606, + "language_loss": 0.74597371, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.76722515, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46289062, + "step": 5727, + "time_per_iteration": 2.532609224319458 + }, + { + "auxiliary_loss_clip": 0.01078869, + "auxiliary_loss_mlp": 0.0105453, + "balance_loss_clip": 1.02084136, + "balance_loss_mlp": 1.03089511, + "epoch": 0.3443859912821284, + "flos": 20229094817280.0, + "grad_norm": 2.1010649462035835, + "language_loss": 0.90179914, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.92313313, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.48046875, + "step": 5728, + "time_per_iteration": 2.411522150039673 + }, + { + "auxiliary_loss_clip": 0.0107991, + "auxiliary_loss_mlp": 0.01050189, + "balance_loss_clip": 1.01654816, + "balance_loss_mlp": 1.03278947, + "epoch": 0.34444611453479634, + "flos": 24315364321920.0, + "grad_norm": 2.0049914189739235, + "language_loss": 0.71876788, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.74006891, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.47070312, + "step": 5729, + "time_per_iteration": 2.4435479640960693 + }, + { + "auxiliary_loss_clip": 0.01078138, + "auxiliary_loss_mlp": 0.01050385, + "balance_loss_clip": 1.01702988, + "balance_loss_mlp": 1.03160262, + "epoch": 0.3445062377874643, + "flos": 16982451532800.0, + "grad_norm": 2.6200547134870136, + "language_loss": 0.75906301, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.7803483, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.46679688, + "step": 5730, + "time_per_iteration": 2.3883657455444336 + }, + { + "auxiliary_loss_clip": 0.01079128, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_clip": 1.01591098, + "balance_loss_mlp": 1.03010893, + "epoch": 0.3445663610401323, + "flos": 20301679267200.0, + "grad_norm": 2.3128803750875604, + "language_loss": 0.81545889, + "learning_rate": 3.048722123283578e-06, + "loss": 0.83677024, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.49023438, + "step": 5731, + "time_per_iteration": 2.390544891357422 + }, + { + "auxiliary_loss_clip": 0.01078223, + "auxiliary_loss_mlp": 0.01058427, + "balance_loss_clip": 1.02266455, + "balance_loss_mlp": 1.03126907, + "epoch": 0.34462648429280024, + "flos": 15887191271040.0, + "grad_norm": 1.9655230609446368, + "language_loss": 0.79704016, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.8184067, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.46875, + "step": 5732, + "time_per_iteration": 2.4063196182250977 + }, + { + "auxiliary_loss_clip": 0.01052602, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.03360462, + "balance_loss_mlp": 1.04219079, + "epoch": 0.3446866075454682, + "flos": 59307760967040.0, + "grad_norm": 0.7667424997685337, + "language_loss": 0.53699392, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55788565, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.10400391, + "step": 5733, + "time_per_iteration": 3.0551936626434326 + }, + { + "auxiliary_loss_clip": 0.01079617, + "auxiliary_loss_mlp": 0.01056818, + "balance_loss_clip": 1.02081633, + "balance_loss_mlp": 1.0309391, + "epoch": 0.34474673079813617, + "flos": 22342771704960.0, + "grad_norm": 1.6793597951256611, + "language_loss": 0.84750199, + "learning_rate": 3.047727069167207e-06, + "loss": 0.86886632, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48632812, + "step": 5734, + "time_per_iteration": 2.4198031425476074 + }, + { + "auxiliary_loss_clip": 0.01076383, + "auxiliary_loss_mlp": 0.01054733, + "balance_loss_clip": 1.01913655, + "balance_loss_mlp": 1.02739084, + "epoch": 0.34480685405080413, + "flos": 27668981612160.0, + "grad_norm": 2.3465816006428497, + "language_loss": 0.94312125, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.9644323, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48828125, + "step": 5735, + "time_per_iteration": 2.4868457317352295 + }, + { + "auxiliary_loss_clip": 0.01077561, + "auxiliary_loss_mlp": 0.01068621, + "balance_loss_clip": 1.03238153, + "balance_loss_mlp": 1.02946627, + "epoch": 0.3448669773034721, + "flos": 22454912592000.0, + "grad_norm": 1.7959881318236572, + "language_loss": 0.78277695, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.8042388, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.48046875, + "step": 5736, + "time_per_iteration": 2.4215283393859863 + }, + { + "auxiliary_loss_clip": 0.01075911, + "auxiliary_loss_mlp": 0.01062152, + "balance_loss_clip": 1.025769, + "balance_loss_mlp": 1.02862477, + "epoch": 0.34492710055614006, + "flos": 24935037776640.0, + "grad_norm": 1.5975385550673242, + "language_loss": 0.80321562, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.82459617, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.47265625, + "step": 5737, + "time_per_iteration": 2.468062162399292 + }, + { + "auxiliary_loss_clip": 0.0107668, + "auxiliary_loss_mlp": 0.0107292, + "balance_loss_clip": 1.0340817, + "balance_loss_mlp": 1.02607155, + "epoch": 0.34498722380880803, + "flos": 20119781750400.0, + "grad_norm": 2.1946822040083647, + "language_loss": 0.73147941, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.75297546, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.50390625, + "step": 5738, + "time_per_iteration": 2.397984027862549 + }, + { + "auxiliary_loss_clip": 0.01074202, + "auxiliary_loss_mlp": 0.01078899, + "balance_loss_clip": 1.03927386, + "balance_loss_mlp": 1.02428329, + "epoch": 0.34504734706147605, + "flos": 28436896166400.0, + "grad_norm": 2.5218704687822426, + "language_loss": 0.83132577, + "learning_rate": 3.046067851209389e-06, + "loss": 0.85285676, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 0.5, + "step": 5739, + "time_per_iteration": 2.485677480697632 + }, + { + "auxiliary_loss_clip": 0.01074047, + "auxiliary_loss_mlp": 0.01076587, + "balance_loss_clip": 1.04060912, + "balance_loss_mlp": 1.02599335, + "epoch": 0.345107470314144, + "flos": 22673364168960.0, + "grad_norm": 1.9571167893742825, + "language_loss": 0.83833277, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85983908, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.48046875, + "step": 5740, + "time_per_iteration": 2.443359375 + }, + { + "auxiliary_loss_clip": 0.01072804, + "auxiliary_loss_mlp": 0.01075323, + "balance_loss_clip": 1.0393455, + "balance_loss_mlp": 1.02458811, + "epoch": 0.345167593566812, + "flos": 20629688290560.0, + "grad_norm": 2.308397724449736, + "language_loss": 0.78307104, + "learning_rate": 3.045403886269181e-06, + "loss": 0.80455232, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48242188, + "step": 5741, + "time_per_iteration": 2.394364356994629 + }, + { + "auxiliary_loss_clip": 0.01073642, + "auxiliary_loss_mlp": 0.01079198, + "balance_loss_clip": 1.04425764, + "balance_loss_mlp": 1.02366412, + "epoch": 0.34522771681947995, + "flos": 26213138161920.0, + "grad_norm": 1.772978624274225, + "language_loss": 0.77958041, + "learning_rate": 3.045071844330053e-06, + "loss": 0.80110878, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.5, + "step": 5742, + "time_per_iteration": 2.5062801837921143 + }, + { + "auxiliary_loss_clip": 0.01072817, + "auxiliary_loss_mlp": 0.01086373, + "balance_loss_clip": 1.05163503, + "balance_loss_mlp": 1.02536476, + "epoch": 0.3452878400721479, + "flos": 19061354839680.0, + "grad_norm": 2.305358306195952, + "language_loss": 0.78119999, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.80279189, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.47460938, + "step": 5743, + "time_per_iteration": 2.4018890857696533 + }, + { + "auxiliary_loss_clip": 0.01075258, + "auxiliary_loss_mlp": 0.01085101, + "balance_loss_clip": 1.05062604, + "balance_loss_mlp": 1.0273031, + "epoch": 0.3453479633248159, + "flos": 27928455903360.0, + "grad_norm": 1.6742534608801989, + "language_loss": 0.709185, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.73078859, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47851562, + "step": 5744, + "time_per_iteration": 2.5147030353546143 + }, + { + "auxiliary_loss_clip": 0.0107611, + "auxiliary_loss_mlp": 0.01081701, + "balance_loss_clip": 1.04698765, + "balance_loss_mlp": 1.02934241, + "epoch": 0.34540808657748384, + "flos": 19605197088000.0, + "grad_norm": 1.9752462198108303, + "language_loss": 0.81346893, + "learning_rate": 3.044075480787665e-06, + "loss": 0.83504701, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.46875, + "step": 5745, + "time_per_iteration": 2.421937942504883 + }, + { + "auxiliary_loss_clip": 0.0108057, + "auxiliary_loss_mlp": 0.01073966, + "balance_loss_clip": 1.03472209, + "balance_loss_mlp": 1.03019392, + "epoch": 0.3454682098301518, + "flos": 20410643220480.0, + "grad_norm": 2.439711401400575, + "language_loss": 0.90261161, + "learning_rate": 3.043743280407182e-06, + "loss": 0.92415696, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.50390625, + "step": 5746, + "time_per_iteration": 2.4659440517425537 + }, + { + "auxiliary_loss_clip": 0.01079751, + "auxiliary_loss_mlp": 0.01069853, + "balance_loss_clip": 1.03280246, + "balance_loss_mlp": 1.03012323, + "epoch": 0.34552833308281977, + "flos": 21324040876800.0, + "grad_norm": 1.8753451880101872, + "language_loss": 0.66763258, + "learning_rate": 3.043411040447849e-06, + "loss": 0.68912858, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.49609375, + "step": 5747, + "time_per_iteration": 2.4393560886383057 + }, + { + "auxiliary_loss_clip": 0.01077987, + "auxiliary_loss_mlp": 0.01056311, + "balance_loss_clip": 1.02369571, + "balance_loss_mlp": 1.03147793, + "epoch": 0.34558845633548774, + "flos": 36242253740160.0, + "grad_norm": 1.7358477885173567, + "language_loss": 0.74648219, + "learning_rate": 3.043078760922264e-06, + "loss": 0.76782519, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.46484375, + "step": 5748, + "time_per_iteration": 2.5488250255584717 + }, + { + "auxiliary_loss_clip": 0.01078609, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.01590705, + "balance_loss_mlp": 1.03302026, + "epoch": 0.3456485795881557, + "flos": 22449606065280.0, + "grad_norm": 3.126749824787166, + "language_loss": 0.7760011, + "learning_rate": 3.042746441843029e-06, + "loss": 0.797261, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45703125, + "step": 5749, + "time_per_iteration": 2.4612433910369873 + }, + { + "auxiliary_loss_clip": 0.01043318, + "auxiliary_loss_mlp": 0.01058894, + "balance_loss_clip": 1.05603337, + "balance_loss_mlp": 1.03312016, + "epoch": 0.34570870284082367, + "flos": 62001135936000.0, + "grad_norm": 0.9103711331807868, + "language_loss": 0.62736475, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64838684, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.10205078, + "step": 5750, + "time_per_iteration": 2.9870095252990723 + }, + { + "auxiliary_loss_clip": 0.0108357, + "auxiliary_loss_mlp": 0.01045768, + "balance_loss_clip": 1.01477313, + "balance_loss_mlp": 1.03786445, + "epoch": 0.34576882609349163, + "flos": 22781141136000.0, + "grad_norm": 1.5930095724477433, + "language_loss": 0.81987906, + "learning_rate": 3.042081685074012e-06, + "loss": 0.84117246, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45703125, + "step": 5751, + "time_per_iteration": 3.8663382530212402 + }, + { + "auxiliary_loss_clip": 0.01088125, + "auxiliary_loss_mlp": 0.01060104, + "balance_loss_clip": 1.02684498, + "balance_loss_mlp": 1.04108405, + "epoch": 0.34582894934615965, + "flos": 12348010771200.0, + "grad_norm": 2.111542051075509, + "language_loss": 0.85897046, + "learning_rate": 3.041749247409439e-06, + "loss": 0.88045275, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46875, + "step": 5752, + "time_per_iteration": 2.413193702697754 + }, + { + "auxiliary_loss_clip": 0.01057248, + "auxiliary_loss_mlp": 0.0100654, + "balance_loss_clip": 1.00348806, + "balance_loss_mlp": 1.04603124, + "epoch": 0.3458890725988276, + "flos": 70164563080320.0, + "grad_norm": 0.7438335388390868, + "language_loss": 0.63166505, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.65230292, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.03051758, + "router_z_loss_mlp": 0.11230469, + "step": 5753, + "time_per_iteration": 4.324312925338745 + }, + { + "auxiliary_loss_clip": 0.0109198, + "auxiliary_loss_mlp": 0.01068534, + "balance_loss_clip": 1.03410602, + "balance_loss_mlp": 1.04394913, + "epoch": 0.3459491958514956, + "flos": 17091624954240.0, + "grad_norm": 1.8442619947680616, + "language_loss": 0.72211492, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.74372005, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48046875, + "step": 5754, + "time_per_iteration": 2.4320058822631836 + }, + { + "auxiliary_loss_clip": 0.01090691, + "auxiliary_loss_mlp": 0.01073566, + "balance_loss_clip": 1.03870964, + "balance_loss_mlp": 1.04110157, + "epoch": 0.34600931910416355, + "flos": 16650113500800.0, + "grad_norm": 1.6592739180145857, + "language_loss": 0.74655104, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.7681936, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.49609375, + "step": 5755, + "time_per_iteration": 2.4063165187835693 + }, + { + "auxiliary_loss_clip": 0.01083943, + "auxiliary_loss_mlp": 0.01076311, + "balance_loss_clip": 1.04414856, + "balance_loss_mlp": 1.03727865, + "epoch": 0.3460694423568315, + "flos": 38544635859840.0, + "grad_norm": 1.471286216084281, + "language_loss": 0.73397279, + "learning_rate": 3.040419101844869e-06, + "loss": 0.7555753, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.46679688, + "step": 5756, + "time_per_iteration": 4.073963403701782 + }, + { + "auxiliary_loss_clip": 0.01049918, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_clip": 1.04630411, + "balance_loss_mlp": 1.03916097, + "epoch": 0.3461295656094995, + "flos": 72077837564160.0, + "grad_norm": 0.7599659908252645, + "language_loss": 0.62765598, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64864659, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.02832031, + "router_z_loss_mlp": 0.10742188, + "step": 5757, + "time_per_iteration": 4.416916847229004 + }, + { + "auxiliary_loss_clip": 0.01044301, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.03380609, + "balance_loss_mlp": 1.03401721, + "epoch": 0.34618968886216744, + "flos": 65457118932480.0, + "grad_norm": 0.8976355954556915, + "language_loss": 0.59244919, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61325836, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.02807617, + "router_z_loss_mlp": 0.10253906, + "step": 5758, + "time_per_iteration": 3.0120491981506348 + }, + { + "auxiliary_loss_clip": 0.01074336, + "auxiliary_loss_mlp": 0.010656, + "balance_loss_clip": 1.03417647, + "balance_loss_mlp": 1.02886248, + "epoch": 0.3462498121148354, + "flos": 23471548738560.0, + "grad_norm": 1.6856442066214543, + "language_loss": 0.72538048, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.74677992, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45507812, + "step": 5759, + "time_per_iteration": 2.4543159008026123 + }, + { + "auxiliary_loss_clip": 0.01074283, + "auxiliary_loss_mlp": 0.01059362, + "balance_loss_clip": 1.02696061, + "balance_loss_mlp": 1.02851164, + "epoch": 0.3463099353675034, + "flos": 24169636840320.0, + "grad_norm": 2.119169691937801, + "language_loss": 0.84741378, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.86875021, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45703125, + "step": 5760, + "time_per_iteration": 2.444492816925049 + }, + { + "auxiliary_loss_clip": 0.01022241, + "auxiliary_loss_mlp": 0.01003536, + "balance_loss_clip": 1.00027001, + "balance_loss_mlp": 1.01296687, + "epoch": 0.34637005862017134, + "flos": 63697915745280.0, + "grad_norm": 0.8444779811170003, + "language_loss": 0.5655418, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58579957, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.03271484, + "router_z_loss_mlp": 0.09277344, + "step": 5761, + "time_per_iteration": 3.095231294631958 + }, + { + "auxiliary_loss_clip": 0.01070759, + "auxiliary_loss_mlp": 0.01052832, + "balance_loss_clip": 1.02118158, + "balance_loss_mlp": 1.02599728, + "epoch": 0.3464301818728393, + "flos": 13144868709120.0, + "grad_norm": 2.311630279129401, + "language_loss": 0.96298701, + "learning_rate": 3.038422700166474e-06, + "loss": 0.98422289, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44726562, + "step": 5762, + "time_per_iteration": 2.400195837020874 + }, + { + "auxiliary_loss_clip": 0.01075261, + "auxiliary_loss_mlp": 0.01058748, + "balance_loss_clip": 1.02639437, + "balance_loss_mlp": 1.02718556, + "epoch": 0.34649030512550727, + "flos": 29313879408000.0, + "grad_norm": 1.580712593362088, + "language_loss": 0.70794535, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.72928542, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.48046875, + "step": 5763, + "time_per_iteration": 2.517936944961548 + }, + { + "auxiliary_loss_clip": 0.01079221, + "auxiliary_loss_mlp": 0.01051539, + "balance_loss_clip": 1.01737392, + "balance_loss_mlp": 1.03104484, + "epoch": 0.34655042837817523, + "flos": 23729801132160.0, + "grad_norm": 1.6192029529189438, + "language_loss": 0.85072184, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.87202948, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.48046875, + "step": 5764, + "time_per_iteration": 2.428978443145752 + }, + { + "auxiliary_loss_clip": 0.01081456, + "auxiliary_loss_mlp": 0.01049992, + "balance_loss_clip": 1.01918805, + "balance_loss_mlp": 1.03518891, + "epoch": 0.34661055163084326, + "flos": 22053132132480.0, + "grad_norm": 2.348373604582191, + "language_loss": 0.70069182, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.72200632, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46289062, + "step": 5765, + "time_per_iteration": 2.4405624866485596 + }, + { + "auxiliary_loss_clip": 0.01084855, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.01869082, + "balance_loss_mlp": 1.03973961, + "epoch": 0.3466706748835112, + "flos": 21798126495360.0, + "grad_norm": 1.8973113223679967, + "language_loss": 0.78063786, + "learning_rate": 3.03709097800413e-06, + "loss": 0.80196989, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44921875, + "step": 5766, + "time_per_iteration": 2.435934066772461 + }, + { + "auxiliary_loss_clip": 0.01090271, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.01679921, + "balance_loss_mlp": 1.04554296, + "epoch": 0.3467307981361792, + "flos": 19460726415360.0, + "grad_norm": 1.4759685346577198, + "language_loss": 0.74633801, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.76768667, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.44726562, + "step": 5767, + "time_per_iteration": 2.4651293754577637 + }, + { + "auxiliary_loss_clip": 0.01092903, + "auxiliary_loss_mlp": 0.01052664, + "balance_loss_clip": 1.02138352, + "balance_loss_mlp": 1.04622996, + "epoch": 0.34679092138884715, + "flos": 24826283291520.0, + "grad_norm": 1.901868111213925, + "language_loss": 0.79603958, + "learning_rate": 3.036424880912893e-06, + "loss": 0.81749523, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.46679688, + "step": 5768, + "time_per_iteration": 2.4725663661956787 + }, + { + "auxiliary_loss_clip": 0.01110797, + "auxiliary_loss_mlp": 0.01026365, + "balance_loss_clip": 1.02305067, + "balance_loss_mlp": 1.09988856, + "epoch": 0.3468510446415151, + "flos": 63233116548480.0, + "grad_norm": 0.790209355758782, + "language_loss": 0.57625508, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59762669, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.109375, + "step": 5769, + "time_per_iteration": 3.080239772796631 + }, + { + "auxiliary_loss_clip": 0.01101225, + "auxiliary_loss_mlp": 0.01052133, + "balance_loss_clip": 1.01708555, + "balance_loss_mlp": 1.04926467, + "epoch": 0.3469111678941831, + "flos": 12120168038400.0, + "grad_norm": 3.5835534670198808, + "language_loss": 0.88215768, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.90369129, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.51953125, + "step": 5770, + "time_per_iteration": 2.401235342025757 + }, + { + "auxiliary_loss_clip": 0.01120012, + "auxiliary_loss_mlp": 0.01008597, + "balance_loss_clip": 1.00502121, + "balance_loss_mlp": 1.10899377, + "epoch": 0.34697129114685105, + "flos": 65931134728320.0, + "grad_norm": 0.7877526445140526, + "language_loss": 0.59856343, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61984956, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.11035156, + "step": 5771, + "time_per_iteration": 2.8367111682891846 + }, + { + "auxiliary_loss_clip": 0.01090666, + "auxiliary_loss_mlp": 0.01058775, + "balance_loss_clip": 1.02878189, + "balance_loss_mlp": 1.0451349, + "epoch": 0.347031414399519, + "flos": 34452920183040.0, + "grad_norm": 2.0748850720451593, + "language_loss": 0.72962284, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.75111723, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45507812, + "step": 5772, + "time_per_iteration": 2.5532376766204834 + }, + { + "auxiliary_loss_clip": 0.01089597, + "auxiliary_loss_mlp": 0.01069608, + "balance_loss_clip": 1.03822029, + "balance_loss_mlp": 1.04147112, + "epoch": 0.347091537652187, + "flos": 26942892733440.0, + "grad_norm": 1.5255470421047013, + "language_loss": 0.77800405, + "learning_rate": 3.034758950632507e-06, + "loss": 0.79959607, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.48046875, + "step": 5773, + "time_per_iteration": 2.489081621170044 + }, + { + "auxiliary_loss_clip": 0.01081958, + "auxiliary_loss_mlp": 0.01078766, + "balance_loss_clip": 1.04641247, + "balance_loss_mlp": 1.03408098, + "epoch": 0.34715166090485494, + "flos": 21141165841920.0, + "grad_norm": 2.232845841914769, + "language_loss": 0.72458911, + "learning_rate": 3.034425646811396e-06, + "loss": 0.74619639, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47851562, + "step": 5774, + "time_per_iteration": 2.458618402481079 + }, + { + "auxiliary_loss_clip": 0.01078337, + "auxiliary_loss_mlp": 0.01098602, + "balance_loss_clip": 1.06900287, + "balance_loss_mlp": 1.03329992, + "epoch": 0.3472117841575229, + "flos": 23476855265280.0, + "grad_norm": 1.762891794177019, + "language_loss": 0.77353138, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.79530078, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44921875, + "step": 5775, + "time_per_iteration": 2.4566147327423096 + }, + { + "auxiliary_loss_clip": 0.01078285, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_clip": 1.06768227, + "balance_loss_mlp": 1.02934825, + "epoch": 0.34727190741019087, + "flos": 17491869313920.0, + "grad_norm": 2.008549467131087, + "language_loss": 0.80387831, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.82569122, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.48828125, + "step": 5776, + "time_per_iteration": 2.408928155899048 + }, + { + "auxiliary_loss_clip": 0.01048063, + "auxiliary_loss_mlp": 0.01088732, + "balance_loss_clip": 1.08451223, + "balance_loss_mlp": 1.0369215, + "epoch": 0.34733203066285884, + "flos": 65261848498560.0, + "grad_norm": 0.9117067832794445, + "language_loss": 0.63429564, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65566361, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.04223633, + "router_z_loss_mlp": 0.11132812, + "step": 5777, + "time_per_iteration": 3.0589051246643066 + }, + { + "auxiliary_loss_clip": 0.01073588, + "auxiliary_loss_mlp": 0.01097974, + "balance_loss_clip": 1.06631231, + "balance_loss_mlp": 1.02584147, + "epoch": 0.3473921539155268, + "flos": 28657442424960.0, + "grad_norm": 1.7667876768086608, + "language_loss": 0.66678029, + "learning_rate": 3.033092039398119e-06, + "loss": 0.68849593, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4765625, + "step": 5778, + "time_per_iteration": 2.5368399620056152 + }, + { + "auxiliary_loss_clip": 0.01076264, + "auxiliary_loss_mlp": 0.01095614, + "balance_loss_clip": 1.06276, + "balance_loss_mlp": 1.02797508, + "epoch": 0.3474522771681948, + "flos": 40835497230720.0, + "grad_norm": 1.801942470518592, + "language_loss": 0.74106085, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.76277965, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48242188, + "step": 5779, + "time_per_iteration": 2.5849602222442627 + }, + { + "auxiliary_loss_clip": 0.01079472, + "auxiliary_loss_mlp": 0.01089844, + "balance_loss_clip": 1.05701399, + "balance_loss_mlp": 1.0308435, + "epoch": 0.3475124004208628, + "flos": 24607412778240.0, + "grad_norm": 1.8460594367499052, + "language_loss": 0.64182055, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.66351372, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.484375, + "step": 5780, + "time_per_iteration": 2.546299934387207 + }, + { + "auxiliary_loss_clip": 0.01082602, + "auxiliary_loss_mlp": 0.01081822, + "balance_loss_clip": 1.051054, + "balance_loss_mlp": 1.03695893, + "epoch": 0.34757252367353075, + "flos": 22710197520000.0, + "grad_norm": 1.6784908181188403, + "language_loss": 0.72763121, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74927545, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45703125, + "step": 5781, + "time_per_iteration": 2.4660961627960205 + }, + { + "auxiliary_loss_clip": 0.01092704, + "auxiliary_loss_mlp": 0.01078364, + "balance_loss_clip": 1.04415083, + "balance_loss_mlp": 1.04488182, + "epoch": 0.3476326469261987, + "flos": 19827174712320.0, + "grad_norm": 2.0576391041474738, + "language_loss": 0.79391694, + "learning_rate": 3.031757805185612e-06, + "loss": 0.81562757, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.47851562, + "step": 5782, + "time_per_iteration": 2.467360496520996 + }, + { + "auxiliary_loss_clip": 0.01094587, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_clip": 1.03486419, + "balance_loss_mlp": 1.0485729, + "epoch": 0.3476927701788667, + "flos": 19937081272320.0, + "grad_norm": 3.7160584077882852, + "language_loss": 0.63997865, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.66156977, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4609375, + "step": 5783, + "time_per_iteration": 2.438946485519409 + }, + { + "auxiliary_loss_clip": 0.01105588, + "auxiliary_loss_mlp": 0.01053358, + "balance_loss_clip": 1.02422309, + "balance_loss_mlp": 1.05944991, + "epoch": 0.34775289343153465, + "flos": 20734218501120.0, + "grad_norm": 2.1068858006369204, + "language_loss": 0.89118379, + "learning_rate": 3.031090453282605e-06, + "loss": 0.91277325, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4609375, + "step": 5784, + "time_per_iteration": 2.5052058696746826 + }, + { + "auxiliary_loss_clip": 0.01110561, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_clip": 1.01605225, + "balance_loss_mlp": 1.06364048, + "epoch": 0.3478130166842026, + "flos": 19353822232320.0, + "grad_norm": 1.7733480970983828, + "language_loss": 0.83180451, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.85338241, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.46875, + "step": 5785, + "time_per_iteration": 2.4215545654296875 + }, + { + "auxiliary_loss_clip": 0.01115716, + "auxiliary_loss_mlp": 0.01054244, + "balance_loss_clip": 1.02553821, + "balance_loss_mlp": 1.0663892, + "epoch": 0.3478731399368706, + "flos": 22050199578240.0, + "grad_norm": 1.711744055374755, + "language_loss": 0.81426966, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.83596927, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.49414062, + "step": 5786, + "time_per_iteration": 2.468496799468994 + }, + { + "auxiliary_loss_clip": 0.01117511, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_clip": 1.02009261, + "balance_loss_mlp": 1.07008278, + "epoch": 0.34793326318953854, + "flos": 18040459507200.0, + "grad_norm": 2.188406086539704, + "language_loss": 0.76106763, + "learning_rate": 3.030089132216836e-06, + "loss": 0.78274632, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.47460938, + "step": 5787, + "time_per_iteration": 2.425279140472412 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01055549, + "balance_loss_clip": 1.02679563, + "balance_loss_mlp": 1.0695622, + "epoch": 0.3479933864422065, + "flos": 29313390648960.0, + "grad_norm": 1.5681236698017509, + "language_loss": 0.83004665, + "learning_rate": 3.029755280389203e-06, + "loss": 0.8518014, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.5, + "step": 5788, + "time_per_iteration": 2.531289577484131 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.0106496, + "balance_loss_clip": 1.03170121, + "balance_loss_mlp": 1.07165122, + "epoch": 0.3480535096948745, + "flos": 20119677016320.0, + "grad_norm": 2.361560001324782, + "language_loss": 0.87768996, + "learning_rate": 3.029421389513147e-06, + "loss": 0.8995859, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.53125, + "step": 5789, + "time_per_iteration": 2.459383726119995 + }, + { + "auxiliary_loss_clip": 0.01125235, + "auxiliary_loss_mlp": 0.01085725, + "balance_loss_clip": 1.05265641, + "balance_loss_mlp": 1.07369626, + "epoch": 0.34811363294754244, + "flos": 18548061897600.0, + "grad_norm": 1.854966163884327, + "language_loss": 0.860726, + "learning_rate": 3.029087459601328e-06, + "loss": 0.88283557, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.515625, + "step": 5790, + "time_per_iteration": 2.484607696533203 + }, + { + "auxiliary_loss_clip": 0.01122616, + "auxiliary_loss_mlp": 0.01084353, + "balance_loss_clip": 1.05338287, + "balance_loss_mlp": 1.07115054, + "epoch": 0.3481737562002104, + "flos": 26869086385920.0, + "grad_norm": 2.0005381945502663, + "language_loss": 0.82757109, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.84964073, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.515625, + "step": 5791, + "time_per_iteration": 3.9457123279571533 + }, + { + "auxiliary_loss_clip": 0.01118577, + "auxiliary_loss_mlp": 0.01073195, + "balance_loss_clip": 1.03936386, + "balance_loss_mlp": 1.0673548, + "epoch": 0.3482338794528784, + "flos": 28907525560320.0, + "grad_norm": 1.6859826228784331, + "language_loss": 0.78889602, + "learning_rate": 3.028419482721056e-06, + "loss": 0.81081378, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.51171875, + "step": 5792, + "time_per_iteration": 3.9383633136749268 + }, + { + "auxiliary_loss_clip": 0.01111965, + "auxiliary_loss_mlp": 0.01078371, + "balance_loss_clip": 1.04695928, + "balance_loss_mlp": 1.06287336, + "epoch": 0.3482940027055464, + "flos": 22199662575360.0, + "grad_norm": 1.4940570846892904, + "language_loss": 0.82832783, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.85023117, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4921875, + "step": 5793, + "time_per_iteration": 2.513511896133423 + }, + { + "auxiliary_loss_clip": 0.01114223, + "auxiliary_loss_mlp": 0.01076586, + "balance_loss_clip": 1.04361308, + "balance_loss_mlp": 1.06441689, + "epoch": 0.34835412595821436, + "flos": 20301679267200.0, + "grad_norm": 1.8946959068397815, + "language_loss": 0.77535689, + "learning_rate": 3.027751349849706e-06, + "loss": 0.79726493, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.49609375, + "step": 5794, + "time_per_iteration": 2.465667247772217 + }, + { + "auxiliary_loss_clip": 0.01109408, + "auxiliary_loss_mlp": 0.01069368, + "balance_loss_clip": 1.03863549, + "balance_loss_mlp": 1.06124735, + "epoch": 0.3484142492108823, + "flos": 20448628646400.0, + "grad_norm": 1.963909151380665, + "language_loss": 0.57904178, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.60082948, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.48046875, + "step": 5795, + "time_per_iteration": 2.496779441833496 + }, + { + "auxiliary_loss_clip": 0.01104478, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_clip": 1.01984334, + "balance_loss_mlp": 1.05808794, + "epoch": 0.3484743724635503, + "flos": 24351778736640.0, + "grad_norm": 1.639251149015611, + "language_loss": 0.83856297, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.86008584, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.46484375, + "step": 5796, + "time_per_iteration": 3.887704849243164 + }, + { + "auxiliary_loss_clip": 0.01099486, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.02728462, + "balance_loss_mlp": 1.05388904, + "epoch": 0.34853449571621825, + "flos": 24351848559360.0, + "grad_norm": 2.392264744977286, + "language_loss": 0.84519941, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.86676276, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.45507812, + "step": 5797, + "time_per_iteration": 3.8614859580993652 + }, + { + "auxiliary_loss_clip": 0.01095737, + "auxiliary_loss_mlp": 0.0105148, + "balance_loss_clip": 1.02054513, + "balance_loss_mlp": 1.05125594, + "epoch": 0.3485946189688862, + "flos": 27266572748160.0, + "grad_norm": 1.629144811364495, + "language_loss": 0.74175072, + "learning_rate": 3.026414616539167e-06, + "loss": 0.76322287, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 5798, + "time_per_iteration": 2.5441410541534424 + }, + { + "auxiliary_loss_clip": 0.0108839, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.02058578, + "balance_loss_mlp": 1.04271555, + "epoch": 0.3486547422215542, + "flos": 20155672494720.0, + "grad_norm": 1.839228890646014, + "language_loss": 0.77217484, + "learning_rate": 3.026080335875485e-06, + "loss": 0.79356539, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45703125, + "step": 5799, + "time_per_iteration": 2.4297337532043457 + }, + { + "auxiliary_loss_clip": 0.01088434, + "auxiliary_loss_mlp": 0.01050925, + "balance_loss_clip": 1.02045512, + "balance_loss_mlp": 1.04215097, + "epoch": 0.34871486547422215, + "flos": 20229304285440.0, + "grad_norm": 1.9143831638052071, + "language_loss": 0.76959896, + "learning_rate": 3.025746016302734e-06, + "loss": 0.7909925, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.46289062, + "step": 5800, + "time_per_iteration": 2.513524293899536 + }, + { + "auxiliary_loss_clip": 0.01085408, + "auxiliary_loss_mlp": 0.01051368, + "balance_loss_clip": 1.01906276, + "balance_loss_mlp": 1.03784585, + "epoch": 0.3487749887268901, + "flos": 44051591208960.0, + "grad_norm": 2.968128044711718, + "language_loss": 0.68710959, + "learning_rate": 3.025411657833591e-06, + "loss": 0.70847732, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.47460938, + "step": 5801, + "time_per_iteration": 2.623863458633423 + }, + { + "auxiliary_loss_clip": 0.01079768, + "auxiliary_loss_mlp": 0.01053664, + "balance_loss_clip": 1.02097714, + "balance_loss_mlp": 1.03553104, + "epoch": 0.3488351119795581, + "flos": 23294015141760.0, + "grad_norm": 2.0251315136323544, + "language_loss": 0.77438146, + "learning_rate": 3.025077260480735e-06, + "loss": 0.79571581, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44335938, + "step": 5802, + "time_per_iteration": 2.5143930912017822 + }, + { + "auxiliary_loss_clip": 0.01074823, + "auxiliary_loss_mlp": 0.01046755, + "balance_loss_clip": 1.01695275, + "balance_loss_mlp": 1.03140593, + "epoch": 0.34889523523222604, + "flos": 19933904338560.0, + "grad_norm": 1.8092280897434303, + "language_loss": 0.7998361, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.8210519, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43359375, + "step": 5803, + "time_per_iteration": 2.445662260055542 + }, + { + "auxiliary_loss_clip": 0.01077751, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_clip": 1.0201292, + "balance_loss_mlp": 1.02976847, + "epoch": 0.348955358484894, + "flos": 30444855857280.0, + "grad_norm": 1.7566057106547157, + "language_loss": 0.68784267, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.70915002, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48046875, + "step": 5804, + "time_per_iteration": 2.5335896015167236 + }, + { + "auxiliary_loss_clip": 0.01072207, + "auxiliary_loss_mlp": 0.01049267, + "balance_loss_clip": 1.02099025, + "balance_loss_mlp": 1.02983963, + "epoch": 0.349015481737562, + "flos": 17999122590720.0, + "grad_norm": 1.9277410812636442, + "language_loss": 0.78667593, + "learning_rate": 3.024073835246702e-06, + "loss": 0.80789071, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.42382812, + "step": 5805, + "time_per_iteration": 2.3964343070983887 + }, + { + "auxiliary_loss_clip": 0.01075776, + "auxiliary_loss_mlp": 0.01048611, + "balance_loss_clip": 1.01554191, + "balance_loss_mlp": 1.03013408, + "epoch": 0.34907560499023, + "flos": 27197269966080.0, + "grad_norm": 2.8325112695221346, + "language_loss": 0.69236374, + "learning_rate": 3.023739282485814e-06, + "loss": 0.71360755, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45703125, + "step": 5806, + "time_per_iteration": 2.5256264209747314 + }, + { + "auxiliary_loss_clip": 0.01072899, + "auxiliary_loss_mlp": 0.01043246, + "balance_loss_clip": 1.01295519, + "balance_loss_mlp": 1.02797508, + "epoch": 0.34913572824289796, + "flos": 30225566407680.0, + "grad_norm": 1.4595041924837142, + "language_loss": 0.73442417, + "learning_rate": 3.023404690904629e-06, + "loss": 0.75558555, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44921875, + "step": 5807, + "time_per_iteration": 2.4796314239501953 + }, + { + "auxiliary_loss_clip": 0.01070951, + "auxiliary_loss_mlp": 0.0105087, + "balance_loss_clip": 1.0178256, + "balance_loss_mlp": 1.02415967, + "epoch": 0.3491958514955659, + "flos": 29970595681920.0, + "grad_norm": 1.9628529179786274, + "language_loss": 0.7554847, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.77670294, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.46875, + "step": 5808, + "time_per_iteration": 2.5145537853240967 + }, + { + "auxiliary_loss_clip": 0.01069236, + "auxiliary_loss_mlp": 0.01050388, + "balance_loss_clip": 1.01960826, + "balance_loss_mlp": 1.02499926, + "epoch": 0.3492559747482339, + "flos": 22782188476800.0, + "grad_norm": 1.8314000773603696, + "language_loss": 0.85023177, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.87142801, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.44140625, + "step": 5809, + "time_per_iteration": 2.429583787918091 + }, + { + "auxiliary_loss_clip": 0.0106765, + "auxiliary_loss_mlp": 0.01049657, + "balance_loss_clip": 1.02028346, + "balance_loss_mlp": 1.02416658, + "epoch": 0.34931609800090185, + "flos": 26066817187200.0, + "grad_norm": 1.9821434360238217, + "language_loss": 0.8134715, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.83464456, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43359375, + "step": 5810, + "time_per_iteration": 2.487069845199585 + }, + { + "auxiliary_loss_clip": 0.01069497, + "auxiliary_loss_mlp": 0.01051567, + "balance_loss_clip": 1.01949978, + "balance_loss_mlp": 1.02412069, + "epoch": 0.3493762212535698, + "flos": 29240736376320.0, + "grad_norm": 1.728257900291536, + "language_loss": 0.76588452, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.78709519, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45507812, + "step": 5811, + "time_per_iteration": 2.4658031463623047 + }, + { + "auxiliary_loss_clip": 0.01072955, + "auxiliary_loss_mlp": 0.01050297, + "balance_loss_clip": 1.01857531, + "balance_loss_mlp": 1.0242238, + "epoch": 0.3494363445062378, + "flos": 27124825161600.0, + "grad_norm": 1.6241625702323288, + "language_loss": 0.80598879, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82722133, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.48828125, + "step": 5812, + "time_per_iteration": 2.461359977722168 + }, + { + "auxiliary_loss_clip": 0.01069946, + "auxiliary_loss_mlp": 0.01049701, + "balance_loss_clip": 1.01884985, + "balance_loss_mlp": 1.02379894, + "epoch": 0.34949646775890575, + "flos": 12275391409920.0, + "grad_norm": 1.95677368743276, + "language_loss": 0.7167778, + "learning_rate": 3.021396326901918e-06, + "loss": 0.73797429, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4609375, + "step": 5813, + "time_per_iteration": 2.373339891433716 + }, + { + "auxiliary_loss_clip": 0.01067811, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.01590896, + "balance_loss_mlp": 1.02293003, + "epoch": 0.3495565910115737, + "flos": 17164558517760.0, + "grad_norm": 2.0369109248681463, + "language_loss": 0.77316213, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.79431504, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 5814, + "time_per_iteration": 2.3924319744110107 + }, + { + "auxiliary_loss_clip": 0.01072467, + "auxiliary_loss_mlp": 0.01059153, + "balance_loss_clip": 1.02396214, + "balance_loss_mlp": 1.02405119, + "epoch": 0.3496167142642417, + "flos": 26464547928960.0, + "grad_norm": 1.5191922182525877, + "language_loss": 0.85942668, + "learning_rate": 3.020726562247328e-06, + "loss": 0.88074291, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.484375, + "step": 5815, + "time_per_iteration": 2.4504432678222656 + }, + { + "auxiliary_loss_clip": 0.0106857, + "auxiliary_loss_mlp": 0.01052559, + "balance_loss_clip": 1.02092075, + "balance_loss_mlp": 1.02305543, + "epoch": 0.34967683751690964, + "flos": 17414048160000.0, + "grad_norm": 1.9438861608842157, + "language_loss": 0.79244387, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.81365514, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45507812, + "step": 5816, + "time_per_iteration": 2.3882343769073486 + }, + { + "auxiliary_loss_clip": 0.01074945, + "auxiliary_loss_mlp": 0.01055248, + "balance_loss_clip": 1.02244151, + "balance_loss_mlp": 1.02699399, + "epoch": 0.3497369607695776, + "flos": 22598964328320.0, + "grad_norm": 2.259040377787045, + "language_loss": 0.61393118, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.63523316, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.48046875, + "step": 5817, + "time_per_iteration": 2.4290599822998047 + }, + { + "auxiliary_loss_clip": 0.01016321, + "auxiliary_loss_mlp": 0.01021094, + "balance_loss_clip": 1.01785171, + "balance_loss_mlp": 1.00670481, + "epoch": 0.34979708402224563, + "flos": 68526193708800.0, + "grad_norm": 0.8927622231140444, + "language_loss": 0.59962922, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.62000334, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.09619141, + "step": 5818, + "time_per_iteration": 3.125664234161377 + }, + { + "auxiliary_loss_clip": 0.01067554, + "auxiliary_loss_mlp": 0.01055638, + "balance_loss_clip": 1.02209258, + "balance_loss_mlp": 1.02396178, + "epoch": 0.3498572072749136, + "flos": 18988630744320.0, + "grad_norm": 2.3915474440455147, + "language_loss": 0.84068668, + "learning_rate": 3.019386568567123e-06, + "loss": 0.86191857, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4375, + "step": 5819, + "time_per_iteration": 2.383964776992798 + }, + { + "auxiliary_loss_clip": 0.01069921, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.01928961, + "balance_loss_mlp": 1.02367663, + "epoch": 0.34991733052758156, + "flos": 27817641648000.0, + "grad_norm": 1.704943036394434, + "language_loss": 0.71330804, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.73452652, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.46289062, + "step": 5820, + "time_per_iteration": 2.480193853378296 + }, + { + "auxiliary_loss_clip": 0.01068976, + "auxiliary_loss_mlp": 0.01059052, + "balance_loss_clip": 1.02760458, + "balance_loss_mlp": 1.02294505, + "epoch": 0.3499774537802495, + "flos": 33582779568000.0, + "grad_norm": 1.689992856138994, + "language_loss": 0.70676184, + "learning_rate": 3.018716339744759e-06, + "loss": 0.72804213, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4609375, + "step": 5821, + "time_per_iteration": 2.49688982963562 + }, + { + "auxiliary_loss_clip": 0.01077178, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_clip": 1.03235364, + "balance_loss_mlp": 1.02698457, + "epoch": 0.3500375770329175, + "flos": 23475633367680.0, + "grad_norm": 2.0218095178282103, + "language_loss": 0.75640047, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.7778877, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.50390625, + "step": 5822, + "time_per_iteration": 2.4625141620635986 + }, + { + "auxiliary_loss_clip": 0.01072347, + "auxiliary_loss_mlp": 0.01066652, + "balance_loss_clip": 1.02874398, + "balance_loss_mlp": 1.0242672, + "epoch": 0.35009770028558546, + "flos": 19025045159040.0, + "grad_norm": 1.7937322377805607, + "language_loss": 0.79493153, + "learning_rate": 3.018045956403094e-06, + "loss": 0.81632149, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.48046875, + "step": 5823, + "time_per_iteration": 2.381380081176758 + }, + { + "auxiliary_loss_clip": 0.01013318, + "auxiliary_loss_mlp": 0.01009023, + "balance_loss_clip": 1.00561333, + "balance_loss_mlp": 1.00342298, + "epoch": 0.3501578235382534, + "flos": 68348555377920.0, + "grad_norm": 0.716615717363725, + "language_loss": 0.59391475, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61413813, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.03417969, + "router_z_loss_mlp": 0.09863281, + "step": 5824, + "time_per_iteration": 3.0502030849456787 + }, + { + "auxiliary_loss_clip": 0.01068371, + "auxiliary_loss_mlp": 0.01056095, + "balance_loss_clip": 1.02113152, + "balance_loss_mlp": 1.02188325, + "epoch": 0.3502179467909214, + "flos": 21249850504320.0, + "grad_norm": 2.5098443285060106, + "language_loss": 0.86593747, + "learning_rate": 3.017375418643811e-06, + "loss": 0.88718212, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.46484375, + "step": 5825, + "time_per_iteration": 2.406557559967041 + }, + { + "auxiliary_loss_clip": 0.01069702, + "auxiliary_loss_mlp": 0.01063073, + "balance_loss_clip": 1.03045774, + "balance_loss_mlp": 1.02350545, + "epoch": 0.35027807004358935, + "flos": 11942285328000.0, + "grad_norm": 2.171718314278295, + "language_loss": 0.8445484, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.8658762, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.4609375, + "step": 5826, + "time_per_iteration": 2.36505389213562 + }, + { + "auxiliary_loss_clip": 0.01072013, + "auxiliary_loss_mlp": 0.01064306, + "balance_loss_clip": 1.03105879, + "balance_loss_mlp": 1.02581215, + "epoch": 0.3503381932962573, + "flos": 21469838181120.0, + "grad_norm": 1.6082830086500342, + "language_loss": 0.81813109, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83949435, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4609375, + "step": 5827, + "time_per_iteration": 2.4067065715789795 + }, + { + "auxiliary_loss_clip": 0.01070342, + "auxiliary_loss_mlp": 0.01061582, + "balance_loss_clip": 1.03019381, + "balance_loss_mlp": 1.02428186, + "epoch": 0.3503983165489253, + "flos": 21250059972480.0, + "grad_norm": 2.103535874486085, + "language_loss": 0.72731251, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.74863172, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4609375, + "step": 5828, + "time_per_iteration": 2.442513942718506 + }, + { + "auxiliary_loss_clip": 0.01073836, + "auxiliary_loss_mlp": 0.01074403, + "balance_loss_clip": 1.03551662, + "balance_loss_mlp": 1.02555621, + "epoch": 0.35045843980159325, + "flos": 27814569448320.0, + "grad_norm": 21.868245058164973, + "language_loss": 0.80895257, + "learning_rate": 3.016033880279248e-06, + "loss": 0.83043492, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.48242188, + "step": 5829, + "time_per_iteration": 2.436824321746826 + }, + { + "auxiliary_loss_clip": 0.01077436, + "auxiliary_loss_mlp": 0.01069214, + "balance_loss_clip": 1.02997041, + "balance_loss_mlp": 1.02777731, + "epoch": 0.3505185630542612, + "flos": 25919972542080.0, + "grad_norm": 2.4841017402925143, + "language_loss": 0.73498297, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.75644946, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.49609375, + "step": 5830, + "time_per_iteration": 3.869079828262329 + }, + { + "auxiliary_loss_clip": 0.0107125, + "auxiliary_loss_mlp": 0.0105768, + "balance_loss_clip": 1.02509999, + "balance_loss_mlp": 1.02551258, + "epoch": 0.35057868630692923, + "flos": 20520724337280.0, + "grad_norm": 2.1028258510182884, + "language_loss": 0.89524508, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.91653442, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.45703125, + "step": 5831, + "time_per_iteration": 2.4045944213867188 + }, + { + "auxiliary_loss_clip": 0.01072337, + "auxiliary_loss_mlp": 0.01057534, + "balance_loss_clip": 1.0254662, + "balance_loss_mlp": 1.02552593, + "epoch": 0.3506388095595972, + "flos": 20447616216960.0, + "grad_norm": 2.4926967062826866, + "language_loss": 0.79624629, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.81754494, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46875, + "step": 5832, + "time_per_iteration": 3.8219618797302246 + }, + { + "auxiliary_loss_clip": 0.01073874, + "auxiliary_loss_mlp": 0.01069771, + "balance_loss_clip": 1.03396058, + "balance_loss_mlp": 1.02619267, + "epoch": 0.35069893281226516, + "flos": 23108626488960.0, + "grad_norm": 2.050430462970009, + "language_loss": 0.73158526, + "learning_rate": 3.014691725465008e-06, + "loss": 0.75302184, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.4765625, + "step": 5833, + "time_per_iteration": 2.4380710124969482 + }, + { + "auxiliary_loss_clip": 0.01070897, + "auxiliary_loss_mlp": 0.01053767, + "balance_loss_clip": 1.02198529, + "balance_loss_mlp": 1.02632165, + "epoch": 0.35075905606493313, + "flos": 27270762111360.0, + "grad_norm": 1.3183549200075686, + "language_loss": 0.81812167, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83936834, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4453125, + "step": 5834, + "time_per_iteration": 4.001646518707275 + }, + { + "auxiliary_loss_clip": 0.01075107, + "auxiliary_loss_mlp": 0.01056426, + "balance_loss_clip": 1.02344048, + "balance_loss_mlp": 1.02957249, + "epoch": 0.3508191793176011, + "flos": 19127794890240.0, + "grad_norm": 2.7909438498610744, + "language_loss": 0.85360861, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.87492394, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45507812, + "step": 5835, + "time_per_iteration": 2.38238263130188 + }, + { + "auxiliary_loss_clip": 0.01072686, + "auxiliary_loss_mlp": 0.01053783, + "balance_loss_clip": 1.02293134, + "balance_loss_mlp": 1.02703309, + "epoch": 0.35087930257026906, + "flos": 25556386976640.0, + "grad_norm": 1.6237561918683274, + "language_loss": 0.77774358, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.79900825, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45703125, + "step": 5836, + "time_per_iteration": 3.8332507610321045 + }, + { + "auxiliary_loss_clip": 0.01076074, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.01596761, + "balance_loss_mlp": 1.02941525, + "epoch": 0.350939425822937, + "flos": 18003277042560.0, + "grad_norm": 2.1377329023050295, + "language_loss": 0.7868551, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.80811375, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46679688, + "step": 5837, + "time_per_iteration": 2.3806662559509277 + }, + { + "auxiliary_loss_clip": 0.01073167, + "auxiliary_loss_mlp": 0.01047668, + "balance_loss_clip": 1.01486123, + "balance_loss_mlp": 1.02905583, + "epoch": 0.350999549075605, + "flos": 22272107379840.0, + "grad_norm": 1.68650829982697, + "language_loss": 0.69786578, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.71907413, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44140625, + "step": 5838, + "time_per_iteration": 2.4404561519622803 + }, + { + "auxiliary_loss_clip": 0.01073304, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.01380134, + "balance_loss_mlp": 1.02742851, + "epoch": 0.35105967232827295, + "flos": 14391407358720.0, + "grad_norm": 2.0484376735249374, + "language_loss": 0.84954381, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.87073529, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45703125, + "step": 5839, + "time_per_iteration": 2.372685432434082 + }, + { + "auxiliary_loss_clip": 0.01075219, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.02125764, + "balance_loss_mlp": 1.02731419, + "epoch": 0.3511197955809409, + "flos": 25081184194560.0, + "grad_norm": 2.0777803660218988, + "language_loss": 0.60660422, + "learning_rate": 3.012341473657572e-06, + "loss": 0.62791407, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48046875, + "step": 5840, + "time_per_iteration": 2.4933981895446777 + }, + { + "auxiliary_loss_clip": 0.01074103, + "auxiliary_loss_mlp": 0.01060412, + "balance_loss_clip": 1.02557862, + "balance_loss_mlp": 1.02763915, + "epoch": 0.3511799188336089, + "flos": 25882999545600.0, + "grad_norm": 2.5483035567991013, + "language_loss": 0.8878755, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.90922064, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.46484375, + "step": 5841, + "time_per_iteration": 2.4199652671813965 + }, + { + "auxiliary_loss_clip": 0.0107551, + "auxiliary_loss_mlp": 0.01062296, + "balance_loss_clip": 1.0269146, + "balance_loss_mlp": 1.0269978, + "epoch": 0.35124004208627685, + "flos": 20082704019840.0, + "grad_norm": 1.8144776608022941, + "language_loss": 0.76243138, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.78380942, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.48632812, + "step": 5842, + "time_per_iteration": 2.463029623031616 + }, + { + "auxiliary_loss_clip": 0.0107189, + "auxiliary_loss_mlp": 0.01063571, + "balance_loss_clip": 1.0289526, + "balance_loss_mlp": 1.02532101, + "epoch": 0.3513001653389448, + "flos": 17782521315840.0, + "grad_norm": 2.0559801936783555, + "language_loss": 0.70819163, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.72954619, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.46484375, + "step": 5843, + "time_per_iteration": 2.371574640274048 + }, + { + "auxiliary_loss_clip": 0.01068952, + "auxiliary_loss_mlp": 0.01057795, + "balance_loss_clip": 1.02450013, + "balance_loss_mlp": 1.02398062, + "epoch": 0.3513602885916128, + "flos": 29385870364800.0, + "grad_norm": 3.106905195881391, + "language_loss": 0.66746396, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68873143, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44921875, + "step": 5844, + "time_per_iteration": 2.508284568786621 + }, + { + "auxiliary_loss_clip": 0.01069272, + "auxiliary_loss_mlp": 0.01064867, + "balance_loss_clip": 1.03170264, + "balance_loss_mlp": 1.02366662, + "epoch": 0.3514204118442808, + "flos": 16178960436480.0, + "grad_norm": 2.029065472002873, + "language_loss": 0.77351749, + "learning_rate": 3.010661570469245e-06, + "loss": 0.79485881, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.45507812, + "step": 5845, + "time_per_iteration": 2.3708672523498535 + }, + { + "auxiliary_loss_clip": 0.01069303, + "auxiliary_loss_mlp": 0.01071077, + "balance_loss_clip": 1.03877163, + "balance_loss_mlp": 1.02419877, + "epoch": 0.35148053509694877, + "flos": 23833737849600.0, + "grad_norm": 2.754368169222888, + "language_loss": 0.755198, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.77660179, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.45117188, + "step": 5846, + "time_per_iteration": 2.4591095447540283 + }, + { + "auxiliary_loss_clip": 0.01069659, + "auxiliary_loss_mlp": 0.0106874, + "balance_loss_clip": 1.03464627, + "balance_loss_mlp": 1.0242362, + "epoch": 0.35154065834961673, + "flos": 20990376213120.0, + "grad_norm": 1.7124426127153407, + "language_loss": 0.76363039, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78501433, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.45507812, + "step": 5847, + "time_per_iteration": 2.4986507892608643 + }, + { + "auxiliary_loss_clip": 0.01066618, + "auxiliary_loss_mlp": 0.01067266, + "balance_loss_clip": 1.03541338, + "balance_loss_mlp": 1.02045465, + "epoch": 0.3516007816022847, + "flos": 33254072317440.0, + "grad_norm": 2.3514699440812414, + "language_loss": 0.73501819, + "learning_rate": 3.009653168561666e-06, + "loss": 0.75635707, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4609375, + "step": 5848, + "time_per_iteration": 2.530108690261841 + }, + { + "auxiliary_loss_clip": 0.01071461, + "auxiliary_loss_mlp": 0.01065392, + "balance_loss_clip": 1.0310955, + "balance_loss_mlp": 1.02405024, + "epoch": 0.35166090485495266, + "flos": 11726207723520.0, + "grad_norm": 2.3659354834879416, + "language_loss": 0.91719878, + "learning_rate": 3.009316958003178e-06, + "loss": 0.93856728, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47460938, + "step": 5849, + "time_per_iteration": 2.350961446762085 + }, + { + "auxiliary_loss_clip": 0.01066368, + "auxiliary_loss_mlp": 0.01061013, + "balance_loss_clip": 1.03090048, + "balance_loss_mlp": 1.02238429, + "epoch": 0.3517210281076206, + "flos": 22637333779200.0, + "grad_norm": 1.9798091018719801, + "language_loss": 0.76457191, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.7858457, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43945312, + "step": 5850, + "time_per_iteration": 2.462162494659424 + }, + { + "auxiliary_loss_clip": 0.01066568, + "auxiliary_loss_mlp": 0.01063041, + "balance_loss_clip": 1.03145015, + "balance_loss_mlp": 1.02296233, + "epoch": 0.3517811513602886, + "flos": 21321736727040.0, + "grad_norm": 1.4271543594638207, + "language_loss": 0.77033454, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.79163063, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43554688, + "step": 5851, + "time_per_iteration": 2.4188969135284424 + }, + { + "auxiliary_loss_clip": 0.01070571, + "auxiliary_loss_mlp": 0.01067216, + "balance_loss_clip": 1.0351249, + "balance_loss_mlp": 1.02505326, + "epoch": 0.35184127461295656, + "flos": 21031817863680.0, + "grad_norm": 1.9630680247349028, + "language_loss": 0.88776636, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.90914416, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45507812, + "step": 5852, + "time_per_iteration": 2.4202194213867188 + }, + { + "auxiliary_loss_clip": 0.01067408, + "auxiliary_loss_mlp": 0.01046332, + "balance_loss_clip": 1.01792479, + "balance_loss_mlp": 1.02460563, + "epoch": 0.3519013978656245, + "flos": 22454179453440.0, + "grad_norm": 2.5511955901605523, + "language_loss": 0.69199812, + "learning_rate": 3.007971733162737e-06, + "loss": 0.7131356, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42773438, + "step": 5853, + "time_per_iteration": 2.4069316387176514 + }, + { + "auxiliary_loss_clip": 0.01068539, + "auxiliary_loss_mlp": 0.01054725, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.02428567, + "epoch": 0.3519615211182925, + "flos": 13114459048320.0, + "grad_norm": 1.6479695697027004, + "language_loss": 0.82341397, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.84464669, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44335938, + "step": 5854, + "time_per_iteration": 2.386979103088379 + }, + { + "auxiliary_loss_clip": 0.01067326, + "auxiliary_loss_mlp": 0.01046318, + "balance_loss_clip": 1.01709974, + "balance_loss_mlp": 1.02433443, + "epoch": 0.35202164437096045, + "flos": 19134148757760.0, + "grad_norm": 3.05291086643193, + "language_loss": 0.74650729, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.76764369, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 5855, + "time_per_iteration": 2.3978748321533203 + }, + { + "auxiliary_loss_clip": 0.01067845, + "auxiliary_loss_mlp": 0.01051706, + "balance_loss_clip": 1.02220201, + "balance_loss_mlp": 1.02574563, + "epoch": 0.3520817676236284, + "flos": 26540972628480.0, + "grad_norm": 2.1274468151501935, + "language_loss": 0.72918022, + "learning_rate": 3.006962413152691e-06, + "loss": 0.75037575, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.421875, + "step": 5856, + "time_per_iteration": 2.453897476196289 + }, + { + "auxiliary_loss_clip": 0.01071365, + "auxiliary_loss_mlp": 0.01057696, + "balance_loss_clip": 1.02429318, + "balance_loss_mlp": 1.02664888, + "epoch": 0.3521418908762964, + "flos": 44891776010880.0, + "grad_norm": 1.5980651001252413, + "language_loss": 0.63332146, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.65461206, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.44726562, + "step": 5857, + "time_per_iteration": 2.622370719909668 + }, + { + "auxiliary_loss_clip": 0.01071055, + "auxiliary_loss_mlp": 0.0104439, + "balance_loss_clip": 1.01344371, + "balance_loss_mlp": 1.02710831, + "epoch": 0.3522020141289644, + "flos": 20186536003200.0, + "grad_norm": 2.063636088411625, + "language_loss": 0.73862195, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75977635, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43945312, + "step": 5858, + "time_per_iteration": 2.453098773956299 + }, + { + "auxiliary_loss_clip": 0.01071276, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_clip": 1.014539, + "balance_loss_mlp": 1.02655339, + "epoch": 0.35226213738163237, + "flos": 27562670922240.0, + "grad_norm": 1.6638805473600786, + "language_loss": 0.78389305, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.80503261, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.44726562, + "step": 5859, + "time_per_iteration": 2.5030131340026855 + }, + { + "auxiliary_loss_clip": 0.0107748, + "auxiliary_loss_mlp": 0.01058835, + "balance_loss_clip": 1.02714896, + "balance_loss_mlp": 1.03051591, + "epoch": 0.35232226063430033, + "flos": 22965203157120.0, + "grad_norm": 1.722573559506577, + "language_loss": 0.73505527, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.75641841, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46875, + "step": 5860, + "time_per_iteration": 2.4506642818450928 + }, + { + "auxiliary_loss_clip": 0.01074685, + "auxiliary_loss_mlp": 0.0105284, + "balance_loss_clip": 1.02191687, + "balance_loss_mlp": 1.02744627, + "epoch": 0.3523823838869683, + "flos": 19167386238720.0, + "grad_norm": 2.396963391109648, + "language_loss": 0.69064307, + "learning_rate": 3.005279449623811e-06, + "loss": 0.71191823, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.47265625, + "step": 5861, + "time_per_iteration": 2.390152931213379 + }, + { + "auxiliary_loss_clip": 0.01072299, + "auxiliary_loss_mlp": 0.01050618, + "balance_loss_clip": 1.02279496, + "balance_loss_mlp": 1.02963996, + "epoch": 0.35244250713963626, + "flos": 17930029276800.0, + "grad_norm": 2.0247560652829937, + "language_loss": 0.67655236, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.69778156, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42578125, + "step": 5862, + "time_per_iteration": 2.4151179790496826 + }, + { + "auxiliary_loss_clip": 0.01072004, + "auxiliary_loss_mlp": 0.01057379, + "balance_loss_clip": 1.02642059, + "balance_loss_mlp": 1.02720714, + "epoch": 0.35250263039230423, + "flos": 21431503641600.0, + "grad_norm": 1.9731807861699378, + "language_loss": 0.78101951, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.80231333, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44921875, + "step": 5863, + "time_per_iteration": 2.413172483444214 + }, + { + "auxiliary_loss_clip": 0.01071304, + "auxiliary_loss_mlp": 0.01053211, + "balance_loss_clip": 1.02482724, + "balance_loss_mlp": 1.02740407, + "epoch": 0.3525627536449722, + "flos": 27415651720320.0, + "grad_norm": 1.878992995111161, + "language_loss": 0.76735032, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.7885955, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4375, + "step": 5864, + "time_per_iteration": 2.4646921157836914 + }, + { + "auxiliary_loss_clip": 0.01069451, + "auxiliary_loss_mlp": 0.01063094, + "balance_loss_clip": 1.03262401, + "balance_loss_mlp": 1.02553535, + "epoch": 0.35262287689764016, + "flos": 24788681890560.0, + "grad_norm": 2.3340722691844595, + "language_loss": 0.81399935, + "learning_rate": 3.003932392558793e-06, + "loss": 0.83532482, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43945312, + "step": 5865, + "time_per_iteration": 2.4382455348968506 + }, + { + "auxiliary_loss_clip": 0.01074619, + "auxiliary_loss_mlp": 0.01063684, + "balance_loss_clip": 1.02980423, + "balance_loss_mlp": 1.02789545, + "epoch": 0.3526830001503081, + "flos": 17820646387200.0, + "grad_norm": 2.230431138181942, + "language_loss": 0.83339953, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.85478258, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.46679688, + "step": 5866, + "time_per_iteration": 2.384157419204712 + }, + { + "auxiliary_loss_clip": 0.01075429, + "auxiliary_loss_mlp": 0.01068242, + "balance_loss_clip": 1.03171659, + "balance_loss_mlp": 1.02688789, + "epoch": 0.3527431234029761, + "flos": 18077118301440.0, + "grad_norm": 2.110367290972529, + "language_loss": 0.86571938, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.88715613, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.48632812, + "step": 5867, + "time_per_iteration": 2.4504923820495605 + }, + { + "auxiliary_loss_clip": 0.01070498, + "auxiliary_loss_mlp": 0.01058947, + "balance_loss_clip": 1.02763033, + "balance_loss_mlp": 1.02542102, + "epoch": 0.35280324665564405, + "flos": 19426336859520.0, + "grad_norm": 2.003807251238924, + "language_loss": 0.75284964, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.77414405, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45117188, + "step": 5868, + "time_per_iteration": 2.3886945247650146 + }, + { + "auxiliary_loss_clip": 0.01069903, + "auxiliary_loss_mlp": 0.01060353, + "balance_loss_clip": 1.0277133, + "balance_loss_mlp": 1.02362132, + "epoch": 0.352863369908312, + "flos": 21503040750720.0, + "grad_norm": 2.5811910562553564, + "language_loss": 0.63547289, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.65677547, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.46484375, + "step": 5869, + "time_per_iteration": 2.4295918941497803 + }, + { + "auxiliary_loss_clip": 0.01065218, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_clip": 1.02550793, + "balance_loss_mlp": 1.02060056, + "epoch": 0.35292349316098, + "flos": 22308417060480.0, + "grad_norm": 1.9411310933428056, + "language_loss": 0.76186657, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.78307509, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4453125, + "step": 5870, + "time_per_iteration": 3.831289291381836 + }, + { + "auxiliary_loss_clip": 0.01064778, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_clip": 1.017555, + "balance_loss_mlp": 1.02045274, + "epoch": 0.352983616413648, + "flos": 33108344835840.0, + "grad_norm": 1.4832416753881874, + "language_loss": 0.73011786, + "learning_rate": 3.001910665140316e-06, + "loss": 0.75124705, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44335938, + "step": 5871, + "time_per_iteration": 2.5230624675750732 + }, + { + "auxiliary_loss_clip": 0.01063679, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_clip": 1.01565492, + "balance_loss_mlp": 1.02080894, + "epoch": 0.35304373966631597, + "flos": 18695639681280.0, + "grad_norm": 2.8041596327889367, + "language_loss": 0.75396693, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.77504534, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 5872, + "time_per_iteration": 3.785813093185425 + }, + { + "auxiliary_loss_clip": 0.01063378, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_clip": 1.02085757, + "balance_loss_mlp": 1.020383, + "epoch": 0.35310386291898394, + "flos": 23363911416960.0, + "grad_norm": 2.0325381720787665, + "language_loss": 0.83384073, + "learning_rate": 3.001236451924089e-06, + "loss": 0.85498422, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4296875, + "step": 5873, + "time_per_iteration": 2.4024715423583984 + }, + { + "auxiliary_loss_clip": 0.01068663, + "auxiliary_loss_mlp": 0.01048845, + "balance_loss_clip": 1.01751697, + "balance_loss_mlp": 1.0218302, + "epoch": 0.3531639861716519, + "flos": 24460812512640.0, + "grad_norm": 2.0341476463751844, + "language_loss": 0.67914867, + "learning_rate": 3.000899288359104e-06, + "loss": 0.70032376, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.46875, + "step": 5874, + "time_per_iteration": 3.811715602874756 + }, + { + "auxiliary_loss_clip": 0.01013987, + "auxiliary_loss_mlp": 0.01005445, + "balance_loss_clip": 1.00213075, + "balance_loss_mlp": 1.00461388, + "epoch": 0.35322410942431987, + "flos": 70309347955200.0, + "grad_norm": 1.2716897174479562, + "language_loss": 0.61611688, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63631123, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.09375, + "step": 5875, + "time_per_iteration": 2.958010673522949 + }, + { + "auxiliary_loss_clip": 0.01067565, + "auxiliary_loss_mlp": 0.01050895, + "balance_loss_clip": 1.02044868, + "balance_loss_mlp": 1.02245986, + "epoch": 0.35328423267698783, + "flos": 19820087706240.0, + "grad_norm": 1.795337470128282, + "language_loss": 0.80736089, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82854551, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.45117188, + "step": 5876, + "time_per_iteration": 3.7829110622406006 + }, + { + "auxiliary_loss_clip": 0.01014476, + "auxiliary_loss_mlp": 0.01006788, + "balance_loss_clip": 1.00333107, + "balance_loss_mlp": 1.00486827, + "epoch": 0.3533443559296558, + "flos": 60823516043520.0, + "grad_norm": 0.6767174665494686, + "language_loss": 0.56788683, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58809948, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.09619141, + "step": 5877, + "time_per_iteration": 3.101242780685425 + }, + { + "auxiliary_loss_clip": 0.01070625, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.02174437, + "balance_loss_mlp": 1.02403033, + "epoch": 0.35340447918232376, + "flos": 24754571625600.0, + "grad_norm": 1.416096287835821, + "language_loss": 0.73383081, + "learning_rate": 2.999550254685024e-06, + "loss": 0.75507402, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46484375, + "step": 5878, + "time_per_iteration": 2.492274284362793 + }, + { + "auxiliary_loss_clip": 0.01067711, + "auxiliary_loss_mlp": 0.01060141, + "balance_loss_clip": 1.02604675, + "balance_loss_mlp": 1.02101851, + "epoch": 0.3534646024349917, + "flos": 21795298675200.0, + "grad_norm": 1.7827743698549832, + "language_loss": 0.79380178, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.81508034, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.46679688, + "step": 5879, + "time_per_iteration": 2.3939504623413086 + }, + { + "auxiliary_loss_clip": 0.01071349, + "auxiliary_loss_mlp": 0.01062873, + "balance_loss_clip": 1.02393889, + "balance_loss_mlp": 1.02296233, + "epoch": 0.3535247256876597, + "flos": 20011062176640.0, + "grad_norm": 2.3373779713219376, + "language_loss": 0.65776545, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.67910767, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.484375, + "step": 5880, + "time_per_iteration": 2.4295570850372314 + }, + { + "auxiliary_loss_clip": 0.01068663, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_clip": 1.01446283, + "balance_loss_mlp": 1.02235603, + "epoch": 0.35358484894032766, + "flos": 18186920127360.0, + "grad_norm": 2.6210819054728267, + "language_loss": 0.68265188, + "learning_rate": 2.998538081402727e-06, + "loss": 0.70381838, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46289062, + "step": 5881, + "time_per_iteration": 2.3914356231689453 + }, + { + "auxiliary_loss_clip": 0.01063686, + "auxiliary_loss_mlp": 0.01040784, + "balance_loss_clip": 1.01287663, + "balance_loss_mlp": 1.02084339, + "epoch": 0.3536449721929956, + "flos": 22819266207360.0, + "grad_norm": 1.4117962647762172, + "language_loss": 0.76876795, + "learning_rate": 2.998200614562239e-06, + "loss": 0.78981268, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42773438, + "step": 5882, + "time_per_iteration": 2.423184394836426 + }, + { + "auxiliary_loss_clip": 0.01071949, + "auxiliary_loss_mlp": 0.01055425, + "balance_loss_clip": 1.02397776, + "balance_loss_mlp": 1.02431321, + "epoch": 0.3537050954456636, + "flos": 26431135891200.0, + "grad_norm": 2.184619424078453, + "language_loss": 0.71764052, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.73891431, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4765625, + "step": 5883, + "time_per_iteration": 2.4260873794555664 + }, + { + "auxiliary_loss_clip": 0.01073587, + "auxiliary_loss_mlp": 0.0105591, + "balance_loss_clip": 1.02298474, + "balance_loss_mlp": 1.02380657, + "epoch": 0.3537652186983316, + "flos": 17196329721600.0, + "grad_norm": 1.9924791416319723, + "language_loss": 0.79631013, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.81760514, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.49609375, + "step": 5884, + "time_per_iteration": 2.386765718460083 + }, + { + "auxiliary_loss_clip": 0.01069176, + "auxiliary_loss_mlp": 0.01045403, + "balance_loss_clip": 1.01540947, + "balance_loss_mlp": 1.02257681, + "epoch": 0.3538253419509996, + "flos": 19535754660480.0, + "grad_norm": 2.05622164937663, + "language_loss": 0.75919867, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.78034449, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.46679688, + "step": 5885, + "time_per_iteration": 2.404128074645996 + }, + { + "auxiliary_loss_clip": 0.01070696, + "auxiliary_loss_mlp": 0.01052158, + "balance_loss_clip": 1.01882744, + "balance_loss_mlp": 1.02305651, + "epoch": 0.35388546520366754, + "flos": 12127813626240.0, + "grad_norm": 2.4076030117275615, + "language_loss": 0.86429131, + "learning_rate": 2.996850368809606e-06, + "loss": 0.8855198, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4765625, + "step": 5886, + "time_per_iteration": 2.394165515899658 + }, + { + "auxiliary_loss_clip": 0.01070238, + "auxiliary_loss_mlp": 0.01055166, + "balance_loss_clip": 1.01971364, + "balance_loss_mlp": 1.02368748, + "epoch": 0.3539455884563355, + "flos": 19677257867520.0, + "grad_norm": 2.7399385378990067, + "language_loss": 0.79963672, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.82089078, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.46484375, + "step": 5887, + "time_per_iteration": 2.423200845718384 + }, + { + "auxiliary_loss_clip": 0.01066792, + "auxiliary_loss_mlp": 0.01054028, + "balance_loss_clip": 1.01964796, + "balance_loss_mlp": 1.02080846, + "epoch": 0.35400571170900347, + "flos": 18071218281600.0, + "grad_norm": 1.8209362484386444, + "language_loss": 0.66795528, + "learning_rate": 2.996175019078089e-06, + "loss": 0.68916351, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.4609375, + "step": 5888, + "time_per_iteration": 2.356095552444458 + }, + { + "auxiliary_loss_clip": 0.01068605, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.01608098, + "balance_loss_mlp": 1.02273107, + "epoch": 0.35406583496167143, + "flos": 26066852098560.0, + "grad_norm": 1.8228429992549593, + "language_loss": 0.78310168, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.80426019, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45898438, + "step": 5889, + "time_per_iteration": 2.4676947593688965 + }, + { + "auxiliary_loss_clip": 0.01068735, + "auxiliary_loss_mlp": 0.01050947, + "balance_loss_clip": 1.01749706, + "balance_loss_mlp": 1.02378702, + "epoch": 0.3541259582143394, + "flos": 19791423613440.0, + "grad_norm": 1.831387166224742, + "language_loss": 0.82527131, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.84646815, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.44921875, + "step": 5890, + "time_per_iteration": 2.394465446472168 + }, + { + "auxiliary_loss_clip": 0.01066286, + "auxiliary_loss_mlp": 0.01048276, + "balance_loss_clip": 1.01678038, + "balance_loss_mlp": 1.02164626, + "epoch": 0.35418608146700736, + "flos": 24021011715840.0, + "grad_norm": 2.030284473592816, + "language_loss": 0.81008446, + "learning_rate": 2.99516171119991e-06, + "loss": 0.83123004, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44726562, + "step": 5891, + "time_per_iteration": 2.424584150314331 + }, + { + "auxiliary_loss_clip": 0.01066491, + "auxiliary_loss_mlp": 0.01053575, + "balance_loss_clip": 1.02031589, + "balance_loss_mlp": 1.02117062, + "epoch": 0.35424620471967533, + "flos": 12384948856320.0, + "grad_norm": 2.4270771381580407, + "language_loss": 0.74889505, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.77009571, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.453125, + "step": 5892, + "time_per_iteration": 2.3653953075408936 + }, + { + "auxiliary_loss_clip": 0.01070666, + "auxiliary_loss_mlp": 0.01055943, + "balance_loss_clip": 1.0208478, + "balance_loss_mlp": 1.02411246, + "epoch": 0.3543063279723433, + "flos": 19672859036160.0, + "grad_norm": 2.3215454876851522, + "language_loss": 0.67737931, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69864547, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.46484375, + "step": 5893, + "time_per_iteration": 2.3852875232696533 + }, + { + "auxiliary_loss_clip": 0.01067166, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_clip": 1.01992798, + "balance_loss_mlp": 1.02145123, + "epoch": 0.35436645122501126, + "flos": 21908102878080.0, + "grad_norm": 3.581656501081007, + "language_loss": 0.70871371, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.72994345, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.45703125, + "step": 5894, + "time_per_iteration": 2.4012954235076904 + }, + { + "auxiliary_loss_clip": 0.0106782, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.01274228, + "balance_loss_mlp": 1.02299309, + "epoch": 0.3544265744776792, + "flos": 21718629596160.0, + "grad_norm": 1.6136607431682202, + "language_loss": 0.75712818, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.7782346, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44726562, + "step": 5895, + "time_per_iteration": 2.398312568664551 + }, + { + "auxiliary_loss_clip": 0.01066601, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.01423311, + "balance_loss_mlp": 1.0211575, + "epoch": 0.3544866977303472, + "flos": 21212214192000.0, + "grad_norm": 1.779760119110029, + "language_loss": 0.8460359, + "learning_rate": 2.993472110174491e-06, + "loss": 0.86715961, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.453125, + "step": 5896, + "time_per_iteration": 2.3839147090911865 + }, + { + "auxiliary_loss_clip": 0.01065845, + "auxiliary_loss_mlp": 0.01058741, + "balance_loss_clip": 1.02303791, + "balance_loss_mlp": 1.02094567, + "epoch": 0.35454682098301515, + "flos": 29310213715200.0, + "grad_norm": 1.648376702765252, + "language_loss": 0.71419418, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.73544002, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.44921875, + "step": 5897, + "time_per_iteration": 2.4598047733306885 + }, + { + "auxiliary_loss_clip": 0.01063499, + "auxiliary_loss_mlp": 0.01044979, + "balance_loss_clip": 1.01295972, + "balance_loss_mlp": 1.01949406, + "epoch": 0.3546069442356832, + "flos": 24315434144640.0, + "grad_norm": 1.7744692632218764, + "language_loss": 0.827389, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84847385, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.43945312, + "step": 5898, + "time_per_iteration": 2.4591777324676514 + }, + { + "auxiliary_loss_clip": 0.01064279, + "auxiliary_loss_mlp": 0.01055333, + "balance_loss_clip": 1.02456534, + "balance_loss_mlp": 1.02034807, + "epoch": 0.35466706748835114, + "flos": 22856169381120.0, + "grad_norm": 1.523384857366134, + "language_loss": 0.75781089, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.77900702, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43945312, + "step": 5899, + "time_per_iteration": 2.4753365516662598 + }, + { + "auxiliary_loss_clip": 0.01066842, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.01804292, + "balance_loss_mlp": 1.02061927, + "epoch": 0.3547271907410191, + "flos": 28328839908480.0, + "grad_norm": 1.7560839000387838, + "language_loss": 0.80802709, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.82919037, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.46289062, + "step": 5900, + "time_per_iteration": 2.4402267932891846 + }, + { + "auxiliary_loss_clip": 0.01066134, + "auxiliary_loss_mlp": 0.01049795, + "balance_loss_clip": 1.01615405, + "balance_loss_mlp": 1.02062726, + "epoch": 0.35478731399368707, + "flos": 23512955477760.0, + "grad_norm": 1.9008015261539222, + "language_loss": 0.82821137, + "learning_rate": 2.991781567335093e-06, + "loss": 0.8493706, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.45507812, + "step": 5901, + "time_per_iteration": 2.4487879276275635 + }, + { + "auxiliary_loss_clip": 0.01069999, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.01510942, + "balance_loss_mlp": 1.02154148, + "epoch": 0.35484743724635504, + "flos": 18623334522240.0, + "grad_norm": 2.162189409690262, + "language_loss": 0.77455759, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.79573864, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.484375, + "step": 5902, + "time_per_iteration": 2.4125750064849854 + }, + { + "auxiliary_loss_clip": 0.01064323, + "auxiliary_loss_mlp": 0.01053135, + "balance_loss_clip": 1.01875532, + "balance_loss_mlp": 1.01922405, + "epoch": 0.354907560499023, + "flos": 17383533765120.0, + "grad_norm": 1.8517856797034262, + "language_loss": 0.7217384, + "learning_rate": 2.991105086850381e-06, + "loss": 0.74291301, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.45117188, + "step": 5903, + "time_per_iteration": 2.392615556716919 + }, + { + "auxiliary_loss_clip": 0.01067691, + "auxiliary_loss_mlp": 0.01049086, + "balance_loss_clip": 1.0161128, + "balance_loss_mlp": 1.02037311, + "epoch": 0.35496768375169097, + "flos": 19207536168960.0, + "grad_norm": 2.3880845475986865, + "language_loss": 0.76929057, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.79045832, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.47265625, + "step": 5904, + "time_per_iteration": 2.380796194076538 + }, + { + "auxiliary_loss_clip": 0.01066593, + "auxiliary_loss_mlp": 0.01056219, + "balance_loss_clip": 1.02464008, + "balance_loss_mlp": 1.02081418, + "epoch": 0.35502780700435893, + "flos": 18331809736320.0, + "grad_norm": 2.155687167612292, + "language_loss": 0.80951416, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.83074224, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45703125, + "step": 5905, + "time_per_iteration": 2.3985402584075928 + }, + { + "auxiliary_loss_clip": 0.01058806, + "auxiliary_loss_mlp": 0.01042133, + "balance_loss_clip": 1.01496482, + "balance_loss_mlp": 1.01815021, + "epoch": 0.3550879302570269, + "flos": 15447704676480.0, + "grad_norm": 2.3482271372688923, + "language_loss": 0.73329389, + "learning_rate": 2.990090084284356e-06, + "loss": 0.75430334, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 5906, + "time_per_iteration": 2.3453664779663086 + }, + { + "auxiliary_loss_clip": 0.01067315, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.0180341, + "balance_loss_mlp": 1.02065253, + "epoch": 0.35514805350969486, + "flos": 21978173710080.0, + "grad_norm": 2.017282851695568, + "language_loss": 0.76692712, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.78811342, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46679688, + "step": 5907, + "time_per_iteration": 2.408691644668579 + }, + { + "auxiliary_loss_clip": 0.01064809, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.01644278, + "balance_loss_mlp": 1.02040839, + "epoch": 0.3552081767623628, + "flos": 29860654210560.0, + "grad_norm": 1.7290802599883692, + "language_loss": 0.76374829, + "learning_rate": 2.989413228164047e-06, + "loss": 0.78486717, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4453125, + "step": 5908, + "time_per_iteration": 2.4615933895111084 + }, + { + "auxiliary_loss_clip": 0.01064956, + "auxiliary_loss_mlp": 0.01047343, + "balance_loss_clip": 1.01546645, + "balance_loss_mlp": 1.02050781, + "epoch": 0.3552683000150308, + "flos": 26431066068480.0, + "grad_norm": 1.7696360803915674, + "language_loss": 0.69150472, + "learning_rate": 2.989074743819502e-06, + "loss": 0.71262771, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4453125, + "step": 5909, + "time_per_iteration": 2.4923958778381348 + }, + { + "auxiliary_loss_clip": 0.01062111, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.01649714, + "balance_loss_mlp": 1.02016985, + "epoch": 0.35532842326769876, + "flos": 19785139568640.0, + "grad_norm": 2.3952664861992257, + "language_loss": 0.80038774, + "learning_rate": 2.988736221969144e-06, + "loss": 0.82145804, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41992188, + "step": 5910, + "time_per_iteration": 3.8053572177886963 + }, + { + "auxiliary_loss_clip": 0.01066024, + "auxiliary_loss_mlp": 0.01054541, + "balance_loss_clip": 1.01741898, + "balance_loss_mlp": 1.02028179, + "epoch": 0.3553885465203668, + "flos": 17238295042560.0, + "grad_norm": 1.7258018015921595, + "language_loss": 0.72651649, + "learning_rate": 2.98839766262581e-06, + "loss": 0.74772215, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.45703125, + "step": 5911, + "time_per_iteration": 2.4054253101348877 + }, + { + "auxiliary_loss_clip": 0.01064874, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_clip": 1.01529646, + "balance_loss_mlp": 1.0211798, + "epoch": 0.35544866977303474, + "flos": 14933608773120.0, + "grad_norm": 2.1147236413616812, + "language_loss": 0.88431484, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.90541077, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4375, + "step": 5912, + "time_per_iteration": 3.868826389312744 + }, + { + "auxiliary_loss_clip": 0.01065692, + "auxiliary_loss_mlp": 0.01047948, + "balance_loss_clip": 1.01626253, + "balance_loss_mlp": 1.02080011, + "epoch": 0.3555087930257027, + "flos": 19755009198720.0, + "grad_norm": 2.1107034595955763, + "language_loss": 0.78161645, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.80275291, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 5913, + "time_per_iteration": 2.3935678005218506 + }, + { + "auxiliary_loss_clip": 0.01063648, + "auxiliary_loss_mlp": 0.01046729, + "balance_loss_clip": 1.01565111, + "balance_loss_mlp": 1.02022147, + "epoch": 0.3555689162783707, + "flos": 21067219848960.0, + "grad_norm": 1.4940040987155385, + "language_loss": 0.83378738, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.85489118, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43359375, + "step": 5914, + "time_per_iteration": 3.8625292778015137 + }, + { + "auxiliary_loss_clip": 0.01065104, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_clip": 1.02168608, + "balance_loss_mlp": 1.02026892, + "epoch": 0.35562903953103864, + "flos": 33068334551040.0, + "grad_norm": 2.2037554311518894, + "language_loss": 0.72017616, + "learning_rate": 2.98704305057949e-06, + "loss": 0.74136353, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44921875, + "step": 5915, + "time_per_iteration": 3.8841097354888916 + }, + { + "auxiliary_loss_clip": 0.01063432, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_clip": 1.01820672, + "balance_loss_mlp": 1.01886773, + "epoch": 0.3556891627837066, + "flos": 20556824549760.0, + "grad_norm": 1.7044241663243156, + "language_loss": 0.76911569, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.79024184, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4453125, + "step": 5916, + "time_per_iteration": 2.371776819229126 + }, + { + "auxiliary_loss_clip": 0.01065847, + "auxiliary_loss_mlp": 0.01045247, + "balance_loss_clip": 1.01420522, + "balance_loss_mlp": 1.02047861, + "epoch": 0.35574928603637457, + "flos": 20702307651840.0, + "grad_norm": 2.59399549557887, + "language_loss": 0.89360362, + "learning_rate": 2.986365519932332e-06, + "loss": 0.91471457, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.453125, + "step": 5917, + "time_per_iteration": 2.432219982147217 + }, + { + "auxiliary_loss_clip": 0.01064752, + "auxiliary_loss_mlp": 0.0104503, + "balance_loss_clip": 1.01473927, + "balance_loss_mlp": 1.02012825, + "epoch": 0.35580940928904253, + "flos": 15193711468800.0, + "grad_norm": 2.3542577751938683, + "language_loss": 0.77640092, + "learning_rate": 2.98602669849771e-06, + "loss": 0.79749876, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4453125, + "step": 5918, + "time_per_iteration": 2.353813648223877 + }, + { + "auxiliary_loss_clip": 0.01019447, + "auxiliary_loss_mlp": 0.01008188, + "balance_loss_clip": 1.00506461, + "balance_loss_mlp": 1.0099169, + "epoch": 0.3558695325417105, + "flos": 58636312099200.0, + "grad_norm": 0.9117779080903505, + "language_loss": 0.63950503, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65978134, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.03125, + "router_z_loss_mlp": 0.09570312, + "step": 5919, + "time_per_iteration": 2.793416738510132 + }, + { + "auxiliary_loss_clip": 0.01066521, + "auxiliary_loss_mlp": 0.0105378, + "balance_loss_clip": 1.02185559, + "balance_loss_mlp": 1.02011609, + "epoch": 0.35592965579437846, + "flos": 22017136654080.0, + "grad_norm": 2.262989004329835, + "language_loss": 0.75350797, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.77471101, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46289062, + "step": 5920, + "time_per_iteration": 2.4035110473632812 + }, + { + "auxiliary_loss_clip": 0.01062656, + "auxiliary_loss_mlp": 0.01042159, + "balance_loss_clip": 1.01292932, + "balance_loss_mlp": 1.01898956, + "epoch": 0.35598977904704643, + "flos": 23366564680320.0, + "grad_norm": 1.8774481929967304, + "language_loss": 0.78455412, + "learning_rate": 2.985010009903857e-06, + "loss": 0.80560231, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4375, + "step": 5921, + "time_per_iteration": 2.437350273132324 + }, + { + "auxiliary_loss_clip": 0.01063537, + "auxiliary_loss_mlp": 0.01052253, + "balance_loss_clip": 1.02290416, + "balance_loss_mlp": 1.01961923, + "epoch": 0.3560499022997144, + "flos": 17784371617920.0, + "grad_norm": 1.989483671326524, + "language_loss": 0.68982154, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.7109794, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43945312, + "step": 5922, + "time_per_iteration": 2.3625378608703613 + }, + { + "auxiliary_loss_clip": 0.01065749, + "auxiliary_loss_mlp": 0.01046342, + "balance_loss_clip": 1.01471591, + "balance_loss_mlp": 1.02030969, + "epoch": 0.35611002555238236, + "flos": 20739420293760.0, + "grad_norm": 3.0009698723860914, + "language_loss": 0.80234349, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.82346433, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45507812, + "step": 5923, + "time_per_iteration": 2.429516077041626 + }, + { + "auxiliary_loss_clip": 0.01064949, + "auxiliary_loss_mlp": 0.01051573, + "balance_loss_clip": 1.02264106, + "balance_loss_mlp": 1.0201757, + "epoch": 0.3561701488050504, + "flos": 19461250085760.0, + "grad_norm": 1.7010726793893929, + "language_loss": 0.86465871, + "learning_rate": 2.983992985144908e-06, + "loss": 0.88582397, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44726562, + "step": 5924, + "time_per_iteration": 2.3667945861816406 + }, + { + "auxiliary_loss_clip": 0.01063377, + "auxiliary_loss_mlp": 0.0104622, + "balance_loss_clip": 1.01604843, + "balance_loss_mlp": 1.01984954, + "epoch": 0.35623027205771834, + "flos": 30773598019200.0, + "grad_norm": 2.206200186468841, + "language_loss": 0.79515958, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.81625557, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43359375, + "step": 5925, + "time_per_iteration": 2.473376750946045 + }, + { + "auxiliary_loss_clip": 0.01062794, + "auxiliary_loss_mlp": 0.01051672, + "balance_loss_clip": 1.02124977, + "balance_loss_mlp": 1.01800299, + "epoch": 0.3562903953103863, + "flos": 16980182294400.0, + "grad_norm": 1.8541525807553443, + "language_loss": 0.76972282, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.79086751, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44921875, + "step": 5926, + "time_per_iteration": 2.3773396015167236 + }, + { + "auxiliary_loss_clip": 0.01065259, + "auxiliary_loss_mlp": 0.01049155, + "balance_loss_clip": 1.01863766, + "balance_loss_mlp": 1.02068722, + "epoch": 0.3563505185630543, + "flos": 23838765085440.0, + "grad_norm": 1.879992644150106, + "language_loss": 0.70775855, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.7289027, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4453125, + "step": 5927, + "time_per_iteration": 2.410907030105591 + }, + { + "auxiliary_loss_clip": 0.01062549, + "auxiliary_loss_mlp": 0.01047819, + "balance_loss_clip": 1.01787353, + "balance_loss_mlp": 1.01940703, + "epoch": 0.35641064181572224, + "flos": 22272351759360.0, + "grad_norm": 2.2760989515387333, + "language_loss": 0.80555177, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.82665545, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43164062, + "step": 5928, + "time_per_iteration": 2.393364429473877 + }, + { + "auxiliary_loss_clip": 0.0106311, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.01348972, + "balance_loss_mlp": 1.01903343, + "epoch": 0.3564707650683902, + "flos": 23000186206080.0, + "grad_norm": 1.3298212415451185, + "language_loss": 0.82470512, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84578586, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44140625, + "step": 5929, + "time_per_iteration": 2.4448800086975098 + }, + { + "auxiliary_loss_clip": 0.0106153, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.01760328, + "balance_loss_mlp": 1.01900959, + "epoch": 0.35653088832105817, + "flos": 14683385992320.0, + "grad_norm": 1.6011721813081803, + "language_loss": 0.71885234, + "learning_rate": 2.981957928520201e-06, + "loss": 0.73991597, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.42578125, + "step": 5930, + "time_per_iteration": 2.4034078121185303 + }, + { + "auxiliary_loss_clip": 0.01068014, + "auxiliary_loss_mlp": 0.01052118, + "balance_loss_clip": 1.02048016, + "balance_loss_mlp": 1.02144051, + "epoch": 0.35659101157372614, + "flos": 23475947569920.0, + "grad_norm": 2.0355608899358772, + "language_loss": 0.69793952, + "learning_rate": 2.981618622015244e-06, + "loss": 0.71914077, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46484375, + "step": 5931, + "time_per_iteration": 2.389711380004883 + }, + { + "auxiliary_loss_clip": 0.01063117, + "auxiliary_loss_mlp": 0.01050125, + "balance_loss_clip": 1.02072823, + "balance_loss_mlp": 1.0196166, + "epoch": 0.3566511348263941, + "flos": 26577456865920.0, + "grad_norm": 1.6999782461956683, + "language_loss": 0.69833702, + "learning_rate": 2.981279278287211e-06, + "loss": 0.71946949, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43554688, + "step": 5932, + "time_per_iteration": 2.458024501800537 + }, + { + "auxiliary_loss_clip": 0.01064145, + "auxiliary_loss_mlp": 0.0104097, + "balance_loss_clip": 1.01333737, + "balance_loss_mlp": 1.02085912, + "epoch": 0.35671125807906207, + "flos": 13114179757440.0, + "grad_norm": 2.1869938108248035, + "language_loss": 0.80576915, + "learning_rate": 2.980939897348969e-06, + "loss": 0.82682031, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.43359375, + "step": 5933, + "time_per_iteration": 2.356184482574463 + }, + { + "auxiliary_loss_clip": 0.01068207, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_clip": 1.01857209, + "balance_loss_mlp": 1.02263999, + "epoch": 0.35677138133173003, + "flos": 32999171414400.0, + "grad_norm": 1.729261809243827, + "language_loss": 0.71065176, + "learning_rate": 2.980600479213388e-06, + "loss": 0.73182142, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.45507812, + "step": 5934, + "time_per_iteration": 2.5117685794830322 + }, + { + "auxiliary_loss_clip": 0.0107058, + "auxiliary_loss_mlp": 0.01051117, + "balance_loss_clip": 1.01623583, + "balance_loss_mlp": 1.02258396, + "epoch": 0.356831504584398, + "flos": 20776777315200.0, + "grad_norm": 1.9215553334968103, + "language_loss": 0.72177815, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.74299514, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48046875, + "step": 5935, + "time_per_iteration": 2.4047012329101562 + }, + { + "auxiliary_loss_clip": 0.01065027, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.01551914, + "balance_loss_mlp": 1.02043939, + "epoch": 0.35689162783706596, + "flos": 12164786622720.0, + "grad_norm": 2.2996971793763237, + "language_loss": 0.79758334, + "learning_rate": 2.979921531401692e-06, + "loss": 0.81869704, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 5936, + "time_per_iteration": 2.371316909790039 + }, + { + "auxiliary_loss_clip": 0.01064579, + "auxiliary_loss_mlp": 0.01049369, + "balance_loss_clip": 1.01926875, + "balance_loss_mlp": 1.02088523, + "epoch": 0.356951751089734, + "flos": 23840371008000.0, + "grad_norm": 1.468998668375143, + "language_loss": 0.66043937, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.68157887, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4375, + "step": 5937, + "time_per_iteration": 2.399121046066284 + }, + { + "auxiliary_loss_clip": 0.01067614, + "auxiliary_loss_mlp": 0.01049209, + "balance_loss_clip": 1.01808393, + "balance_loss_mlp": 1.0220623, + "epoch": 0.35701187434240195, + "flos": 11721564512640.0, + "grad_norm": 3.302620549499157, + "language_loss": 0.79807794, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.81924617, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.45507812, + "step": 5938, + "time_per_iteration": 2.3717987537384033 + }, + { + "auxiliary_loss_clip": 0.01068421, + "auxiliary_loss_mlp": 0.01055643, + "balance_loss_clip": 1.02582908, + "balance_loss_mlp": 1.02283466, + "epoch": 0.3570719975950699, + "flos": 24897750577920.0, + "grad_norm": 1.5444790169734557, + "language_loss": 0.80613828, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82737899, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.45703125, + "step": 5939, + "time_per_iteration": 2.4104559421539307 + }, + { + "auxiliary_loss_clip": 0.01069795, + "auxiliary_loss_mlp": 0.01049544, + "balance_loss_clip": 1.01671338, + "balance_loss_mlp": 1.02136254, + "epoch": 0.3571321208477379, + "flos": 25993639244160.0, + "grad_norm": 1.694199067265966, + "language_loss": 0.80435586, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.82554924, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.484375, + "step": 5940, + "time_per_iteration": 2.448390007019043 + }, + { + "auxiliary_loss_clip": 0.0106845, + "auxiliary_loss_mlp": 0.01046072, + "balance_loss_clip": 1.01445735, + "balance_loss_mlp": 1.0222789, + "epoch": 0.35719224410040584, + "flos": 14500790248320.0, + "grad_norm": 1.9205955218097863, + "language_loss": 0.74857092, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.76971614, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4609375, + "step": 5941, + "time_per_iteration": 2.4040355682373047 + }, + { + "auxiliary_loss_clip": 0.01067902, + "auxiliary_loss_mlp": 0.01049161, + "balance_loss_clip": 1.01726043, + "balance_loss_mlp": 1.02213752, + "epoch": 0.3572523673530738, + "flos": 31174121669760.0, + "grad_norm": 5.281039865402096, + "language_loss": 0.66156113, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.68273181, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.45703125, + "step": 5942, + "time_per_iteration": 2.5005507469177246 + }, + { + "auxiliary_loss_clip": 0.01065252, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_clip": 1.01700008, + "balance_loss_mlp": 1.02080858, + "epoch": 0.3573124906057418, + "flos": 15851056147200.0, + "grad_norm": 2.0383366769961357, + "language_loss": 0.7514838, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.77262765, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4453125, + "step": 5943, + "time_per_iteration": 2.3491477966308594 + }, + { + "auxiliary_loss_clip": 0.01012011, + "auxiliary_loss_mlp": 0.01002918, + "balance_loss_clip": 1.00016475, + "balance_loss_mlp": 1.00254035, + "epoch": 0.35737261385840974, + "flos": 60819989996160.0, + "grad_norm": 0.7858810965743278, + "language_loss": 0.60776722, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62791651, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.09472656, + "step": 5944, + "time_per_iteration": 3.1353371143341064 + }, + { + "auxiliary_loss_clip": 0.01063852, + "auxiliary_loss_mlp": 0.01045954, + "balance_loss_clip": 1.01678348, + "balance_loss_mlp": 1.02031922, + "epoch": 0.3574327371110777, + "flos": 18842763617280.0, + "grad_norm": 1.804717507623184, + "language_loss": 0.74034417, + "learning_rate": 2.976864428379655e-06, + "loss": 0.76144218, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43554688, + "step": 5945, + "time_per_iteration": 2.3500185012817383 + }, + { + "auxiliary_loss_clip": 0.01064433, + "auxiliary_loss_mlp": 0.0105176, + "balance_loss_clip": 1.02133822, + "balance_loss_mlp": 1.02019489, + "epoch": 0.35749286036374567, + "flos": 23548566931200.0, + "grad_norm": 1.6594313822798994, + "language_loss": 0.82639319, + "learning_rate": 2.976524564880326e-06, + "loss": 0.84755504, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44140625, + "step": 5946, + "time_per_iteration": 2.4454147815704346 + }, + { + "auxiliary_loss_clip": 0.01068131, + "auxiliary_loss_mlp": 0.01049886, + "balance_loss_clip": 1.01980948, + "balance_loss_mlp": 1.02235746, + "epoch": 0.35755298361641363, + "flos": 21104437224960.0, + "grad_norm": 1.5457970269799857, + "language_loss": 0.70786452, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.72904474, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45703125, + "step": 5947, + "time_per_iteration": 2.391890287399292 + }, + { + "auxiliary_loss_clip": 0.01060558, + "auxiliary_loss_mlp": 0.01053816, + "balance_loss_clip": 1.02458596, + "balance_loss_mlp": 1.01871848, + "epoch": 0.3576131068690816, + "flos": 19244020406400.0, + "grad_norm": 1.7034024497160882, + "language_loss": 0.77138948, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.79253322, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41796875, + "step": 5948, + "time_per_iteration": 2.394029378890991 + }, + { + "auxiliary_loss_clip": 0.01063668, + "auxiliary_loss_mlp": 0.01046409, + "balance_loss_clip": 1.01697636, + "balance_loss_mlp": 1.01908684, + "epoch": 0.35767323012174956, + "flos": 28653532352640.0, + "grad_norm": 1.8775238969478159, + "language_loss": 0.72110635, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.74220711, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4453125, + "step": 5949, + "time_per_iteration": 2.428607225418091 + }, + { + "auxiliary_loss_clip": 0.01063862, + "auxiliary_loss_mlp": 0.01054838, + "balance_loss_clip": 1.02312791, + "balance_loss_mlp": 1.01895857, + "epoch": 0.35773335337441753, + "flos": 17084607770880.0, + "grad_norm": 1.870643689759122, + "language_loss": 0.78834623, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.80953324, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 5950, + "time_per_iteration": 3.821986675262451 + }, + { + "auxiliary_loss_clip": 0.01065551, + "auxiliary_loss_mlp": 0.01046991, + "balance_loss_clip": 1.01548374, + "balance_loss_mlp": 1.01914477, + "epoch": 0.35779347662708555, + "flos": 15887680030080.0, + "grad_norm": 1.6445095727594887, + "language_loss": 0.74353653, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.76466197, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.46484375, + "step": 5951, + "time_per_iteration": 3.7664127349853516 + }, + { + "auxiliary_loss_clip": 0.01065693, + "auxiliary_loss_mlp": 0.01048004, + "balance_loss_clip": 1.01617503, + "balance_loss_mlp": 1.01916647, + "epoch": 0.3578535998797535, + "flos": 28657547159040.0, + "grad_norm": 2.4147427324980577, + "language_loss": 0.70516396, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.7263009, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46679688, + "step": 5952, + "time_per_iteration": 2.4257729053497314 + }, + { + "auxiliary_loss_clip": 0.01063868, + "auxiliary_loss_mlp": 0.01045514, + "balance_loss_clip": 1.01680827, + "balance_loss_mlp": 1.01978588, + "epoch": 0.3579137231324215, + "flos": 37850911678080.0, + "grad_norm": 1.6566585885803995, + "language_loss": 0.71127021, + "learning_rate": 2.974144484269449e-06, + "loss": 0.732364, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.44140625, + "step": 5953, + "time_per_iteration": 2.558886766433716 + }, + { + "auxiliary_loss_clip": 0.01063992, + "auxiliary_loss_mlp": 0.01046988, + "balance_loss_clip": 1.01587427, + "balance_loss_mlp": 1.01936126, + "epoch": 0.35797384638508944, + "flos": 22345739170560.0, + "grad_norm": 1.6991887591867438, + "language_loss": 0.68051183, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.70162165, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4453125, + "step": 5954, + "time_per_iteration": 3.8225338459014893 + }, + { + "auxiliary_loss_clip": 0.01063925, + "auxiliary_loss_mlp": 0.01053193, + "balance_loss_clip": 1.02275908, + "balance_loss_mlp": 1.01980245, + "epoch": 0.3580339696377574, + "flos": 13588858869120.0, + "grad_norm": 2.6541281938852355, + "language_loss": 0.76800513, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.78917634, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44140625, + "step": 5955, + "time_per_iteration": 3.7609376907348633 + }, + { + "auxiliary_loss_clip": 0.01062277, + "auxiliary_loss_mlp": 0.01041863, + "balance_loss_clip": 1.01405144, + "balance_loss_mlp": 1.01980114, + "epoch": 0.3580940928904254, + "flos": 23767123242240.0, + "grad_norm": 1.7661992695853792, + "language_loss": 0.76903951, + "learning_rate": 2.973123895369182e-06, + "loss": 0.7900809, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.42382812, + "step": 5956, + "time_per_iteration": 2.420297622680664 + }, + { + "auxiliary_loss_clip": 0.01063856, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_clip": 1.016186, + "balance_loss_mlp": 1.02039599, + "epoch": 0.35815421614309334, + "flos": 19462856008320.0, + "grad_norm": 1.8226764865554828, + "language_loss": 0.7470327, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.76811385, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.43359375, + "step": 5957, + "time_per_iteration": 2.363208293914795 + }, + { + "auxiliary_loss_clip": 0.01065878, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_clip": 1.01622605, + "balance_loss_mlp": 1.02104878, + "epoch": 0.3582143393957613, + "flos": 23367053439360.0, + "grad_norm": 1.895376695454308, + "language_loss": 0.73304701, + "learning_rate": 2.972443318242726e-06, + "loss": 0.75417936, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44726562, + "step": 5958, + "time_per_iteration": 2.4336812496185303 + }, + { + "auxiliary_loss_clip": 0.01062786, + "auxiliary_loss_mlp": 0.01042242, + "balance_loss_clip": 1.01260674, + "balance_loss_mlp": 1.01990092, + "epoch": 0.35827446264842927, + "flos": 26322067203840.0, + "grad_norm": 1.724359855474124, + "language_loss": 0.89889646, + "learning_rate": 2.972102974360324e-06, + "loss": 0.91994679, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42773438, + "step": 5959, + "time_per_iteration": 2.427905559539795 + }, + { + "auxiliary_loss_clip": 0.01065615, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.01554418, + "balance_loss_mlp": 1.02170694, + "epoch": 0.35833458590109724, + "flos": 30445274793600.0, + "grad_norm": 2.1434773633272775, + "language_loss": 0.59805429, + "learning_rate": 2.971762593615679e-06, + "loss": 0.61917436, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43945312, + "step": 5960, + "time_per_iteration": 2.472266435623169 + }, + { + "auxiliary_loss_clip": 0.01066156, + "auxiliary_loss_mlp": 0.01049976, + "balance_loss_clip": 1.01616859, + "balance_loss_mlp": 1.02083147, + "epoch": 0.3583947091537652, + "flos": 14829008739840.0, + "grad_norm": 2.7142633682281456, + "language_loss": 0.77716482, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.79832619, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.453125, + "step": 5961, + "time_per_iteration": 2.3718082904815674 + }, + { + "auxiliary_loss_clip": 0.01067374, + "auxiliary_loss_mlp": 0.0104379, + "balance_loss_clip": 1.01186538, + "balance_loss_mlp": 1.02274656, + "epoch": 0.35845483240643317, + "flos": 34239216551040.0, + "grad_norm": 1.718158708216115, + "language_loss": 0.72130698, + "learning_rate": 2.971081721591294e-06, + "loss": 0.74241865, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44726562, + "step": 5962, + "time_per_iteration": 2.536874294281006 + }, + { + "auxiliary_loss_clip": 0.0106359, + "auxiliary_loss_mlp": 0.01044537, + "balance_loss_clip": 1.01788211, + "balance_loss_mlp": 1.02134705, + "epoch": 0.35851495565910113, + "flos": 20959023945600.0, + "grad_norm": 1.9110352712841838, + "language_loss": 0.75610709, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.77718836, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 5963, + "time_per_iteration": 2.386296272277832 + }, + { + "auxiliary_loss_clip": 0.0106565, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.01377559, + "balance_loss_mlp": 1.02154005, + "epoch": 0.35857507891176915, + "flos": 22308766174080.0, + "grad_norm": 2.812090751538277, + "language_loss": 0.80012524, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.82122111, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44140625, + "step": 5964, + "time_per_iteration": 2.445861339569092 + }, + { + "auxiliary_loss_clip": 0.01066114, + "auxiliary_loss_mlp": 0.01053269, + "balance_loss_clip": 1.02046275, + "balance_loss_mlp": 1.02024233, + "epoch": 0.3586352021644371, + "flos": 23366739237120.0, + "grad_norm": 1.7965609813693006, + "language_loss": 0.67619348, + "learning_rate": 2.970060137410626e-06, + "loss": 0.69738734, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.45898438, + "step": 5965, + "time_per_iteration": 2.406238317489624 + }, + { + "auxiliary_loss_clip": 0.01065276, + "auxiliary_loss_mlp": 0.01052499, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.02156281, + "epoch": 0.3586953254171051, + "flos": 27848156042880.0, + "grad_norm": 1.8118013127259531, + "language_loss": 0.79856896, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81974673, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4375, + "step": 5966, + "time_per_iteration": 2.4770190715789795 + }, + { + "auxiliary_loss_clip": 0.01064513, + "auxiliary_loss_mlp": 0.01063297, + "balance_loss_clip": 1.02916765, + "balance_loss_mlp": 1.01951218, + "epoch": 0.35875544866977305, + "flos": 19499479891200.0, + "grad_norm": 1.8985582454095418, + "language_loss": 0.92483246, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.94611055, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.44921875, + "step": 5967, + "time_per_iteration": 2.3709988594055176 + }, + { + "auxiliary_loss_clip": 0.01067191, + "auxiliary_loss_mlp": 0.01060639, + "balance_loss_clip": 1.02821398, + "balance_loss_mlp": 1.0209744, + "epoch": 0.358815571922441, + "flos": 21470047649280.0, + "grad_norm": 2.194333888765889, + "language_loss": 0.82124805, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.84252632, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46289062, + "step": 5968, + "time_per_iteration": 2.4276368618011475 + }, + { + "auxiliary_loss_clip": 0.01066325, + "auxiliary_loss_mlp": 0.01066301, + "balance_loss_clip": 1.03165925, + "balance_loss_mlp": 1.01982462, + "epoch": 0.358875695175109, + "flos": 21834331441920.0, + "grad_norm": 2.4259509985793333, + "language_loss": 0.85823482, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.87956107, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.46484375, + "step": 5969, + "time_per_iteration": 2.381028652191162 + }, + { + "auxiliary_loss_clip": 0.01064737, + "auxiliary_loss_mlp": 0.01045604, + "balance_loss_clip": 1.01558721, + "balance_loss_mlp": 1.0206995, + "epoch": 0.35893581842777694, + "flos": 32010361488000.0, + "grad_norm": 1.695335481041504, + "language_loss": 0.72954571, + "learning_rate": 2.968356761586202e-06, + "loss": 0.75064909, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43945312, + "step": 5970, + "time_per_iteration": 2.5112342834472656 + }, + { + "auxiliary_loss_clip": 0.01061994, + "auxiliary_loss_mlp": 0.01046148, + "balance_loss_clip": 1.0158453, + "balance_loss_mlp": 1.01860046, + "epoch": 0.3589959416804449, + "flos": 20484763770240.0, + "grad_norm": 1.7517814846282334, + "language_loss": 0.81432605, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.8354075, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43359375, + "step": 5971, + "time_per_iteration": 2.3918800354003906 + }, + { + "auxiliary_loss_clip": 0.01065662, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.01526546, + "balance_loss_mlp": 1.01912987, + "epoch": 0.3590560649331129, + "flos": 16179728486400.0, + "grad_norm": 2.048734967973474, + "language_loss": 0.80042195, + "learning_rate": 2.967675154124696e-06, + "loss": 0.82154787, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46484375, + "step": 5972, + "time_per_iteration": 2.3790743350982666 + }, + { + "auxiliary_loss_clip": 0.01063872, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_clip": 1.01751053, + "balance_loss_mlp": 1.01885724, + "epoch": 0.35911618818578084, + "flos": 20374368451200.0, + "grad_norm": 2.189375427553425, + "language_loss": 0.82083088, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.84196228, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44921875, + "step": 5973, + "time_per_iteration": 2.3881454467773438 + }, + { + "auxiliary_loss_clip": 0.01014969, + "auxiliary_loss_mlp": 0.0100434, + "balance_loss_clip": 1.00158679, + "balance_loss_mlp": 1.00470555, + "epoch": 0.3591763114384488, + "flos": 41234308358400.0, + "grad_norm": 0.9160861868809462, + "language_loss": 0.56810904, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58830214, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.10253906, + "step": 5974, + "time_per_iteration": 2.908655881881714 + }, + { + "auxiliary_loss_clip": 0.0106577, + "auxiliary_loss_mlp": 0.0105349, + "balance_loss_clip": 1.02275801, + "balance_loss_mlp": 1.02006531, + "epoch": 0.35923643469111677, + "flos": 18694522517760.0, + "grad_norm": 1.8223911727452269, + "language_loss": 0.70773208, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.72892463, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45703125, + "step": 5975, + "time_per_iteration": 2.361236572265625 + }, + { + "auxiliary_loss_clip": 0.01064672, + "auxiliary_loss_mlp": 0.01053179, + "balance_loss_clip": 1.02353132, + "balance_loss_mlp": 1.01989007, + "epoch": 0.35929655794378473, + "flos": 25008774301440.0, + "grad_norm": 1.7324167990027837, + "language_loss": 0.80952322, + "learning_rate": 2.96631149897303e-06, + "loss": 0.83070171, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44726562, + "step": 5976, + "time_per_iteration": 2.424760580062866 + }, + { + "auxiliary_loss_clip": 0.01063806, + "auxiliary_loss_mlp": 0.01052682, + "balance_loss_clip": 1.02037668, + "balance_loss_mlp": 1.0189147, + "epoch": 0.35935668119645275, + "flos": 14974701310080.0, + "grad_norm": 1.9522397669206484, + "language_loss": 0.79895121, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.82011604, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44921875, + "step": 5977, + "time_per_iteration": 2.3556342124938965 + }, + { + "auxiliary_loss_clip": 0.0106475, + "auxiliary_loss_mlp": 0.01050065, + "balance_loss_clip": 1.02140665, + "balance_loss_mlp": 1.02046311, + "epoch": 0.3594168044491207, + "flos": 21177091497600.0, + "grad_norm": 2.734496974948363, + "language_loss": 0.81951249, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.84066057, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.44140625, + "step": 5978, + "time_per_iteration": 2.4259114265441895 + }, + { + "auxiliary_loss_clip": 0.0106587, + "auxiliary_loss_mlp": 0.01054655, + "balance_loss_clip": 1.02228928, + "balance_loss_mlp": 1.02003312, + "epoch": 0.3594769277017887, + "flos": 27670936648320.0, + "grad_norm": 1.5229743504862585, + "language_loss": 0.68062621, + "learning_rate": 2.965288372816436e-06, + "loss": 0.70183146, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45703125, + "step": 5979, + "time_per_iteration": 2.4538497924804688 + }, + { + "auxiliary_loss_clip": 0.01065174, + "auxiliary_loss_mlp": 0.01060736, + "balance_loss_clip": 1.03061152, + "balance_loss_mlp": 1.02009785, + "epoch": 0.35953705095445665, + "flos": 23001233546880.0, + "grad_norm": 2.315164296365949, + "language_loss": 0.68840331, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.70966244, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45117188, + "step": 5980, + "time_per_iteration": 2.416557550430298 + }, + { + "auxiliary_loss_clip": 0.01069336, + "auxiliary_loss_mlp": 0.01059082, + "balance_loss_clip": 1.02494025, + "balance_loss_mlp": 1.02145433, + "epoch": 0.3595971742071246, + "flos": 25512990289920.0, + "grad_norm": 1.6848261917683713, + "language_loss": 0.72430789, + "learning_rate": 2.964606105671327e-06, + "loss": 0.74559212, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.47851562, + "step": 5981, + "time_per_iteration": 2.4358015060424805 + }, + { + "auxiliary_loss_clip": 0.0106748, + "auxiliary_loss_mlp": 0.01060661, + "balance_loss_clip": 1.02635241, + "balance_loss_mlp": 1.02184319, + "epoch": 0.3596572974597926, + "flos": 29861247703680.0, + "grad_norm": 1.700784967385501, + "language_loss": 0.72277397, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.74405539, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.45703125, + "step": 5982, + "time_per_iteration": 2.4497992992401123 + }, + { + "auxiliary_loss_clip": 0.0106492, + "auxiliary_loss_mlp": 0.01052304, + "balance_loss_clip": 1.02307439, + "balance_loss_mlp": 1.02184844, + "epoch": 0.35971742071246054, + "flos": 23111419397760.0, + "grad_norm": 1.8796246492144724, + "language_loss": 0.76857209, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78974438, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 5983, + "time_per_iteration": 2.4587509632110596 + }, + { + "auxiliary_loss_clip": 0.01070278, + "auxiliary_loss_mlp": 0.01053895, + "balance_loss_clip": 1.0179894, + "balance_loss_mlp": 1.02212763, + "epoch": 0.3597775439651285, + "flos": 16724478430080.0, + "grad_norm": 1.7392171900075237, + "language_loss": 0.77354467, + "learning_rate": 2.96358243065131e-06, + "loss": 0.79478633, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.48242188, + "step": 5984, + "time_per_iteration": 2.357228994369507 + }, + { + "auxiliary_loss_clip": 0.01066758, + "auxiliary_loss_mlp": 0.01054138, + "balance_loss_clip": 1.02172458, + "balance_loss_mlp": 1.02158594, + "epoch": 0.3598376672177965, + "flos": 19718455138560.0, + "grad_norm": 1.7184436444566886, + "language_loss": 0.87551719, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.89672613, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.453125, + "step": 5985, + "time_per_iteration": 2.4332563877105713 + }, + { + "auxiliary_loss_clip": 0.01066234, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0178256, + "balance_loss_mlp": 1.02180851, + "epoch": 0.35989779047046444, + "flos": 17310565290240.0, + "grad_norm": 1.588851209134777, + "language_loss": 0.73748875, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.75862569, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4453125, + "step": 5986, + "time_per_iteration": 2.3704192638397217 + }, + { + "auxiliary_loss_clip": 0.01070095, + "auxiliary_loss_mlp": 0.01055168, + "balance_loss_clip": 1.01890421, + "balance_loss_mlp": 1.0218972, + "epoch": 0.3599579137231324, + "flos": 22710127697280.0, + "grad_norm": 1.7906121574254825, + "language_loss": 0.75163788, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.77289051, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.48046875, + "step": 5987, + "time_per_iteration": 2.4077744483947754 + }, + { + "auxiliary_loss_clip": 0.01069798, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_clip": 1.0147301, + "balance_loss_mlp": 1.02218843, + "epoch": 0.36001803697580037, + "flos": 20958814477440.0, + "grad_norm": 1.7819949714729768, + "language_loss": 0.71182537, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.73300946, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4765625, + "step": 5988, + "time_per_iteration": 2.37961745262146 + }, + { + "auxiliary_loss_clip": 0.01070509, + "auxiliary_loss_mlp": 0.01049388, + "balance_loss_clip": 1.01623583, + "balance_loss_mlp": 1.02223706, + "epoch": 0.36007816022846834, + "flos": 20484519390720.0, + "grad_norm": 2.5930777769272395, + "language_loss": 0.74443376, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.76563275, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.48242188, + "step": 5989, + "time_per_iteration": 2.392566680908203 + }, + { + "auxiliary_loss_clip": 0.01063388, + "auxiliary_loss_mlp": 0.01050184, + "balance_loss_clip": 1.01940393, + "balance_loss_mlp": 1.01888919, + "epoch": 0.36013828348113636, + "flos": 27999993012480.0, + "grad_norm": 1.4835589710238526, + "language_loss": 0.80874628, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82988203, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 5990, + "time_per_iteration": 3.9099249839782715 + }, + { + "auxiliary_loss_clip": 0.01067808, + "auxiliary_loss_mlp": 0.01052296, + "balance_loss_clip": 1.01809454, + "balance_loss_mlp": 1.02162313, + "epoch": 0.3601984067338043, + "flos": 20081202831360.0, + "grad_norm": 1.7890676150293896, + "language_loss": 0.85160816, + "learning_rate": 2.961192577338698e-06, + "loss": 0.87280917, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.4609375, + "step": 5991, + "time_per_iteration": 2.374112844467163 + }, + { + "auxiliary_loss_clip": 0.01068916, + "auxiliary_loss_mlp": 0.01055656, + "balance_loss_clip": 1.01917791, + "balance_loss_mlp": 1.02115655, + "epoch": 0.3602585299864723, + "flos": 18616806097920.0, + "grad_norm": 2.0679591543757243, + "language_loss": 0.77003568, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.79128146, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.4765625, + "step": 5992, + "time_per_iteration": 2.407437562942505 + }, + { + "auxiliary_loss_clip": 0.01063593, + "auxiliary_loss_mlp": 0.01053041, + "balance_loss_clip": 1.02025867, + "balance_loss_mlp": 1.01918459, + "epoch": 0.36031865323914025, + "flos": 19571994518400.0, + "grad_norm": 2.0842862370685435, + "language_loss": 0.79251701, + "learning_rate": 2.960509433875627e-06, + "loss": 0.81368327, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44335938, + "step": 5993, + "time_per_iteration": 2.4100444316864014 + }, + { + "auxiliary_loss_clip": 0.010701, + "auxiliary_loss_mlp": 0.01055436, + "balance_loss_clip": 1.01993513, + "balance_loss_mlp": 1.02180481, + "epoch": 0.3603787764918082, + "flos": 17489739720960.0, + "grad_norm": 2.3676338883182977, + "language_loss": 0.75961196, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.78086734, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48242188, + "step": 5994, + "time_per_iteration": 3.905600070953369 + }, + { + "auxiliary_loss_clip": 0.01067679, + "auxiliary_loss_mlp": 0.01056967, + "balance_loss_clip": 1.02108502, + "balance_loss_mlp": 1.02043438, + "epoch": 0.3604388997444762, + "flos": 15522488542080.0, + "grad_norm": 1.7629234950078223, + "language_loss": 0.71027625, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.73152268, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.47265625, + "step": 5995, + "time_per_iteration": 3.783785820007324 + }, + { + "auxiliary_loss_clip": 0.01069679, + "auxiliary_loss_mlp": 0.01057622, + "balance_loss_clip": 1.02033377, + "balance_loss_mlp": 1.02088356, + "epoch": 0.36049902299714415, + "flos": 17309936885760.0, + "grad_norm": 1.8406735319765823, + "language_loss": 0.83927298, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.86054599, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.48828125, + "step": 5996, + "time_per_iteration": 2.3577911853790283 + }, + { + "auxiliary_loss_clip": 0.01066854, + "auxiliary_loss_mlp": 0.01052389, + "balance_loss_clip": 1.01954734, + "balance_loss_mlp": 1.02068782, + "epoch": 0.3605591462498121, + "flos": 17055070894080.0, + "grad_norm": 1.5776884139582494, + "language_loss": 0.74332714, + "learning_rate": 2.959142709981763e-06, + "loss": 0.76451951, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4609375, + "step": 5997, + "time_per_iteration": 2.374495267868042 + }, + { + "auxiliary_loss_clip": 0.01064596, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.02004993, + "balance_loss_mlp": 1.01939976, + "epoch": 0.3606192695024801, + "flos": 16835921089920.0, + "grad_norm": 2.651191356984711, + "language_loss": 0.71166515, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.73283702, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.453125, + "step": 5998, + "time_per_iteration": 2.3559012413024902 + }, + { + "auxiliary_loss_clip": 0.01066578, + "auxiliary_loss_mlp": 0.01049578, + "balance_loss_clip": 1.01567507, + "balance_loss_mlp": 1.02029157, + "epoch": 0.36067939275514804, + "flos": 12128860967040.0, + "grad_norm": 2.4768192273040612, + "language_loss": 0.79070413, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.81186563, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.46289062, + "step": 5999, + "time_per_iteration": 2.358144521713257 + }, + { + "auxiliary_loss_clip": 0.01068347, + "auxiliary_loss_mlp": 0.01056508, + "balance_loss_clip": 1.02391613, + "balance_loss_mlp": 1.02092934, + "epoch": 0.360739516007816, + "flos": 18040459507200.0, + "grad_norm": 1.743335082232669, + "language_loss": 0.79624999, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.81749856, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.47265625, + "step": 6000, + "time_per_iteration": 2.4855520725250244 + }, + { + "auxiliary_loss_clip": 0.01065314, + "auxiliary_loss_mlp": 0.01052557, + "balance_loss_clip": 1.01839125, + "balance_loss_mlp": 1.01975477, + "epoch": 0.360799639260484, + "flos": 18548864858880.0, + "grad_norm": 1.6347947278559263, + "language_loss": 0.79332066, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.81449938, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.45507812, + "step": 6001, + "time_per_iteration": 2.3546643257141113 + }, + { + "auxiliary_loss_clip": 0.01065188, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.01195335, + "balance_loss_mlp": 1.02053404, + "epoch": 0.36085976251315194, + "flos": 19681028294400.0, + "grad_norm": 2.7908463790166635, + "language_loss": 0.84233183, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.8634091, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4453125, + "step": 6002, + "time_per_iteration": 2.3881120681762695 + }, + { + "auxiliary_loss_clip": 0.01062484, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.01669002, + "balance_loss_mlp": 1.01905847, + "epoch": 0.3609198857658199, + "flos": 24198021642240.0, + "grad_norm": 2.3355286927910193, + "language_loss": 0.92213798, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.94323862, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43359375, + "step": 6003, + "time_per_iteration": 2.379826784133911 + }, + { + "auxiliary_loss_clip": 0.01021841, + "auxiliary_loss_mlp": 0.01006049, + "balance_loss_clip": 1.00261593, + "balance_loss_mlp": 1.01119137, + "epoch": 0.3609800090184879, + "flos": 57112946346240.0, + "grad_norm": 0.9664460917388455, + "language_loss": 0.53413963, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55441856, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.03442383, + "router_z_loss_mlp": 0.10644531, + "step": 6004, + "time_per_iteration": 2.9869070053100586 + }, + { + "auxiliary_loss_clip": 0.01068101, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.01617146, + "balance_loss_mlp": 1.02059591, + "epoch": 0.3610401322711559, + "flos": 20810259175680.0, + "grad_norm": 1.7901569410440712, + "language_loss": 0.78828871, + "learning_rate": 2.956407517225883e-06, + "loss": 0.80950916, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 0.47265625, + "step": 6005, + "time_per_iteration": 2.408513069152832 + }, + { + "auxiliary_loss_clip": 0.01066106, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.01514864, + "balance_loss_mlp": 1.02110124, + "epoch": 0.36110025552382385, + "flos": 13698311581440.0, + "grad_norm": 1.9245382792614767, + "language_loss": 0.799564, + "learning_rate": 2.956065454793429e-06, + "loss": 0.82070267, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44921875, + "step": 6006, + "time_per_iteration": 2.3796803951263428 + }, + { + "auxiliary_loss_clip": 0.01068586, + "auxiliary_loss_mlp": 0.01056608, + "balance_loss_clip": 1.01855588, + "balance_loss_mlp": 1.02148604, + "epoch": 0.3611603787764918, + "flos": 22453935073920.0, + "grad_norm": 1.8154335021057288, + "language_loss": 0.85952306, + "learning_rate": 2.955723356106876e-06, + "loss": 0.88077497, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 0.47070312, + "step": 6007, + "time_per_iteration": 2.400137186050415 + }, + { + "auxiliary_loss_clip": 0.01071155, + "auxiliary_loss_mlp": 0.01052984, + "balance_loss_clip": 1.01452708, + "balance_loss_mlp": 1.02083087, + "epoch": 0.3612205020291598, + "flos": 20885601623040.0, + "grad_norm": 3.2168944122961833, + "language_loss": 0.74254429, + "learning_rate": 2.955381221179198e-06, + "loss": 0.76378572, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 0.5, + "step": 6008, + "time_per_iteration": 2.4336371421813965 + }, + { + "auxiliary_loss_clip": 0.01065501, + "auxiliary_loss_mlp": 0.01053115, + "balance_loss_clip": 1.01880622, + "balance_loss_mlp": 1.02067256, + "epoch": 0.36128062528182775, + "flos": 15741079764480.0, + "grad_norm": 2.8965197859536005, + "language_loss": 0.84822464, + "learning_rate": 2.955039050023368e-06, + "loss": 0.86941087, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.44921875, + "step": 6009, + "time_per_iteration": 2.3686957359313965 + }, + { + "auxiliary_loss_clip": 0.01067681, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_clip": 1.01813424, + "balance_loss_mlp": 1.02122915, + "epoch": 0.3613407485344957, + "flos": 16763546108160.0, + "grad_norm": 1.874431776735353, + "language_loss": 0.78563225, + "learning_rate": 2.954696842652362e-06, + "loss": 0.80682659, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46484375, + "step": 6010, + "time_per_iteration": 2.3846652507781982 + }, + { + "auxiliary_loss_clip": 0.01067735, + "auxiliary_loss_mlp": 0.01049969, + "balance_loss_clip": 1.0168761, + "balance_loss_mlp": 1.02130866, + "epoch": 0.3614008717871637, + "flos": 20370283822080.0, + "grad_norm": 1.60495570058458, + "language_loss": 0.83838862, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85956562, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46289062, + "step": 6011, + "time_per_iteration": 2.4071991443634033 + }, + { + "auxiliary_loss_clip": 0.01070637, + "auxiliary_loss_mlp": 0.01056015, + "balance_loss_clip": 1.01937056, + "balance_loss_mlp": 1.0223186, + "epoch": 0.36146099503983165, + "flos": 22775764786560.0, + "grad_norm": 1.9155660515828825, + "language_loss": 0.6373263, + "learning_rate": 2.954012319316727e-06, + "loss": 0.65859282, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.48242188, + "step": 6012, + "time_per_iteration": 2.4455463886260986 + }, + { + "auxiliary_loss_clip": 0.01062819, + "auxiliary_loss_mlp": 0.01057649, + "balance_loss_clip": 1.02582002, + "balance_loss_mlp": 1.01888442, + "epoch": 0.3615211182924996, + "flos": 22995717552000.0, + "grad_norm": 1.854027394348883, + "language_loss": 0.85233831, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.87354302, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.43945312, + "step": 6013, + "time_per_iteration": 2.40134596824646 + }, + { + "auxiliary_loss_clip": 0.01064926, + "auxiliary_loss_mlp": 0.01050441, + "balance_loss_clip": 1.01532197, + "balance_loss_mlp": 1.01918006, + "epoch": 0.3615812415451676, + "flos": 16647320592000.0, + "grad_norm": 1.6979708046258233, + "language_loss": 0.92642605, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.94757968, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.45703125, + "step": 6014, + "time_per_iteration": 2.3910655975341797 + }, + { + "auxiliary_loss_clip": 0.01063371, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.01518571, + "balance_loss_mlp": 1.01853967, + "epoch": 0.36164136479783554, + "flos": 21319153286400.0, + "grad_norm": 1.8960365431521347, + "language_loss": 0.75032294, + "learning_rate": 2.95298526302391e-06, + "loss": 0.77143824, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.44921875, + "step": 6015, + "time_per_iteration": 2.4150164127349854 + }, + { + "auxiliary_loss_clip": 0.01067279, + "auxiliary_loss_mlp": 0.0105135, + "balance_loss_clip": 1.01830471, + "balance_loss_mlp": 1.01988709, + "epoch": 0.3617014880505035, + "flos": 24168449854080.0, + "grad_norm": 1.8021535183297066, + "language_loss": 0.67053616, + "learning_rate": 2.9526428386344e-06, + "loss": 0.69172251, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.47460938, + "step": 6016, + "time_per_iteration": 2.4747092723846436 + }, + { + "auxiliary_loss_clip": 0.01069097, + "auxiliary_loss_mlp": 0.01059103, + "balance_loss_clip": 1.01962101, + "balance_loss_mlp": 1.0212388, + "epoch": 0.3617616113031715, + "flos": 39013414951680.0, + "grad_norm": 1.8483925414323463, + "language_loss": 0.73043448, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.75171649, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 0.47851562, + "step": 6017, + "time_per_iteration": 2.552084445953369 + }, + { + "auxiliary_loss_clip": 0.01068711, + "auxiliary_loss_mlp": 0.01052887, + "balance_loss_clip": 1.01807785, + "balance_loss_mlp": 1.02026224, + "epoch": 0.3618217345558395, + "flos": 12130013041920.0, + "grad_norm": 2.0411449496127942, + "language_loss": 0.75059456, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.77181053, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.48632812, + "step": 6018, + "time_per_iteration": 2.3631505966186523 + }, + { + "auxiliary_loss_clip": 0.01062903, + "auxiliary_loss_mlp": 0.01050017, + "balance_loss_clip": 1.016662, + "balance_loss_mlp": 1.01910746, + "epoch": 0.36188185780850746, + "flos": 24933885701760.0, + "grad_norm": 1.8574084016516483, + "language_loss": 0.70084006, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.72196925, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4375, + "step": 6019, + "time_per_iteration": 2.449065923690796 + }, + { + "auxiliary_loss_clip": 0.01070801, + "auxiliary_loss_mlp": 0.01054784, + "balance_loss_clip": 1.01928365, + "balance_loss_mlp": 1.02175879, + "epoch": 0.3619419810611754, + "flos": 20957802048000.0, + "grad_norm": 1.5689644611582465, + "language_loss": 0.77784818, + "learning_rate": 2.95127277996311e-06, + "loss": 0.79910398, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.49023438, + "step": 6020, + "time_per_iteration": 2.4063119888305664 + }, + { + "auxiliary_loss_clip": 0.01068636, + "auxiliary_loss_mlp": 0.01058916, + "balance_loss_clip": 1.02103162, + "balance_loss_mlp": 1.02161789, + "epoch": 0.3620021043138434, + "flos": 22527776332800.0, + "grad_norm": 1.9400272119151518, + "language_loss": 0.74886525, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.77014077, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.47070312, + "step": 6021, + "time_per_iteration": 2.4168219566345215 + }, + { + "auxiliary_loss_clip": 0.01066231, + "auxiliary_loss_mlp": 0.01053458, + "balance_loss_clip": 1.0185771, + "balance_loss_mlp": 1.01976502, + "epoch": 0.36206222756651135, + "flos": 15595771219200.0, + "grad_norm": 2.5642410008152576, + "language_loss": 0.82155049, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.84274733, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.46484375, + "step": 6022, + "time_per_iteration": 2.415053606033325 + }, + { + "auxiliary_loss_clip": 0.01065803, + "auxiliary_loss_mlp": 0.01052201, + "balance_loss_clip": 1.01939464, + "balance_loss_mlp": 1.02084279, + "epoch": 0.3621223508191793, + "flos": 23586028686720.0, + "grad_norm": 1.5152122707145106, + "language_loss": 0.82923388, + "learning_rate": 2.950244857154417e-06, + "loss": 0.85041398, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44921875, + "step": 6023, + "time_per_iteration": 2.4238691329956055 + }, + { + "auxiliary_loss_clip": 0.01069081, + "auxiliary_loss_mlp": 0.0105603, + "balance_loss_clip": 1.02031541, + "balance_loss_mlp": 1.020661, + "epoch": 0.3621824740718473, + "flos": 22308801085440.0, + "grad_norm": 1.826065443629867, + "language_loss": 0.81824923, + "learning_rate": 2.9499021441341e-06, + "loss": 0.83950031, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48632812, + "step": 6024, + "time_per_iteration": 2.418917655944824 + }, + { + "auxiliary_loss_clip": 0.01064743, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.01842403, + "balance_loss_mlp": 1.0209198, + "epoch": 0.36224259732451525, + "flos": 16762708235520.0, + "grad_norm": 2.295432912708728, + "language_loss": 0.75984889, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.78099501, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43945312, + "step": 6025, + "time_per_iteration": 2.3947601318359375 + }, + { + "auxiliary_loss_clip": 0.01065909, + "auxiliary_loss_mlp": 0.01050213, + "balance_loss_clip": 1.01880181, + "balance_loss_mlp": 1.02104068, + "epoch": 0.3623027205771832, + "flos": 23148601862400.0, + "grad_norm": 1.5840849188232182, + "language_loss": 0.73856223, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.75972342, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44921875, + "step": 6026, + "time_per_iteration": 2.4243900775909424 + }, + { + "auxiliary_loss_clip": 0.01072416, + "auxiliary_loss_mlp": 0.01063836, + "balance_loss_clip": 1.02561736, + "balance_loss_mlp": 1.02213848, + "epoch": 0.3623628438298512, + "flos": 28547884978560.0, + "grad_norm": 1.985710687317433, + "language_loss": 0.80281603, + "learning_rate": 2.948873789002833e-06, + "loss": 0.82417858, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.50390625, + "step": 6027, + "time_per_iteration": 2.4597713947296143 + }, + { + "auxiliary_loss_clip": 0.01069112, + "auxiliary_loss_mlp": 0.0105266, + "balance_loss_clip": 1.01656389, + "balance_loss_mlp": 1.02115369, + "epoch": 0.36242296708251914, + "flos": 25483732704000.0, + "grad_norm": 1.8167003319544601, + "language_loss": 0.69465673, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.71587443, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.48046875, + "step": 6028, + "time_per_iteration": 2.4455742835998535 + }, + { + "auxiliary_loss_clip": 0.01065494, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_clip": 1.01545012, + "balance_loss_mlp": 1.02092052, + "epoch": 0.3624830903351871, + "flos": 16289425578240.0, + "grad_norm": 2.5719377544118025, + "language_loss": 0.8673234, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.88844353, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4453125, + "step": 6029, + "time_per_iteration": 5.243697643280029 + }, + { + "auxiliary_loss_clip": 0.0106436, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.01600814, + "balance_loss_mlp": 1.02086985, + "epoch": 0.36254321358785513, + "flos": 18295325498880.0, + "grad_norm": 1.564412714912511, + "language_loss": 0.73721516, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.75832546, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43554688, + "step": 6030, + "time_per_iteration": 2.355849266052246 + }, + { + "auxiliary_loss_clip": 0.01069514, + "auxiliary_loss_mlp": 0.01054315, + "balance_loss_clip": 1.01705015, + "balance_loss_mlp": 1.02105188, + "epoch": 0.3626033368405231, + "flos": 14864445636480.0, + "grad_norm": 2.1051685295351374, + "language_loss": 0.76701778, + "learning_rate": 2.94750214514905e-06, + "loss": 0.78825611, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.484375, + "step": 6031, + "time_per_iteration": 2.413341522216797 + }, + { + "auxiliary_loss_clip": 0.01064157, + "auxiliary_loss_mlp": 0.01049704, + "balance_loss_clip": 1.01719582, + "balance_loss_mlp": 1.01966405, + "epoch": 0.36266346009319106, + "flos": 22305589240320.0, + "grad_norm": 3.468221423645251, + "language_loss": 0.74852788, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.76966655, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4453125, + "step": 6032, + "time_per_iteration": 2.3793890476226807 + }, + { + "auxiliary_loss_clip": 0.01066467, + "auxiliary_loss_mlp": 0.01047569, + "balance_loss_clip": 1.01600266, + "balance_loss_mlp": 1.02003443, + "epoch": 0.362723583345859, + "flos": 18221379505920.0, + "grad_norm": 3.4707212765275077, + "language_loss": 0.79304862, + "learning_rate": 2.946816107593884e-06, + "loss": 0.81418902, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.46484375, + "step": 6033, + "time_per_iteration": 2.3979406356811523 + }, + { + "auxiliary_loss_clip": 0.01014312, + "auxiliary_loss_mlp": 0.01011718, + "balance_loss_clip": 1.00804591, + "balance_loss_mlp": 1.00398135, + "epoch": 0.362783706598527, + "flos": 68495818959360.0, + "grad_norm": 0.7970406028189138, + "language_loss": 0.648929, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66918921, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.10351562, + "step": 6034, + "time_per_iteration": 4.411875247955322 + }, + { + "auxiliary_loss_clip": 0.01063474, + "auxiliary_loss_mlp": 0.01049987, + "balance_loss_clip": 1.01665616, + "balance_loss_mlp": 1.018538, + "epoch": 0.36284382985119495, + "flos": 26575432007040.0, + "grad_norm": 1.64374318272035, + "language_loss": 0.90672064, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92785531, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44921875, + "step": 6035, + "time_per_iteration": 3.8427653312683105 + }, + { + "auxiliary_loss_clip": 0.01067381, + "auxiliary_loss_mlp": 0.01048833, + "balance_loss_clip": 1.01457191, + "balance_loss_mlp": 1.02002573, + "epoch": 0.3629039531038629, + "flos": 20155742317440.0, + "grad_norm": 2.251950160992578, + "language_loss": 0.74839061, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.76955271, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.47460938, + "step": 6036, + "time_per_iteration": 2.397188425064087 + }, + { + "auxiliary_loss_clip": 0.01067143, + "auxiliary_loss_mlp": 0.01056364, + "balance_loss_clip": 1.02055371, + "balance_loss_mlp": 1.02056324, + "epoch": 0.3629640763565309, + "flos": 18624696065280.0, + "grad_norm": 1.9179781810052645, + "language_loss": 0.76948333, + "learning_rate": 2.945443601747297e-06, + "loss": 0.79071844, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.46679688, + "step": 6037, + "time_per_iteration": 2.3758556842803955 + }, + { + "auxiliary_loss_clip": 0.0106298, + "auxiliary_loss_mlp": 0.01054975, + "balance_loss_clip": 1.02311003, + "balance_loss_mlp": 1.02056813, + "epoch": 0.36302419960919885, + "flos": 19570493329920.0, + "grad_norm": 1.8352873530613996, + "language_loss": 0.7927835, + "learning_rate": 2.945100385624828e-06, + "loss": 0.81396306, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.42382812, + "step": 6038, + "time_per_iteration": 2.4023067951202393 + }, + { + "auxiliary_loss_clip": 0.01014351, + "auxiliary_loss_mlp": 0.0100412, + "balance_loss_clip": 1.00080621, + "balance_loss_mlp": 1.00455213, + "epoch": 0.3630843228618668, + "flos": 63794239920000.0, + "grad_norm": 0.8418757327163344, + "language_loss": 0.63619882, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65638357, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.09814453, + "step": 6039, + "time_per_iteration": 3.102839231491089 + }, + { + "auxiliary_loss_clip": 0.01066149, + "auxiliary_loss_mlp": 0.01056744, + "balance_loss_clip": 1.02496231, + "balance_loss_mlp": 1.02185643, + "epoch": 0.3631444461145348, + "flos": 21834087062400.0, + "grad_norm": 2.1931429674313483, + "language_loss": 0.72609472, + "learning_rate": 2.944413845878002e-06, + "loss": 0.74732363, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44335938, + "step": 6040, + "time_per_iteration": 2.3959462642669678 + }, + { + "auxiliary_loss_clip": 0.01067871, + "auxiliary_loss_mlp": 0.01052749, + "balance_loss_clip": 1.01839328, + "balance_loss_mlp": 1.02096641, + "epoch": 0.36320456936720275, + "flos": 21721073391360.0, + "grad_norm": 1.6664630938019536, + "language_loss": 0.82849962, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.84970582, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46875, + "step": 6041, + "time_per_iteration": 2.4016969203948975 + }, + { + "auxiliary_loss_clip": 0.01068418, + "auxiliary_loss_mlp": 0.01052964, + "balance_loss_clip": 1.01949024, + "balance_loss_mlp": 1.02072299, + "epoch": 0.3632646926198707, + "flos": 17018132808960.0, + "grad_norm": 2.006842735356281, + "language_loss": 0.86020947, + "learning_rate": 2.943727162882107e-06, + "loss": 0.88142323, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4765625, + "step": 6042, + "time_per_iteration": 2.4044528007507324 + }, + { + "auxiliary_loss_clip": 0.01064587, + "auxiliary_loss_mlp": 0.01055291, + "balance_loss_clip": 1.02353382, + "balance_loss_mlp": 1.02026761, + "epoch": 0.36332481587253873, + "flos": 23330045531520.0, + "grad_norm": 1.9146668490934713, + "language_loss": 0.79420626, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.81540513, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44335938, + "step": 6043, + "time_per_iteration": 2.409888505935669 + }, + { + "auxiliary_loss_clip": 0.01062983, + "auxiliary_loss_mlp": 0.01058853, + "balance_loss_clip": 1.02530777, + "balance_loss_mlp": 1.02033389, + "epoch": 0.3633849391252067, + "flos": 10742774146560.0, + "grad_norm": 2.0777489442279045, + "language_loss": 0.67665374, + "learning_rate": 2.943040336741298e-06, + "loss": 0.69787204, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.42578125, + "step": 6044, + "time_per_iteration": 2.3758041858673096 + }, + { + "auxiliary_loss_clip": 0.01064969, + "auxiliary_loss_mlp": 0.01050608, + "balance_loss_clip": 1.01858842, + "balance_loss_mlp": 1.02146935, + "epoch": 0.36344506237787466, + "flos": 25847946673920.0, + "grad_norm": 1.7118556862556429, + "language_loss": 0.82494754, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.84610331, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.43554688, + "step": 6045, + "time_per_iteration": 2.451549768447876 + }, + { + "auxiliary_loss_clip": 0.01066653, + "auxiliary_loss_mlp": 0.01052679, + "balance_loss_clip": 1.01996815, + "balance_loss_mlp": 1.02171302, + "epoch": 0.3635051856305426, + "flos": 30152737578240.0, + "grad_norm": 3.0986418979590997, + "language_loss": 0.67059064, + "learning_rate": 2.942353367559755e-06, + "loss": 0.69178396, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44921875, + "step": 6046, + "time_per_iteration": 2.4797322750091553 + }, + { + "auxiliary_loss_clip": 0.01065578, + "auxiliary_loss_mlp": 0.01050735, + "balance_loss_clip": 1.0187633, + "balance_loss_mlp": 1.02054262, + "epoch": 0.3635653088832106, + "flos": 22197358425600.0, + "grad_norm": 1.5701601978803301, + "language_loss": 0.78448278, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.80564582, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44921875, + "step": 6047, + "time_per_iteration": 2.439427137374878 + }, + { + "auxiliary_loss_clip": 0.01069475, + "auxiliary_loss_mlp": 0.01058973, + "balance_loss_clip": 1.02361584, + "balance_loss_mlp": 1.02075171, + "epoch": 0.36362543213587856, + "flos": 24785993715840.0, + "grad_norm": 1.59592476816687, + "language_loss": 0.80765581, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.82894033, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.48632812, + "step": 6048, + "time_per_iteration": 2.449002265930176 + }, + { + "auxiliary_loss_clip": 0.01012747, + "auxiliary_loss_mlp": 0.01013999, + "balance_loss_clip": 1.0102793, + "balance_loss_mlp": 1.00319743, + "epoch": 0.3636855553885465, + "flos": 62522877427200.0, + "grad_norm": 0.768398638911346, + "language_loss": 0.52602255, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54629004, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.09570312, + "step": 6049, + "time_per_iteration": 3.1021018028259277 + }, + { + "auxiliary_loss_clip": 0.010673, + "auxiliary_loss_mlp": 0.01053669, + "balance_loss_clip": 1.02136385, + "balance_loss_mlp": 1.0226531, + "epoch": 0.3637456786412145, + "flos": 24059520812160.0, + "grad_norm": 1.8215225142766682, + "language_loss": 0.88060814, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.9018178, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44726562, + "step": 6050, + "time_per_iteration": 2.4546291828155518 + }, + { + "auxiliary_loss_clip": 0.01065596, + "auxiliary_loss_mlp": 0.0104751, + "balance_loss_clip": 1.01704037, + "balance_loss_mlp": 1.02206826, + "epoch": 0.36380580189388245, + "flos": 16690542721920.0, + "grad_norm": 1.7905212725623627, + "language_loss": 0.79699004, + "learning_rate": 2.940635319486546e-06, + "loss": 0.81812114, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43554688, + "step": 6051, + "time_per_iteration": 2.452287197113037 + }, + { + "auxiliary_loss_clip": 0.01065189, + "auxiliary_loss_mlp": 0.010461, + "balance_loss_clip": 1.01722789, + "balance_loss_mlp": 1.02158749, + "epoch": 0.3638659251465504, + "flos": 25113060132480.0, + "grad_norm": 1.8044377090517456, + "language_loss": 0.84169936, + "learning_rate": 2.940291602812822e-06, + "loss": 0.86281222, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43554688, + "step": 6052, + "time_per_iteration": 2.464834690093994 + }, + { + "auxiliary_loss_clip": 0.01062815, + "auxiliary_loss_mlp": 0.01041724, + "balance_loss_clip": 1.01497316, + "balance_loss_mlp": 1.02169895, + "epoch": 0.3639260483992184, + "flos": 23001896862720.0, + "grad_norm": 1.6479627544398416, + "language_loss": 0.73463857, + "learning_rate": 2.939947850483145e-06, + "loss": 0.75568402, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 6053, + "time_per_iteration": 2.4610178470611572 + }, + { + "auxiliary_loss_clip": 0.01014951, + "auxiliary_loss_mlp": 0.01003151, + "balance_loss_clip": 1.00002789, + "balance_loss_mlp": 1.00536585, + "epoch": 0.36398617165188635, + "flos": 70712839071360.0, + "grad_norm": 0.7678438056466644, + "language_loss": 0.61335182, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63353288, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.03125, + "router_z_loss_mlp": 0.09570312, + "step": 6054, + "time_per_iteration": 3.069837808609009 + }, + { + "auxiliary_loss_clip": 0.01066553, + "auxiliary_loss_mlp": 0.01059686, + "balance_loss_clip": 1.02603269, + "balance_loss_mlp": 1.02161443, + "epoch": 0.3640462949045543, + "flos": 22234401244800.0, + "grad_norm": 1.9277682401823124, + "language_loss": 0.77414942, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.79541183, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.44921875, + "step": 6055, + "time_per_iteration": 2.4518473148345947 + }, + { + "auxiliary_loss_clip": 0.01064281, + "auxiliary_loss_mlp": 0.01055458, + "balance_loss_clip": 1.02424932, + "balance_loss_mlp": 1.02099764, + "epoch": 0.3641064181572223, + "flos": 21542457542400.0, + "grad_norm": 1.770214631831233, + "language_loss": 0.76862454, + "learning_rate": 2.938916379688765e-06, + "loss": 0.78982198, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43164062, + "step": 6056, + "time_per_iteration": 2.400132894515991 + }, + { + "auxiliary_loss_clip": 0.01064352, + "auxiliary_loss_mlp": 0.01054139, + "balance_loss_clip": 1.02443159, + "balance_loss_mlp": 1.02101016, + "epoch": 0.3641665414098903, + "flos": 22272212113920.0, + "grad_norm": 2.1215672124157017, + "language_loss": 0.81835747, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.83954245, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43359375, + "step": 6057, + "time_per_iteration": 2.414478302001953 + }, + { + "auxiliary_loss_clip": 0.01065362, + "auxiliary_loss_mlp": 0.01058418, + "balance_loss_clip": 1.02744746, + "balance_loss_mlp": 1.02142787, + "epoch": 0.36422666466255826, + "flos": 28328420972160.0, + "grad_norm": 2.2866156848367676, + "language_loss": 0.8198843, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.84112215, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43945312, + "step": 6058, + "time_per_iteration": 2.4605324268341064 + }, + { + "auxiliary_loss_clip": 0.01065325, + "auxiliary_loss_mlp": 0.01064226, + "balance_loss_clip": 1.03474534, + "balance_loss_mlp": 1.02128923, + "epoch": 0.36428678791522623, + "flos": 24169357549440.0, + "grad_norm": 1.6914433938816842, + "language_loss": 0.86271828, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.88401377, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44140625, + "step": 6059, + "time_per_iteration": 2.432072401046753 + }, + { + "auxiliary_loss_clip": 0.01064811, + "auxiliary_loss_mlp": 0.01060814, + "balance_loss_clip": 1.02874684, + "balance_loss_mlp": 1.02048981, + "epoch": 0.3643469111678942, + "flos": 22527357396480.0, + "grad_norm": 1.4735670381864518, + "language_loss": 0.88610202, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90735829, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44335938, + "step": 6060, + "time_per_iteration": 2.415597915649414 + }, + { + "auxiliary_loss_clip": 0.01065619, + "auxiliary_loss_mlp": 0.01061941, + "balance_loss_clip": 1.03197145, + "balance_loss_mlp": 1.0219543, + "epoch": 0.36440703442056216, + "flos": 19425603720960.0, + "grad_norm": 2.078039531231747, + "language_loss": 0.68328512, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70456076, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4375, + "step": 6061, + "time_per_iteration": 2.4130678176879883 + }, + { + "auxiliary_loss_clip": 0.01065735, + "auxiliary_loss_mlp": 0.01057996, + "balance_loss_clip": 1.02673936, + "balance_loss_mlp": 1.02128601, + "epoch": 0.3644671576732301, + "flos": 18039551811840.0, + "grad_norm": 3.502665948180738, + "language_loss": 0.78082329, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.8020606, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4453125, + "step": 6062, + "time_per_iteration": 2.4070374965667725 + }, + { + "auxiliary_loss_clip": 0.0106352, + "auxiliary_loss_mlp": 0.01058834, + "balance_loss_clip": 1.02748227, + "balance_loss_mlp": 1.02019298, + "epoch": 0.3645272809258981, + "flos": 21541759315200.0, + "grad_norm": 2.1379264398893145, + "language_loss": 0.73841053, + "learning_rate": 2.936508368977432e-06, + "loss": 0.75963408, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43359375, + "step": 6063, + "time_per_iteration": 2.457726001739502 + }, + { + "auxiliary_loss_clip": 0.01061718, + "auxiliary_loss_mlp": 0.01056195, + "balance_loss_clip": 1.0258801, + "balance_loss_mlp": 1.01906681, + "epoch": 0.36458740417856605, + "flos": 22745774062080.0, + "grad_norm": 1.8443569761998375, + "language_loss": 0.69214797, + "learning_rate": 2.936164225292901e-06, + "loss": 0.71332711, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.42578125, + "step": 6064, + "time_per_iteration": 2.4396913051605225 + }, + { + "auxiliary_loss_clip": 0.0106547, + "auxiliary_loss_mlp": 0.01062452, + "balance_loss_clip": 1.03263831, + "balance_loss_mlp": 1.02045178, + "epoch": 0.364647527431234, + "flos": 26139471459840.0, + "grad_norm": 1.7593490485803371, + "language_loss": 0.76793051, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.78920984, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44921875, + "step": 6065, + "time_per_iteration": 2.455273151397705 + }, + { + "auxiliary_loss_clip": 0.01066681, + "auxiliary_loss_mlp": 0.01055564, + "balance_loss_clip": 1.0236398, + "balance_loss_mlp": 1.02045679, + "epoch": 0.364707650683902, + "flos": 31028568744960.0, + "grad_norm": 1.925013973970139, + "language_loss": 0.76474226, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.78596473, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4609375, + "step": 6066, + "time_per_iteration": 2.506743907928467 + }, + { + "auxiliary_loss_clip": 0.0106359, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_clip": 1.02513051, + "balance_loss_mlp": 1.02125239, + "epoch": 0.36476777393656995, + "flos": 19571889784320.0, + "grad_norm": 2.010062039731278, + "language_loss": 0.77890325, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.80006289, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42382812, + "step": 6067, + "time_per_iteration": 2.3956353664398193 + }, + { + "auxiliary_loss_clip": 0.01062706, + "auxiliary_loss_mlp": 0.01043702, + "balance_loss_clip": 1.0182147, + "balance_loss_mlp": 1.02154708, + "epoch": 0.3648278971892379, + "flos": 17747887380480.0, + "grad_norm": 1.8467826870386332, + "language_loss": 0.72651887, + "learning_rate": 2.934787295690886e-06, + "loss": 0.74758291, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.41015625, + "step": 6068, + "time_per_iteration": 3.824467182159424 + }, + { + "auxiliary_loss_clip": 0.01067341, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.01939392, + "balance_loss_mlp": 1.02337158, + "epoch": 0.3648880204419059, + "flos": 17930203833600.0, + "grad_norm": 1.8473229036917926, + "language_loss": 0.75718153, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.77833736, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43945312, + "step": 6069, + "time_per_iteration": 3.840327501296997 + }, + { + "auxiliary_loss_clip": 0.01069561, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.0148797, + "balance_loss_mlp": 1.02440143, + "epoch": 0.3649481436945739, + "flos": 22637159222400.0, + "grad_norm": 1.7536621086701136, + "language_loss": 0.67510307, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.69625837, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.45117188, + "step": 6070, + "time_per_iteration": 2.4036340713500977 + }, + { + "auxiliary_loss_clip": 0.01067361, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.01290286, + "balance_loss_mlp": 1.02502704, + "epoch": 0.36500826694724187, + "flos": 21578592666240.0, + "grad_norm": 1.6255840910848005, + "language_loss": 0.75975692, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.78082359, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.42382812, + "step": 6071, + "time_per_iteration": 2.471506357192993 + }, + { + "auxiliary_loss_clip": 0.01067391, + "auxiliary_loss_mlp": 0.01044211, + "balance_loss_clip": 1.01377654, + "balance_loss_mlp": 1.02469277, + "epoch": 0.36506839019990983, + "flos": 13771664081280.0, + "grad_norm": 1.8518747771017972, + "language_loss": 0.90138894, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.92250496, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42578125, + "step": 6072, + "time_per_iteration": 2.3868484497070312 + }, + { + "auxiliary_loss_clip": 0.01067377, + "auxiliary_loss_mlp": 0.01050373, + "balance_loss_clip": 1.02161932, + "balance_loss_mlp": 1.02386093, + "epoch": 0.3651285134525778, + "flos": 17274011230080.0, + "grad_norm": 2.146757077798453, + "language_loss": 0.74609745, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.76727498, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.43554688, + "step": 6073, + "time_per_iteration": 2.4388935565948486 + }, + { + "auxiliary_loss_clip": 0.01068004, + "auxiliary_loss_mlp": 0.01060515, + "balance_loss_clip": 1.02869785, + "balance_loss_mlp": 1.02340364, + "epoch": 0.36518863670524576, + "flos": 21906915891840.0, + "grad_norm": 2.0135079940506193, + "language_loss": 0.68521392, + "learning_rate": 2.932720838132236e-06, + "loss": 0.7064991, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4453125, + "step": 6074, + "time_per_iteration": 5.213298082351685 + }, + { + "auxiliary_loss_clip": 0.01067531, + "auxiliary_loss_mlp": 0.01048505, + "balance_loss_clip": 1.01960886, + "balance_loss_mlp": 1.02430725, + "epoch": 0.3652487599579137, + "flos": 27121054734720.0, + "grad_norm": 1.528138869056988, + "language_loss": 0.73739088, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75855124, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43359375, + "step": 6075, + "time_per_iteration": 2.488754987716675 + }, + { + "auxiliary_loss_clip": 0.01072305, + "auxiliary_loss_mlp": 0.01058836, + "balance_loss_clip": 1.02536249, + "balance_loss_mlp": 1.02454519, + "epoch": 0.3653088832105817, + "flos": 19754555351040.0, + "grad_norm": 3.574055218905957, + "language_loss": 0.90777487, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.92908633, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.47851562, + "step": 6076, + "time_per_iteration": 2.4288127422332764 + }, + { + "auxiliary_loss_clip": 0.01064791, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.02143073, + "balance_loss_mlp": 1.02200627, + "epoch": 0.36536900646324966, + "flos": 13114179757440.0, + "grad_norm": 2.0519411467333835, + "language_loss": 0.7189377, + "learning_rate": 2.931687131696872e-06, + "loss": 0.74011272, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.42773438, + "step": 6077, + "time_per_iteration": 2.39399790763855 + }, + { + "auxiliary_loss_clip": 0.01020268, + "auxiliary_loss_mlp": 0.01005211, + "balance_loss_clip": 1.0018971, + "balance_loss_mlp": 1.0110836, + "epoch": 0.3654291297159176, + "flos": 71096743048320.0, + "grad_norm": 0.7567554185115096, + "language_loss": 0.61759639, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63785118, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.09179688, + "step": 6078, + "time_per_iteration": 3.1324706077575684 + }, + { + "auxiliary_loss_clip": 0.01062852, + "auxiliary_loss_mlp": 0.01057118, + "balance_loss_clip": 1.02631402, + "balance_loss_mlp": 1.01915991, + "epoch": 0.3654892529685856, + "flos": 23616508170240.0, + "grad_norm": 4.845880266774484, + "language_loss": 0.79115009, + "learning_rate": 2.930997817403173e-06, + "loss": 0.8123498, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4375, + "step": 6079, + "time_per_iteration": 2.3886826038360596 + }, + { + "auxiliary_loss_clip": 0.01066096, + "auxiliary_loss_mlp": 0.01057785, + "balance_loss_clip": 1.0267663, + "balance_loss_mlp": 1.02121508, + "epoch": 0.36554937622125355, + "flos": 43469135130240.0, + "grad_norm": 1.8721465913692896, + "language_loss": 0.63794357, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65918243, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44921875, + "step": 6080, + "time_per_iteration": 2.5787179470062256 + }, + { + "auxiliary_loss_clip": 0.01067919, + "auxiliary_loss_mlp": 0.01060622, + "balance_loss_clip": 1.02621806, + "balance_loss_mlp": 1.020715, + "epoch": 0.3656094994739215, + "flos": 23293526382720.0, + "grad_norm": 2.354454420305791, + "language_loss": 0.69951957, + "learning_rate": 2.930308361895352e-06, + "loss": 0.72080493, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47265625, + "step": 6081, + "time_per_iteration": 2.4036571979522705 + }, + { + "auxiliary_loss_clip": 0.01067844, + "auxiliary_loss_mlp": 0.01065918, + "balance_loss_clip": 1.03408909, + "balance_loss_mlp": 1.02130568, + "epoch": 0.3656696227265895, + "flos": 24570823806720.0, + "grad_norm": 1.8596062752508533, + "language_loss": 0.76019931, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.78153694, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46484375, + "step": 6082, + "time_per_iteration": 2.458595037460327 + }, + { + "auxiliary_loss_clip": 0.01066317, + "auxiliary_loss_mlp": 0.01051314, + "balance_loss_clip": 1.02120209, + "balance_loss_mlp": 1.02165902, + "epoch": 0.3657297459792575, + "flos": 27927129271680.0, + "grad_norm": 1.6370117728677909, + "language_loss": 0.83628601, + "learning_rate": 2.929618765277987e-06, + "loss": 0.85746229, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4453125, + "step": 6083, + "time_per_iteration": 2.455702304840088 + }, + { + "auxiliary_loss_clip": 0.01013406, + "auxiliary_loss_mlp": 0.01010645, + "balance_loss_clip": 1.00737834, + "balance_loss_mlp": 1.00347567, + "epoch": 0.36578986923192547, + "flos": 67389631441920.0, + "grad_norm": 0.8125260517007118, + "language_loss": 0.59365356, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.6138941, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.03271484, + "router_z_loss_mlp": 0.09960938, + "step": 6084, + "time_per_iteration": 3.097857713699341 + }, + { + "auxiliary_loss_clip": 0.01066287, + "auxiliary_loss_mlp": 0.01057579, + "balance_loss_clip": 1.0263344, + "balance_loss_mlp": 1.02203441, + "epoch": 0.36584999248459343, + "flos": 20226546288000.0, + "grad_norm": 1.6638076004526672, + "language_loss": 0.73885995, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.76009858, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.44335938, + "step": 6085, + "time_per_iteration": 2.404226779937744 + }, + { + "auxiliary_loss_clip": 0.01065458, + "auxiliary_loss_mlp": 0.0104677, + "balance_loss_clip": 1.01799321, + "balance_loss_mlp": 1.02243543, + "epoch": 0.3659101157372614, + "flos": 19061459573760.0, + "grad_norm": 2.0052961671392913, + "language_loss": 0.79932994, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.82045227, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 6086, + "time_per_iteration": 2.4030630588531494 + }, + { + "auxiliary_loss_clip": 0.01064246, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.02273238, + "balance_loss_mlp": 1.02186811, + "epoch": 0.36597023898992936, + "flos": 30809384029440.0, + "grad_norm": 1.8043978069176132, + "language_loss": 0.78781724, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.80898082, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42382812, + "step": 6087, + "time_per_iteration": 2.501413106918335 + }, + { + "auxiliary_loss_clip": 0.01069796, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.01526546, + "balance_loss_mlp": 1.02309918, + "epoch": 0.36603036224259733, + "flos": 20520759248640.0, + "grad_norm": 3.2317345540405995, + "language_loss": 0.72018051, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.74134719, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46484375, + "step": 6088, + "time_per_iteration": 2.4131362438201904 + }, + { + "auxiliary_loss_clip": 0.01075078, + "auxiliary_loss_mlp": 0.01054079, + "balance_loss_clip": 1.0186497, + "balance_loss_mlp": 1.02518821, + "epoch": 0.3660904854952653, + "flos": 38327790205440.0, + "grad_norm": 1.5664606467148123, + "language_loss": 0.80397844, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82527, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.5, + "step": 6089, + "time_per_iteration": 2.5901448726654053 + }, + { + "auxiliary_loss_clip": 0.01069924, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_clip": 1.01720786, + "balance_loss_mlp": 1.02541947, + "epoch": 0.36615060874793326, + "flos": 21834471087360.0, + "grad_norm": 1.7808666210710236, + "language_loss": 0.72054511, + "learning_rate": 2.927204067389884e-06, + "loss": 0.74173039, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 6090, + "time_per_iteration": 2.461784839630127 + }, + { + "auxiliary_loss_clip": 0.01071652, + "auxiliary_loss_mlp": 0.01058014, + "balance_loss_clip": 1.0275923, + "balance_loss_mlp": 1.02820945, + "epoch": 0.3662107320006012, + "flos": 16580601250560.0, + "grad_norm": 1.8750627795033012, + "language_loss": 0.75042987, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.77172649, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43554688, + "step": 6091, + "time_per_iteration": 2.408586263656616 + }, + { + "auxiliary_loss_clip": 0.01071457, + "auxiliary_loss_mlp": 0.01054375, + "balance_loss_clip": 1.02348769, + "balance_loss_mlp": 1.02661419, + "epoch": 0.3662708552532692, + "flos": 20957348200320.0, + "grad_norm": 1.9661640758421983, + "language_loss": 0.74014294, + "learning_rate": 2.926513837074284e-06, + "loss": 0.76140124, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.44921875, + "step": 6092, + "time_per_iteration": 2.4520299434661865 + }, + { + "auxiliary_loss_clip": 0.01073715, + "auxiliary_loss_mlp": 0.01051263, + "balance_loss_clip": 1.0200783, + "balance_loss_mlp": 1.02811217, + "epoch": 0.36633097850593715, + "flos": 21901783921920.0, + "grad_norm": 1.8855275427413294, + "language_loss": 0.79772562, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.81897539, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45703125, + "step": 6093, + "time_per_iteration": 2.4207775592803955 + }, + { + "auxiliary_loss_clip": 0.0107309, + "auxiliary_loss_mlp": 0.01057832, + "balance_loss_clip": 1.02738595, + "balance_loss_mlp": 1.02599955, + "epoch": 0.3663911017586051, + "flos": 32852745705600.0, + "grad_norm": 1.8846376262521234, + "language_loss": 0.75919402, + "learning_rate": 2.925823466224696e-06, + "loss": 0.78050327, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.47265625, + "step": 6094, + "time_per_iteration": 2.5181753635406494 + }, + { + "auxiliary_loss_clip": 0.01073283, + "auxiliary_loss_mlp": 0.01053115, + "balance_loss_clip": 1.0231936, + "balance_loss_mlp": 1.02656221, + "epoch": 0.3664512250112731, + "flos": 27270517731840.0, + "grad_norm": 1.546973379311129, + "language_loss": 0.80133414, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.82259816, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.46679688, + "step": 6095, + "time_per_iteration": 2.4765758514404297 + }, + { + "auxiliary_loss_clip": 0.01073861, + "auxiliary_loss_mlp": 0.01055009, + "balance_loss_clip": 1.01965189, + "balance_loss_mlp": 1.02663636, + "epoch": 0.3665113482639411, + "flos": 17783498833920.0, + "grad_norm": 2.904092921047173, + "language_loss": 0.75969583, + "learning_rate": 2.925132954945834e-06, + "loss": 0.78098452, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.47265625, + "step": 6096, + "time_per_iteration": 2.4183590412139893 + }, + { + "auxiliary_loss_clip": 0.01070694, + "auxiliary_loss_mlp": 0.01056408, + "balance_loss_clip": 1.02271914, + "balance_loss_mlp": 1.02400792, + "epoch": 0.36657147151660907, + "flos": 27853392746880.0, + "grad_norm": 1.9658426794854906, + "language_loss": 0.68755102, + "learning_rate": 2.924787646678155e-06, + "loss": 0.70882207, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46679688, + "step": 6097, + "time_per_iteration": 2.493474245071411 + }, + { + "auxiliary_loss_clip": 0.01071263, + "auxiliary_loss_mlp": 0.01062378, + "balance_loss_clip": 1.03073978, + "balance_loss_mlp": 1.0246048, + "epoch": 0.36663159476927704, + "flos": 25372848625920.0, + "grad_norm": 1.7046302990297402, + "language_loss": 0.77896994, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.80030632, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46679688, + "step": 6098, + "time_per_iteration": 2.4577386379241943 + }, + { + "auxiliary_loss_clip": 0.0106765, + "auxiliary_loss_mlp": 0.01054776, + "balance_loss_clip": 1.0256654, + "balance_loss_mlp": 1.023561, + "epoch": 0.366691718021945, + "flos": 21356265928320.0, + "grad_norm": 2.2363802359625504, + "language_loss": 0.74572456, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.76694882, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.44140625, + "step": 6099, + "time_per_iteration": 2.425905704498291 + }, + { + "auxiliary_loss_clip": 0.01064477, + "auxiliary_loss_mlp": 0.01050915, + "balance_loss_clip": 1.02020669, + "balance_loss_mlp": 1.02275848, + "epoch": 0.36675184127461297, + "flos": 16799436852480.0, + "grad_norm": 1.6181954774365455, + "language_loss": 0.85991752, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.88107145, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.41796875, + "step": 6100, + "time_per_iteration": 2.397919178009033 + }, + { + "auxiliary_loss_clip": 0.01068904, + "auxiliary_loss_mlp": 0.01052414, + "balance_loss_clip": 1.02107441, + "balance_loss_mlp": 1.02219105, + "epoch": 0.36681196452728093, + "flos": 21905484526080.0, + "grad_norm": 2.808386622479719, + "language_loss": 0.72471547, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.74592859, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.46679688, + "step": 6101, + "time_per_iteration": 2.51114821434021 + }, + { + "auxiliary_loss_clip": 0.01067583, + "auxiliary_loss_mlp": 0.01062445, + "balance_loss_clip": 1.02799356, + "balance_loss_mlp": 1.02173615, + "epoch": 0.3668720877799489, + "flos": 17711472965760.0, + "grad_norm": 2.7559493690168946, + "language_loss": 0.77778792, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.79908824, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.45703125, + "step": 6102, + "time_per_iteration": 2.4034926891326904 + }, + { + "auxiliary_loss_clip": 0.01069802, + "auxiliary_loss_mlp": 0.01051508, + "balance_loss_clip": 1.01789141, + "balance_loss_mlp": 1.02251482, + "epoch": 0.36693221103261686, + "flos": 47043717615360.0, + "grad_norm": 1.575081330549062, + "language_loss": 0.71712977, + "learning_rate": 2.922715061101625e-06, + "loss": 0.73834288, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.47265625, + "step": 6103, + "time_per_iteration": 2.6397671699523926 + }, + { + "auxiliary_loss_clip": 0.01066075, + "auxiliary_loss_mlp": 0.01056262, + "balance_loss_clip": 1.02471948, + "balance_loss_mlp": 1.02061212, + "epoch": 0.3669923342852848, + "flos": 15960020100480.0, + "grad_norm": 2.040876330325197, + "language_loss": 0.73165011, + "learning_rate": 2.922369507632716e-06, + "loss": 0.75287348, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45507812, + "step": 6104, + "time_per_iteration": 2.3996357917785645 + }, + { + "auxiliary_loss_clip": 0.0106516, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_clip": 1.02210832, + "balance_loss_mlp": 1.02086663, + "epoch": 0.3670524575379528, + "flos": 19973460775680.0, + "grad_norm": 1.9476178729516682, + "language_loss": 0.83299804, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.85418522, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44335938, + "step": 6105, + "time_per_iteration": 2.423210859298706 + }, + { + "auxiliary_loss_clip": 0.0106967, + "auxiliary_loss_mlp": 0.01059242, + "balance_loss_clip": 1.02455223, + "balance_loss_mlp": 1.02255261, + "epoch": 0.36711258079062076, + "flos": 25701765344640.0, + "grad_norm": 2.2218104234480704, + "language_loss": 0.8215847, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.84287381, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.47070312, + "step": 6106, + "time_per_iteration": 2.4490904808044434 + }, + { + "auxiliary_loss_clip": 0.01013243, + "auxiliary_loss_mlp": 0.01011934, + "balance_loss_clip": 1.00864351, + "balance_loss_mlp": 1.00414026, + "epoch": 0.3671727040432887, + "flos": 60769364791680.0, + "grad_norm": 0.706685652765088, + "language_loss": 0.59255385, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.6128056, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.03295898, + "router_z_loss_mlp": 0.09082031, + "step": 6107, + "time_per_iteration": 4.4786036014556885 + }, + { + "auxiliary_loss_clip": 0.01066048, + "auxiliary_loss_mlp": 0.010472, + "balance_loss_clip": 1.01642048, + "balance_loss_mlp": 1.02150452, + "epoch": 0.3672328272959567, + "flos": 18660307518720.0, + "grad_norm": 1.8724020790845193, + "language_loss": 0.75217694, + "learning_rate": 2.92098694412469e-06, + "loss": 0.77330947, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 6108, + "time_per_iteration": 3.8907783031463623 + }, + { + "auxiliary_loss_clip": 0.01066283, + "auxiliary_loss_mlp": 0.01050172, + "balance_loss_clip": 1.0179379, + "balance_loss_mlp": 1.02109504, + "epoch": 0.3672929505486247, + "flos": 15048158544000.0, + "grad_norm": 2.35039636279293, + "language_loss": 0.74537027, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.76653481, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45117188, + "step": 6109, + "time_per_iteration": 2.420531988143921 + }, + { + "auxiliary_loss_clip": 0.01066192, + "auxiliary_loss_mlp": 0.01048881, + "balance_loss_clip": 1.01915002, + "balance_loss_mlp": 1.0228982, + "epoch": 0.3673530738012927, + "flos": 20588456108160.0, + "grad_norm": 2.343074016976726, + "language_loss": 0.5441153, + "learning_rate": 2.920295452774744e-06, + "loss": 0.56526601, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43359375, + "step": 6110, + "time_per_iteration": 2.396131753921509 + }, + { + "auxiliary_loss_clip": 0.01067051, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.01431537, + "balance_loss_mlp": 1.02320945, + "epoch": 0.36741319705396064, + "flos": 21688743605760.0, + "grad_norm": 1.406488697497103, + "language_loss": 0.81175441, + "learning_rate": 2.919949654746672e-06, + "loss": 0.83285749, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4375, + "step": 6111, + "time_per_iteration": 2.456507682800293 + }, + { + "auxiliary_loss_clip": 0.01067763, + "auxiliary_loss_mlp": 0.0104822, + "balance_loss_clip": 1.0192759, + "balance_loss_mlp": 1.02509117, + "epoch": 0.3674733203066286, + "flos": 29860898590080.0, + "grad_norm": 1.5883862628222674, + "language_loss": 0.73711699, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.75827682, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42578125, + "step": 6112, + "time_per_iteration": 2.4635517597198486 + }, + { + "auxiliary_loss_clip": 0.01071587, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.01689196, + "balance_loss_mlp": 1.02714002, + "epoch": 0.36753344355929657, + "flos": 18256118175360.0, + "grad_norm": 1.6118316632136163, + "language_loss": 0.86402845, + "learning_rate": 2.919257954049892e-06, + "loss": 0.8852073, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4453125, + "step": 6113, + "time_per_iteration": 3.9492545127868652 + }, + { + "auxiliary_loss_clip": 0.01073459, + "auxiliary_loss_mlp": 0.01051643, + "balance_loss_clip": 1.02138758, + "balance_loss_mlp": 1.02738047, + "epoch": 0.36759356681196453, + "flos": 25299984885120.0, + "grad_norm": 1.8439952285035381, + "language_loss": 0.80146039, + "learning_rate": 2.918912051407413e-06, + "loss": 0.82271147, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4609375, + "step": 6114, + "time_per_iteration": 3.8554916381835938 + }, + { + "auxiliary_loss_clip": 0.01075693, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_clip": 1.02078152, + "balance_loss_mlp": 1.02884316, + "epoch": 0.3676536900646325, + "flos": 21031887686400.0, + "grad_norm": 1.919928596833533, + "language_loss": 0.68252921, + "learning_rate": 2.918566113919698e-06, + "loss": 0.70381415, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46679688, + "step": 6115, + "time_per_iteration": 2.443690538406372 + }, + { + "auxiliary_loss_clip": 0.01068969, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.01108384, + "balance_loss_mlp": 1.02687156, + "epoch": 0.36771381331730046, + "flos": 16287610187520.0, + "grad_norm": 2.161560255759521, + "language_loss": 0.78492749, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.80600071, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.421875, + "step": 6116, + "time_per_iteration": 2.408116340637207 + }, + { + "auxiliary_loss_clip": 0.01071934, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_clip": 1.02146888, + "balance_loss_mlp": 1.02833533, + "epoch": 0.36777393656996843, + "flos": 22308870908160.0, + "grad_norm": 1.9128547754386835, + "language_loss": 0.64435792, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.66557455, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.43554688, + "step": 6117, + "time_per_iteration": 2.445099353790283 + }, + { + "auxiliary_loss_clip": 0.01070943, + "auxiliary_loss_mlp": 0.01043999, + "balance_loss_clip": 1.01642549, + "balance_loss_mlp": 1.02850389, + "epoch": 0.3678340598226364, + "flos": 26832846528000.0, + "grad_norm": 1.6978635103553648, + "language_loss": 0.7515465, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.77269584, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.42578125, + "step": 6118, + "time_per_iteration": 2.4761810302734375 + }, + { + "auxiliary_loss_clip": 0.01072155, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.02191472, + "balance_loss_mlp": 1.02498412, + "epoch": 0.36789418307530436, + "flos": 21760664739840.0, + "grad_norm": 1.9251888962679011, + "language_loss": 0.74467659, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.7659421, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47265625, + "step": 6119, + "time_per_iteration": 2.412123441696167 + }, + { + "auxiliary_loss_clip": 0.0107344, + "auxiliary_loss_mlp": 0.01048515, + "balance_loss_clip": 1.01930904, + "balance_loss_mlp": 1.02902484, + "epoch": 0.3679543063279723, + "flos": 15923291483520.0, + "grad_norm": 1.7331851609408127, + "language_loss": 0.81716973, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.83838928, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4453125, + "step": 6120, + "time_per_iteration": 2.4039199352264404 + }, + { + "auxiliary_loss_clip": 0.01069377, + "auxiliary_loss_mlp": 0.01055912, + "balance_loss_clip": 1.02603865, + "balance_loss_mlp": 1.02523565, + "epoch": 0.3680144295806403, + "flos": 24274516164480.0, + "grad_norm": 1.8786069124218971, + "language_loss": 0.66439641, + "learning_rate": 2.916489757978126e-06, + "loss": 0.68564928, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44140625, + "step": 6121, + "time_per_iteration": 2.424571990966797 + }, + { + "auxiliary_loss_clip": 0.01070621, + "auxiliary_loss_mlp": 0.01054846, + "balance_loss_clip": 1.02357697, + "balance_loss_mlp": 1.02467358, + "epoch": 0.36807455283330826, + "flos": 26102952311040.0, + "grad_norm": 2.0375896004438205, + "language_loss": 0.72793376, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.74918854, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45898438, + "step": 6122, + "time_per_iteration": 2.471782922744751 + }, + { + "auxiliary_loss_clip": 0.01065249, + "auxiliary_loss_mlp": 0.0105587, + "balance_loss_clip": 1.02528083, + "balance_loss_mlp": 1.02309883, + "epoch": 0.3681346760859763, + "flos": 24643827192960.0, + "grad_norm": 3.401637180253066, + "language_loss": 0.70576191, + "learning_rate": 2.915797361163875e-06, + "loss": 0.72697306, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.421875, + "step": 6123, + "time_per_iteration": 2.4221949577331543 + }, + { + "auxiliary_loss_clip": 0.01067776, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_clip": 1.01830518, + "balance_loss_mlp": 1.0222702, + "epoch": 0.36819479933864424, + "flos": 23877239270400.0, + "grad_norm": 3.0376166548059675, + "language_loss": 0.76262015, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.78379881, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.45507812, + "step": 6124, + "time_per_iteration": 2.444472551345825 + }, + { + "auxiliary_loss_clip": 0.01066465, + "auxiliary_loss_mlp": 0.01064279, + "balance_loss_clip": 1.03112662, + "balance_loss_mlp": 1.02261627, + "epoch": 0.3682549225913122, + "flos": 25552895840640.0, + "grad_norm": 2.0459387767283244, + "language_loss": 0.75789881, + "learning_rate": 2.915104825441114e-06, + "loss": 0.77920628, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4375, + "step": 6125, + "time_per_iteration": 2.434241771697998 + }, + { + "auxiliary_loss_clip": 0.01066792, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.02175546, + "epoch": 0.36831504584398017, + "flos": 16945653093120.0, + "grad_norm": 1.900138156787967, + "language_loss": 0.79959518, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.8208521, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.44921875, + "step": 6126, + "time_per_iteration": 2.3881282806396484 + }, + { + "auxiliary_loss_clip": 0.01066667, + "auxiliary_loss_mlp": 0.01057804, + "balance_loss_clip": 1.0228281, + "balance_loss_mlp": 1.01907766, + "epoch": 0.36837516909664814, + "flos": 19864042974720.0, + "grad_norm": 3.004124935259161, + "language_loss": 0.6774466, + "learning_rate": 2.914412150914888e-06, + "loss": 0.69869131, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.4765625, + "step": 6127, + "time_per_iteration": 2.3881824016571045 + }, + { + "auxiliary_loss_clip": 0.01069792, + "auxiliary_loss_mlp": 0.01059393, + "balance_loss_clip": 1.02665782, + "balance_loss_mlp": 1.02393103, + "epoch": 0.3684352923493161, + "flos": 37625652385920.0, + "grad_norm": 1.9940336169421327, + "language_loss": 0.7143752, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.73566705, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.45898438, + "step": 6128, + "time_per_iteration": 2.5547914505004883 + }, + { + "auxiliary_loss_clip": 0.01067875, + "auxiliary_loss_mlp": 0.01054994, + "balance_loss_clip": 1.02156758, + "balance_loss_mlp": 1.02192438, + "epoch": 0.36849541560198407, + "flos": 14464620213120.0, + "grad_norm": 1.7559817398920063, + "language_loss": 0.77116317, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.79239184, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4609375, + "step": 6129, + "time_per_iteration": 2.3613719940185547 + }, + { + "auxiliary_loss_clip": 0.01064785, + "auxiliary_loss_mlp": 0.01059236, + "balance_loss_clip": 1.02918303, + "balance_loss_mlp": 1.01999426, + "epoch": 0.36855553885465203, + "flos": 25769706583680.0, + "grad_norm": 1.837245159408968, + "language_loss": 0.86047435, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.88171458, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44921875, + "step": 6130, + "time_per_iteration": 2.461215019226074 + }, + { + "auxiliary_loss_clip": 0.01013877, + "auxiliary_loss_mlp": 0.01009453, + "balance_loss_clip": 1.00611496, + "balance_loss_mlp": 1.00453305, + "epoch": 0.36861566210732, + "flos": 65045701071360.0, + "grad_norm": 0.8202653869065714, + "language_loss": 0.60382074, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62405401, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.03344727, + "router_z_loss_mlp": 0.09375, + "step": 6131, + "time_per_iteration": 3.1161012649536133 + }, + { + "auxiliary_loss_clip": 0.01064969, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_clip": 1.01625443, + "balance_loss_mlp": 1.02088976, + "epoch": 0.36867578535998796, + "flos": 30953226297600.0, + "grad_norm": 1.5890844020343478, + "language_loss": 0.74248856, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.76361597, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44140625, + "step": 6132, + "time_per_iteration": 2.4971365928649902 + }, + { + "auxiliary_loss_clip": 0.01070685, + "auxiliary_loss_mlp": 0.01048888, + "balance_loss_clip": 1.01457906, + "balance_loss_mlp": 1.02374256, + "epoch": 0.3687359086126559, + "flos": 28836756501120.0, + "grad_norm": 1.78950213190559, + "language_loss": 0.74688691, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76808262, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46875, + "step": 6133, + "time_per_iteration": 2.472233772277832 + }, + { + "auxiliary_loss_clip": 0.01066005, + "auxiliary_loss_mlp": 0.01044548, + "balance_loss_clip": 1.01438856, + "balance_loss_mlp": 1.02338886, + "epoch": 0.3687960318653239, + "flos": 21395752542720.0, + "grad_norm": 1.6083685145791635, + "language_loss": 0.72312057, + "learning_rate": 2.911986698512874e-06, + "loss": 0.7442261, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42578125, + "step": 6134, + "time_per_iteration": 2.4402964115142822 + }, + { + "auxiliary_loss_clip": 0.01068561, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.01604187, + "balance_loss_mlp": 1.02413654, + "epoch": 0.36885615511799186, + "flos": 20265020472960.0, + "grad_norm": 1.7253574883360732, + "language_loss": 0.76449239, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.78566819, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4453125, + "step": 6135, + "time_per_iteration": 2.4217441082000732 + }, + { + "auxiliary_loss_clip": 0.01018866, + "auxiliary_loss_mlp": 0.01007223, + "balance_loss_clip": 1.00381374, + "balance_loss_mlp": 1.00975716, + "epoch": 0.3689162783706599, + "flos": 63085922789760.0, + "grad_norm": 0.8264676951619004, + "language_loss": 0.58846605, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60872698, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.03417969, + "router_z_loss_mlp": 0.09082031, + "step": 6136, + "time_per_iteration": 3.0040581226348877 + }, + { + "auxiliary_loss_clip": 0.01068511, + "auxiliary_loss_mlp": 0.01044753, + "balance_loss_clip": 1.01330566, + "balance_loss_mlp": 1.02447867, + "epoch": 0.36897640162332784, + "flos": 10961225723520.0, + "grad_norm": 2.059980659289682, + "language_loss": 0.80070698, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.82183969, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44140625, + "step": 6137, + "time_per_iteration": 2.387873649597168 + }, + { + "auxiliary_loss_clip": 0.01069178, + "auxiliary_loss_mlp": 0.01049547, + "balance_loss_clip": 1.01788521, + "balance_loss_mlp": 1.02424884, + "epoch": 0.3690365248759958, + "flos": 20703250258560.0, + "grad_norm": 1.9211346136341438, + "language_loss": 0.76243854, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.78362572, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 6138, + "time_per_iteration": 2.45884108543396 + }, + { + "auxiliary_loss_clip": 0.01071555, + "auxiliary_loss_mlp": 0.01056292, + "balance_loss_clip": 1.02265108, + "balance_loss_mlp": 1.02439022, + "epoch": 0.3690966481286638, + "flos": 31825182303360.0, + "grad_norm": 2.1887521067975233, + "language_loss": 0.67017686, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.6914553, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.47265625, + "step": 6139, + "time_per_iteration": 2.50856351852417 + }, + { + "auxiliary_loss_clip": 0.01065991, + "auxiliary_loss_mlp": 0.01062953, + "balance_loss_clip": 1.03092098, + "balance_loss_mlp": 1.02283525, + "epoch": 0.36915677138133174, + "flos": 13114109934720.0, + "grad_norm": 1.887048012794129, + "language_loss": 0.73099154, + "learning_rate": 2.909906390418006e-06, + "loss": 0.75228095, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.43164062, + "step": 6140, + "time_per_iteration": 2.4058003425598145 + }, + { + "auxiliary_loss_clip": 0.01016156, + "auxiliary_loss_mlp": 0.01014575, + "balance_loss_clip": 1.01126099, + "balance_loss_mlp": 1.00699437, + "epoch": 0.3692168946339997, + "flos": 68683372116480.0, + "grad_norm": 0.7705507448124861, + "language_loss": 0.59369546, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61400276, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.09179688, + "step": 6141, + "time_per_iteration": 3.1289751529693604 + }, + { + "auxiliary_loss_clip": 0.01064302, + "auxiliary_loss_mlp": 0.01056852, + "balance_loss_clip": 1.02726436, + "balance_loss_mlp": 1.02057505, + "epoch": 0.36927701788666767, + "flos": 22016787540480.0, + "grad_norm": 1.692762143847503, + "language_loss": 0.76384413, + "learning_rate": 2.909212678216192e-06, + "loss": 0.7850557, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4375, + "step": 6142, + "time_per_iteration": 2.4079134464263916 + }, + { + "auxiliary_loss_clip": 0.01063982, + "auxiliary_loss_mlp": 0.0105425, + "balance_loss_clip": 1.02631891, + "balance_loss_mlp": 1.02086949, + "epoch": 0.36933714113933563, + "flos": 21834505998720.0, + "grad_norm": 1.8443137075397662, + "language_loss": 0.77675653, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79793882, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.43164062, + "step": 6143, + "time_per_iteration": 2.3961758613586426 + }, + { + "auxiliary_loss_clip": 0.01062538, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.02484393, + "balance_loss_mlp": 1.02068448, + "epoch": 0.3693972643920036, + "flos": 23690698542720.0, + "grad_norm": 2.438947204468492, + "language_loss": 0.83103943, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.85219043, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 6144, + "time_per_iteration": 2.4417450428009033 + }, + { + "auxiliary_loss_clip": 0.01063981, + "auxiliary_loss_mlp": 0.0105621, + "balance_loss_clip": 1.02674174, + "balance_loss_mlp": 1.01956797, + "epoch": 0.36945738764467156, + "flos": 22855645710720.0, + "grad_norm": 2.696169316757798, + "language_loss": 0.78866816, + "learning_rate": 2.908171851365593e-06, + "loss": 0.80987, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.44335938, + "step": 6145, + "time_per_iteration": 2.38777232170105 + }, + { + "auxiliary_loss_clip": 0.01065749, + "auxiliary_loss_mlp": 0.01052962, + "balance_loss_clip": 1.02256346, + "balance_loss_mlp": 1.02118909, + "epoch": 0.36951751089733953, + "flos": 16615060629120.0, + "grad_norm": 2.170351949388902, + "language_loss": 0.78648913, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.8076762, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4453125, + "step": 6146, + "time_per_iteration": 2.3977911472320557 + }, + { + "auxiliary_loss_clip": 0.01067246, + "auxiliary_loss_mlp": 0.01058278, + "balance_loss_clip": 1.02628219, + "balance_loss_mlp": 1.02149916, + "epoch": 0.3695776341500075, + "flos": 18913602499200.0, + "grad_norm": 1.9516697308121533, + "language_loss": 0.82439345, + "learning_rate": 2.907477794586761e-06, + "loss": 0.84564865, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45703125, + "step": 6147, + "time_per_iteration": 3.825615644454956 + }, + { + "auxiliary_loss_clip": 0.01066073, + "auxiliary_loss_mlp": 0.01052283, + "balance_loss_clip": 1.02294588, + "balance_loss_mlp": 1.02146327, + "epoch": 0.36963775740267546, + "flos": 20807571000960.0, + "grad_norm": 1.8434205428015917, + "language_loss": 0.84942818, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.87061167, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4453125, + "step": 6148, + "time_per_iteration": 3.8347694873809814 + }, + { + "auxiliary_loss_clip": 0.01065107, + "auxiliary_loss_mlp": 0.01052444, + "balance_loss_clip": 1.02371466, + "balance_loss_mlp": 1.02251434, + "epoch": 0.3696978806553435, + "flos": 26060847344640.0, + "grad_norm": 2.0440454156607113, + "language_loss": 0.76464391, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.78581935, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42578125, + "step": 6149, + "time_per_iteration": 2.4530186653137207 + }, + { + "auxiliary_loss_clip": 0.01069623, + "auxiliary_loss_mlp": 0.01055621, + "balance_loss_clip": 1.02302885, + "balance_loss_mlp": 1.0249449, + "epoch": 0.36975800390801145, + "flos": 26832706882560.0, + "grad_norm": 1.901272129384223, + "language_loss": 0.72442937, + "learning_rate": 2.906436451364054e-06, + "loss": 0.74568176, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44726562, + "step": 6150, + "time_per_iteration": 2.4892961978912354 + }, + { + "auxiliary_loss_clip": 0.01070689, + "auxiliary_loss_mlp": 0.01059403, + "balance_loss_clip": 1.02855134, + "balance_loss_mlp": 1.02547276, + "epoch": 0.3698181271606794, + "flos": 21141549866880.0, + "grad_norm": 1.8746921540941004, + "language_loss": 0.82638478, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84768569, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.453125, + "step": 6151, + "time_per_iteration": 2.458369493484497 + }, + { + "auxiliary_loss_clip": 0.01036833, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.03569031, + "balance_loss_mlp": 1.02756858, + "epoch": 0.3698782504133474, + "flos": 66739478503680.0, + "grad_norm": 0.8037791212956692, + "language_loss": 0.6317476, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65250123, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.02832031, + "router_z_loss_mlp": 0.09277344, + "step": 6152, + "time_per_iteration": 3.153697967529297 + }, + { + "auxiliary_loss_clip": 0.01068656, + "auxiliary_loss_mlp": 0.01045781, + "balance_loss_clip": 1.01647902, + "balance_loss_mlp": 1.0263834, + "epoch": 0.36993837366601534, + "flos": 24310511642880.0, + "grad_norm": 1.7676397683107228, + "language_loss": 0.7143712, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.73551559, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.421875, + "step": 6153, + "time_per_iteration": 5.215890407562256 + }, + { + "auxiliary_loss_clip": 0.01072028, + "auxiliary_loss_mlp": 0.01044425, + "balance_loss_clip": 1.01484919, + "balance_loss_mlp": 1.02768803, + "epoch": 0.3699984969186833, + "flos": 24348147955200.0, + "grad_norm": 1.6785630045355642, + "language_loss": 0.73570907, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.75687361, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44335938, + "step": 6154, + "time_per_iteration": 2.4871866703033447 + }, + { + "auxiliary_loss_clip": 0.01069503, + "auxiliary_loss_mlp": 0.01050396, + "balance_loss_clip": 1.02144027, + "balance_loss_mlp": 1.02600348, + "epoch": 0.37005862017135127, + "flos": 19828117319040.0, + "grad_norm": 1.6499191852251607, + "language_loss": 0.68577218, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70697117, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43554688, + "step": 6155, + "time_per_iteration": 2.405717611312866 + }, + { + "auxiliary_loss_clip": 0.01070544, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_clip": 1.01854074, + "balance_loss_mlp": 1.02645636, + "epoch": 0.37011874342401924, + "flos": 19572762568320.0, + "grad_norm": 1.862080410167843, + "language_loss": 0.68977058, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.71095473, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44140625, + "step": 6156, + "time_per_iteration": 2.4118590354919434 + }, + { + "auxiliary_loss_clip": 0.0106766, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_clip": 1.02591658, + "balance_loss_mlp": 1.02505362, + "epoch": 0.3701788666766872, + "flos": 20373356021760.0, + "grad_norm": 1.7599078946132636, + "language_loss": 0.82606161, + "learning_rate": 2.904005448099916e-06, + "loss": 0.8472814, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.42578125, + "step": 6157, + "time_per_iteration": 2.419261932373047 + }, + { + "auxiliary_loss_clip": 0.01070154, + "auxiliary_loss_mlp": 0.01069087, + "balance_loss_clip": 1.03330064, + "balance_loss_mlp": 1.02394986, + "epoch": 0.37023898992935517, + "flos": 15340032443520.0, + "grad_norm": 2.4842314951334363, + "language_loss": 0.77984583, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.80123818, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.4609375, + "step": 6158, + "time_per_iteration": 2.3943405151367188 + }, + { + "auxiliary_loss_clip": 0.01069614, + "auxiliary_loss_mlp": 0.01066183, + "balance_loss_clip": 1.03380537, + "balance_loss_mlp": 1.02360225, + "epoch": 0.37029911318202313, + "flos": 19572902213760.0, + "grad_norm": 2.067656315048353, + "language_loss": 0.70281291, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.72417092, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4609375, + "step": 6159, + "time_per_iteration": 2.4557595252990723 + }, + { + "auxiliary_loss_clip": 0.01066414, + "auxiliary_loss_mlp": 0.01063875, + "balance_loss_clip": 1.03501463, + "balance_loss_mlp": 1.02234006, + "epoch": 0.3703592364346911, + "flos": 26212160643840.0, + "grad_norm": 1.702158640034723, + "language_loss": 0.71766269, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73896563, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44140625, + "step": 6160, + "time_per_iteration": 2.4414467811584473 + }, + { + "auxiliary_loss_clip": 0.01064062, + "auxiliary_loss_mlp": 0.01060588, + "balance_loss_clip": 1.03009439, + "balance_loss_mlp": 1.02208185, + "epoch": 0.37041935968735906, + "flos": 20047267123200.0, + "grad_norm": 1.6897245268620424, + "language_loss": 0.80329818, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.82454467, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.41992188, + "step": 6161, + "time_per_iteration": 2.3780417442321777 + }, + { + "auxiliary_loss_clip": 0.01065215, + "auxiliary_loss_mlp": 0.01077084, + "balance_loss_clip": 1.04392004, + "balance_loss_mlp": 1.0211904, + "epoch": 0.3704794829400271, + "flos": 24132663843840.0, + "grad_norm": 2.553293121484324, + "language_loss": 0.80170417, + "learning_rate": 2.902267988534295e-06, + "loss": 0.82312715, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.43945312, + "step": 6162, + "time_per_iteration": 2.4441370964050293 + }, + { + "auxiliary_loss_clip": 0.01068416, + "auxiliary_loss_mlp": 0.01068081, + "balance_loss_clip": 1.03634763, + "balance_loss_mlp": 1.02306557, + "epoch": 0.37053960619269505, + "flos": 14865981736320.0, + "grad_norm": 2.9065930936496804, + "language_loss": 0.81483763, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.83620262, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.453125, + "step": 6163, + "time_per_iteration": 2.3540284633636475 + }, + { + "auxiliary_loss_clip": 0.01068208, + "auxiliary_loss_mlp": 0.01072616, + "balance_loss_clip": 1.04014349, + "balance_loss_mlp": 1.02285171, + "epoch": 0.370599729445363, + "flos": 21360420380160.0, + "grad_norm": 1.676608654993983, + "language_loss": 0.6990751, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.72048342, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.453125, + "step": 6164, + "time_per_iteration": 2.416917562484741 + }, + { + "auxiliary_loss_clip": 0.0106999, + "auxiliary_loss_mlp": 0.01072617, + "balance_loss_clip": 1.0403111, + "balance_loss_mlp": 1.02353787, + "epoch": 0.370659852698031, + "flos": 26827958937600.0, + "grad_norm": 2.5611454280670856, + "language_loss": 0.85089707, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.87232316, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46484375, + "step": 6165, + "time_per_iteration": 2.4528987407684326 + }, + { + "auxiliary_loss_clip": 0.010713, + "auxiliary_loss_mlp": 0.01064463, + "balance_loss_clip": 1.02803266, + "balance_loss_mlp": 1.02353621, + "epoch": 0.37071997595069894, + "flos": 19098013633920.0, + "grad_norm": 2.1772146820153058, + "language_loss": 0.70892346, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.73028111, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.4765625, + "step": 6166, + "time_per_iteration": 2.4417853355407715 + }, + { + "auxiliary_loss_clip": 0.01014822, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.02684963, + "balance_loss_mlp": 1.00522506, + "epoch": 0.3707800992033669, + "flos": 52175809163520.0, + "grad_norm": 0.8050277017671205, + "language_loss": 0.57130075, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59174895, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.03149414, + "router_z_loss_mlp": 0.09570312, + "step": 6167, + "time_per_iteration": 2.8692944049835205 + }, + { + "auxiliary_loss_clip": 0.01066752, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_clip": 1.02368486, + "balance_loss_mlp": 1.0230602, + "epoch": 0.3708402224560349, + "flos": 19900806503040.0, + "grad_norm": 1.826747107734393, + "language_loss": 0.7775805, + "learning_rate": 2.900181908135584e-06, + "loss": 0.79879528, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43554688, + "step": 6168, + "time_per_iteration": 2.4361259937286377 + }, + { + "auxiliary_loss_clip": 0.01069444, + "auxiliary_loss_mlp": 0.0105672, + "balance_loss_clip": 1.02541566, + "balance_loss_mlp": 1.02393234, + "epoch": 0.37090034570870284, + "flos": 20006698256640.0, + "grad_norm": 3.58720840676508, + "language_loss": 0.75917268, + "learning_rate": 2.899834108519755e-06, + "loss": 0.78043431, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45507812, + "step": 6169, + "time_per_iteration": 2.3874013423919678 + }, + { + "auxiliary_loss_clip": 0.01070083, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_clip": 1.0154407, + "balance_loss_mlp": 1.02533138, + "epoch": 0.3709604689613708, + "flos": 24133536627840.0, + "grad_norm": 2.0425189723282027, + "language_loss": 0.80549508, + "learning_rate": 2.899486274782127e-06, + "loss": 0.8266387, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44726562, + "step": 6170, + "time_per_iteration": 2.4548234939575195 + }, + { + "auxiliary_loss_clip": 0.01074999, + "auxiliary_loss_mlp": 0.01053695, + "balance_loss_clip": 1.0211513, + "balance_loss_mlp": 1.02823782, + "epoch": 0.37102059221403877, + "flos": 23875004943360.0, + "grad_norm": 1.481214911072211, + "language_loss": 0.77842087, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.79970783, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46875, + "step": 6171, + "time_per_iteration": 2.429335832595825 + }, + { + "auxiliary_loss_clip": 0.01075572, + "auxiliary_loss_mlp": 0.01049152, + "balance_loss_clip": 1.01715672, + "balance_loss_mlp": 1.02919888, + "epoch": 0.37108071546670673, + "flos": 14500406223360.0, + "grad_norm": 1.7748763089769146, + "language_loss": 0.82144058, + "learning_rate": 2.898790504994232e-06, + "loss": 0.84268779, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46484375, + "step": 6172, + "time_per_iteration": 2.460134983062744 + }, + { + "auxiliary_loss_clip": 0.01077774, + "auxiliary_loss_mlp": 0.01059347, + "balance_loss_clip": 1.02639771, + "balance_loss_mlp": 1.03037548, + "epoch": 0.3711408387193747, + "flos": 34561360465920.0, + "grad_norm": 1.8940442935869428, + "language_loss": 0.61105078, + "learning_rate": 2.89844256897035e-06, + "loss": 0.63242197, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.47460938, + "step": 6173, + "time_per_iteration": 2.548811435699463 + }, + { + "auxiliary_loss_clip": 0.01075866, + "auxiliary_loss_mlp": 0.01053197, + "balance_loss_clip": 1.02046227, + "balance_loss_mlp": 1.02948356, + "epoch": 0.37120096197204266, + "flos": 17309762328960.0, + "grad_norm": 1.8924876567783964, + "language_loss": 0.82736403, + "learning_rate": 2.898094598877435e-06, + "loss": 0.84865463, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46484375, + "step": 6174, + "time_per_iteration": 2.450430154800415 + }, + { + "auxiliary_loss_clip": 0.01073677, + "auxiliary_loss_mlp": 0.01053547, + "balance_loss_clip": 1.02348268, + "balance_loss_mlp": 1.02922392, + "epoch": 0.37126108522471063, + "flos": 30662748852480.0, + "grad_norm": 1.806166983091659, + "language_loss": 0.81665623, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.83792841, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44335938, + "step": 6175, + "time_per_iteration": 2.4967219829559326 + }, + { + "auxiliary_loss_clip": 0.01080572, + "auxiliary_loss_mlp": 0.01065571, + "balance_loss_clip": 1.03316975, + "balance_loss_mlp": 1.0340277, + "epoch": 0.37132120847737865, + "flos": 25154466871680.0, + "grad_norm": 1.7661680279857317, + "language_loss": 0.90479302, + "learning_rate": 2.89739855653729e-06, + "loss": 0.92625445, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46484375, + "step": 6176, + "time_per_iteration": 2.503720998764038 + }, + { + "auxiliary_loss_clip": 0.01076777, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_clip": 1.02141929, + "balance_loss_mlp": 1.02992129, + "epoch": 0.3713813317300466, + "flos": 21212458571520.0, + "grad_norm": 1.4794812519409979, + "language_loss": 0.74540699, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.76669151, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.46875, + "step": 6177, + "time_per_iteration": 2.4569990634918213 + }, + { + "auxiliary_loss_clip": 0.01075954, + "auxiliary_loss_mlp": 0.0105859, + "balance_loss_clip": 1.02842999, + "balance_loss_mlp": 1.02974284, + "epoch": 0.3714414549827146, + "flos": 21615565662720.0, + "grad_norm": 1.7282440854251964, + "language_loss": 0.76534122, + "learning_rate": 2.896702378079374e-06, + "loss": 0.78668672, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.46289062, + "step": 6178, + "time_per_iteration": 2.451380491256714 + }, + { + "auxiliary_loss_clip": 0.01075186, + "auxiliary_loss_mlp": 0.01076778, + "balance_loss_clip": 1.04499674, + "balance_loss_mlp": 1.02925539, + "epoch": 0.37150157823538255, + "flos": 19971331182720.0, + "grad_norm": 1.8118013172328051, + "language_loss": 0.73222107, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.75374073, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45898438, + "step": 6179, + "time_per_iteration": 2.438295602798462 + }, + { + "auxiliary_loss_clip": 0.010731, + "auxiliary_loss_mlp": 0.0105813, + "balance_loss_clip": 1.02625299, + "balance_loss_mlp": 1.02685177, + "epoch": 0.3715617014880505, + "flos": 24859485861120.0, + "grad_norm": 3.2335674360587956, + "language_loss": 0.70807952, + "learning_rate": 2.896006063609283e-06, + "loss": 0.72939181, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46289062, + "step": 6180, + "time_per_iteration": 2.4974782466888428 + }, + { + "auxiliary_loss_clip": 0.01071482, + "auxiliary_loss_mlp": 0.01057924, + "balance_loss_clip": 1.02685809, + "balance_loss_mlp": 1.0259943, + "epoch": 0.3716218247407185, + "flos": 20448035153280.0, + "grad_norm": 1.705477140120759, + "language_loss": 0.79165876, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.81295282, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.45507812, + "step": 6181, + "time_per_iteration": 2.4232723712921143 + }, + { + "auxiliary_loss_clip": 0.0106951, + "auxiliary_loss_mlp": 0.01064755, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.02476525, + "epoch": 0.37168194799338644, + "flos": 24132349641600.0, + "grad_norm": 1.9530702750784004, + "language_loss": 0.79755151, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.81889415, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.44921875, + "step": 6182, + "time_per_iteration": 2.4527652263641357 + }, + { + "auxiliary_loss_clip": 0.01015908, + "auxiliary_loss_mlp": 0.01009171, + "balance_loss_clip": 1.00557065, + "balance_loss_mlp": 1.00645137, + "epoch": 0.3717420712460544, + "flos": 67405481199360.0, + "grad_norm": 0.7891171117585289, + "language_loss": 0.57511938, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59537017, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.03588867, + "router_z_loss_mlp": 0.09472656, + "step": 6183, + "time_per_iteration": 3.006164312362671 + }, + { + "auxiliary_loss_clip": 0.01072675, + "auxiliary_loss_mlp": 0.01065878, + "balance_loss_clip": 1.02930439, + "balance_loss_mlp": 1.02358079, + "epoch": 0.37180219449872237, + "flos": 22375974274560.0, + "grad_norm": 1.70237307689659, + "language_loss": 0.78278875, + "learning_rate": 2.894613027055066e-06, + "loss": 0.8041743, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.49023438, + "step": 6184, + "time_per_iteration": 2.521357536315918 + }, + { + "auxiliary_loss_clip": 0.01065785, + "auxiliary_loss_mlp": 0.01052073, + "balance_loss_clip": 1.02006483, + "balance_loss_mlp": 1.02190566, + "epoch": 0.37186231775139034, + "flos": 21868860643200.0, + "grad_norm": 2.1164873875871613, + "language_loss": 0.74055469, + "learning_rate": 2.894264683073954e-06, + "loss": 0.76173329, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4375, + "step": 6185, + "time_per_iteration": 2.4144961833953857 + }, + { + "auxiliary_loss_clip": 0.01065697, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_clip": 1.01350331, + "balance_loss_mlp": 1.02150226, + "epoch": 0.3719224410040583, + "flos": 22414238991360.0, + "grad_norm": 1.5021778050344743, + "language_loss": 0.78418303, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.80530733, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44140625, + "step": 6186, + "time_per_iteration": 3.9281699657440186 + }, + { + "auxiliary_loss_clip": 0.01068419, + "auxiliary_loss_mlp": 0.01052937, + "balance_loss_clip": 1.01855719, + "balance_loss_mlp": 1.0211997, + "epoch": 0.37198256425672627, + "flos": 25150172774400.0, + "grad_norm": 2.030387640240308, + "language_loss": 0.85434473, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.87555826, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47265625, + "step": 6187, + "time_per_iteration": 2.4561898708343506 + }, + { + "auxiliary_loss_clip": 0.01066283, + "auxiliary_loss_mlp": 0.01049363, + "balance_loss_clip": 1.01846373, + "balance_loss_mlp": 1.02114487, + "epoch": 0.37204268750939423, + "flos": 21137360503680.0, + "grad_norm": 2.1211545290682072, + "language_loss": 0.85168588, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87284237, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45117188, + "step": 6188, + "time_per_iteration": 3.7974941730499268 + }, + { + "auxiliary_loss_clip": 0.01066926, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_clip": 1.01807451, + "balance_loss_mlp": 1.02209294, + "epoch": 0.37210281076206225, + "flos": 21505763836800.0, + "grad_norm": 1.719364336120611, + "language_loss": 0.6657846, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.68694365, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.44726562, + "step": 6189, + "time_per_iteration": 2.4921469688415527 + }, + { + "auxiliary_loss_clip": 0.01070079, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.02224588, + "balance_loss_mlp": 1.02409148, + "epoch": 0.3721629340147302, + "flos": 17346874970880.0, + "grad_norm": 2.1507482791206867, + "language_loss": 0.85157299, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.87282181, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.45898438, + "step": 6190, + "time_per_iteration": 2.363616943359375 + }, + { + "auxiliary_loss_clip": 0.01070091, + "auxiliary_loss_mlp": 0.01049482, + "balance_loss_clip": 1.01742673, + "balance_loss_mlp": 1.02306008, + "epoch": 0.3722230572673982, + "flos": 16431557189760.0, + "grad_norm": 2.534085884848953, + "language_loss": 0.91087818, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.93207389, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.47070312, + "step": 6191, + "time_per_iteration": 2.4337618350982666 + }, + { + "auxiliary_loss_clip": 0.01071639, + "auxiliary_loss_mlp": 0.01047524, + "balance_loss_clip": 1.01240516, + "balance_loss_mlp": 1.02384233, + "epoch": 0.37228318052006615, + "flos": 22673608548480.0, + "grad_norm": 1.7144899418570887, + "language_loss": 0.75094247, + "learning_rate": 2.891825326449073e-06, + "loss": 0.77213413, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.47851562, + "step": 6192, + "time_per_iteration": 3.870932102203369 + }, + { + "auxiliary_loss_clip": 0.01069855, + "auxiliary_loss_mlp": 0.01052129, + "balance_loss_clip": 1.0201571, + "balance_loss_mlp": 1.02460933, + "epoch": 0.3723433037727341, + "flos": 25264303608960.0, + "grad_norm": 2.380520786741067, + "language_loss": 0.81311119, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.83433104, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.453125, + "step": 6193, + "time_per_iteration": 3.841954231262207 + }, + { + "auxiliary_loss_clip": 0.0107215, + "auxiliary_loss_mlp": 0.0105188, + "balance_loss_clip": 1.02096868, + "balance_loss_mlp": 1.02594995, + "epoch": 0.3724034270254021, + "flos": 10523903633280.0, + "grad_norm": 1.927325504245247, + "language_loss": 0.85983491, + "learning_rate": 2.891128062852194e-06, + "loss": 0.88107526, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4609375, + "step": 6194, + "time_per_iteration": 2.3971354961395264 + }, + { + "auxiliary_loss_clip": 0.01070545, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.02187419, + "balance_loss_mlp": 1.02504909, + "epoch": 0.37246355027807004, + "flos": 20265195029760.0, + "grad_norm": 2.918751222440559, + "language_loss": 0.79629201, + "learning_rate": 2.890779380359646e-06, + "loss": 0.81752515, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45507812, + "step": 6195, + "time_per_iteration": 2.4235358238220215 + }, + { + "auxiliary_loss_clip": 0.01071988, + "auxiliary_loss_mlp": 0.01053634, + "balance_loss_clip": 1.01911092, + "balance_loss_mlp": 1.02802038, + "epoch": 0.372523673530738, + "flos": 19499549713920.0, + "grad_norm": 4.008457467251298, + "language_loss": 0.80612642, + "learning_rate": 2.890430664088655e-06, + "loss": 0.82738268, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.43945312, + "step": 6196, + "time_per_iteration": 2.40454363822937 + }, + { + "auxiliary_loss_clip": 0.0107053, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_clip": 1.02178097, + "balance_loss_mlp": 1.02627885, + "epoch": 0.372583796783406, + "flos": 16763301728640.0, + "grad_norm": 1.9988524397205056, + "language_loss": 0.86537284, + "learning_rate": 2.890081914052443e-06, + "loss": 0.88660192, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44335938, + "step": 6197, + "time_per_iteration": 2.404353618621826 + }, + { + "auxiliary_loss_clip": 0.01068681, + "auxiliary_loss_mlp": 0.01052242, + "balance_loss_clip": 1.01900601, + "balance_loss_mlp": 1.02486157, + "epoch": 0.37264392003607394, + "flos": 22636879931520.0, + "grad_norm": 1.5328579089843617, + "language_loss": 0.65693605, + "learning_rate": 2.889733130264237e-06, + "loss": 0.67814523, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4375, + "step": 6198, + "time_per_iteration": 2.431446075439453 + }, + { + "auxiliary_loss_clip": 0.0106899, + "auxiliary_loss_mlp": 0.01053639, + "balance_loss_clip": 1.02226329, + "balance_loss_mlp": 1.02549064, + "epoch": 0.3727040432887419, + "flos": 19972134144000.0, + "grad_norm": 1.3916492545597519, + "language_loss": 0.75027448, + "learning_rate": 2.889384312737261e-06, + "loss": 0.77150077, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43554688, + "step": 6199, + "time_per_iteration": 2.4463441371917725 + }, + { + "auxiliary_loss_clip": 0.01070019, + "auxiliary_loss_mlp": 0.01047823, + "balance_loss_clip": 1.0168761, + "balance_loss_mlp": 1.02628052, + "epoch": 0.37276416654140987, + "flos": 63896991517440.0, + "grad_norm": 1.8408988952734409, + "language_loss": 0.82236296, + "learning_rate": 2.889035461484742e-06, + "loss": 0.84354138, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4375, + "step": 6200, + "time_per_iteration": 2.863687753677368 + }, + { + "auxiliary_loss_clip": 0.01069151, + "auxiliary_loss_mlp": 0.0105778, + "balance_loss_clip": 1.02416325, + "balance_loss_mlp": 1.02312398, + "epoch": 0.37282428979407783, + "flos": 39784401705600.0, + "grad_norm": 1.8531203086096317, + "language_loss": 0.61748129, + "learning_rate": 2.88868657651991e-06, + "loss": 0.63875061, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4609375, + "step": 6201, + "time_per_iteration": 2.581199884414673 + }, + { + "auxiliary_loss_clip": 0.01071133, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.01899946, + "balance_loss_mlp": 1.02455878, + "epoch": 0.37288441304674586, + "flos": 22707998104320.0, + "grad_norm": 1.6353477671011873, + "language_loss": 0.74941546, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.77066076, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46484375, + "step": 6202, + "time_per_iteration": 2.449441909790039 + }, + { + "auxiliary_loss_clip": 0.01068406, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.02557731, + "balance_loss_mlp": 1.0239985, + "epoch": 0.3729445362994138, + "flos": 18769306383360.0, + "grad_norm": 1.7628705184126339, + "language_loss": 0.75196481, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.77321315, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 6203, + "time_per_iteration": 2.409721612930298 + }, + { + "auxiliary_loss_clip": 0.01064768, + "auxiliary_loss_mlp": 0.01049483, + "balance_loss_clip": 1.02018142, + "balance_loss_mlp": 1.02085614, + "epoch": 0.3730046595520818, + "flos": 22455087148800.0, + "grad_norm": 2.116806731411721, + "language_loss": 0.83118474, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.85232723, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43945312, + "step": 6204, + "time_per_iteration": 2.498588800430298 + }, + { + "auxiliary_loss_clip": 0.01067886, + "auxiliary_loss_mlp": 0.01053162, + "balance_loss_clip": 1.02161956, + "balance_loss_mlp": 1.02205193, + "epoch": 0.37306478280474975, + "flos": 24315224676480.0, + "grad_norm": 1.616420142614673, + "language_loss": 0.76209074, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.78330123, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45898438, + "step": 6205, + "time_per_iteration": 2.430117607116699 + }, + { + "auxiliary_loss_clip": 0.01065792, + "auxiliary_loss_mlp": 0.01052314, + "balance_loss_clip": 1.01977026, + "balance_loss_mlp": 1.0208869, + "epoch": 0.3731249060574177, + "flos": 15814257707520.0, + "grad_norm": 1.8836810814841336, + "language_loss": 0.795555, + "learning_rate": 2.886941646474128e-06, + "loss": 0.81673604, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44921875, + "step": 6206, + "time_per_iteration": 2.409475803375244 + }, + { + "auxiliary_loss_clip": 0.01066102, + "auxiliary_loss_mlp": 0.01051745, + "balance_loss_clip": 1.01929665, + "balance_loss_mlp": 1.02007437, + "epoch": 0.3731850293100857, + "flos": 19827069978240.0, + "grad_norm": 1.936852968841416, + "language_loss": 0.94352925, + "learning_rate": 2.886592559513283e-06, + "loss": 0.96470767, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4609375, + "step": 6207, + "time_per_iteration": 2.3767848014831543 + }, + { + "auxiliary_loss_clip": 0.01067998, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.01400304, + "balance_loss_mlp": 1.02196062, + "epoch": 0.37324515256275365, + "flos": 19061354839680.0, + "grad_norm": 2.358150721094156, + "language_loss": 0.8432225, + "learning_rate": 2.886243438932759e-06, + "loss": 0.86436152, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4609375, + "step": 6208, + "time_per_iteration": 2.467061996459961 + }, + { + "auxiliary_loss_clip": 0.01066499, + "auxiliary_loss_mlp": 0.01054054, + "balance_loss_clip": 1.02237976, + "balance_loss_mlp": 1.02011347, + "epoch": 0.3733052758154216, + "flos": 20703285169920.0, + "grad_norm": 2.2976712733657085, + "language_loss": 0.74257135, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.7637769, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.46484375, + "step": 6209, + "time_per_iteration": 2.3781378269195557 + }, + { + "auxiliary_loss_clip": 0.01066112, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.02132392, + "balance_loss_mlp": 1.01994395, + "epoch": 0.3733653990680896, + "flos": 20192470934400.0, + "grad_norm": 1.648909382213249, + "language_loss": 0.7185986, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.73983836, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.4609375, + "step": 6210, + "time_per_iteration": 2.4245078563690186 + }, + { + "auxiliary_loss_clip": 0.01066611, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.01392162, + "balance_loss_mlp": 1.02027512, + "epoch": 0.37342552232075754, + "flos": 20338617352320.0, + "grad_norm": 1.5695027775139112, + "language_loss": 0.79041934, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.81154728, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.46484375, + "step": 6211, + "time_per_iteration": 2.3790764808654785 + }, + { + "auxiliary_loss_clip": 0.01067016, + "auxiliary_loss_mlp": 0.01054076, + "balance_loss_clip": 1.02062559, + "balance_loss_mlp": 1.02109897, + "epoch": 0.3734856455734255, + "flos": 35516409240960.0, + "grad_norm": 2.922005530314484, + "language_loss": 0.74687624, + "learning_rate": 2.884846620678668e-06, + "loss": 0.76808715, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.45898438, + "step": 6212, + "time_per_iteration": 2.52748966217041 + }, + { + "auxiliary_loss_clip": 0.01072929, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.02217174, + "balance_loss_mlp": 1.02215147, + "epoch": 0.37354576882609347, + "flos": 21141235664640.0, + "grad_norm": 1.9390132899422765, + "language_loss": 0.83572477, + "learning_rate": 2.884497332198356e-06, + "loss": 0.85705894, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.5078125, + "step": 6213, + "time_per_iteration": 2.3797342777252197 + }, + { + "auxiliary_loss_clip": 0.01068745, + "auxiliary_loss_mlp": 0.01049528, + "balance_loss_clip": 1.01579142, + "balance_loss_mlp": 1.02218342, + "epoch": 0.37360589207876144, + "flos": 21505728925440.0, + "grad_norm": 2.2015986392589806, + "language_loss": 0.79071647, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81189919, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.46679688, + "step": 6214, + "time_per_iteration": 2.4351308345794678 + }, + { + "auxiliary_loss_clip": 0.0106597, + "auxiliary_loss_mlp": 0.01055971, + "balance_loss_clip": 1.02407086, + "balance_loss_mlp": 1.02061081, + "epoch": 0.37366601533142946, + "flos": 38434275452160.0, + "grad_norm": 1.6022821623426917, + "language_loss": 0.85672796, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87794733, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.453125, + "step": 6215, + "time_per_iteration": 2.5300703048706055 + }, + { + "auxiliary_loss_clip": 0.01069724, + "auxiliary_loss_mlp": 0.01060652, + "balance_loss_clip": 1.02391219, + "balance_loss_mlp": 1.02218485, + "epoch": 0.3737261385840974, + "flos": 18440215107840.0, + "grad_norm": 1.8902648735577556, + "language_loss": 0.6948837, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.71618748, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.47460938, + "step": 6216, + "time_per_iteration": 2.4300568103790283 + }, + { + "auxiliary_loss_clip": 0.01069329, + "auxiliary_loss_mlp": 0.01051706, + "balance_loss_clip": 1.01894784, + "balance_loss_mlp": 1.02228165, + "epoch": 0.3737862618367654, + "flos": 22928753831040.0, + "grad_norm": 2.0437986155403447, + "language_loss": 0.68222743, + "learning_rate": 2.883099843007303e-06, + "loss": 0.70343781, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.47070312, + "step": 6217, + "time_per_iteration": 2.4007680416107178 + }, + { + "auxiliary_loss_clip": 0.01070756, + "auxiliary_loss_mlp": 0.01051156, + "balance_loss_clip": 1.01789713, + "balance_loss_mlp": 1.02337146, + "epoch": 0.37384638508943335, + "flos": 15408881377920.0, + "grad_norm": 1.6306091636435542, + "language_loss": 0.81944299, + "learning_rate": 2.88275038695833e-06, + "loss": 0.840662, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.47265625, + "step": 6218, + "time_per_iteration": 2.4123613834381104 + }, + { + "auxiliary_loss_clip": 0.01063802, + "auxiliary_loss_mlp": 0.01054473, + "balance_loss_clip": 1.02283525, + "balance_loss_mlp": 1.02094841, + "epoch": 0.3739065083421013, + "flos": 24279648134400.0, + "grad_norm": 1.3990121445651011, + "language_loss": 0.79888797, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.82007074, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4296875, + "step": 6219, + "time_per_iteration": 2.4346718788146973 + }, + { + "auxiliary_loss_clip": 0.01067102, + "auxiliary_loss_mlp": 0.01058699, + "balance_loss_clip": 1.02372289, + "balance_loss_mlp": 1.02153301, + "epoch": 0.3739666315947693, + "flos": 23001722305920.0, + "grad_norm": 1.7930942315456824, + "language_loss": 0.77719218, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79845023, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.45703125, + "step": 6220, + "time_per_iteration": 2.459007740020752 + }, + { + "auxiliary_loss_clip": 0.01068263, + "auxiliary_loss_mlp": 0.01055042, + "balance_loss_clip": 1.02175879, + "balance_loss_mlp": 1.021487, + "epoch": 0.37402675484743725, + "flos": 19390097001600.0, + "grad_norm": 2.1427071880354718, + "language_loss": 0.8469767, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.86820972, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46679688, + "step": 6221, + "time_per_iteration": 2.3790395259857178 + }, + { + "auxiliary_loss_clip": 0.01066232, + "auxiliary_loss_mlp": 0.01056569, + "balance_loss_clip": 1.02309513, + "balance_loss_mlp": 1.02072048, + "epoch": 0.3740868781001052, + "flos": 17125281371520.0, + "grad_norm": 1.7437742802669203, + "language_loss": 0.77330327, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.79453129, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.45507812, + "step": 6222, + "time_per_iteration": 2.412693500518799 + }, + { + "auxiliary_loss_clip": 0.01069078, + "auxiliary_loss_mlp": 0.01050343, + "balance_loss_clip": 1.0172739, + "balance_loss_mlp": 1.02309752, + "epoch": 0.3741470013527732, + "flos": 20042589000960.0, + "grad_norm": 1.7792024918747102, + "language_loss": 0.71955836, + "learning_rate": 2.881002604868789e-06, + "loss": 0.74075258, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45898438, + "step": 6223, + "time_per_iteration": 2.3846848011016846 + }, + { + "auxiliary_loss_clip": 0.01066467, + "auxiliary_loss_mlp": 0.01051638, + "balance_loss_clip": 1.01947546, + "balance_loss_mlp": 1.02137601, + "epoch": 0.37420712460544114, + "flos": 36895967637120.0, + "grad_norm": 2.0209762566288303, + "language_loss": 0.70235574, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.72353673, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.45117188, + "step": 6224, + "time_per_iteration": 2.5586984157562256 + }, + { + "auxiliary_loss_clip": 0.01065223, + "auxiliary_loss_mlp": 0.0105116, + "balance_loss_clip": 1.01800776, + "balance_loss_mlp": 1.02149105, + "epoch": 0.3742672478581091, + "flos": 22200081511680.0, + "grad_norm": 1.7284532789267233, + "language_loss": 0.71420431, + "learning_rate": 2.880303258086228e-06, + "loss": 0.73536813, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4375, + "step": 6225, + "time_per_iteration": 2.3968114852905273 + }, + { + "auxiliary_loss_clip": 0.01064218, + "auxiliary_loss_mlp": 0.01044722, + "balance_loss_clip": 1.01203465, + "balance_loss_mlp": 1.02008033, + "epoch": 0.3743273711107771, + "flos": 24680381253120.0, + "grad_norm": 1.84504222583044, + "language_loss": 0.80361867, + "learning_rate": 2.879953534616536e-06, + "loss": 0.8247081, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44140625, + "step": 6226, + "time_per_iteration": 2.4245471954345703 + }, + { + "auxiliary_loss_clip": 0.01065888, + "auxiliary_loss_mlp": 0.01049089, + "balance_loss_clip": 1.01728392, + "balance_loss_mlp": 1.02037835, + "epoch": 0.37438749436344504, + "flos": 24458543274240.0, + "grad_norm": 1.9515622136634245, + "language_loss": 0.69754744, + "learning_rate": 2.879603777778917e-06, + "loss": 0.71869719, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.45507812, + "step": 6227, + "time_per_iteration": 5.318897247314453 + }, + { + "auxiliary_loss_clip": 0.01065307, + "auxiliary_loss_mlp": 0.01053124, + "balance_loss_clip": 1.01919687, + "balance_loss_mlp": 1.02176023, + "epoch": 0.374447617616113, + "flos": 21797672647680.0, + "grad_norm": 1.6704511929756602, + "language_loss": 0.84708637, + "learning_rate": 2.879253987586635e-06, + "loss": 0.86827064, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.43554688, + "step": 6228, + "time_per_iteration": 2.376373767852783 + }, + { + "auxiliary_loss_clip": 0.01064895, + "auxiliary_loss_mlp": 0.01052046, + "balance_loss_clip": 1.02045572, + "balance_loss_mlp": 1.0213387, + "epoch": 0.374507740868781, + "flos": 17967211741440.0, + "grad_norm": 2.630225515341044, + "language_loss": 0.75861514, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.77978456, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43554688, + "step": 6229, + "time_per_iteration": 2.380281686782837 + }, + { + "auxiliary_loss_clip": 0.01066783, + "auxiliary_loss_mlp": 0.01056664, + "balance_loss_clip": 1.02068615, + "balance_loss_mlp": 1.0202297, + "epoch": 0.374567864121449, + "flos": 16104944620800.0, + "grad_norm": 1.880821784624589, + "language_loss": 0.85410851, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.87534297, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.46484375, + "step": 6230, + "time_per_iteration": 2.3458127975463867 + }, + { + "auxiliary_loss_clip": 0.01066891, + "auxiliary_loss_mlp": 0.0105432, + "balance_loss_clip": 1.01934433, + "balance_loss_mlp": 1.02089047, + "epoch": 0.37462798737411696, + "flos": 25772045644800.0, + "grad_norm": 2.1184858002424014, + "language_loss": 0.74262863, + "learning_rate": 2.878204417014456e-06, + "loss": 0.76384079, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.4609375, + "step": 6231, + "time_per_iteration": 2.491569995880127 + }, + { + "auxiliary_loss_clip": 0.01067218, + "auxiliary_loss_mlp": 0.01060761, + "balance_loss_clip": 1.0249027, + "balance_loss_mlp": 1.02164984, + "epoch": 0.3746881106267849, + "flos": 16653569725440.0, + "grad_norm": 2.028678012703331, + "language_loss": 0.74439406, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.76567388, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.45507812, + "step": 6232, + "time_per_iteration": 3.8391807079315186 + }, + { + "auxiliary_loss_clip": 0.01067235, + "auxiliary_loss_mlp": 0.01047392, + "balance_loss_clip": 1.01353693, + "balance_loss_mlp": 1.02025926, + "epoch": 0.3747482338794529, + "flos": 26176758658560.0, + "grad_norm": 1.6132503418436666, + "language_loss": 0.78033262, + "learning_rate": 2.877504536769561e-06, + "loss": 0.80147892, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.47070312, + "step": 6233, + "time_per_iteration": 2.4672532081604004 + }, + { + "auxiliary_loss_clip": 0.01066919, + "auxiliary_loss_mlp": 0.01059634, + "balance_loss_clip": 1.02602935, + "balance_loss_mlp": 1.01988816, + "epoch": 0.37480835713212085, + "flos": 12020246127360.0, + "grad_norm": 1.7446080938443214, + "language_loss": 0.70766717, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.72893268, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.47070312, + "step": 6234, + "time_per_iteration": 2.3572475910186768 + }, + { + "auxiliary_loss_clip": 0.01067511, + "auxiliary_loss_mlp": 0.01055624, + "balance_loss_clip": 1.02353251, + "balance_loss_mlp": 1.02117097, + "epoch": 0.3748684803847888, + "flos": 19678340119680.0, + "grad_norm": 2.682285120805443, + "language_loss": 0.84310681, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.86433816, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46289062, + "step": 6235, + "time_per_iteration": 2.378056764602661 + }, + { + "auxiliary_loss_clip": 0.01068504, + "auxiliary_loss_mlp": 0.01057277, + "balance_loss_clip": 1.02218223, + "balance_loss_mlp": 1.02149773, + "epoch": 0.3749286036374568, + "flos": 20520165755520.0, + "grad_norm": 1.812605518058928, + "language_loss": 0.79172707, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.81298494, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.47070312, + "step": 6236, + "time_per_iteration": 2.384197473526001 + }, + { + "auxiliary_loss_clip": 0.01070716, + "auxiliary_loss_mlp": 0.01057921, + "balance_loss_clip": 1.0189395, + "balance_loss_mlp": 1.02155304, + "epoch": 0.37498872689012475, + "flos": 20703564460800.0, + "grad_norm": 2.182206234020845, + "language_loss": 0.76136672, + "learning_rate": 2.876104377085234e-06, + "loss": 0.78265309, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.4921875, + "step": 6237, + "time_per_iteration": 2.3894240856170654 + }, + { + "auxiliary_loss_clip": 0.01067021, + "auxiliary_loss_mlp": 0.01055146, + "balance_loss_clip": 1.01964521, + "balance_loss_mlp": 1.0206399, + "epoch": 0.3750488501427927, + "flos": 21573914544000.0, + "grad_norm": 2.06132737624561, + "language_loss": 0.94167602, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.96289778, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.46484375, + "step": 6238, + "time_per_iteration": 2.374831199645996 + }, + { + "auxiliary_loss_clip": 0.01065824, + "auxiliary_loss_mlp": 0.01051387, + "balance_loss_clip": 1.01698279, + "balance_loss_mlp": 1.02043951, + "epoch": 0.3751089733954607, + "flos": 15922977281280.0, + "grad_norm": 2.0178993440246527, + "language_loss": 0.72488832, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.74606037, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.453125, + "step": 6239, + "time_per_iteration": 2.393449306488037 + }, + { + "auxiliary_loss_clip": 0.0106838, + "auxiliary_loss_mlp": 0.01052677, + "balance_loss_clip": 1.01824927, + "balance_loss_mlp": 1.02094388, + "epoch": 0.37516909664812864, + "flos": 36283136808960.0, + "grad_norm": 1.6856725075254644, + "language_loss": 0.66573024, + "learning_rate": 2.875053908444895e-06, + "loss": 0.68694079, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47460938, + "step": 6240, + "time_per_iteration": 2.514086961746216 + }, + { + "auxiliary_loss_clip": 0.0106825, + "auxiliary_loss_mlp": 0.01049535, + "balance_loss_clip": 1.01696682, + "balance_loss_mlp": 1.02086878, + "epoch": 0.3752292199007966, + "flos": 13515087432960.0, + "grad_norm": 1.8421040015712367, + "language_loss": 0.77276742, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.79394531, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.47265625, + "step": 6241, + "time_per_iteration": 2.394563913345337 + }, + { + "auxiliary_loss_clip": 0.01068751, + "auxiliary_loss_mlp": 0.01057852, + "balance_loss_clip": 1.02070618, + "balance_loss_mlp": 1.02104306, + "epoch": 0.3752893431534646, + "flos": 27196885941120.0, + "grad_norm": 2.1482945059354583, + "language_loss": 0.8506968, + "learning_rate": 2.874353430085213e-06, + "loss": 0.87196279, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.4765625, + "step": 6242, + "time_per_iteration": 2.426455020904541 + }, + { + "auxiliary_loss_clip": 0.01067429, + "auxiliary_loss_mlp": 0.01059545, + "balance_loss_clip": 1.02649975, + "balance_loss_mlp": 1.02035677, + "epoch": 0.3753494664061326, + "flos": 30006381692160.0, + "grad_norm": 2.3595082021661753, + "language_loss": 0.7003125, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.72158229, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.47070312, + "step": 6243, + "time_per_iteration": 2.5027406215667725 + }, + { + "auxiliary_loss_clip": 0.01067423, + "auxiliary_loss_mlp": 0.01060538, + "balance_loss_clip": 1.02503717, + "balance_loss_mlp": 1.02149236, + "epoch": 0.37540958965880056, + "flos": 24460812512640.0, + "grad_norm": 1.7974587483423823, + "language_loss": 0.85199177, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.87327135, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.45898438, + "step": 6244, + "time_per_iteration": 2.4073851108551025 + }, + { + "auxiliary_loss_clip": 0.01065205, + "auxiliary_loss_mlp": 0.01053455, + "balance_loss_clip": 1.02148342, + "balance_loss_mlp": 1.02065992, + "epoch": 0.3754697129114685, + "flos": 16507458218880.0, + "grad_norm": 2.268234925644364, + "language_loss": 0.84269309, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.86387968, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4453125, + "step": 6245, + "time_per_iteration": 2.364856243133545 + }, + { + "auxiliary_loss_clip": 0.01067242, + "auxiliary_loss_mlp": 0.01055673, + "balance_loss_clip": 1.02041113, + "balance_loss_mlp": 1.02091718, + "epoch": 0.3755298361641365, + "flos": 19389887533440.0, + "grad_norm": 1.9609093072141877, + "language_loss": 0.65281802, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.67404717, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.46289062, + "step": 6246, + "time_per_iteration": 2.3741250038146973 + }, + { + "auxiliary_loss_clip": 0.01068272, + "auxiliary_loss_mlp": 0.01059176, + "balance_loss_clip": 1.02167261, + "balance_loss_mlp": 1.02070451, + "epoch": 0.37558995941680445, + "flos": 14719521116160.0, + "grad_norm": 2.364650680035281, + "language_loss": 0.76789892, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.78917336, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.47460938, + "step": 6247, + "time_per_iteration": 2.3882367610931396 + }, + { + "auxiliary_loss_clip": 0.01066752, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.01551223, + "balance_loss_mlp": 1.02072394, + "epoch": 0.3756500826694724, + "flos": 21688813428480.0, + "grad_norm": 2.783516448399697, + "language_loss": 0.56067353, + "learning_rate": 2.872251199697598e-06, + "loss": 0.58182847, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4609375, + "step": 6248, + "time_per_iteration": 2.380462884902954 + }, + { + "auxiliary_loss_clip": 0.01066415, + "auxiliary_loss_mlp": 0.01056665, + "balance_loss_clip": 1.02154636, + "balance_loss_mlp": 1.0211885, + "epoch": 0.3757102059221404, + "flos": 26504453479680.0, + "grad_norm": 1.7391372630195248, + "language_loss": 0.84835106, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86958188, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.45117188, + "step": 6249, + "time_per_iteration": 2.4346978664398193 + }, + { + "auxiliary_loss_clip": 0.01064622, + "auxiliary_loss_mlp": 0.01046631, + "balance_loss_clip": 1.01517153, + "balance_loss_mlp": 1.0202204, + "epoch": 0.37577032917480835, + "flos": 37336676129280.0, + "grad_norm": 1.4910373581303633, + "language_loss": 0.70093608, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.72204864, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 6250, + "time_per_iteration": 2.5270063877105713 + }, + { + "auxiliary_loss_clip": 0.01067681, + "auxiliary_loss_mlp": 0.01049659, + "balance_loss_clip": 1.01907039, + "balance_loss_mlp": 1.02197146, + "epoch": 0.3758304524274763, + "flos": 21907509384960.0, + "grad_norm": 2.0057259287560316, + "language_loss": 0.79832339, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.81949675, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45703125, + "step": 6251, + "time_per_iteration": 2.4000701904296875 + }, + { + "auxiliary_loss_clip": 0.01067087, + "auxiliary_loss_mlp": 0.01050939, + "balance_loss_clip": 1.01677406, + "balance_loss_mlp": 1.02132511, + "epoch": 0.3758905756801443, + "flos": 36568028436480.0, + "grad_norm": 1.8276540268372234, + "language_loss": 0.60398042, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.62516069, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.45703125, + "step": 6252, + "time_per_iteration": 2.564404010772705 + }, + { + "auxiliary_loss_clip": 0.01068993, + "auxiliary_loss_mlp": 0.01055509, + "balance_loss_clip": 1.01981783, + "balance_loss_mlp": 1.02141726, + "epoch": 0.37595069893281224, + "flos": 24527811144960.0, + "grad_norm": 1.7305884944017336, + "language_loss": 0.9003861, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.9216311, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.4765625, + "step": 6253, + "time_per_iteration": 2.450216770172119 + }, + { + "auxiliary_loss_clip": 0.01066228, + "auxiliary_loss_mlp": 0.01047064, + "balance_loss_clip": 1.01548564, + "balance_loss_mlp": 1.02182281, + "epoch": 0.3760108221854802, + "flos": 16434105719040.0, + "grad_norm": 2.0215580701775373, + "language_loss": 0.78036916, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.80150211, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44335938, + "step": 6254, + "time_per_iteration": 2.3608360290527344 + }, + { + "auxiliary_loss_clip": 0.01070555, + "auxiliary_loss_mlp": 0.01056218, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.02344608, + "epoch": 0.37607094543814823, + "flos": 13770896031360.0, + "grad_norm": 2.1698051231002755, + "language_loss": 0.64051855, + "learning_rate": 2.869797092829169e-06, + "loss": 0.66178632, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.47070312, + "step": 6255, + "time_per_iteration": 2.3979666233062744 + }, + { + "auxiliary_loss_clip": 0.01068029, + "auxiliary_loss_mlp": 0.01056319, + "balance_loss_clip": 1.02258229, + "balance_loss_mlp": 1.02013636, + "epoch": 0.3761310686908162, + "flos": 19857095614080.0, + "grad_norm": 3.1374547950796194, + "language_loss": 0.75517309, + "learning_rate": 2.869446374096135e-06, + "loss": 0.77641654, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.47851562, + "step": 6256, + "time_per_iteration": 2.4308218955993652 + }, + { + "auxiliary_loss_clip": 0.01068214, + "auxiliary_loss_mlp": 0.01053159, + "balance_loss_clip": 1.01992345, + "balance_loss_mlp": 1.02096319, + "epoch": 0.37619119194348416, + "flos": 12749965787520.0, + "grad_norm": 2.042367298924999, + "language_loss": 0.72261637, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.74383008, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.47265625, + "step": 6257, + "time_per_iteration": 2.37766170501709 + }, + { + "auxiliary_loss_clip": 0.01066802, + "auxiliary_loss_mlp": 0.01046198, + "balance_loss_clip": 1.01317692, + "balance_loss_mlp": 1.02140546, + "epoch": 0.3762513151961521, + "flos": 17529575448960.0, + "grad_norm": 1.641680476477804, + "language_loss": 0.8535673, + "learning_rate": 2.868744837734889e-06, + "loss": 0.87469733, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.453125, + "step": 6258, + "time_per_iteration": 2.454693078994751 + }, + { + "auxiliary_loss_clip": 0.01065369, + "auxiliary_loss_mlp": 0.01051164, + "balance_loss_clip": 1.02128983, + "balance_loss_mlp": 1.02125382, + "epoch": 0.3763114384488201, + "flos": 23616438347520.0, + "grad_norm": 1.525372446921158, + "language_loss": 0.81626534, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83743072, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.44140625, + "step": 6259, + "time_per_iteration": 2.4088242053985596 + }, + { + "auxiliary_loss_clip": 0.01068338, + "auxiliary_loss_mlp": 0.01057692, + "balance_loss_clip": 1.02312112, + "balance_loss_mlp": 1.02101123, + "epoch": 0.37637156170148806, + "flos": 25405911550080.0, + "grad_norm": 2.1853350900028494, + "language_loss": 0.72949708, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.75075734, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.47265625, + "step": 6260, + "time_per_iteration": 2.468419075012207 + }, + { + "auxiliary_loss_clip": 0.01066877, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_clip": 1.0153017, + "balance_loss_mlp": 1.0202136, + "epoch": 0.376431684954156, + "flos": 23439777534720.0, + "grad_norm": 1.612588053038594, + "language_loss": 0.79255712, + "learning_rate": 2.867692286154594e-06, + "loss": 0.81370556, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.46679688, + "step": 6261, + "time_per_iteration": 2.416405439376831 + }, + { + "auxiliary_loss_clip": 0.01066632, + "auxiliary_loss_mlp": 0.01053955, + "balance_loss_clip": 1.02026653, + "balance_loss_mlp": 1.01977158, + "epoch": 0.376491808206824, + "flos": 34203046515840.0, + "grad_norm": 1.6210896241644364, + "language_loss": 0.81544989, + "learning_rate": 2.867341369804132e-06, + "loss": 0.8366558, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46875, + "step": 6262, + "time_per_iteration": 2.536407470703125 + }, + { + "auxiliary_loss_clip": 0.01065226, + "auxiliary_loss_mlp": 0.01052748, + "balance_loss_clip": 1.01981068, + "balance_loss_mlp": 1.02061236, + "epoch": 0.37655193145949195, + "flos": 35184315588480.0, + "grad_norm": 1.6092611316804604, + "language_loss": 0.81939054, + "learning_rate": 2.866990420563998e-06, + "loss": 0.84057027, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.4453125, + "step": 6263, + "time_per_iteration": 2.532015085220337 + }, + { + "auxiliary_loss_clip": 0.01067178, + "auxiliary_loss_mlp": 0.01056067, + "balance_loss_clip": 1.0240953, + "balance_loss_mlp": 1.02186108, + "epoch": 0.3766120547121599, + "flos": 16760962667520.0, + "grad_norm": 1.795921989462818, + "language_loss": 0.81230104, + "learning_rate": 2.866639438447501e-06, + "loss": 0.83353353, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.453125, + "step": 6264, + "time_per_iteration": 2.4235222339630127 + }, + { + "auxiliary_loss_clip": 0.01064574, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.01877165, + "balance_loss_mlp": 1.01965404, + "epoch": 0.3766721779648279, + "flos": 23549230247040.0, + "grad_norm": 1.9345790281617024, + "language_loss": 0.75485885, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.77601588, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44921875, + "step": 6265, + "time_per_iteration": 2.3970727920532227 + }, + { + "auxiliary_loss_clip": 0.01065541, + "auxiliary_loss_mlp": 0.01055233, + "balance_loss_clip": 1.02538347, + "balance_loss_mlp": 1.02141476, + "epoch": 0.37673230121749585, + "flos": 29128001996160.0, + "grad_norm": 1.652327539800267, + "language_loss": 0.69216037, + "learning_rate": 2.865937375638654e-06, + "loss": 0.71336812, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44140625, + "step": 6266, + "time_per_iteration": 3.908853530883789 + }, + { + "auxiliary_loss_clip": 0.01069156, + "auxiliary_loss_mlp": 0.01054988, + "balance_loss_clip": 1.02034581, + "balance_loss_mlp": 1.02082372, + "epoch": 0.3767924244701638, + "flos": 28145545937280.0, + "grad_norm": 2.864214598475721, + "language_loss": 0.64740586, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.66864735, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.484375, + "step": 6267, + "time_per_iteration": 3.8403568267822266 + }, + { + "auxiliary_loss_clip": 0.01013508, + "auxiliary_loss_mlp": 0.0100422, + "balance_loss_clip": 1.00123954, + "balance_loss_mlp": 1.00409913, + "epoch": 0.37685254772283183, + "flos": 60794153723520.0, + "grad_norm": 0.7207785014702951, + "language_loss": 0.58892345, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.6091007, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.02978516, + "router_z_loss_mlp": 0.09423828, + "step": 6268, + "time_per_iteration": 3.1662817001342773 + }, + { + "auxiliary_loss_clip": 0.01065428, + "auxiliary_loss_mlp": 0.01051691, + "balance_loss_clip": 1.01833642, + "balance_loss_mlp": 1.01957476, + "epoch": 0.3769126709754998, + "flos": 26031310467840.0, + "grad_norm": 1.575821130084188, + "language_loss": 0.65905333, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.68022454, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.45898438, + "step": 6269, + "time_per_iteration": 2.4698076248168945 + }, + { + "auxiliary_loss_clip": 0.01065254, + "auxiliary_loss_mlp": 0.01056281, + "balance_loss_clip": 1.02290201, + "balance_loss_mlp": 1.02268052, + "epoch": 0.37697279422816776, + "flos": 23578941680640.0, + "grad_norm": 1.7048816638798925, + "language_loss": 0.72268957, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.74390495, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.42578125, + "step": 6270, + "time_per_iteration": 2.4572460651397705 + }, + { + "auxiliary_loss_clip": 0.01012341, + "auxiliary_loss_mlp": 0.01008164, + "balance_loss_clip": 1.00520718, + "balance_loss_mlp": 1.00295258, + "epoch": 0.3770329174808357, + "flos": 64742550802560.0, + "grad_norm": 0.6989844473959727, + "language_loss": 0.56116152, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.5813666, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.02954102, + "router_z_loss_mlp": 0.09375, + "step": 6271, + "time_per_iteration": 4.364473819732666 + }, + { + "auxiliary_loss_clip": 0.01062511, + "auxiliary_loss_mlp": 0.01049644, + "balance_loss_clip": 1.0158366, + "balance_loss_mlp": 1.01886916, + "epoch": 0.3770930407335037, + "flos": 21834226707840.0, + "grad_norm": 1.829100297405524, + "language_loss": 0.80801922, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82914078, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.4375, + "step": 6272, + "time_per_iteration": 3.8443620204925537 + }, + { + "auxiliary_loss_clip": 0.01063756, + "auxiliary_loss_mlp": 0.01045077, + "balance_loss_clip": 1.0154655, + "balance_loss_mlp": 1.02059281, + "epoch": 0.37715316398617166, + "flos": 22746786491520.0, + "grad_norm": 1.7378585241649005, + "language_loss": 0.75885665, + "learning_rate": 2.863479122159103e-06, + "loss": 0.77994502, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43164062, + "step": 6273, + "time_per_iteration": 2.4371728897094727 + }, + { + "auxiliary_loss_clip": 0.01066972, + "auxiliary_loss_mlp": 0.01053564, + "balance_loss_clip": 1.02289104, + "balance_loss_mlp": 1.0222609, + "epoch": 0.3772132872388396, + "flos": 18913637410560.0, + "grad_norm": 1.4299582561236608, + "language_loss": 0.72915286, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.75035816, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44726562, + "step": 6274, + "time_per_iteration": 2.369401454925537 + }, + { + "auxiliary_loss_clip": 0.01067994, + "auxiliary_loss_mlp": 0.01049089, + "balance_loss_clip": 1.01709306, + "balance_loss_mlp": 1.0217042, + "epoch": 0.3772734104915076, + "flos": 17345303959680.0, + "grad_norm": 2.1638136669224934, + "language_loss": 0.85311675, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.87428761, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46289062, + "step": 6275, + "time_per_iteration": 2.3967461585998535 + }, + { + "auxiliary_loss_clip": 0.01065522, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.01074302, + "balance_loss_mlp": 1.02305365, + "epoch": 0.37733353374417555, + "flos": 32341023774720.0, + "grad_norm": 1.4934556151906513, + "language_loss": 0.76426077, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.78528666, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.42382812, + "step": 6276, + "time_per_iteration": 2.4768617153167725 + }, + { + "auxiliary_loss_clip": 0.01068419, + "auxiliary_loss_mlp": 0.0104794, + "balance_loss_clip": 1.014943, + "balance_loss_mlp": 1.02219319, + "epoch": 0.3773936569968435, + "flos": 23359756965120.0, + "grad_norm": 1.9532391330712027, + "language_loss": 0.87457734, + "learning_rate": 2.862073685241366e-06, + "loss": 0.89574099, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.46289062, + "step": 6277, + "time_per_iteration": 2.4229183197021484 + }, + { + "auxiliary_loss_clip": 0.01066599, + "auxiliary_loss_mlp": 0.01043107, + "balance_loss_clip": 1.01323342, + "balance_loss_mlp": 1.02369452, + "epoch": 0.3774537802495115, + "flos": 21465823374720.0, + "grad_norm": 1.8100537727044947, + "language_loss": 0.79400361, + "learning_rate": 2.861722244253818e-06, + "loss": 0.81510067, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.4296875, + "step": 6278, + "time_per_iteration": 2.3939003944396973 + }, + { + "auxiliary_loss_clip": 0.01068895, + "auxiliary_loss_mlp": 0.01048901, + "balance_loss_clip": 1.01642823, + "balance_loss_mlp": 1.02259314, + "epoch": 0.37751390350217945, + "flos": 24972534443520.0, + "grad_norm": 1.6771018995456737, + "language_loss": 0.84281611, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.86399406, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46289062, + "step": 6279, + "time_per_iteration": 2.4570884704589844 + }, + { + "auxiliary_loss_clip": 0.010672, + "auxiliary_loss_mlp": 0.01049945, + "balance_loss_clip": 1.02082193, + "balance_loss_mlp": 1.02332246, + "epoch": 0.3775740267548474, + "flos": 27817851116160.0, + "grad_norm": 2.1072148208776853, + "language_loss": 0.75894618, + "learning_rate": 2.861019264262269e-06, + "loss": 0.78011763, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4375, + "step": 6280, + "time_per_iteration": 2.4425361156463623 + }, + { + "auxiliary_loss_clip": 0.01065651, + "auxiliary_loss_mlp": 0.01049658, + "balance_loss_clip": 1.02071357, + "balance_loss_mlp": 1.02315283, + "epoch": 0.3776341500075154, + "flos": 22564120924800.0, + "grad_norm": 1.5529264121633932, + "language_loss": 0.76904577, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.79019886, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42578125, + "step": 6281, + "time_per_iteration": 2.44450306892395 + }, + { + "auxiliary_loss_clip": 0.01067442, + "auxiliary_loss_mlp": 0.0104868, + "balance_loss_clip": 1.01787674, + "balance_loss_mlp": 1.02366865, + "epoch": 0.3776942732601834, + "flos": 23076087235200.0, + "grad_norm": 1.4406665332336894, + "language_loss": 0.85086787, + "learning_rate": 2.860316153670974e-06, + "loss": 0.87202913, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4375, + "step": 6282, + "time_per_iteration": 2.4288699626922607 + }, + { + "auxiliary_loss_clip": 0.0106464, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_clip": 1.01620793, + "balance_loss_mlp": 1.02193642, + "epoch": 0.37775439651285136, + "flos": 21723377541120.0, + "grad_norm": 1.784319296612173, + "language_loss": 0.70224154, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72335279, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.42578125, + "step": 6283, + "time_per_iteration": 2.438530445098877 + }, + { + "auxiliary_loss_clip": 0.01065457, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_clip": 1.02285075, + "balance_loss_mlp": 1.02174926, + "epoch": 0.37781451976551933, + "flos": 23986622160000.0, + "grad_norm": 1.62608186544779, + "language_loss": 0.77940738, + "learning_rate": 2.859612912586581e-06, + "loss": 0.80061555, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4375, + "step": 6284, + "time_per_iteration": 2.4105803966522217 + }, + { + "auxiliary_loss_clip": 0.01070207, + "auxiliary_loss_mlp": 0.01053734, + "balance_loss_clip": 1.01964045, + "balance_loss_mlp": 1.02249491, + "epoch": 0.3778746430181873, + "flos": 13727324787840.0, + "grad_norm": 2.07146892832885, + "language_loss": 0.87627763, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.89751709, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4765625, + "step": 6285, + "time_per_iteration": 2.3789777755737305 + }, + { + "auxiliary_loss_clip": 0.01066609, + "auxiliary_loss_mlp": 0.01054929, + "balance_loss_clip": 1.01975036, + "balance_loss_mlp": 1.02094889, + "epoch": 0.37793476627085526, + "flos": 19459574340480.0, + "grad_norm": 1.8795040463874788, + "language_loss": 0.85952216, + "learning_rate": 2.858909541115758e-06, + "loss": 0.88073754, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.45703125, + "step": 6286, + "time_per_iteration": 2.3936920166015625 + }, + { + "auxiliary_loss_clip": 0.01064656, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_clip": 1.02413249, + "balance_loss_mlp": 1.0206778, + "epoch": 0.3779948895235232, + "flos": 10706254997760.0, + "grad_norm": 1.9317425163782684, + "language_loss": 0.83554327, + "learning_rate": 2.858557806518775e-06, + "loss": 0.85674518, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43945312, + "step": 6287, + "time_per_iteration": 2.387052297592163 + }, + { + "auxiliary_loss_clip": 0.01065703, + "auxiliary_loss_mlp": 0.01054144, + "balance_loss_clip": 1.02244627, + "balance_loss_mlp": 1.02066159, + "epoch": 0.3780550127761912, + "flos": 22308905819520.0, + "grad_norm": 2.225176955727695, + "language_loss": 0.74023616, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.76143461, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 6288, + "time_per_iteration": 2.410839557647705 + }, + { + "auxiliary_loss_clip": 0.01066777, + "auxiliary_loss_mlp": 0.01047668, + "balance_loss_clip": 1.01618457, + "balance_loss_mlp": 1.02139544, + "epoch": 0.37811513602885916, + "flos": 28949351235840.0, + "grad_norm": 1.7953625808178562, + "language_loss": 0.76793075, + "learning_rate": 2.857854239668352e-06, + "loss": 0.78907514, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45507812, + "step": 6289, + "time_per_iteration": 2.4465088844299316 + }, + { + "auxiliary_loss_clip": 0.01066954, + "auxiliary_loss_mlp": 0.01051539, + "balance_loss_clip": 1.01999593, + "balance_loss_mlp": 1.02171993, + "epoch": 0.3781752592815271, + "flos": 23111803422720.0, + "grad_norm": 1.7761964396691463, + "language_loss": 0.75377709, + "learning_rate": 2.857502407441593e-06, + "loss": 0.77496207, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.453125, + "step": 6290, + "time_per_iteration": 2.4401636123657227 + }, + { + "auxiliary_loss_clip": 0.0106696, + "auxiliary_loss_mlp": 0.010547, + "balance_loss_clip": 1.01850796, + "balance_loss_mlp": 1.0200181, + "epoch": 0.3782353825341951, + "flos": 19754904464640.0, + "grad_norm": 3.465297990544801, + "language_loss": 0.8114723, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.83268893, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.46875, + "step": 6291, + "time_per_iteration": 2.3626456260681152 + }, + { + "auxiliary_loss_clip": 0.01065365, + "auxiliary_loss_mlp": 0.01055074, + "balance_loss_clip": 1.01983547, + "balance_loss_mlp": 1.01926041, + "epoch": 0.37829550578686305, + "flos": 22049850464640.0, + "grad_norm": 1.8276864339882875, + "language_loss": 0.77511525, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.79631966, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.4609375, + "step": 6292, + "time_per_iteration": 2.4162251949310303 + }, + { + "auxiliary_loss_clip": 0.0106677, + "auxiliary_loss_mlp": 0.01056524, + "balance_loss_clip": 1.02104688, + "balance_loss_mlp": 1.0207665, + "epoch": 0.378355629039531, + "flos": 16469472792960.0, + "grad_norm": 2.007667866964689, + "language_loss": 0.71129817, + "learning_rate": 2.856446715715224e-06, + "loss": 0.73253107, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.45898438, + "step": 6293, + "time_per_iteration": 2.351641893386841 + }, + { + "auxiliary_loss_clip": 0.0106412, + "auxiliary_loss_mlp": 0.0104938, + "balance_loss_clip": 1.01719332, + "balance_loss_mlp": 1.0199219, + "epoch": 0.378415752292199, + "flos": 19973809889280.0, + "grad_norm": 2.3552797735134123, + "language_loss": 0.72523773, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.74637282, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.44140625, + "step": 6294, + "time_per_iteration": 2.387784481048584 + }, + { + "auxiliary_loss_clip": 0.01068907, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_clip": 1.01793194, + "balance_loss_mlp": 1.02053368, + "epoch": 0.378475875544867, + "flos": 14646517729920.0, + "grad_norm": 2.8926078395787704, + "language_loss": 0.84911764, + "learning_rate": 2.855742758826011e-06, + "loss": 0.87033439, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.484375, + "step": 6295, + "time_per_iteration": 2.347059488296509 + }, + { + "auxiliary_loss_clip": 0.01067038, + "auxiliary_loss_mlp": 0.01051783, + "balance_loss_clip": 1.01726007, + "balance_loss_mlp": 1.02031887, + "epoch": 0.37853599879753497, + "flos": 26649796936320.0, + "grad_norm": 1.6359105726619436, + "language_loss": 0.72792768, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.74911594, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46679688, + "step": 6296, + "time_per_iteration": 2.4955921173095703 + }, + { + "auxiliary_loss_clip": 0.01064683, + "auxiliary_loss_mlp": 0.01053214, + "balance_loss_clip": 1.02202868, + "balance_loss_mlp": 1.02049005, + "epoch": 0.37859612205020293, + "flos": 17310984226560.0, + "grad_norm": 1.9054788308708104, + "language_loss": 0.78887182, + "learning_rate": 2.855038672137396e-06, + "loss": 0.81005079, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.44140625, + "step": 6297, + "time_per_iteration": 2.367610454559326 + }, + { + "auxiliary_loss_clip": 0.010675, + "auxiliary_loss_mlp": 0.01055963, + "balance_loss_clip": 1.02229857, + "balance_loss_mlp": 1.02071762, + "epoch": 0.3786562453028709, + "flos": 18219494292480.0, + "grad_norm": 1.6094739209044648, + "language_loss": 0.80276668, + "learning_rate": 2.854686580151684e-06, + "loss": 0.82400131, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46875, + "step": 6298, + "time_per_iteration": 2.3968329429626465 + }, + { + "auxiliary_loss_clip": 0.01064484, + "auxiliary_loss_mlp": 0.01054407, + "balance_loss_clip": 1.02176762, + "balance_loss_mlp": 1.02078462, + "epoch": 0.37871636855553886, + "flos": 21213820114560.0, + "grad_norm": 1.6364420352955638, + "language_loss": 0.85778964, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.87897861, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.4375, + "step": 6299, + "time_per_iteration": 2.381861448287964 + }, + { + "auxiliary_loss_clip": 0.01067072, + "auxiliary_loss_mlp": 0.0105289, + "balance_loss_clip": 1.01812816, + "balance_loss_mlp": 1.02078271, + "epoch": 0.3787764918082068, + "flos": 20951867116800.0, + "grad_norm": 2.0093958790635846, + "language_loss": 0.77738166, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.7985813, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.46289062, + "step": 6300, + "time_per_iteration": 2.410543918609619 + }, + { + "auxiliary_loss_clip": 0.01068992, + "auxiliary_loss_mlp": 0.01061714, + "balance_loss_clip": 1.02251828, + "balance_loss_mlp": 1.02061212, + "epoch": 0.3788366150608748, + "flos": 17307143976960.0, + "grad_norm": 2.514136677066058, + "language_loss": 0.84479249, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.86609948, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 0.484375, + "step": 6301, + "time_per_iteration": 2.3485982418060303 + }, + { + "auxiliary_loss_clip": 0.01066779, + "auxiliary_loss_mlp": 0.0105212, + "balance_loss_clip": 1.01843143, + "balance_loss_mlp": 1.02045894, + "epoch": 0.37889673831354276, + "flos": 24310092706560.0, + "grad_norm": 1.729840790087598, + "language_loss": 0.69753867, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.71872759, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46289062, + "step": 6302, + "time_per_iteration": 2.4267046451568604 + }, + { + "auxiliary_loss_clip": 0.01066386, + "auxiliary_loss_mlp": 0.01045826, + "balance_loss_clip": 1.01418757, + "balance_loss_mlp": 1.02226102, + "epoch": 0.3789568615662107, + "flos": 26682510746880.0, + "grad_norm": 2.4602512519607322, + "language_loss": 0.69484949, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.71597165, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44140625, + "step": 6303, + "time_per_iteration": 2.451859474182129 + }, + { + "auxiliary_loss_clip": 0.01066019, + "auxiliary_loss_mlp": 0.01048618, + "balance_loss_clip": 1.01633608, + "balance_loss_mlp": 1.02030981, + "epoch": 0.3790169848188787, + "flos": 23584108561920.0, + "grad_norm": 2.743955822820057, + "language_loss": 0.78649014, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.8076365, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.45703125, + "step": 6304, + "time_per_iteration": 2.4545810222625732 + }, + { + "auxiliary_loss_clip": 0.01071037, + "auxiliary_loss_mlp": 0.01054656, + "balance_loss_clip": 1.01782084, + "balance_loss_mlp": 1.02212691, + "epoch": 0.37907710807154665, + "flos": 18436584326400.0, + "grad_norm": 2.036611620209349, + "language_loss": 0.81321007, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.83446705, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.48828125, + "step": 6305, + "time_per_iteration": 3.807636260986328 + }, + { + "auxiliary_loss_clip": 0.01016841, + "auxiliary_loss_mlp": 0.01021233, + "balance_loss_clip": 1.0180856, + "balance_loss_mlp": 1.00679088, + "epoch": 0.3791372313242146, + "flos": 50104411799040.0, + "grad_norm": 0.9894978101182651, + "language_loss": 0.64617914, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66655988, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.03149414, + "router_z_loss_mlp": 0.10058594, + "step": 6306, + "time_per_iteration": 2.956925868988037 + }, + { + "auxiliary_loss_clip": 0.01068416, + "auxiliary_loss_mlp": 0.01059392, + "balance_loss_clip": 1.02300882, + "balance_loss_mlp": 1.02123976, + "epoch": 0.3791973545768826, + "flos": 24315399233280.0, + "grad_norm": 1.7029137520996735, + "language_loss": 0.74525696, + "learning_rate": 2.851516295441817e-06, + "loss": 0.76653504, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.47265625, + "step": 6307, + "time_per_iteration": 3.844367504119873 + }, + { + "auxiliary_loss_clip": 0.01070184, + "auxiliary_loss_mlp": 0.01054925, + "balance_loss_clip": 1.02049732, + "balance_loss_mlp": 1.02183902, + "epoch": 0.3792574778295506, + "flos": 21578837045760.0, + "grad_norm": 1.506832752334691, + "language_loss": 0.79649723, + "learning_rate": 2.851163879959112e-06, + "loss": 0.81774831, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.48242188, + "step": 6308, + "time_per_iteration": 2.3809924125671387 + }, + { + "auxiliary_loss_clip": 0.01066287, + "auxiliary_loss_mlp": 0.0106189, + "balance_loss_clip": 1.02729523, + "balance_loss_mlp": 1.02055907, + "epoch": 0.37931760108221857, + "flos": 22271653532160.0, + "grad_norm": 2.0892643323818576, + "language_loss": 0.74139762, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.7626794, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.45703125, + "step": 6309, + "time_per_iteration": 2.409688711166382 + }, + { + "auxiliary_loss_clip": 0.01065771, + "auxiliary_loss_mlp": 0.01061145, + "balance_loss_clip": 1.02929246, + "balance_loss_mlp": 1.02093077, + "epoch": 0.37937772433488653, + "flos": 19681970901120.0, + "grad_norm": 1.5477468705862467, + "language_loss": 0.79913521, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.82040441, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44921875, + "step": 6310, + "time_per_iteration": 2.382229804992676 + }, + { + "auxiliary_loss_clip": 0.01065702, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_clip": 1.01903915, + "balance_loss_mlp": 1.02001595, + "epoch": 0.3794378475875545, + "flos": 19098362747520.0, + "grad_norm": 2.8061987353218973, + "language_loss": 0.77711737, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.79827523, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.45703125, + "step": 6311, + "time_per_iteration": 3.869875431060791 + }, + { + "auxiliary_loss_clip": 0.01067398, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.01770926, + "balance_loss_mlp": 1.02162623, + "epoch": 0.37949797084022246, + "flos": 20338617352320.0, + "grad_norm": 1.4563280323269752, + "language_loss": 0.71684313, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73800695, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45703125, + "step": 6312, + "time_per_iteration": 3.8048007488250732 + }, + { + "auxiliary_loss_clip": 0.01019048, + "auxiliary_loss_mlp": 0.01005988, + "balance_loss_clip": 1.0020299, + "balance_loss_mlp": 1.00866461, + "epoch": 0.37955809409289043, + "flos": 63969050430720.0, + "grad_norm": 0.7757654654775705, + "language_loss": 0.56200629, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58225667, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.10400391, + "step": 6313, + "time_per_iteration": 2.9740657806396484 + }, + { + "auxiliary_loss_clip": 0.01065328, + "auxiliary_loss_mlp": 0.01059606, + "balance_loss_clip": 1.02927935, + "balance_loss_mlp": 1.02100158, + "epoch": 0.3796182173455584, + "flos": 31539313157760.0, + "grad_norm": 1.6808608542223547, + "language_loss": 0.72655118, + "learning_rate": 2.849048709730083e-06, + "loss": 0.74780047, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44335938, + "step": 6314, + "time_per_iteration": 2.470212697982788 + }, + { + "auxiliary_loss_clip": 0.01069784, + "auxiliary_loss_mlp": 0.01060839, + "balance_loss_clip": 1.02412236, + "balance_loss_mlp": 1.02158046, + "epoch": 0.37967834059822636, + "flos": 12129978130560.0, + "grad_norm": 1.729500347840395, + "language_loss": 0.75076461, + "learning_rate": 2.848696068594545e-06, + "loss": 0.77207088, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.48242188, + "step": 6315, + "time_per_iteration": 2.3571832180023193 + }, + { + "auxiliary_loss_clip": 0.01067338, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_clip": 1.03812551, + "balance_loss_mlp": 1.02247238, + "epoch": 0.3797384638508943, + "flos": 39347009792640.0, + "grad_norm": 1.8409880623758668, + "language_loss": 0.72363824, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.74501717, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44921875, + "step": 6316, + "time_per_iteration": 2.5326409339904785 + }, + { + "auxiliary_loss_clip": 0.01067257, + "auxiliary_loss_mlp": 0.01060036, + "balance_loss_clip": 1.02808833, + "balance_loss_mlp": 1.02164793, + "epoch": 0.3797985871035623, + "flos": 34052710734720.0, + "grad_norm": 1.6586920180608467, + "language_loss": 0.66374815, + "learning_rate": 2.847990689788923e-06, + "loss": 0.6850211, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45703125, + "step": 6317, + "time_per_iteration": 2.4765305519104004 + }, + { + "auxiliary_loss_clip": 0.0106471, + "auxiliary_loss_mlp": 0.01057707, + "balance_loss_clip": 1.02616405, + "balance_loss_mlp": 1.0201869, + "epoch": 0.37985871035623026, + "flos": 23221046666880.0, + "grad_norm": 1.9335744307232055, + "language_loss": 0.86988914, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.89111328, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4453125, + "step": 6318, + "time_per_iteration": 2.4248201847076416 + }, + { + "auxiliary_loss_clip": 0.01068041, + "auxiliary_loss_mlp": 0.01062417, + "balance_loss_clip": 1.02798951, + "balance_loss_mlp": 1.02132106, + "epoch": 0.3799188336088982, + "flos": 18113951652480.0, + "grad_norm": 1.9416959683633455, + "language_loss": 0.78793848, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.80924296, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46875, + "step": 6319, + "time_per_iteration": 2.3741204738616943 + }, + { + "auxiliary_loss_clip": 0.01067666, + "auxiliary_loss_mlp": 0.01068127, + "balance_loss_clip": 1.03632212, + "balance_loss_mlp": 1.02185655, + "epoch": 0.3799789568615662, + "flos": 21870815679360.0, + "grad_norm": 4.013086761829991, + "language_loss": 0.65197408, + "learning_rate": 2.846932380444744e-06, + "loss": 0.67333198, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.45898438, + "step": 6320, + "time_per_iteration": 2.4420294761657715 + }, + { + "auxiliary_loss_clip": 0.01066248, + "auxiliary_loss_mlp": 0.0105637, + "balance_loss_clip": 1.02528059, + "balance_loss_mlp": 1.02136958, + "epoch": 0.3800390801142342, + "flos": 32961570013440.0, + "grad_norm": 1.80919709956783, + "language_loss": 0.72368693, + "learning_rate": 2.846579546413992e-06, + "loss": 0.7449131, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44921875, + "step": 6321, + "time_per_iteration": 2.4924166202545166 + }, + { + "auxiliary_loss_clip": 0.01070213, + "auxiliary_loss_mlp": 0.01053551, + "balance_loss_clip": 1.01955295, + "balance_loss_mlp": 1.02235079, + "epoch": 0.38009920336690217, + "flos": 26905849914240.0, + "grad_norm": 1.8886251520325552, + "language_loss": 0.76338959, + "learning_rate": 2.846226680280859e-06, + "loss": 0.7846272, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.47851562, + "step": 6322, + "time_per_iteration": 2.467200517654419 + }, + { + "auxiliary_loss_clip": 0.01066934, + "auxiliary_loss_mlp": 0.01049909, + "balance_loss_clip": 1.0186044, + "balance_loss_mlp": 1.02279329, + "epoch": 0.38015932661957014, + "flos": 22487905693440.0, + "grad_norm": 2.3352981935432036, + "language_loss": 0.86752677, + "learning_rate": 2.845873782058725e-06, + "loss": 0.88869512, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.44140625, + "step": 6323, + "time_per_iteration": 2.391192674636841 + }, + { + "auxiliary_loss_clip": 0.01070798, + "auxiliary_loss_mlp": 0.01053538, + "balance_loss_clip": 1.01794207, + "balance_loss_mlp": 1.02416015, + "epoch": 0.3802194498722381, + "flos": 21979919278080.0, + "grad_norm": 1.8540548598746636, + "language_loss": 0.7446003, + "learning_rate": 2.845520851760973e-06, + "loss": 0.76584363, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.46679688, + "step": 6324, + "time_per_iteration": 2.4184179306030273 + }, + { + "auxiliary_loss_clip": 0.0107192, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_clip": 1.01196134, + "balance_loss_mlp": 1.02524018, + "epoch": 0.38027957312490607, + "flos": 21323796497280.0, + "grad_norm": 1.642924366701546, + "language_loss": 0.85864735, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.87980992, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46679688, + "step": 6325, + "time_per_iteration": 2.4248766899108887 + }, + { + "auxiliary_loss_clip": 0.01070572, + "auxiliary_loss_mlp": 0.01047374, + "balance_loss_clip": 1.01733363, + "balance_loss_mlp": 1.02503896, + "epoch": 0.38033969637757403, + "flos": 16690298342400.0, + "grad_norm": 1.7444616707184535, + "language_loss": 0.81557178, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.83675122, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45703125, + "step": 6326, + "time_per_iteration": 2.4223783016204834 + }, + { + "auxiliary_loss_clip": 0.01070561, + "auxiliary_loss_mlp": 0.01054069, + "balance_loss_clip": 1.02209687, + "balance_loss_mlp": 1.0242914, + "epoch": 0.380399819630242, + "flos": 36209365372800.0, + "grad_norm": 1.8677358331334701, + "language_loss": 0.7480104, + "learning_rate": 2.844461868547842e-06, + "loss": 0.76925671, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46289062, + "step": 6327, + "time_per_iteration": 2.545905590057373 + }, + { + "auxiliary_loss_clip": 0.01069646, + "auxiliary_loss_mlp": 0.01053487, + "balance_loss_clip": 1.02089477, + "balance_loss_mlp": 1.02468538, + "epoch": 0.38045994288290996, + "flos": 21287766107520.0, + "grad_norm": 1.5200753606149444, + "language_loss": 0.8396244, + "learning_rate": 2.844108810081459e-06, + "loss": 0.8608557, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44921875, + "step": 6328, + "time_per_iteration": 2.4522266387939453 + }, + { + "auxiliary_loss_clip": 0.0106837, + "auxiliary_loss_mlp": 0.01046664, + "balance_loss_clip": 1.0169692, + "balance_loss_mlp": 1.02366161, + "epoch": 0.38052006613557793, + "flos": 20921841480960.0, + "grad_norm": 1.7701961905439585, + "language_loss": 0.62636852, + "learning_rate": 2.843755719606385e-06, + "loss": 0.64751887, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44726562, + "step": 6329, + "time_per_iteration": 2.403658151626587 + }, + { + "auxiliary_loss_clip": 0.01069017, + "auxiliary_loss_mlp": 0.01054443, + "balance_loss_clip": 1.02220857, + "balance_loss_mlp": 1.0234493, + "epoch": 0.3805801893882459, + "flos": 20989817631360.0, + "grad_norm": 1.818396707629304, + "language_loss": 0.56721675, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58845139, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.45507812, + "step": 6330, + "time_per_iteration": 2.4120686054229736 + }, + { + "auxiliary_loss_clip": 0.0106849, + "auxiliary_loss_mlp": 0.01051497, + "balance_loss_clip": 1.02301836, + "balance_loss_mlp": 1.02549982, + "epoch": 0.38064031264091386, + "flos": 25557364494720.0, + "grad_norm": 5.9765305694747735, + "language_loss": 0.66909832, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.6902982, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 6331, + "time_per_iteration": 2.466381549835205 + }, + { + "auxiliary_loss_clip": 0.01070467, + "auxiliary_loss_mlp": 0.01064851, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.02485907, + "epoch": 0.3807004358935818, + "flos": 15084956983680.0, + "grad_norm": 1.5624522512316206, + "language_loss": 0.77715003, + "learning_rate": 2.842696256262919e-06, + "loss": 0.79850316, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.45703125, + "step": 6332, + "time_per_iteration": 2.4440338611602783 + }, + { + "auxiliary_loss_clip": 0.01068318, + "auxiliary_loss_mlp": 0.01054128, + "balance_loss_clip": 1.02175117, + "balance_loss_mlp": 1.02208877, + "epoch": 0.3807605591462498, + "flos": 16398459354240.0, + "grad_norm": 1.7021448613142254, + "language_loss": 0.83089787, + "learning_rate": 2.842343037886987e-06, + "loss": 0.85212231, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46289062, + "step": 6333, + "time_per_iteration": 2.412381172180176 + }, + { + "auxiliary_loss_clip": 0.01065961, + "auxiliary_loss_mlp": 0.0105219, + "balance_loss_clip": 1.02106452, + "balance_loss_mlp": 1.02142429, + "epoch": 0.3808206823989178, + "flos": 29055871393920.0, + "grad_norm": 2.0185880754559298, + "language_loss": 0.87206191, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.89324343, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4453125, + "step": 6334, + "time_per_iteration": 2.506929874420166 + }, + { + "auxiliary_loss_clip": 0.01067474, + "auxiliary_loss_mlp": 0.01053722, + "balance_loss_clip": 1.02126169, + "balance_loss_mlp": 1.0223372, + "epoch": 0.3808808056515858, + "flos": 15704944640640.0, + "grad_norm": 1.7016586030709253, + "language_loss": 0.80686384, + "learning_rate": 2.841636505323321e-06, + "loss": 0.82807589, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45117188, + "step": 6335, + "time_per_iteration": 2.3895583152770996 + }, + { + "auxiliary_loss_clip": 0.01067994, + "auxiliary_loss_mlp": 0.01047479, + "balance_loss_clip": 1.01454115, + "balance_loss_mlp": 1.02215719, + "epoch": 0.38094092890425374, + "flos": 20703529549440.0, + "grad_norm": 1.8434398832748637, + "language_loss": 0.74351168, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.76466638, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.45703125, + "step": 6336, + "time_per_iteration": 2.4231884479522705 + }, + { + "auxiliary_loss_clip": 0.01064282, + "auxiliary_loss_mlp": 0.01051236, + "balance_loss_clip": 1.02179146, + "balance_loss_mlp": 1.020666, + "epoch": 0.3810010521569217, + "flos": 20666905666560.0, + "grad_norm": 3.390208048128217, + "language_loss": 0.7059328, + "learning_rate": 2.840929845099894e-06, + "loss": 0.72708797, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43554688, + "step": 6337, + "time_per_iteration": 2.376833438873291 + }, + { + "auxiliary_loss_clip": 0.01065661, + "auxiliary_loss_mlp": 0.01048866, + "balance_loss_clip": 1.01853919, + "balance_loss_mlp": 1.02183032, + "epoch": 0.38106117540958967, + "flos": 31826404200960.0, + "grad_norm": 1.9207580719580042, + "language_loss": 0.64978153, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.67092681, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4375, + "step": 6338, + "time_per_iteration": 2.503746747970581 + }, + { + "auxiliary_loss_clip": 0.01066746, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.01907802, + "balance_loss_mlp": 1.02170396, + "epoch": 0.38112129866225763, + "flos": 16902012026880.0, + "grad_norm": 2.1501939640328933, + "language_loss": 0.70759183, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.72876358, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44921875, + "step": 6339, + "time_per_iteration": 2.398677349090576 + }, + { + "auxiliary_loss_clip": 0.01066686, + "auxiliary_loss_mlp": 0.01048542, + "balance_loss_clip": 1.01809633, + "balance_loss_mlp": 1.02118564, + "epoch": 0.3811814219149256, + "flos": 20886160204800.0, + "grad_norm": 2.252975005128591, + "language_loss": 0.70024508, + "learning_rate": 2.839869615637177e-06, + "loss": 0.7213974, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.45507812, + "step": 6340, + "time_per_iteration": 2.4322092533111572 + }, + { + "auxiliary_loss_clip": 0.0106751, + "auxiliary_loss_mlp": 0.01047294, + "balance_loss_clip": 1.01544166, + "balance_loss_mlp": 1.0207907, + "epoch": 0.38124154516759357, + "flos": 16689879406080.0, + "grad_norm": 1.9345164811928526, + "language_loss": 0.90834975, + "learning_rate": 2.839516142102522e-06, + "loss": 0.92949778, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46679688, + "step": 6341, + "time_per_iteration": 2.4129791259765625 + }, + { + "auxiliary_loss_clip": 0.01067292, + "auxiliary_loss_mlp": 0.01048122, + "balance_loss_clip": 1.01705551, + "balance_loss_mlp": 1.02125537, + "epoch": 0.38130166842026153, + "flos": 19680958471680.0, + "grad_norm": 1.6718687298097696, + "language_loss": 0.76592928, + "learning_rate": 2.83916263673333e-06, + "loss": 0.78708345, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4609375, + "step": 6342, + "time_per_iteration": 2.4632701873779297 + }, + { + "auxiliary_loss_clip": 0.01064351, + "auxiliary_loss_mlp": 0.01050392, + "balance_loss_clip": 1.01954079, + "balance_loss_mlp": 1.01911342, + "epoch": 0.3813617916729295, + "flos": 22197393336960.0, + "grad_norm": 1.7168407437481723, + "language_loss": 0.8495968, + "learning_rate": 2.838809099543007e-06, + "loss": 0.87074423, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.453125, + "step": 6343, + "time_per_iteration": 2.4077558517456055 + }, + { + "auxiliary_loss_clip": 0.01064875, + "auxiliary_loss_mlp": 0.01050682, + "balance_loss_clip": 1.02037871, + "balance_loss_mlp": 1.0199101, + "epoch": 0.38142191492559746, + "flos": 19095953863680.0, + "grad_norm": 1.4913530285958556, + "language_loss": 0.77976036, + "learning_rate": 2.838455530544959e-06, + "loss": 0.80091596, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44921875, + "step": 6344, + "time_per_iteration": 2.401063919067383 + }, + { + "auxiliary_loss_clip": 0.01066349, + "auxiliary_loss_mlp": 0.01049828, + "balance_loss_clip": 1.01799917, + "balance_loss_mlp": 1.02163112, + "epoch": 0.3814820381782654, + "flos": 24096598542720.0, + "grad_norm": 2.140447811961153, + "language_loss": 0.75411963, + "learning_rate": 2.838101929752593e-06, + "loss": 0.77528143, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44726562, + "step": 6345, + "time_per_iteration": 3.8701794147491455 + }, + { + "auxiliary_loss_clip": 0.01065358, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_clip": 1.01690507, + "balance_loss_mlp": 1.0220145, + "epoch": 0.3815421614309334, + "flos": 15777598913280.0, + "grad_norm": 3.3563392758260253, + "language_loss": 0.71975738, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.74086279, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.43359375, + "step": 6346, + "time_per_iteration": 3.846802234649658 + }, + { + "auxiliary_loss_clip": 0.01067732, + "auxiliary_loss_mlp": 0.01044372, + "balance_loss_clip": 1.01408148, + "balance_loss_mlp": 1.02243328, + "epoch": 0.38160228468360136, + "flos": 19898781644160.0, + "grad_norm": 1.8221601250934318, + "language_loss": 0.76291704, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78403813, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.453125, + "step": 6347, + "time_per_iteration": 2.411039113998413 + }, + { + "auxiliary_loss_clip": 0.01067187, + "auxiliary_loss_mlp": 0.01052115, + "balance_loss_clip": 1.02054811, + "balance_loss_mlp": 1.02209723, + "epoch": 0.3816624079362694, + "flos": 19280050796160.0, + "grad_norm": 1.6194568762565813, + "language_loss": 0.75829351, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.77948654, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45117188, + "step": 6348, + "time_per_iteration": 2.382446765899658 + }, + { + "auxiliary_loss_clip": 0.01066113, + "auxiliary_loss_mlp": 0.01044759, + "balance_loss_clip": 1.0140264, + "balance_loss_mlp": 1.02146482, + "epoch": 0.38172253118893734, + "flos": 21176532915840.0, + "grad_norm": 1.827558476096686, + "language_loss": 0.88350892, + "learning_rate": 2.836687208908142e-06, + "loss": 0.90461761, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4453125, + "step": 6349, + "time_per_iteration": 2.384216785430908 + }, + { + "auxiliary_loss_clip": 0.01065671, + "auxiliary_loss_mlp": 0.01055158, + "balance_loss_clip": 1.02264929, + "balance_loss_mlp": 1.02216959, + "epoch": 0.3817826544416053, + "flos": 17528283728640.0, + "grad_norm": 1.8437953100507407, + "language_loss": 0.78058434, + "learning_rate": 2.836333449345341e-06, + "loss": 0.80179262, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.43359375, + "step": 6350, + "time_per_iteration": 3.775536298751831 + }, + { + "auxiliary_loss_clip": 0.01068095, + "auxiliary_loss_mlp": 0.01048643, + "balance_loss_clip": 1.01800644, + "balance_loss_mlp": 1.02307534, + "epoch": 0.38184277769427327, + "flos": 16325595613440.0, + "grad_norm": 2.028221769568398, + "language_loss": 0.76954812, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.79071558, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44921875, + "step": 6351, + "time_per_iteration": 3.7839794158935547 + }, + { + "auxiliary_loss_clip": 0.01068139, + "auxiliary_loss_mlp": 0.01053785, + "balance_loss_clip": 1.02181244, + "balance_loss_mlp": 1.02281117, + "epoch": 0.38190290094694124, + "flos": 30442202593920.0, + "grad_norm": 1.9078827029400325, + "language_loss": 0.76561964, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.78683889, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.453125, + "step": 6352, + "time_per_iteration": 2.495309829711914 + }, + { + "auxiliary_loss_clip": 0.01064367, + "auxiliary_loss_mlp": 0.01039388, + "balance_loss_clip": 1.0118866, + "balance_loss_mlp": 1.02204013, + "epoch": 0.3819630241996092, + "flos": 14209055994240.0, + "grad_norm": 1.8314766206282933, + "language_loss": 0.6526643, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.67370188, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.42382812, + "step": 6353, + "time_per_iteration": 2.3915152549743652 + }, + { + "auxiliary_loss_clip": 0.01064658, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.01567388, + "balance_loss_mlp": 1.02162826, + "epoch": 0.38202314745227717, + "flos": 25008529921920.0, + "grad_norm": 8.083305430002785, + "language_loss": 0.84286642, + "learning_rate": 2.834918094089816e-06, + "loss": 0.86396009, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4296875, + "step": 6354, + "time_per_iteration": 2.4214165210723877 + }, + { + "auxiliary_loss_clip": 0.01064604, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.01474345, + "balance_loss_mlp": 1.02247119, + "epoch": 0.38208327070494513, + "flos": 20813436109440.0, + "grad_norm": 1.7648439249317445, + "language_loss": 0.81733942, + "learning_rate": 2.834564176091943e-06, + "loss": 0.83839804, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 6355, + "time_per_iteration": 2.4377002716064453 + }, + { + "auxiliary_loss_clip": 0.01066532, + "auxiliary_loss_mlp": 0.01047284, + "balance_loss_clip": 1.01698065, + "balance_loss_mlp": 1.0225842, + "epoch": 0.3821433939576131, + "flos": 22636635552000.0, + "grad_norm": 1.9760656026346997, + "language_loss": 0.76074558, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.78188372, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43945312, + "step": 6356, + "time_per_iteration": 2.414358377456665 + }, + { + "auxiliary_loss_clip": 0.01064973, + "auxiliary_loss_mlp": 0.01044075, + "balance_loss_clip": 1.01422513, + "balance_loss_mlp": 1.02153397, + "epoch": 0.38220351721028106, + "flos": 26868667449600.0, + "grad_norm": 1.9228274079777237, + "language_loss": 0.82766181, + "learning_rate": 2.833856245169348e-06, + "loss": 0.84875226, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43359375, + "step": 6357, + "time_per_iteration": 2.4571309089660645 + }, + { + "auxiliary_loss_clip": 0.01066866, + "auxiliary_loss_mlp": 0.0105301, + "balance_loss_clip": 1.01750898, + "balance_loss_mlp": 1.02091551, + "epoch": 0.38226364046294903, + "flos": 23366355212160.0, + "grad_norm": 2.1738762692089484, + "language_loss": 0.7922101, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.81340885, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.45898438, + "step": 6358, + "time_per_iteration": 2.3965306282043457 + }, + { + "auxiliary_loss_clip": 0.01064735, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.02019906, + "balance_loss_mlp": 1.02125216, + "epoch": 0.382323763715617, + "flos": 19645207372800.0, + "grad_norm": 2.7366478665744753, + "language_loss": 0.8048383, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.82598734, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43554688, + "step": 6359, + "time_per_iteration": 2.4002082347869873 + }, + { + "auxiliary_loss_clip": 0.0106536, + "auxiliary_loss_mlp": 0.01049465, + "balance_loss_clip": 1.01751685, + "balance_loss_mlp": 1.02070129, + "epoch": 0.38238388696828496, + "flos": 54122776842240.0, + "grad_norm": 1.6147639843334827, + "language_loss": 0.70607865, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.72722685, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4453125, + "step": 6360, + "time_per_iteration": 2.681946039199829 + }, + { + "auxiliary_loss_clip": 0.01063898, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.01103926, + "balance_loss_mlp": 1.01981962, + "epoch": 0.382444010220953, + "flos": 24935037776640.0, + "grad_norm": 1.814162532659825, + "language_loss": 0.79235154, + "learning_rate": 2.83244000399261e-06, + "loss": 0.81338835, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.44140625, + "step": 6361, + "time_per_iteration": 2.4558351039886475 + }, + { + "auxiliary_loss_clip": 0.01061235, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_clip": 1.01630092, + "balance_loss_mlp": 1.02028894, + "epoch": 0.38250413347362094, + "flos": 42335784708480.0, + "grad_norm": 1.486384485879323, + "language_loss": 0.66602397, + "learning_rate": 2.832085864749337e-06, + "loss": 0.68707472, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41015625, + "step": 6362, + "time_per_iteration": 2.599665403366089 + }, + { + "auxiliary_loss_clip": 0.01061194, + "auxiliary_loss_mlp": 0.01042405, + "balance_loss_clip": 1.01136279, + "balance_loss_mlp": 1.01952636, + "epoch": 0.3825642567262889, + "flos": 16288308414720.0, + "grad_norm": 1.8905823481182726, + "language_loss": 0.83128035, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.85231626, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.41601562, + "step": 6363, + "time_per_iteration": 2.4060213565826416 + }, + { + "auxiliary_loss_clip": 0.01059851, + "auxiliary_loss_mlp": 0.01044997, + "balance_loss_clip": 1.01452672, + "balance_loss_mlp": 1.01922321, + "epoch": 0.3826243799789569, + "flos": 45653197052160.0, + "grad_norm": 1.693829331783628, + "language_loss": 0.6074121, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.62846053, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.40625, + "step": 6364, + "time_per_iteration": 2.6088473796844482 + }, + { + "auxiliary_loss_clip": 0.0106466, + "auxiliary_loss_mlp": 0.01048511, + "balance_loss_clip": 1.01723003, + "balance_loss_mlp": 1.02094114, + "epoch": 0.38268450323162484, + "flos": 25300403821440.0, + "grad_norm": 2.46118969117612, + "language_loss": 0.69980389, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.72093558, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4375, + "step": 6365, + "time_per_iteration": 2.455159902572632 + }, + { + "auxiliary_loss_clip": 0.01064206, + "auxiliary_loss_mlp": 0.01049877, + "balance_loss_clip": 1.01779735, + "balance_loss_mlp": 1.01979184, + "epoch": 0.3827446264842928, + "flos": 21834924935040.0, + "grad_norm": 2.09147588833453, + "language_loss": 0.74673223, + "learning_rate": 2.830668992382758e-06, + "loss": 0.76787305, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4453125, + "step": 6366, + "time_per_iteration": 2.421412944793701 + }, + { + "auxiliary_loss_clip": 0.01063964, + "auxiliary_loss_mlp": 0.0104503, + "balance_loss_clip": 1.01377344, + "balance_loss_mlp": 1.02028751, + "epoch": 0.38280474973696077, + "flos": 25733536548480.0, + "grad_norm": 2.709213940654289, + "language_loss": 0.70052671, + "learning_rate": 2.830314695509902e-06, + "loss": 0.72161663, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4375, + "step": 6367, + "time_per_iteration": 2.4637439250946045 + }, + { + "auxiliary_loss_clip": 0.01061162, + "auxiliary_loss_mlp": 0.01048802, + "balance_loss_clip": 1.01737869, + "balance_loss_mlp": 1.01924586, + "epoch": 0.38286487298962874, + "flos": 24894887846400.0, + "grad_norm": 2.06428370367374, + "language_loss": 0.65331328, + "learning_rate": 2.82996036715143e-06, + "loss": 0.67441297, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.41796875, + "step": 6368, + "time_per_iteration": 2.4155375957489014 + }, + { + "auxiliary_loss_clip": 0.01062633, + "auxiliary_loss_mlp": 0.01053532, + "balance_loss_clip": 1.02229881, + "balance_loss_mlp": 1.01918411, + "epoch": 0.3829249962422967, + "flos": 28542578451840.0, + "grad_norm": 1.3597876252514884, + "language_loss": 0.69298339, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.71414495, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43359375, + "step": 6369, + "time_per_iteration": 2.462728977203369 + }, + { + "auxiliary_loss_clip": 0.01064682, + "auxiliary_loss_mlp": 0.01046875, + "balance_loss_clip": 1.01481915, + "balance_loss_mlp": 1.02048624, + "epoch": 0.38298511949496467, + "flos": 21470117472000.0, + "grad_norm": 1.7324415173371763, + "language_loss": 0.79319, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.8143056, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44140625, + "step": 6370, + "time_per_iteration": 2.3887529373168945 + }, + { + "auxiliary_loss_clip": 0.01062374, + "auxiliary_loss_mlp": 0.01052446, + "balance_loss_clip": 1.02076054, + "balance_loss_mlp": 1.01989603, + "epoch": 0.38304524274763263, + "flos": 31678826417280.0, + "grad_norm": 3.0926876599712254, + "language_loss": 0.65513623, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.67628443, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.42382812, + "step": 6371, + "time_per_iteration": 2.457451343536377 + }, + { + "auxiliary_loss_clip": 0.01067739, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.0241611, + "balance_loss_mlp": 1.02112949, + "epoch": 0.3831053660003006, + "flos": 25075807845120.0, + "grad_norm": 2.6250462774194467, + "language_loss": 0.73929954, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.76055205, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.46679688, + "step": 6372, + "time_per_iteration": 2.4331464767456055 + }, + { + "auxiliary_loss_clip": 0.01064349, + "auxiliary_loss_mlp": 0.01046202, + "balance_loss_clip": 1.01592314, + "balance_loss_mlp": 1.01972926, + "epoch": 0.38316548925296856, + "flos": 23257880017920.0, + "grad_norm": 1.7543730173292114, + "language_loss": 0.86057878, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.8816843, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4453125, + "step": 6373, + "time_per_iteration": 2.413870096206665 + }, + { + "auxiliary_loss_clip": 0.01066679, + "auxiliary_loss_mlp": 0.01056717, + "balance_loss_clip": 1.02521038, + "balance_loss_mlp": 1.02103448, + "epoch": 0.3832256125056366, + "flos": 34422021763200.0, + "grad_norm": 1.9653266640598404, + "language_loss": 0.75857466, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.77980864, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45703125, + "step": 6374, + "time_per_iteration": 2.4860427379608154 + }, + { + "auxiliary_loss_clip": 0.01067166, + "auxiliary_loss_mlp": 0.01052605, + "balance_loss_clip": 1.02022767, + "balance_loss_mlp": 1.0206573, + "epoch": 0.38328573575830455, + "flos": 21761677169280.0, + "grad_norm": 2.3615810988998924, + "language_loss": 0.77953351, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.80073118, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.46484375, + "step": 6375, + "time_per_iteration": 2.4259936809539795 + }, + { + "auxiliary_loss_clip": 0.01065004, + "auxiliary_loss_mlp": 0.01043809, + "balance_loss_clip": 1.01389921, + "balance_loss_mlp": 1.02151334, + "epoch": 0.3833458590109725, + "flos": 17379169845120.0, + "grad_norm": 2.1398987950061974, + "language_loss": 0.7457844, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.76687258, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43554688, + "step": 6376, + "time_per_iteration": 2.3516883850097656 + }, + { + "auxiliary_loss_clip": 0.01061277, + "auxiliary_loss_mlp": 0.01048802, + "balance_loss_clip": 1.01885676, + "balance_loss_mlp": 1.01860666, + "epoch": 0.3834059822636405, + "flos": 29423262297600.0, + "grad_norm": 1.501776080683288, + "language_loss": 0.68600667, + "learning_rate": 2.826769997289796e-06, + "loss": 0.70710742, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.42578125, + "step": 6377, + "time_per_iteration": 2.4720356464385986 + }, + { + "auxiliary_loss_clip": 0.01064729, + "auxiliary_loss_mlp": 0.01048289, + "balance_loss_clip": 1.01610267, + "balance_loss_mlp": 1.01992869, + "epoch": 0.38346610551630844, + "flos": 21469663624320.0, + "grad_norm": 3.3393269197829283, + "language_loss": 0.74764234, + "learning_rate": 2.826415354814344e-06, + "loss": 0.76877254, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.44921875, + "step": 6378, + "time_per_iteration": 2.3663418292999268 + }, + { + "auxiliary_loss_clip": 0.01063408, + "auxiliary_loss_mlp": 0.0104844, + "balance_loss_clip": 1.01808965, + "balance_loss_mlp": 1.01897883, + "epoch": 0.3835262287689764, + "flos": 27560052570240.0, + "grad_norm": 1.8980896128201121, + "language_loss": 0.71280247, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.73392093, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4453125, + "step": 6379, + "time_per_iteration": 2.4543209075927734 + }, + { + "auxiliary_loss_clip": 0.01062376, + "auxiliary_loss_mlp": 0.01048972, + "balance_loss_clip": 1.0188241, + "balance_loss_mlp": 1.01961577, + "epoch": 0.3835863520216444, + "flos": 15522802744320.0, + "grad_norm": 1.9968826945430231, + "language_loss": 0.84540856, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.86652207, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.42773438, + "step": 6380, + "time_per_iteration": 2.365417957305908 + }, + { + "auxiliary_loss_clip": 0.01062078, + "auxiliary_loss_mlp": 0.01042791, + "balance_loss_clip": 1.01375151, + "balance_loss_mlp": 1.01904714, + "epoch": 0.38364647527431234, + "flos": 21903948426240.0, + "grad_norm": 1.449300061590241, + "language_loss": 0.82186115, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.84290981, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 6381, + "time_per_iteration": 2.434833288192749 + }, + { + "auxiliary_loss_clip": 0.01016491, + "auxiliary_loss_mlp": 0.01008767, + "balance_loss_clip": 1.00449908, + "balance_loss_mlp": 1.00632811, + "epoch": 0.3837065985269803, + "flos": 65531902798080.0, + "grad_norm": 0.8237663286722084, + "language_loss": 0.60560811, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62586069, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.1015625, + "step": 6382, + "time_per_iteration": 3.013780355453491 + }, + { + "auxiliary_loss_clip": 0.01066526, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_clip": 1.01561022, + "balance_loss_mlp": 1.01986933, + "epoch": 0.38376672177964827, + "flos": 28255347763200.0, + "grad_norm": 2.3073193012700344, + "language_loss": 0.68002689, + "learning_rate": 2.824641672639794e-06, + "loss": 0.7011615, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.46679688, + "step": 6383, + "time_per_iteration": 2.475337266921997 + }, + { + "auxiliary_loss_clip": 0.01065204, + "auxiliary_loss_mlp": 0.01049381, + "balance_loss_clip": 1.01814806, + "balance_loss_mlp": 1.020123, + "epoch": 0.38382684503231623, + "flos": 20630316695040.0, + "grad_norm": 2.0372074969148035, + "language_loss": 0.76438439, + "learning_rate": 2.824286842339587e-06, + "loss": 0.78553027, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.45117188, + "step": 6384, + "time_per_iteration": 2.4091949462890625 + }, + { + "auxiliary_loss_clip": 0.01062888, + "auxiliary_loss_mlp": 0.01045624, + "balance_loss_clip": 1.01433158, + "balance_loss_mlp": 1.01988971, + "epoch": 0.3838869682849842, + "flos": 19604917797120.0, + "grad_norm": 1.7954730707304243, + "language_loss": 0.77515501, + "learning_rate": 2.823931980782341e-06, + "loss": 0.79624009, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4296875, + "step": 6385, + "time_per_iteration": 3.9195058345794678 + }, + { + "auxiliary_loss_clip": 0.01016162, + "auxiliary_loss_mlp": 0.01005397, + "balance_loss_clip": 1.0012958, + "balance_loss_mlp": 1.00570953, + "epoch": 0.38394709153765216, + "flos": 56553428897280.0, + "grad_norm": 0.8940279882877806, + "language_loss": 0.67136705, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69158268, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.10449219, + "step": 6386, + "time_per_iteration": 4.315301895141602 + }, + { + "auxiliary_loss_clip": 0.0106181, + "auxiliary_loss_mlp": 0.01049645, + "balance_loss_clip": 1.02198839, + "balance_loss_mlp": 1.0198437, + "epoch": 0.3840072147903202, + "flos": 15887819675520.0, + "grad_norm": 1.96451650797348, + "language_loss": 0.73763233, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.75874692, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41992188, + "step": 6387, + "time_per_iteration": 2.3827855587005615 + }, + { + "auxiliary_loss_clip": 0.01063424, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.02053678, + "balance_loss_mlp": 1.02133405, + "epoch": 0.38406733804298815, + "flos": 28216838666880.0, + "grad_norm": 1.6233387617232438, + "language_loss": 0.82276624, + "learning_rate": 2.822867208702932e-06, + "loss": 0.84390819, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.421875, + "step": 6388, + "time_per_iteration": 2.490025043487549 + }, + { + "auxiliary_loss_clip": 0.0106012, + "auxiliary_loss_mlp": 0.0104649, + "balance_loss_clip": 1.01877344, + "balance_loss_mlp": 1.0183084, + "epoch": 0.3841274612956561, + "flos": 18222601403520.0, + "grad_norm": 1.7551562563109815, + "language_loss": 0.77066195, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.79172796, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 6389, + "time_per_iteration": 2.3614163398742676 + }, + { + "auxiliary_loss_clip": 0.01066274, + "auxiliary_loss_mlp": 0.01055613, + "balance_loss_clip": 1.02235329, + "balance_loss_mlp": 1.02040482, + "epoch": 0.3841875845483241, + "flos": 19791842549760.0, + "grad_norm": 1.6580719878320376, + "language_loss": 0.77902436, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.80024326, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.45898438, + "step": 6390, + "time_per_iteration": 3.7473785877227783 + }, + { + "auxiliary_loss_clip": 0.01064379, + "auxiliary_loss_mlp": 0.0105425, + "balance_loss_clip": 1.01996541, + "balance_loss_mlp": 1.0193913, + "epoch": 0.38424770780099204, + "flos": 29897522472960.0, + "grad_norm": 2.1421026135593744, + "language_loss": 0.71170557, + "learning_rate": 2.821802155794668e-06, + "loss": 0.73289186, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.44921875, + "step": 6391, + "time_per_iteration": 3.989543914794922 + }, + { + "auxiliary_loss_clip": 0.0106406, + "auxiliary_loss_mlp": 0.01056971, + "balance_loss_clip": 1.02185225, + "balance_loss_mlp": 1.01903033, + "epoch": 0.38430783105366, + "flos": 20812668059520.0, + "grad_norm": 1.9508558970128071, + "language_loss": 0.8540138, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.87522411, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.44921875, + "step": 6392, + "time_per_iteration": 2.444394111633301 + }, + { + "auxiliary_loss_clip": 0.01062602, + "auxiliary_loss_mlp": 0.0104663, + "balance_loss_clip": 1.01649404, + "balance_loss_mlp": 1.01866114, + "epoch": 0.384367954306328, + "flos": 10997814695040.0, + "grad_norm": 2.188232238702554, + "language_loss": 0.63300234, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.65409464, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43945312, + "step": 6393, + "time_per_iteration": 2.399193286895752 + }, + { + "auxiliary_loss_clip": 0.01067034, + "auxiliary_loss_mlp": 0.01053708, + "balance_loss_clip": 1.01851773, + "balance_loss_mlp": 1.01971102, + "epoch": 0.38442807755899594, + "flos": 25336853147520.0, + "grad_norm": 2.198181211409453, + "language_loss": 0.7290647, + "learning_rate": 2.820736822421029e-06, + "loss": 0.75027215, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.47265625, + "step": 6394, + "time_per_iteration": 2.3978912830352783 + }, + { + "auxiliary_loss_clip": 0.01068591, + "auxiliary_loss_mlp": 0.01057534, + "balance_loss_clip": 1.02053189, + "balance_loss_mlp": 1.02128005, + "epoch": 0.3844882008116639, + "flos": 21068686126080.0, + "grad_norm": 7.7055853384041555, + "language_loss": 0.8282401, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.84950137, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.47265625, + "step": 6395, + "time_per_iteration": 2.412632703781128 + }, + { + "auxiliary_loss_clip": 0.01065883, + "auxiliary_loss_mlp": 0.01051509, + "balance_loss_clip": 1.01827383, + "balance_loss_mlp": 1.02153254, + "epoch": 0.38454832406433187, + "flos": 17962393973760.0, + "grad_norm": 2.0689666648180225, + "language_loss": 0.72166789, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.74284184, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44335938, + "step": 6396, + "time_per_iteration": 2.3330984115600586 + }, + { + "auxiliary_loss_clip": 0.01015853, + "auxiliary_loss_mlp": 0.01013449, + "balance_loss_clip": 1.00860918, + "balance_loss_mlp": 1.00461471, + "epoch": 0.38460844731699984, + "flos": 67921392493440.0, + "grad_norm": 0.8987857639002326, + "language_loss": 0.59740496, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61769801, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.04833984, + "router_z_loss_mlp": 0.11230469, + "step": 6397, + "time_per_iteration": 3.1016032695770264 + }, + { + "auxiliary_loss_clip": 0.01064135, + "auxiliary_loss_mlp": 0.0104959, + "balance_loss_clip": 1.01556718, + "balance_loss_mlp": 1.01905894, + "epoch": 0.3846685705696678, + "flos": 25847876851200.0, + "grad_norm": 1.7570372434564208, + "language_loss": 0.86077309, + "learning_rate": 2.819315942271794e-06, + "loss": 0.88191026, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.45117188, + "step": 6398, + "time_per_iteration": 2.4448041915893555 + }, + { + "auxiliary_loss_clip": 0.01062352, + "auxiliary_loss_mlp": 0.01042263, + "balance_loss_clip": 1.01318753, + "balance_loss_mlp": 1.01913118, + "epoch": 0.38472869382233577, + "flos": 16289251021440.0, + "grad_norm": 1.9322418579900946, + "language_loss": 0.81198251, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.83302867, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43164062, + "step": 6399, + "time_per_iteration": 2.422811269760132 + }, + { + "auxiliary_loss_clip": 0.01065067, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.01401043, + "balance_loss_mlp": 1.02025008, + "epoch": 0.38478881707500373, + "flos": 19352146487040.0, + "grad_norm": 2.0272011162936083, + "language_loss": 0.69000775, + "learning_rate": 2.818605315732038e-06, + "loss": 0.7111361, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.44921875, + "step": 6400, + "time_per_iteration": 2.388669490814209 + }, + { + "auxiliary_loss_clip": 0.01067918, + "auxiliary_loss_mlp": 0.01052425, + "balance_loss_clip": 1.02050054, + "balance_loss_mlp": 1.02183008, + "epoch": 0.38484894032767175, + "flos": 24859765152000.0, + "grad_norm": 1.9174719971968093, + "language_loss": 0.74230522, + "learning_rate": 2.81824995589303e-06, + "loss": 0.76350868, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4609375, + "step": 6401, + "time_per_iteration": 2.443650245666504 + }, + { + "auxiliary_loss_clip": 0.01063204, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.01654434, + "balance_loss_mlp": 1.01913059, + "epoch": 0.3849090635803397, + "flos": 14500929893760.0, + "grad_norm": 1.833380167793557, + "language_loss": 0.74343777, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.76457405, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.44140625, + "step": 6402, + "time_per_iteration": 2.4122135639190674 + }, + { + "auxiliary_loss_clip": 0.01061513, + "auxiliary_loss_mlp": 0.01049036, + "balance_loss_clip": 1.01820862, + "balance_loss_mlp": 1.0192889, + "epoch": 0.3849691868330077, + "flos": 18514859328000.0, + "grad_norm": 2.307765745820898, + "language_loss": 0.83897817, + "learning_rate": 2.817539143144128e-06, + "loss": 0.8600837, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.421875, + "step": 6403, + "time_per_iteration": 2.475731611251831 + }, + { + "auxiliary_loss_clip": 0.0106236, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.01510513, + "balance_loss_mlp": 1.01974082, + "epoch": 0.38502931008567565, + "flos": 21615321283200.0, + "grad_norm": 2.00314150572524, + "language_loss": 0.84062684, + "learning_rate": 2.817183690261189e-06, + "loss": 0.8617124, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.42578125, + "step": 6404, + "time_per_iteration": 2.3955297470092773 + }, + { + "auxiliary_loss_clip": 0.01064429, + "auxiliary_loss_mlp": 0.0104549, + "balance_loss_clip": 1.01425719, + "balance_loss_mlp": 1.01970232, + "epoch": 0.3850894333383436, + "flos": 25414045896960.0, + "grad_norm": 1.5542454540892872, + "language_loss": 0.71014583, + "learning_rate": 2.816828206390563e-06, + "loss": 0.73124504, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.44726562, + "step": 6405, + "time_per_iteration": 2.5049026012420654 + }, + { + "auxiliary_loss_clip": 0.01062797, + "auxiliary_loss_mlp": 0.01047251, + "balance_loss_clip": 1.0183785, + "balance_loss_mlp": 1.01973033, + "epoch": 0.3851495565910116, + "flos": 20226895401600.0, + "grad_norm": 2.3360123992548116, + "language_loss": 0.80900347, + "learning_rate": 2.816472691545729e-06, + "loss": 0.83010393, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43164062, + "step": 6406, + "time_per_iteration": 2.419074296951294 + }, + { + "auxiliary_loss_clip": 0.01064878, + "auxiliary_loss_mlp": 0.01048844, + "balance_loss_clip": 1.01584697, + "balance_loss_mlp": 1.02037525, + "epoch": 0.38520967984367954, + "flos": 16507528041600.0, + "grad_norm": 2.201573896813602, + "language_loss": 0.86056358, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.88170075, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4453125, + "step": 6407, + "time_per_iteration": 2.4199435710906982 + }, + { + "auxiliary_loss_clip": 0.0101579, + "auxiliary_loss_mlp": 0.01010922, + "balance_loss_clip": 1.00696397, + "balance_loss_mlp": 1.00430369, + "epoch": 0.3852698030963475, + "flos": 61310553776640.0, + "grad_norm": 0.8598289708519227, + "language_loss": 0.64987934, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67014647, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.11523438, + "step": 6408, + "time_per_iteration": 3.090538501739502 + }, + { + "auxiliary_loss_clip": 0.01065761, + "auxiliary_loss_mlp": 0.01052847, + "balance_loss_clip": 1.02104235, + "balance_loss_mlp": 1.02089024, + "epoch": 0.3853299263490155, + "flos": 22891920480000.0, + "grad_norm": 1.4716750777352787, + "language_loss": 0.742993, + "learning_rate": 2.8154059613008e-06, + "loss": 0.76417911, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44921875, + "step": 6409, + "time_per_iteration": 2.455472469329834 + }, + { + "auxiliary_loss_clip": 0.01067954, + "auxiliary_loss_mlp": 0.01058466, + "balance_loss_clip": 1.02577877, + "balance_loss_mlp": 1.02062023, + "epoch": 0.38539004960168344, + "flos": 20046464161920.0, + "grad_norm": 2.0554562544717947, + "language_loss": 0.72029173, + "learning_rate": 2.81505032269396e-06, + "loss": 0.74155593, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.47265625, + "step": 6410, + "time_per_iteration": 2.3855443000793457 + }, + { + "auxiliary_loss_clip": 0.01015199, + "auxiliary_loss_mlp": 0.01004319, + "balance_loss_clip": 1.00038493, + "balance_loss_mlp": 1.00300407, + "epoch": 0.3854501728543514, + "flos": 68726978271360.0, + "grad_norm": 0.6849419074217806, + "language_loss": 0.60400832, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62420356, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.12207031, + "step": 6411, + "time_per_iteration": 3.1318328380584717 + }, + { + "auxiliary_loss_clip": 0.01063728, + "auxiliary_loss_mlp": 0.01043414, + "balance_loss_clip": 1.01506639, + "balance_loss_mlp": 1.02028, + "epoch": 0.38551029610701937, + "flos": 20483995720320.0, + "grad_norm": 2.010700687131897, + "language_loss": 0.79516947, + "learning_rate": 2.814338952773397e-06, + "loss": 0.81624091, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.43554688, + "step": 6412, + "time_per_iteration": 2.409669876098633 + }, + { + "auxiliary_loss_clip": 0.01067574, + "auxiliary_loss_mlp": 0.01053677, + "balance_loss_clip": 1.02034593, + "balance_loss_mlp": 1.02153361, + "epoch": 0.38557041935968733, + "flos": 23470815600000.0, + "grad_norm": 2.074192521123628, + "language_loss": 0.79563963, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.81685209, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.4609375, + "step": 6413, + "time_per_iteration": 2.4261488914489746 + }, + { + "auxiliary_loss_clip": 0.01014819, + "auxiliary_loss_mlp": 0.01006729, + "balance_loss_clip": 1.00269938, + "balance_loss_mlp": 1.00273275, + "epoch": 0.38563054261235535, + "flos": 63963639169920.0, + "grad_norm": 0.8078189058373881, + "language_loss": 0.61315024, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63336563, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.12109375, + "step": 6414, + "time_per_iteration": 2.9398422241210938 + }, + { + "auxiliary_loss_clip": 0.01066688, + "auxiliary_loss_mlp": 0.01055756, + "balance_loss_clip": 1.02538133, + "balance_loss_mlp": 1.02129841, + "epoch": 0.3856906658650233, + "flos": 23986657071360.0, + "grad_norm": 2.5914086214084824, + "language_loss": 0.79332387, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.81454837, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.453125, + "step": 6415, + "time_per_iteration": 2.402998924255371 + }, + { + "auxiliary_loss_clip": 0.01058947, + "auxiliary_loss_mlp": 0.01043443, + "balance_loss_clip": 1.01755035, + "balance_loss_mlp": 1.01924753, + "epoch": 0.3857507891176913, + "flos": 25006330506240.0, + "grad_norm": 1.6884870856894691, + "language_loss": 0.80835587, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.82937974, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39648438, + "step": 6416, + "time_per_iteration": 2.4967572689056396 + }, + { + "auxiliary_loss_clip": 0.01063913, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.0198561, + "balance_loss_mlp": 1.02030325, + "epoch": 0.38581091237035925, + "flos": 21535894206720.0, + "grad_norm": 2.0235730532194913, + "language_loss": 0.80235428, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.82348126, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43554688, + "step": 6417, + "time_per_iteration": 2.37949538230896 + }, + { + "auxiliary_loss_clip": 0.01062091, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.02047443, + "balance_loss_mlp": 1.01885247, + "epoch": 0.3858710356230272, + "flos": 17382940272000.0, + "grad_norm": 7.623268048276753, + "language_loss": 0.82308674, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.84419012, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.43164062, + "step": 6418, + "time_per_iteration": 2.4818711280822754 + }, + { + "auxiliary_loss_clip": 0.01061157, + "auxiliary_loss_mlp": 0.01044826, + "balance_loss_clip": 1.01573896, + "balance_loss_mlp": 1.01881564, + "epoch": 0.3859311588756952, + "flos": 20338547529600.0, + "grad_norm": 1.6871021047957437, + "language_loss": 0.81322068, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.83428049, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42382812, + "step": 6419, + "time_per_iteration": 2.396074056625366 + }, + { + "auxiliary_loss_clip": 0.010614, + "auxiliary_loss_mlp": 0.01048545, + "balance_loss_clip": 1.01662064, + "balance_loss_mlp": 1.0191462, + "epoch": 0.38599128212836314, + "flos": 26320007433600.0, + "grad_norm": 2.1384364006340584, + "language_loss": 0.69733268, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.71843207, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.421875, + "step": 6420, + "time_per_iteration": 2.5066535472869873 + }, + { + "auxiliary_loss_clip": 0.01060387, + "auxiliary_loss_mlp": 0.01046765, + "balance_loss_clip": 1.0205276, + "balance_loss_mlp": 1.0198915, + "epoch": 0.3860514053810311, + "flos": 13552968124800.0, + "grad_norm": 2.0898081661168764, + "language_loss": 0.82438052, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.84545207, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 6421, + "time_per_iteration": 2.3562872409820557 + }, + { + "auxiliary_loss_clip": 0.0106149, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.01444244, + "balance_loss_mlp": 1.01890802, + "epoch": 0.3861115286336991, + "flos": 20953368305280.0, + "grad_norm": 2.0316399267435936, + "language_loss": 0.73737895, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.758412, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42578125, + "step": 6422, + "time_per_iteration": 2.397467851638794 + }, + { + "auxiliary_loss_clip": 0.0105944, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.01993275, + "balance_loss_mlp": 1.01859713, + "epoch": 0.38617165188636704, + "flos": 16361765648640.0, + "grad_norm": 1.5579358128845253, + "language_loss": 0.68014836, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.7012136, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40820312, + "step": 6423, + "time_per_iteration": 2.3771145343780518 + }, + { + "auxiliary_loss_clip": 0.01063185, + "auxiliary_loss_mlp": 0.01054819, + "balance_loss_clip": 1.02616072, + "balance_loss_mlp": 1.01971579, + "epoch": 0.386231775139035, + "flos": 34785851708160.0, + "grad_norm": 1.7898353472915558, + "language_loss": 0.70504463, + "learning_rate": 2.810068143123449e-06, + "loss": 0.72622466, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43359375, + "step": 6424, + "time_per_iteration": 2.4902095794677734 + }, + { + "auxiliary_loss_clip": 0.01060568, + "auxiliary_loss_mlp": 0.01048735, + "balance_loss_clip": 1.02024412, + "balance_loss_mlp": 1.01890373, + "epoch": 0.38629189839170297, + "flos": 21725088197760.0, + "grad_norm": 1.4461908066186457, + "language_loss": 0.7344408, + "learning_rate": 2.809712042331429e-06, + "loss": 0.75553381, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41601562, + "step": 6425, + "time_per_iteration": 5.260988473892212 + }, + { + "auxiliary_loss_clip": 0.01065668, + "auxiliary_loss_mlp": 0.01047084, + "balance_loss_clip": 1.01642299, + "balance_loss_mlp": 1.02040005, + "epoch": 0.38635202164437094, + "flos": 27922520972160.0, + "grad_norm": 2.347155200877132, + "language_loss": 0.81012344, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.83125091, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.453125, + "step": 6426, + "time_per_iteration": 2.481254816055298 + }, + { + "auxiliary_loss_clip": 0.01062005, + "auxiliary_loss_mlp": 0.01045331, + "balance_loss_clip": 1.01635098, + "balance_loss_mlp": 1.01971936, + "epoch": 0.38641214489703896, + "flos": 23585505016320.0, + "grad_norm": 2.049402789011015, + "language_loss": 0.76495999, + "learning_rate": 2.80899974864781e-06, + "loss": 0.78603327, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.421875, + "step": 6427, + "time_per_iteration": 2.4035422801971436 + }, + { + "auxiliary_loss_clip": 0.01060366, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_clip": 1.01767468, + "balance_loss_mlp": 1.01917183, + "epoch": 0.3864722681497069, + "flos": 12640408341120.0, + "grad_norm": 2.0717810882806846, + "language_loss": 0.70686519, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.72792786, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 6428, + "time_per_iteration": 2.4081335067749023 + }, + { + "auxiliary_loss_clip": 0.01063402, + "auxiliary_loss_mlp": 0.01048087, + "balance_loss_clip": 1.01990557, + "balance_loss_mlp": 1.02087831, + "epoch": 0.3865323914023749, + "flos": 17598075269760.0, + "grad_norm": 1.9747152575837872, + "language_loss": 0.85430783, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.87542272, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42578125, + "step": 6429, + "time_per_iteration": 2.3632004261016846 + }, + { + "auxiliary_loss_clip": 0.01062031, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.0148114, + "balance_loss_mlp": 1.02033734, + "epoch": 0.38659251465504285, + "flos": 18477956154240.0, + "grad_norm": 1.9152502264010145, + "language_loss": 0.82584089, + "learning_rate": 2.807931078076015e-06, + "loss": 0.8468926, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41601562, + "step": 6430, + "time_per_iteration": 3.9454731941223145 + }, + { + "auxiliary_loss_clip": 0.01017662, + "auxiliary_loss_mlp": 0.01005693, + "balance_loss_clip": 1.00166416, + "balance_loss_mlp": 1.00750947, + "epoch": 0.3866526379077108, + "flos": 64162259228160.0, + "grad_norm": 0.7214581572232176, + "language_loss": 0.58885682, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60909033, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.1015625, + "step": 6431, + "time_per_iteration": 3.095679759979248 + }, + { + "auxiliary_loss_clip": 0.01063823, + "auxiliary_loss_mlp": 0.01049117, + "balance_loss_clip": 1.0164777, + "balance_loss_mlp": 1.01969934, + "epoch": 0.3867127611603788, + "flos": 14387532197760.0, + "grad_norm": 1.9066484634153322, + "language_loss": 0.79923284, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.82036221, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44140625, + "step": 6432, + "time_per_iteration": 2.4057750701904297 + }, + { + "auxiliary_loss_clip": 0.01065408, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.01394439, + "balance_loss_mlp": 1.02040148, + "epoch": 0.38677288441304675, + "flos": 20009735544960.0, + "grad_norm": 2.1716257499306804, + "language_loss": 0.82504368, + "learning_rate": 2.806862131772779e-06, + "loss": 0.84612989, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44921875, + "step": 6433, + "time_per_iteration": 2.4107301235198975 + }, + { + "auxiliary_loss_clip": 0.01063587, + "auxiliary_loss_mlp": 0.01050401, + "balance_loss_clip": 1.01727307, + "balance_loss_mlp": 1.02024972, + "epoch": 0.3868330076657147, + "flos": 22235797699200.0, + "grad_norm": 1.6875365109040825, + "language_loss": 0.72012973, + "learning_rate": 2.806505755127765e-06, + "loss": 0.74126959, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.43359375, + "step": 6434, + "time_per_iteration": 2.4042165279388428 + }, + { + "auxiliary_loss_clip": 0.01066107, + "auxiliary_loss_mlp": 0.01049667, + "balance_loss_clip": 1.01810014, + "balance_loss_mlp": 1.02002609, + "epoch": 0.3868931309183827, + "flos": 16726503288960.0, + "grad_norm": 1.9219448438613114, + "language_loss": 0.79497194, + "learning_rate": 2.806149347899972e-06, + "loss": 0.81612968, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4609375, + "step": 6435, + "time_per_iteration": 2.39243745803833 + }, + { + "auxiliary_loss_clip": 0.01060914, + "auxiliary_loss_mlp": 0.01044371, + "balance_loss_clip": 1.0163691, + "balance_loss_mlp": 1.01896596, + "epoch": 0.38695325417105064, + "flos": 22673608548480.0, + "grad_norm": 1.7717351618753494, + "language_loss": 0.81013906, + "learning_rate": 2.805792910102915e-06, + "loss": 0.8311919, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 6436, + "time_per_iteration": 2.392469882965088 + }, + { + "auxiliary_loss_clip": 0.01060357, + "auxiliary_loss_mlp": 0.01044926, + "balance_loss_clip": 1.01829481, + "balance_loss_mlp": 1.01910722, + "epoch": 0.3870133774237186, + "flos": 23110930638720.0, + "grad_norm": 1.6637151325115598, + "language_loss": 0.77513492, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79618776, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.4140625, + "step": 6437, + "time_per_iteration": 2.4074625968933105 + }, + { + "auxiliary_loss_clip": 0.01061726, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_clip": 1.02163339, + "balance_loss_mlp": 1.02015519, + "epoch": 0.3870735006763866, + "flos": 17674744348800.0, + "grad_norm": 2.01287512020745, + "language_loss": 0.82794023, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84903705, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41601562, + "step": 6438, + "time_per_iteration": 2.3617351055145264 + }, + { + "auxiliary_loss_clip": 0.01062094, + "auxiliary_loss_mlp": 0.01050344, + "balance_loss_clip": 1.02262819, + "balance_loss_mlp": 1.0194459, + "epoch": 0.38713362392905454, + "flos": 23294643546240.0, + "grad_norm": 1.4012944113029324, + "language_loss": 0.76252091, + "learning_rate": 2.804723413431326e-06, + "loss": 0.78364527, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42578125, + "step": 6439, + "time_per_iteration": 2.430793285369873 + }, + { + "auxiliary_loss_clip": 0.0105733, + "auxiliary_loss_mlp": 0.01043878, + "balance_loss_clip": 1.0182004, + "balance_loss_mlp": 1.01809251, + "epoch": 0.38719374718172256, + "flos": 21030177029760.0, + "grad_norm": 1.4636289750574472, + "language_loss": 0.7507655, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.77177763, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39257812, + "step": 6440, + "time_per_iteration": 2.3811075687408447 + }, + { + "auxiliary_loss_clip": 0.01061714, + "auxiliary_loss_mlp": 0.01050436, + "balance_loss_clip": 1.02056265, + "balance_loss_mlp": 1.01837993, + "epoch": 0.3872538704343905, + "flos": 19608758046720.0, + "grad_norm": 1.8543697247829694, + "language_loss": 0.82964098, + "learning_rate": 2.804010263051774e-06, + "loss": 0.85076249, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43359375, + "step": 6441, + "time_per_iteration": 2.3872947692871094 + }, + { + "auxiliary_loss_clip": 0.01060987, + "auxiliary_loss_mlp": 0.01057104, + "balance_loss_clip": 1.03041279, + "balance_loss_mlp": 1.01961088, + "epoch": 0.3873139936870585, + "flos": 17529086689920.0, + "grad_norm": 21.527129005084454, + "language_loss": 0.82202709, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.84320801, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.4140625, + "step": 6442, + "time_per_iteration": 2.361820936203003 + }, + { + "auxiliary_loss_clip": 0.01060249, + "auxiliary_loss_mlp": 0.01044017, + "balance_loss_clip": 1.01529944, + "balance_loss_mlp": 1.01914263, + "epoch": 0.38737411693972645, + "flos": 17785558604160.0, + "grad_norm": 1.673484587668203, + "language_loss": 0.85736901, + "learning_rate": 2.803296990719624e-06, + "loss": 0.87841165, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41015625, + "step": 6443, + "time_per_iteration": 2.3715953826904297 + }, + { + "auxiliary_loss_clip": 0.01012298, + "auxiliary_loss_mlp": 0.0100887, + "balance_loss_clip": 1.00465024, + "balance_loss_mlp": 1.0031724, + "epoch": 0.3874342401923944, + "flos": 58301984119680.0, + "grad_norm": 0.7634005985629908, + "language_loss": 0.50225204, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52246368, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.04223633, + "router_z_loss_mlp": 0.09082031, + "step": 6444, + "time_per_iteration": 3.030442714691162 + }, + { + "auxiliary_loss_clip": 0.0105651, + "auxiliary_loss_mlp": 0.0103998, + "balance_loss_clip": 1.01579249, + "balance_loss_mlp": 1.01795161, + "epoch": 0.3874943634450624, + "flos": 17710984206720.0, + "grad_norm": 1.490345437053535, + "language_loss": 0.80078948, + "learning_rate": 2.802583596543065e-06, + "loss": 0.82175446, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38671875, + "step": 6445, + "time_per_iteration": 2.3960797786712646 + }, + { + "auxiliary_loss_clip": 0.01061226, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.01571035, + "balance_loss_mlp": 1.02110052, + "epoch": 0.38755448669773035, + "flos": 19243845849600.0, + "grad_norm": 1.831930497781827, + "language_loss": 0.82948619, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.85051215, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 6446, + "time_per_iteration": 2.3920023441314697 + }, + { + "auxiliary_loss_clip": 0.01060827, + "auxiliary_loss_mlp": 0.01044929, + "balance_loss_clip": 1.01817894, + "balance_loss_mlp": 1.01972294, + "epoch": 0.3876146099503983, + "flos": 20593238964480.0, + "grad_norm": 1.6757828249897224, + "language_loss": 0.78865302, + "learning_rate": 2.801870080630306e-06, + "loss": 0.80971056, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 6447, + "time_per_iteration": 2.409745216369629 + }, + { + "auxiliary_loss_clip": 0.01061149, + "auxiliary_loss_mlp": 0.01040037, + "balance_loss_clip": 1.01409721, + "balance_loss_mlp": 1.02108371, + "epoch": 0.3876747332030663, + "flos": 19280120618880.0, + "grad_norm": 1.8894455545854087, + "language_loss": 0.78239703, + "learning_rate": 2.801513277056671e-06, + "loss": 0.80340886, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40039062, + "step": 6448, + "time_per_iteration": 2.3877058029174805 + }, + { + "auxiliary_loss_clip": 0.01060076, + "auxiliary_loss_mlp": 0.01039273, + "balance_loss_clip": 1.01472807, + "balance_loss_mlp": 1.0203433, + "epoch": 0.38773485645573424, + "flos": 18945094412160.0, + "grad_norm": 1.7702356710381366, + "language_loss": 0.76773614, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78872955, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3984375, + "step": 6449, + "time_per_iteration": 2.38065242767334 + }, + { + "auxiliary_loss_clip": 0.01062292, + "auxiliary_loss_mlp": 0.01046772, + "balance_loss_clip": 1.01891279, + "balance_loss_mlp": 1.01953363, + "epoch": 0.3877949797084022, + "flos": 23070361772160.0, + "grad_norm": 1.6732670503903657, + "language_loss": 0.79767632, + "learning_rate": 2.800799578742542e-06, + "loss": 0.81876695, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42773438, + "step": 6450, + "time_per_iteration": 2.422934055328369 + }, + { + "auxiliary_loss_clip": 0.01065663, + "auxiliary_loss_mlp": 0.01051454, + "balance_loss_clip": 1.02260482, + "balance_loss_mlp": 1.02066088, + "epoch": 0.3878551029610702, + "flos": 29094275756160.0, + "grad_norm": 3.0251453218986195, + "language_loss": 0.7911284, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.81229955, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44921875, + "step": 6451, + "time_per_iteration": 2.4713737964630127 + }, + { + "auxiliary_loss_clip": 0.01060095, + "auxiliary_loss_mlp": 0.01045733, + "balance_loss_clip": 1.02047253, + "balance_loss_mlp": 1.0198648, + "epoch": 0.38791522621373814, + "flos": 20995333626240.0, + "grad_norm": 1.8219948832117312, + "language_loss": 0.77993232, + "learning_rate": 2.800085758962812e-06, + "loss": 0.80099064, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40234375, + "step": 6452, + "time_per_iteration": 2.4121556282043457 + }, + { + "auxiliary_loss_clip": 0.01063254, + "auxiliary_loss_mlp": 0.01049153, + "balance_loss_clip": 1.02198517, + "balance_loss_mlp": 1.02200794, + "epoch": 0.3879753494664061, + "flos": 15485934481920.0, + "grad_norm": 1.470247511791582, + "language_loss": 0.80659413, + "learning_rate": 2.799728803557182e-06, + "loss": 0.82771814, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 6453, + "time_per_iteration": 2.475649356842041 + }, + { + "auxiliary_loss_clip": 0.0106681, + "auxiliary_loss_mlp": 0.01049711, + "balance_loss_clip": 1.01807261, + "balance_loss_mlp": 1.0214504, + "epoch": 0.3880354727190741, + "flos": 22052887752960.0, + "grad_norm": 1.5891469118744868, + "language_loss": 0.72584623, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.74701148, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.453125, + "step": 6454, + "time_per_iteration": 2.4295310974121094 + }, + { + "auxiliary_loss_clip": 0.01067298, + "auxiliary_loss_mlp": 0.01047969, + "balance_loss_clip": 1.01644993, + "balance_loss_mlp": 1.02258372, + "epoch": 0.3880955959717421, + "flos": 20339245756800.0, + "grad_norm": 1.7023499961137403, + "language_loss": 0.78772491, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80887765, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44726562, + "step": 6455, + "time_per_iteration": 2.5071732997894287 + }, + { + "auxiliary_loss_clip": 0.01061406, + "auxiliary_loss_mlp": 0.01053657, + "balance_loss_clip": 1.02521348, + "balance_loss_mlp": 1.0200671, + "epoch": 0.38815571922441006, + "flos": 23074306755840.0, + "grad_norm": 1.5222459092012772, + "language_loss": 0.77230108, + "learning_rate": 2.798657755439662e-06, + "loss": 0.79345179, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4140625, + "step": 6456, + "time_per_iteration": 2.3809947967529297 + }, + { + "auxiliary_loss_clip": 0.01064584, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_clip": 1.01672161, + "balance_loss_mlp": 1.02071691, + "epoch": 0.388215842477078, + "flos": 20775904531200.0, + "grad_norm": 2.316604987255037, + "language_loss": 0.6326412, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.6537323, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.43945312, + "step": 6457, + "time_per_iteration": 2.376450300216675 + }, + { + "auxiliary_loss_clip": 0.01062619, + "auxiliary_loss_mlp": 0.01055382, + "balance_loss_clip": 1.02326655, + "balance_loss_mlp": 1.0182606, + "epoch": 0.388275965729746, + "flos": 20447162369280.0, + "grad_norm": 2.1638512303771344, + "language_loss": 0.81780559, + "learning_rate": 2.797943571912841e-06, + "loss": 0.83898562, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44335938, + "step": 6458, + "time_per_iteration": 2.4131245613098145 + }, + { + "auxiliary_loss_clip": 0.01063265, + "auxiliary_loss_mlp": 0.01043214, + "balance_loss_clip": 1.01395965, + "balance_loss_mlp": 1.01965165, + "epoch": 0.38833608898241395, + "flos": 27891133793280.0, + "grad_norm": 2.045223033388776, + "language_loss": 0.83118916, + "learning_rate": 2.797586434755509e-06, + "loss": 0.85225391, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43554688, + "step": 6459, + "time_per_iteration": 2.43587589263916 + }, + { + "auxiliary_loss_clip": 0.01062843, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.0150516, + "balance_loss_mlp": 1.01989114, + "epoch": 0.3883962122350819, + "flos": 18075442556160.0, + "grad_norm": 1.5853541933325563, + "language_loss": 0.63265228, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.65371192, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4296875, + "step": 6460, + "time_per_iteration": 2.3841888904571533 + }, + { + "auxiliary_loss_clip": 0.01061872, + "auxiliary_loss_mlp": 0.01046753, + "balance_loss_clip": 1.01885831, + "balance_loss_mlp": 1.01979089, + "epoch": 0.3884563354877499, + "flos": 23621151381120.0, + "grad_norm": 1.577150847689544, + "language_loss": 0.87101662, + "learning_rate": 2.796872069720717e-06, + "loss": 0.8921029, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.421875, + "step": 6461, + "time_per_iteration": 2.4171931743621826 + }, + { + "auxiliary_loss_clip": 0.01064089, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.01762712, + "balance_loss_mlp": 1.02028394, + "epoch": 0.38851645874041785, + "flos": 27452310514560.0, + "grad_norm": 2.371371523605077, + "language_loss": 0.7390269, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.76016307, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4375, + "step": 6462, + "time_per_iteration": 2.439781427383423 + }, + { + "auxiliary_loss_clip": 0.01062176, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.01819491, + "balance_loss_mlp": 1.01860368, + "epoch": 0.3885765819930858, + "flos": 25226911676160.0, + "grad_norm": 1.9216882191753863, + "language_loss": 0.77695143, + "learning_rate": 2.796157583816052e-06, + "loss": 0.79804939, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.43554688, + "step": 6463, + "time_per_iteration": 2.413299083709717 + }, + { + "auxiliary_loss_clip": 0.010653, + "auxiliary_loss_mlp": 0.01054726, + "balance_loss_clip": 1.01986969, + "balance_loss_mlp": 1.02054679, + "epoch": 0.3886367052457538, + "flos": 16945653093120.0, + "grad_norm": 2.017860276909241, + "language_loss": 0.7214905, + "learning_rate": 2.795800295571382e-06, + "loss": 0.7426908, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.4453125, + "step": 6464, + "time_per_iteration": 2.391571044921875 + }, + { + "auxiliary_loss_clip": 0.01063279, + "auxiliary_loss_mlp": 0.01044569, + "balance_loss_clip": 1.01632857, + "balance_loss_mlp": 1.02201366, + "epoch": 0.38869682849842174, + "flos": 27153140140800.0, + "grad_norm": 2.442741168952276, + "language_loss": 0.70621943, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.7272979, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4140625, + "step": 6465, + "time_per_iteration": 3.8800346851348877 + }, + { + "auxiliary_loss_clip": 0.01063244, + "auxiliary_loss_mlp": 0.0104923, + "balance_loss_clip": 1.01773429, + "balance_loss_mlp": 1.01934361, + "epoch": 0.3887569517510897, + "flos": 21062716283520.0, + "grad_norm": 2.2732253091596526, + "language_loss": 0.79298919, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.81411386, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43945312, + "step": 6466, + "time_per_iteration": 2.3741443157196045 + }, + { + "auxiliary_loss_clip": 0.0106486, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.01673412, + "balance_loss_mlp": 1.02086878, + "epoch": 0.38881707500375773, + "flos": 29496091127040.0, + "grad_norm": 1.5520402516450096, + "language_loss": 0.70784736, + "learning_rate": 2.794728249830611e-06, + "loss": 0.72897637, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43945312, + "step": 6467, + "time_per_iteration": 2.485211133956909 + }, + { + "auxiliary_loss_clip": 0.01063962, + "auxiliary_loss_mlp": 0.01054622, + "balance_loss_clip": 1.02014625, + "balance_loss_mlp": 1.01937342, + "epoch": 0.3888771982564257, + "flos": 17487470482560.0, + "grad_norm": 2.3776768765907765, + "language_loss": 0.84457862, + "learning_rate": 2.794370840959936e-06, + "loss": 0.8657645, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.4453125, + "step": 6468, + "time_per_iteration": 2.3377246856689453 + }, + { + "auxiliary_loss_clip": 0.01064021, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.01533604, + "balance_loss_mlp": 1.01981914, + "epoch": 0.38893732150909366, + "flos": 21941410181760.0, + "grad_norm": 1.710475754605988, + "language_loss": 0.84923679, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.8703233, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44140625, + "step": 6469, + "time_per_iteration": 3.780565023422241 + }, + { + "auxiliary_loss_clip": 0.01061598, + "auxiliary_loss_mlp": 0.01051014, + "balance_loss_clip": 1.01768279, + "balance_loss_mlp": 1.01834941, + "epoch": 0.3889974447617616, + "flos": 24275319125760.0, + "grad_norm": 1.836174943494901, + "language_loss": 0.76081049, + "learning_rate": 2.793655932864273e-06, + "loss": 0.78193659, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.43359375, + "step": 6470, + "time_per_iteration": 3.917071580886841 + }, + { + "auxiliary_loss_clip": 0.01064658, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.0184536, + "balance_loss_mlp": 1.01891446, + "epoch": 0.3890575680144296, + "flos": 25665909511680.0, + "grad_norm": 1.585689080283161, + "language_loss": 0.75694549, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77812839, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.45703125, + "step": 6471, + "time_per_iteration": 2.443952798843384 + }, + { + "auxiliary_loss_clip": 0.01064861, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.01132679, + "balance_loss_mlp": 1.02125216, + "epoch": 0.38911769126709755, + "flos": 22854214344960.0, + "grad_norm": 1.589670907747918, + "language_loss": 0.6808368, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70192671, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.43554688, + "step": 6472, + "time_per_iteration": 2.413602113723755 + }, + { + "auxiliary_loss_clip": 0.0106575, + "auxiliary_loss_mlp": 0.01054168, + "balance_loss_clip": 1.02169502, + "balance_loss_mlp": 1.02150154, + "epoch": 0.3891778145197655, + "flos": 25446340771200.0, + "grad_norm": 1.6883131244629637, + "language_loss": 0.77341211, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.79461133, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44140625, + "step": 6473, + "time_per_iteration": 2.4130804538726807 + }, + { + "auxiliary_loss_clip": 0.01066656, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.01737142, + "balance_loss_mlp": 1.02219963, + "epoch": 0.3892379377724335, + "flos": 14027088654720.0, + "grad_norm": 2.038268362633952, + "language_loss": 0.73478693, + "learning_rate": 2.792225755635257e-06, + "loss": 0.75595123, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4453125, + "step": 6474, + "time_per_iteration": 2.371166706085205 + }, + { + "auxiliary_loss_clip": 0.01065157, + "auxiliary_loss_mlp": 0.01048632, + "balance_loss_clip": 1.01592112, + "balance_loss_mlp": 1.02054715, + "epoch": 0.38929806102510145, + "flos": 20156405633280.0, + "grad_norm": 1.9418069518315269, + "language_loss": 0.69615209, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71728992, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4453125, + "step": 6475, + "time_per_iteration": 2.383227586746216 + }, + { + "auxiliary_loss_clip": 0.01070534, + "auxiliary_loss_mlp": 0.01054867, + "balance_loss_clip": 1.01958108, + "balance_loss_mlp": 1.02297306, + "epoch": 0.3893581842777694, + "flos": 22162864135680.0, + "grad_norm": 1.981392822593474, + "language_loss": 0.77203846, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.79329252, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.4765625, + "step": 6476, + "time_per_iteration": 2.4091298580169678 + }, + { + "auxiliary_loss_clip": 0.01017403, + "auxiliary_loss_mlp": 0.01026375, + "balance_loss_clip": 1.02236938, + "balance_loss_mlp": 1.00620866, + "epoch": 0.3894183075304374, + "flos": 67298367548160.0, + "grad_norm": 0.7978657656102623, + "language_loss": 0.58288419, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60332203, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.11181641, + "step": 6477, + "time_per_iteration": 3.0182688236236572 + }, + { + "auxiliary_loss_clip": 0.01065399, + "auxiliary_loss_mlp": 0.01052489, + "balance_loss_clip": 1.01806128, + "balance_loss_mlp": 1.02070808, + "epoch": 0.38947843078310534, + "flos": 18546630531840.0, + "grad_norm": 1.9136510163879519, + "language_loss": 0.79007065, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.81124949, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.44726562, + "step": 6478, + "time_per_iteration": 2.3737242221832275 + }, + { + "auxiliary_loss_clip": 0.01065334, + "auxiliary_loss_mlp": 0.01045453, + "balance_loss_clip": 1.01445854, + "balance_loss_mlp": 1.02079141, + "epoch": 0.3895385540357733, + "flos": 14605145902080.0, + "grad_norm": 2.122791233193536, + "language_loss": 0.8431468, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.86425465, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4453125, + "step": 6479, + "time_per_iteration": 2.3437395095825195 + }, + { + "auxiliary_loss_clip": 0.01063692, + "auxiliary_loss_mlp": 0.01043869, + "balance_loss_clip": 1.0154022, + "balance_loss_mlp": 1.02081919, + "epoch": 0.38959867728844133, + "flos": 19974159002880.0, + "grad_norm": 9.338544083907466, + "language_loss": 0.81787723, + "learning_rate": 2.790079588824617e-06, + "loss": 0.8389529, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 6480, + "time_per_iteration": 2.3887434005737305 + }, + { + "auxiliary_loss_clip": 0.01064235, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.0175066, + "balance_loss_mlp": 1.02198362, + "epoch": 0.3896588005411093, + "flos": 22671094930560.0, + "grad_norm": 1.8766277634015556, + "language_loss": 0.84878016, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.8698895, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.421875, + "step": 6481, + "time_per_iteration": 2.403028964996338 + }, + { + "auxiliary_loss_clip": 0.01063689, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_clip": 1.01719522, + "balance_loss_mlp": 1.02153039, + "epoch": 0.38971892379377726, + "flos": 20994984512640.0, + "grad_norm": 1.7776804059305822, + "language_loss": 0.76262754, + "learning_rate": 2.789363960063863e-06, + "loss": 0.78371763, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 6482, + "time_per_iteration": 2.3985133171081543 + }, + { + "auxiliary_loss_clip": 0.01065119, + "auxiliary_loss_mlp": 0.01057507, + "balance_loss_clip": 1.02763295, + "balance_loss_mlp": 1.02162635, + "epoch": 0.3897790470464452, + "flos": 22527392307840.0, + "grad_norm": 2.1898915449714624, + "language_loss": 0.80150783, + "learning_rate": 2.78900610077756e-06, + "loss": 0.82273412, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43554688, + "step": 6483, + "time_per_iteration": 2.40327525138855 + }, + { + "auxiliary_loss_clip": 0.01063722, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_clip": 1.01692903, + "balance_loss_mlp": 1.02003539, + "epoch": 0.3898391702991132, + "flos": 26208809153280.0, + "grad_norm": 1.4963497281179574, + "language_loss": 0.80808854, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82922006, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4375, + "step": 6484, + "time_per_iteration": 2.411647319793701 + }, + { + "auxiliary_loss_clip": 0.01064651, + "auxiliary_loss_mlp": 0.01051736, + "balance_loss_clip": 1.01702237, + "balance_loss_mlp": 1.02113569, + "epoch": 0.38989929355178116, + "flos": 21064601496960.0, + "grad_norm": 1.6161260121067504, + "language_loss": 0.78641796, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80758184, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.43554688, + "step": 6485, + "time_per_iteration": 2.405679941177368 + }, + { + "auxiliary_loss_clip": 0.01066839, + "auxiliary_loss_mlp": 0.01053611, + "balance_loss_clip": 1.02130556, + "balance_loss_mlp": 1.02238488, + "epoch": 0.3899594168044491, + "flos": 25482929742720.0, + "grad_norm": 2.4859184371205116, + "language_loss": 0.87100697, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.89221144, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4453125, + "step": 6486, + "time_per_iteration": 2.413532257080078 + }, + { + "auxiliary_loss_clip": 0.01067162, + "auxiliary_loss_mlp": 0.01059892, + "balance_loss_clip": 1.02749085, + "balance_loss_mlp": 1.02110052, + "epoch": 0.3900195400571171, + "flos": 31138021457280.0, + "grad_norm": 2.1206154685395116, + "language_loss": 0.87121594, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.89248651, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4609375, + "step": 6487, + "time_per_iteration": 2.480565071105957 + }, + { + "auxiliary_loss_clip": 0.010635, + "auxiliary_loss_mlp": 0.01059299, + "balance_loss_clip": 1.02539551, + "balance_loss_mlp": 1.02053165, + "epoch": 0.39007966330978505, + "flos": 20228885349120.0, + "grad_norm": 1.5151998717978257, + "language_loss": 0.74351346, + "learning_rate": 2.787216355829633e-06, + "loss": 0.76474148, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4296875, + "step": 6488, + "time_per_iteration": 2.3809475898742676 + }, + { + "auxiliary_loss_clip": 0.01065706, + "auxiliary_loss_mlp": 0.01062761, + "balance_loss_clip": 1.03010917, + "balance_loss_mlp": 1.02093172, + "epoch": 0.390139786562453, + "flos": 22527636687360.0, + "grad_norm": 1.9876581268588358, + "language_loss": 0.69809145, + "learning_rate": 2.786858317231779e-06, + "loss": 0.71937609, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44726562, + "step": 6489, + "time_per_iteration": 2.4261162281036377 + }, + { + "auxiliary_loss_clip": 0.01060831, + "auxiliary_loss_mlp": 0.01052062, + "balance_loss_clip": 1.0225693, + "balance_loss_mlp": 1.01978946, + "epoch": 0.390199909815121, + "flos": 26431694472960.0, + "grad_norm": 1.7758799186442942, + "language_loss": 0.81930304, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.84043193, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41015625, + "step": 6490, + "time_per_iteration": 2.421757221221924 + }, + { + "auxiliary_loss_clip": 0.01063689, + "auxiliary_loss_mlp": 0.01056076, + "balance_loss_clip": 1.02543926, + "balance_loss_mlp": 1.02028477, + "epoch": 0.39026003306778895, + "flos": 17273627205120.0, + "grad_norm": 1.7921643596368593, + "language_loss": 0.90527201, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.92646968, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43359375, + "step": 6491, + "time_per_iteration": 2.3892433643341064 + }, + { + "auxiliary_loss_clip": 0.01064391, + "auxiliary_loss_mlp": 0.0106275, + "balance_loss_clip": 1.03011012, + "balance_loss_mlp": 1.01965189, + "epoch": 0.3903201563204569, + "flos": 24531756128640.0, + "grad_norm": 4.035657799360741, + "language_loss": 0.79826295, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.81953442, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44726562, + "step": 6492, + "time_per_iteration": 2.408308267593384 + }, + { + "auxiliary_loss_clip": 0.01065573, + "auxiliary_loss_mlp": 0.01052015, + "balance_loss_clip": 1.02094889, + "balance_loss_mlp": 1.02097201, + "epoch": 0.39038027957312493, + "flos": 23766843951360.0, + "grad_norm": 1.905768494451364, + "language_loss": 0.7530489, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.77422476, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4453125, + "step": 6493, + "time_per_iteration": 2.4645166397094727 + }, + { + "auxiliary_loss_clip": 0.01067379, + "auxiliary_loss_mlp": 0.0105772, + "balance_loss_clip": 1.02126575, + "balance_loss_mlp": 1.02089214, + "epoch": 0.3904404028257929, + "flos": 14099742927360.0, + "grad_norm": 1.9813766373597879, + "language_loss": 0.78167391, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.80292487, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.46484375, + "step": 6494, + "time_per_iteration": 2.350229501724243 + }, + { + "auxiliary_loss_clip": 0.01069138, + "auxiliary_loss_mlp": 0.01061735, + "balance_loss_clip": 1.02530468, + "balance_loss_mlp": 1.02042484, + "epoch": 0.39050052607846086, + "flos": 16909099032960.0, + "grad_norm": 3.77379751661921, + "language_loss": 0.76744872, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.78875744, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.48632812, + "step": 6495, + "time_per_iteration": 2.400339126586914 + }, + { + "auxiliary_loss_clip": 0.01065756, + "auxiliary_loss_mlp": 0.01059378, + "balance_loss_clip": 1.02568924, + "balance_loss_mlp": 1.02132487, + "epoch": 0.39056064933112883, + "flos": 25914735838080.0, + "grad_norm": 1.6599506049462327, + "language_loss": 0.69175953, + "learning_rate": 2.784351212350352e-06, + "loss": 0.71301091, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4453125, + "step": 6496, + "time_per_iteration": 2.427006483078003 + }, + { + "auxiliary_loss_clip": 0.01018877, + "auxiliary_loss_mlp": 0.01011377, + "balance_loss_clip": 1.00720513, + "balance_loss_mlp": 1.00844216, + "epoch": 0.3906207725837968, + "flos": 60025471119360.0, + "grad_norm": 0.6649028229824571, + "language_loss": 0.54045177, + "learning_rate": 2.783992935430775e-06, + "loss": 0.5607543, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.10449219, + "step": 6497, + "time_per_iteration": 3.1214606761932373 + }, + { + "auxiliary_loss_clip": 0.01066919, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_clip": 1.01629138, + "balance_loss_mlp": 1.02242422, + "epoch": 0.39068089583646476, + "flos": 21067638785280.0, + "grad_norm": 2.339866419286431, + "language_loss": 0.70322049, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.72437704, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4453125, + "step": 6498, + "time_per_iteration": 2.381258487701416 + }, + { + "auxiliary_loss_clip": 0.0102036, + "auxiliary_loss_mlp": 0.01017649, + "balance_loss_clip": 1.01414406, + "balance_loss_mlp": 1.00987601, + "epoch": 0.3907410190891327, + "flos": 70441911987840.0, + "grad_norm": 0.745130211315678, + "language_loss": 0.51837468, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5387547, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.03515625, + "router_z_loss_mlp": 0.10498047, + "step": 6499, + "time_per_iteration": 3.0878586769104004 + }, + { + "auxiliary_loss_clip": 0.01068982, + "auxiliary_loss_mlp": 0.01054842, + "balance_loss_clip": 1.01931739, + "balance_loss_mlp": 1.02294576, + "epoch": 0.3908011423418007, + "flos": 27961274448000.0, + "grad_norm": 1.847627525839178, + "language_loss": 0.74523389, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76647216, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.4609375, + "step": 6500, + "time_per_iteration": 2.4462313652038574 + }, + { + "auxiliary_loss_clip": 0.01068146, + "auxiliary_loss_mlp": 0.01045851, + "balance_loss_clip": 1.01430869, + "balance_loss_mlp": 1.02328134, + "epoch": 0.39086126559446865, + "flos": 24460952158080.0, + "grad_norm": 3.2205083920803794, + "language_loss": 0.7015481, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.72268808, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 6501, + "time_per_iteration": 2.444197177886963 + }, + { + "auxiliary_loss_clip": 0.01068597, + "auxiliary_loss_mlp": 0.01052196, + "balance_loss_clip": 1.01757717, + "balance_loss_mlp": 1.02382815, + "epoch": 0.3909213888471366, + "flos": 16940730591360.0, + "grad_norm": 1.713059225427031, + "language_loss": 0.79409224, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81530023, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.44921875, + "step": 6502, + "time_per_iteration": 2.3609795570373535 + }, + { + "auxiliary_loss_clip": 0.01064731, + "auxiliary_loss_mlp": 0.01048644, + "balance_loss_clip": 1.02046239, + "balance_loss_mlp": 1.02351642, + "epoch": 0.3909815120998046, + "flos": 29277115879680.0, + "grad_norm": 2.4703372250825386, + "language_loss": 0.81788051, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.83901429, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 6503, + "time_per_iteration": 2.454406976699829 + }, + { + "auxiliary_loss_clip": 0.01065397, + "auxiliary_loss_mlp": 0.0104662, + "balance_loss_clip": 1.01773596, + "balance_loss_mlp": 1.0238595, + "epoch": 0.39104163535247255, + "flos": 18950296204800.0, + "grad_norm": 1.8273875549905592, + "language_loss": 0.73159319, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.75271332, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41601562, + "step": 6504, + "time_per_iteration": 2.3722448348999023 + }, + { + "auxiliary_loss_clip": 0.01065497, + "auxiliary_loss_mlp": 0.01057317, + "balance_loss_clip": 1.02648997, + "balance_loss_mlp": 1.02201724, + "epoch": 0.3911017586051405, + "flos": 26322137026560.0, + "grad_norm": 1.4651605436365642, + "language_loss": 0.8394556, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.86068368, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43359375, + "step": 6505, + "time_per_iteration": 5.270334482192993 + }, + { + "auxiliary_loss_clip": 0.01065294, + "auxiliary_loss_mlp": 0.01057143, + "balance_loss_clip": 1.02316856, + "balance_loss_mlp": 1.02294075, + "epoch": 0.3911618818578085, + "flos": 21834680555520.0, + "grad_norm": 1.8705428989368649, + "language_loss": 0.73101103, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.75223541, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.421875, + "step": 6506, + "time_per_iteration": 2.430527925491333 + }, + { + "auxiliary_loss_clip": 0.01062762, + "auxiliary_loss_mlp": 0.01056336, + "balance_loss_clip": 1.0260334, + "balance_loss_mlp": 1.02117515, + "epoch": 0.3912220051104765, + "flos": 16358833094400.0, + "grad_norm": 1.9569521034959418, + "language_loss": 0.76305038, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.78424138, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4140625, + "step": 6507, + "time_per_iteration": 2.365361213684082 + }, + { + "auxiliary_loss_clip": 0.01018651, + "auxiliary_loss_mlp": 0.01015946, + "balance_loss_clip": 1.01227391, + "balance_loss_mlp": 1.00903916, + "epoch": 0.39128212836314447, + "flos": 71047620898560.0, + "grad_norm": 0.7633167338955782, + "language_loss": 0.56603694, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58638293, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.09619141, + "step": 6508, + "time_per_iteration": 4.528662919998169 + }, + { + "auxiliary_loss_clip": 0.0106536, + "auxiliary_loss_mlp": 0.01058723, + "balance_loss_clip": 1.02817011, + "balance_loss_mlp": 1.02144194, + "epoch": 0.39134225161581243, + "flos": 20331146321280.0, + "grad_norm": 1.9468153009481834, + "language_loss": 0.78137577, + "learning_rate": 2.779691297413471e-06, + "loss": 0.80261666, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43945312, + "step": 6509, + "time_per_iteration": 3.87225604057312 + }, + { + "auxiliary_loss_clip": 0.01065372, + "auxiliary_loss_mlp": 0.01058082, + "balance_loss_clip": 1.02308249, + "balance_loss_mlp": 1.01997685, + "epoch": 0.3914023748684804, + "flos": 17017469493120.0, + "grad_norm": 2.680423175968471, + "language_loss": 0.84814018, + "learning_rate": 2.779332635075825e-06, + "loss": 0.86937475, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.45507812, + "step": 6510, + "time_per_iteration": 2.371619462966919 + }, + { + "auxiliary_loss_clip": 0.01065645, + "auxiliary_loss_mlp": 0.01061922, + "balance_loss_clip": 1.02861524, + "balance_loss_mlp": 1.02097833, + "epoch": 0.39146249812114836, + "flos": 18404254540800.0, + "grad_norm": 1.8569065734046586, + "language_loss": 0.78263903, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.80391473, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44726562, + "step": 6511, + "time_per_iteration": 2.3658549785614014 + }, + { + "auxiliary_loss_clip": 0.01015642, + "auxiliary_loss_mlp": 0.0101112, + "balance_loss_clip": 1.00740111, + "balance_loss_mlp": 1.00523329, + "epoch": 0.3915226213738163, + "flos": 67633638134400.0, + "grad_norm": 0.7250172145191213, + "language_loss": 0.57859558, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59886318, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.10400391, + "step": 6512, + "time_per_iteration": 3.135610580444336 + }, + { + "auxiliary_loss_clip": 0.01065327, + "auxiliary_loss_mlp": 0.01055265, + "balance_loss_clip": 1.02104032, + "balance_loss_mlp": 1.02036214, + "epoch": 0.3915827446264843, + "flos": 26358132504960.0, + "grad_norm": 1.7976585700534344, + "language_loss": 0.70892614, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.73013204, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.44921875, + "step": 6513, + "time_per_iteration": 2.4213638305664062 + }, + { + "auxiliary_loss_clip": 0.01068529, + "auxiliary_loss_mlp": 0.01057854, + "balance_loss_clip": 1.02149534, + "balance_loss_mlp": 1.02130175, + "epoch": 0.39164286787915226, + "flos": 21942841547520.0, + "grad_norm": 3.1550097343905494, + "language_loss": 0.77031869, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.79158252, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.36328125, + "router_z_loss_mlp": 0.47265625, + "step": 6514, + "time_per_iteration": 2.402256727218628 + }, + { + "auxiliary_loss_clip": 0.01065418, + "auxiliary_loss_mlp": 0.01046307, + "balance_loss_clip": 1.01512182, + "balance_loss_mlp": 1.02167869, + "epoch": 0.3917029911318202, + "flos": 16398878290560.0, + "grad_norm": 1.7655216203092243, + "language_loss": 0.79040438, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.81152165, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4375, + "step": 6515, + "time_per_iteration": 2.368178606033325 + }, + { + "auxiliary_loss_clip": 0.01063275, + "auxiliary_loss_mlp": 0.01052499, + "balance_loss_clip": 1.02411485, + "balance_loss_mlp": 1.020854, + "epoch": 0.3917631143844882, + "flos": 26210554721280.0, + "grad_norm": 1.4053218442219149, + "language_loss": 0.80476403, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82592177, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42578125, + "step": 6516, + "time_per_iteration": 2.464479446411133 + }, + { + "auxiliary_loss_clip": 0.01066978, + "auxiliary_loss_mlp": 0.01053551, + "balance_loss_clip": 1.02215147, + "balance_loss_mlp": 1.02210891, + "epoch": 0.39182323763715615, + "flos": 18547468404480.0, + "grad_norm": 2.259430313709165, + "language_loss": 0.72581738, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.74702269, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44921875, + "step": 6517, + "time_per_iteration": 2.378713369369507 + }, + { + "auxiliary_loss_clip": 0.01067495, + "auxiliary_loss_mlp": 0.01058022, + "balance_loss_clip": 1.02609801, + "balance_loss_mlp": 1.02266753, + "epoch": 0.3918833608898241, + "flos": 34312115203200.0, + "grad_norm": 1.5355293754854469, + "language_loss": 0.73323423, + "learning_rate": 2.776462273631956e-06, + "loss": 0.75448942, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44726562, + "step": 6518, + "time_per_iteration": 2.52559494972229 + }, + { + "auxiliary_loss_clip": 0.01073175, + "auxiliary_loss_mlp": 0.0105611, + "balance_loss_clip": 1.02411425, + "balance_loss_mlp": 1.02783775, + "epoch": 0.3919434841424921, + "flos": 36938107514880.0, + "grad_norm": 1.6846018961978952, + "language_loss": 0.62759113, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.648884, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.453125, + "step": 6519, + "time_per_iteration": 2.5262880325317383 + }, + { + "auxiliary_loss_clip": 0.01074142, + "auxiliary_loss_mlp": 0.01059864, + "balance_loss_clip": 1.0251981, + "balance_loss_mlp": 1.02731013, + "epoch": 0.3920036073951601, + "flos": 23507963153280.0, + "grad_norm": 2.1804448650297745, + "language_loss": 0.68827713, + "learning_rate": 2.775744388563563e-06, + "loss": 0.7096172, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.46875, + "step": 6520, + "time_per_iteration": 2.4251515865325928 + }, + { + "auxiliary_loss_clip": 0.01068122, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.01584673, + "balance_loss_mlp": 1.02508068, + "epoch": 0.39206373064782807, + "flos": 18405092413440.0, + "grad_norm": 1.7258386007102553, + "language_loss": 0.80427516, + "learning_rate": 2.775385401898104e-06, + "loss": 0.82542384, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4296875, + "step": 6521, + "time_per_iteration": 2.3862552642822266 + }, + { + "auxiliary_loss_clip": 0.01076154, + "auxiliary_loss_mlp": 0.01064924, + "balance_loss_clip": 1.02303374, + "balance_loss_mlp": 1.02695382, + "epoch": 0.39212385390049603, + "flos": 12312224760960.0, + "grad_norm": 2.048282458413658, + "language_loss": 0.71372569, + "learning_rate": 2.775026385829952e-06, + "loss": 0.73513651, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.4921875, + "step": 6522, + "time_per_iteration": 2.388674020767212 + }, + { + "auxiliary_loss_clip": 0.01076296, + "auxiliary_loss_mlp": 0.01048816, + "balance_loss_clip": 1.01666546, + "balance_loss_mlp": 1.02987278, + "epoch": 0.392183977153164, + "flos": 19718140936320.0, + "grad_norm": 1.836509255022637, + "language_loss": 0.78349835, + "learning_rate": 2.774667340372722e-06, + "loss": 0.80474949, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46484375, + "step": 6523, + "time_per_iteration": 2.409184694290161 + }, + { + "auxiliary_loss_clip": 0.01074029, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.01499748, + "balance_loss_mlp": 1.02878642, + "epoch": 0.39224410040583196, + "flos": 33143537352960.0, + "grad_norm": 2.543986864836236, + "language_loss": 0.63268036, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.6538837, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.453125, + "step": 6524, + "time_per_iteration": 2.518162727355957 + }, + { + "auxiliary_loss_clip": 0.01072112, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.01556361, + "balance_loss_mlp": 1.02774715, + "epoch": 0.39230422365849993, + "flos": 27781192321920.0, + "grad_norm": 1.6132919817899067, + "language_loss": 0.75229025, + "learning_rate": 2.773949161345489e-06, + "loss": 0.77349263, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4453125, + "step": 6525, + "time_per_iteration": 2.4731082916259766 + }, + { + "auxiliary_loss_clip": 0.01072116, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.01670611, + "balance_loss_mlp": 1.02754474, + "epoch": 0.3923643469111679, + "flos": 17930657681280.0, + "grad_norm": 3.891026016617497, + "language_loss": 0.82787216, + "learning_rate": 2.773590027802719e-06, + "loss": 0.84906304, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4453125, + "step": 6526, + "time_per_iteration": 2.3894612789154053 + }, + { + "auxiliary_loss_clip": 0.01069784, + "auxiliary_loss_mlp": 0.01050232, + "balance_loss_clip": 1.01997709, + "balance_loss_mlp": 1.02579427, + "epoch": 0.39242447016383586, + "flos": 24058438560000.0, + "grad_norm": 1.549040156973907, + "language_loss": 0.70617598, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72737616, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43945312, + "step": 6527, + "time_per_iteration": 2.4535739421844482 + }, + { + "auxiliary_loss_clip": 0.01069722, + "auxiliary_loss_mlp": 0.01052525, + "balance_loss_clip": 1.02091098, + "balance_loss_mlp": 1.02624989, + "epoch": 0.3924845934165038, + "flos": 10663486715520.0, + "grad_norm": 2.4071139762776514, + "language_loss": 0.84321284, + "learning_rate": 2.772871672726965e-06, + "loss": 0.86443532, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43359375, + "step": 6528, + "time_per_iteration": 2.3656504154205322 + }, + { + "auxiliary_loss_clip": 0.01066769, + "auxiliary_loss_mlp": 0.01049761, + "balance_loss_clip": 1.01670432, + "balance_loss_mlp": 1.02447867, + "epoch": 0.3925447166691718, + "flos": 31244646349440.0, + "grad_norm": 1.5835867669188777, + "language_loss": 0.69740188, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.71856719, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.421875, + "step": 6529, + "time_per_iteration": 2.514498710632324 + }, + { + "auxiliary_loss_clip": 0.01068873, + "auxiliary_loss_mlp": 0.01053961, + "balance_loss_clip": 1.02089214, + "balance_loss_mlp": 1.02411973, + "epoch": 0.39260483992183975, + "flos": 29414010787200.0, + "grad_norm": 2.404158504139493, + "language_loss": 0.82459617, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.84582448, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44726562, + "step": 6530, + "time_per_iteration": 2.468825578689575 + }, + { + "auxiliary_loss_clip": 0.01065382, + "auxiliary_loss_mlp": 0.01042137, + "balance_loss_clip": 1.01369369, + "balance_loss_mlp": 1.02207088, + "epoch": 0.3926649631745077, + "flos": 22856658140160.0, + "grad_norm": 1.555331128386134, + "language_loss": 0.77287424, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.79394937, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43359375, + "step": 6531, + "time_per_iteration": 2.4373130798339844 + }, + { + "auxiliary_loss_clip": 0.01029265, + "auxiliary_loss_mlp": 0.01005532, + "balance_loss_clip": 1.00154996, + "balance_loss_mlp": 1.01824152, + "epoch": 0.3927250864271757, + "flos": 63890880163200.0, + "grad_norm": 0.8132591450334073, + "language_loss": 0.60378397, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62413192, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.11035156, + "step": 6532, + "time_per_iteration": 2.9106557369232178 + }, + { + "auxiliary_loss_clip": 0.01023298, + "auxiliary_loss_mlp": 0.01004377, + "balance_loss_clip": 1.00034738, + "balance_loss_mlp": 1.0128324, + "epoch": 0.3927852096798437, + "flos": 68906117790720.0, + "grad_norm": 0.7811912494360661, + "language_loss": 0.5556798, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57595646, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.10449219, + "step": 6533, + "time_per_iteration": 3.127985715866089 + }, + { + "auxiliary_loss_clip": 0.01068973, + "auxiliary_loss_mlp": 0.0106684, + "balance_loss_clip": 1.03348517, + "balance_loss_mlp": 1.0219028, + "epoch": 0.39284533293251167, + "flos": 29714682349440.0, + "grad_norm": 2.0820089297517947, + "language_loss": 0.77479672, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.79615492, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.47070312, + "step": 6534, + "time_per_iteration": 2.509692430496216 + }, + { + "auxiliary_loss_clip": 0.01065511, + "auxiliary_loss_mlp": 0.01060534, + "balance_loss_clip": 1.02872872, + "balance_loss_mlp": 1.01934063, + "epoch": 0.39290545618517964, + "flos": 18551029363200.0, + "grad_norm": 2.2317081209241865, + "language_loss": 0.79783463, + "learning_rate": 2.770356507494851e-06, + "loss": 0.81909513, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4609375, + "step": 6535, + "time_per_iteration": 2.365178108215332 + }, + { + "auxiliary_loss_clip": 0.01062861, + "auxiliary_loss_mlp": 0.01071195, + "balance_loss_clip": 1.04034352, + "balance_loss_mlp": 1.0204258, + "epoch": 0.3929655794378476, + "flos": 26248295767680.0, + "grad_norm": 2.8957527935117597, + "language_loss": 0.69908768, + "learning_rate": 2.769997081218978e-06, + "loss": 0.72042823, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.42578125, + "step": 6536, + "time_per_iteration": 2.447791337966919 + }, + { + "auxiliary_loss_clip": 0.01061467, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.04147291, + "balance_loss_mlp": 1.01985812, + "epoch": 0.39302570269051557, + "flos": 29276662032000.0, + "grad_norm": 1.8313046263949275, + "language_loss": 0.69843125, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71974826, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41601562, + "step": 6537, + "time_per_iteration": 2.4622740745544434 + }, + { + "auxiliary_loss_clip": 0.01066783, + "auxiliary_loss_mlp": 0.01066824, + "balance_loss_clip": 1.03509057, + "balance_loss_mlp": 1.02134025, + "epoch": 0.39308582594318353, + "flos": 17346490945920.0, + "grad_norm": 1.9421015415431033, + "language_loss": 0.80258363, + "learning_rate": 2.769278141085763e-06, + "loss": 0.82391971, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45507812, + "step": 6538, + "time_per_iteration": 2.409656286239624 + }, + { + "auxiliary_loss_clip": 0.01018697, + "auxiliary_loss_mlp": 0.01094811, + "balance_loss_clip": 1.09032917, + "balance_loss_mlp": 1.00879931, + "epoch": 0.3931459491958515, + "flos": 61004296396800.0, + "grad_norm": 0.8654711586514772, + "language_loss": 0.61946416, + "learning_rate": 2.768918627255683e-06, + "loss": 0.64059925, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.09863281, + "step": 6539, + "time_per_iteration": 2.8049373626708984 + }, + { + "auxiliary_loss_clip": 0.01071715, + "auxiliary_loss_mlp": 0.01064878, + "balance_loss_clip": 1.02892494, + "balance_loss_mlp": 1.02645707, + "epoch": 0.39320607244851946, + "flos": 39014567026560.0, + "grad_norm": 2.5241478292576383, + "language_loss": 0.70182884, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.72319472, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.45117188, + "step": 6540, + "time_per_iteration": 2.592240810394287 + }, + { + "auxiliary_loss_clip": 0.01072617, + "auxiliary_loss_mlp": 0.01058964, + "balance_loss_clip": 1.02622914, + "balance_loss_mlp": 1.02794695, + "epoch": 0.3932661957011874, + "flos": 24678635685120.0, + "grad_norm": 1.659816541787921, + "language_loss": 0.73444378, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.75575966, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4453125, + "step": 6541, + "time_per_iteration": 2.4225590229034424 + }, + { + "auxiliary_loss_clip": 0.01036949, + "auxiliary_loss_mlp": 0.01037312, + "balance_loss_clip": 1.03340161, + "balance_loss_mlp": 1.02602625, + "epoch": 0.3933263189538554, + "flos": 70093375084800.0, + "grad_norm": 0.8324111915141185, + "language_loss": 0.60442382, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62516642, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.0390625, + "router_z_loss_mlp": 0.109375, + "step": 6542, + "time_per_iteration": 2.930344820022583 + }, + { + "auxiliary_loss_clip": 0.01076687, + "auxiliary_loss_mlp": 0.01055091, + "balance_loss_clip": 1.02156878, + "balance_loss_mlp": 1.0320996, + "epoch": 0.39338644220652336, + "flos": 22927985781120.0, + "grad_norm": 1.8288865705365573, + "language_loss": 0.83388746, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.85520518, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.44726562, + "step": 6543, + "time_per_iteration": 2.428105354309082 + }, + { + "auxiliary_loss_clip": 0.01079313, + "auxiliary_loss_mlp": 0.01051058, + "balance_loss_clip": 1.01860917, + "balance_loss_mlp": 1.03357482, + "epoch": 0.3934465654591913, + "flos": 30846810873600.0, + "grad_norm": 1.7394342392752344, + "language_loss": 0.70243013, + "learning_rate": 2.767120621015908e-06, + "loss": 0.72373384, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45703125, + "step": 6544, + "time_per_iteration": 3.9572207927703857 + }, + { + "auxiliary_loss_clip": 0.01085088, + "auxiliary_loss_mlp": 0.01065572, + "balance_loss_clip": 1.02940428, + "balance_loss_mlp": 1.03706145, + "epoch": 0.3935066887118593, + "flos": 29235394938240.0, + "grad_norm": 1.8536981061983677, + "language_loss": 0.77226883, + "learning_rate": 2.76676093244553e-06, + "loss": 0.7937755, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 0.48046875, + "step": 6545, + "time_per_iteration": 3.8932409286499023 + }, + { + "auxiliary_loss_clip": 0.01079897, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_clip": 1.0324235, + "balance_loss_mlp": 1.03780901, + "epoch": 0.3935668119645273, + "flos": 19134288403200.0, + "grad_norm": 1.4925852760802736, + "language_loss": 0.75923312, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.78064024, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 6546, + "time_per_iteration": 2.419640302658081 + }, + { + "auxiliary_loss_clip": 0.01088462, + "auxiliary_loss_mlp": 0.01069903, + "balance_loss_clip": 1.03547525, + "balance_loss_mlp": 1.03926778, + "epoch": 0.3936269352171953, + "flos": 18515103707520.0, + "grad_norm": 1.6171623248792848, + "language_loss": 0.82558358, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.84716725, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.4921875, + "step": 6547, + "time_per_iteration": 2.447960376739502 + }, + { + "auxiliary_loss_clip": 0.01082476, + "auxiliary_loss_mlp": 0.0107131, + "balance_loss_clip": 1.04048276, + "balance_loss_mlp": 1.0359056, + "epoch": 0.39368705846986324, + "flos": 15631906343040.0, + "grad_norm": 1.6847975719523272, + "language_loss": 0.85392725, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.87546515, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.46484375, + "step": 6548, + "time_per_iteration": 3.834223508834839 + }, + { + "auxiliary_loss_clip": 0.01082718, + "auxiliary_loss_mlp": 0.01065795, + "balance_loss_clip": 1.03683925, + "balance_loss_mlp": 1.03807116, + "epoch": 0.3937471817225312, + "flos": 21324739104000.0, + "grad_norm": 1.697264349363501, + "language_loss": 0.73799264, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.75947785, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4453125, + "step": 6549, + "time_per_iteration": 3.9004571437835693 + }, + { + "auxiliary_loss_clip": 0.01082471, + "auxiliary_loss_mlp": 0.01081933, + "balance_loss_clip": 1.04819679, + "balance_loss_mlp": 1.03641891, + "epoch": 0.39380730497519917, + "flos": 20775660151680.0, + "grad_norm": 1.5486394088261988, + "language_loss": 0.78909743, + "learning_rate": 2.764962053731699e-06, + "loss": 0.81074148, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.4609375, + "step": 6550, + "time_per_iteration": 2.5636725425720215 + }, + { + "auxiliary_loss_clip": 0.01078609, + "auxiliary_loss_mlp": 0.01075102, + "balance_loss_clip": 1.04358304, + "balance_loss_mlp": 1.03345394, + "epoch": 0.39386742822786713, + "flos": 21608897592960.0, + "grad_norm": 1.632783132423678, + "language_loss": 0.82528687, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.84682393, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45117188, + "step": 6551, + "time_per_iteration": 2.4190211296081543 + }, + { + "auxiliary_loss_clip": 0.01076897, + "auxiliary_loss_mlp": 0.0107367, + "balance_loss_clip": 1.04231787, + "balance_loss_mlp": 1.03125238, + "epoch": 0.3939275514805351, + "flos": 12414031885440.0, + "grad_norm": 2.7831094535263365, + "language_loss": 0.81305736, + "learning_rate": 2.764242299098596e-06, + "loss": 0.83456302, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45703125, + "step": 6552, + "time_per_iteration": 2.4468345642089844 + }, + { + "auxiliary_loss_clip": 0.01076632, + "auxiliary_loss_mlp": 0.01074589, + "balance_loss_clip": 1.04368973, + "balance_loss_mlp": 1.03079081, + "epoch": 0.39398767473320306, + "flos": 18551029363200.0, + "grad_norm": 1.6505830511648856, + "language_loss": 0.72442591, + "learning_rate": 2.763882378305003e-06, + "loss": 0.74593806, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45703125, + "step": 6553, + "time_per_iteration": 2.4525063037872314 + }, + { + "auxiliary_loss_clip": 0.01071137, + "auxiliary_loss_mlp": 0.01067366, + "balance_loss_clip": 1.03780234, + "balance_loss_mlp": 1.02853072, + "epoch": 0.39404779798587103, + "flos": 29307769920000.0, + "grad_norm": 1.7805703118469085, + "language_loss": 0.6557681, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.67715317, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42578125, + "step": 6554, + "time_per_iteration": 2.5265610218048096 + }, + { + "auxiliary_loss_clip": 0.01071209, + "auxiliary_loss_mlp": 0.01062809, + "balance_loss_clip": 1.03285122, + "balance_loss_mlp": 1.02812088, + "epoch": 0.394107921238539, + "flos": 34895618622720.0, + "grad_norm": 2.9321606981703665, + "language_loss": 0.80692804, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.82826823, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43164062, + "step": 6555, + "time_per_iteration": 2.5256388187408447 + }, + { + "auxiliary_loss_clip": 0.01070059, + "auxiliary_loss_mlp": 0.01057643, + "balance_loss_clip": 1.02455068, + "balance_loss_mlp": 1.02589285, + "epoch": 0.39416804449120696, + "flos": 25080276499200.0, + "grad_norm": 2.0033417116030408, + "language_loss": 0.73507702, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.75635409, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44140625, + "step": 6556, + "time_per_iteration": 2.467241048812866 + }, + { + "auxiliary_loss_clip": 0.01067864, + "auxiliary_loss_mlp": 0.01047626, + "balance_loss_clip": 1.01735902, + "balance_loss_mlp": 1.02381575, + "epoch": 0.3942281677438749, + "flos": 32305272675840.0, + "grad_norm": 2.065561051975645, + "language_loss": 0.8508631, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.87201798, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44140625, + "step": 6557, + "time_per_iteration": 2.4944376945495605 + }, + { + "auxiliary_loss_clip": 0.01065514, + "auxiliary_loss_mlp": 0.01052338, + "balance_loss_clip": 1.02199924, + "balance_loss_mlp": 1.02270913, + "epoch": 0.3942882909965429, + "flos": 24935456712960.0, + "grad_norm": 2.7669281501085212, + "language_loss": 0.81785136, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.83902991, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4296875, + "step": 6558, + "time_per_iteration": 2.451183319091797 + }, + { + "auxiliary_loss_clip": 0.01064383, + "auxiliary_loss_mlp": 0.01044569, + "balance_loss_clip": 1.01592278, + "balance_loss_mlp": 1.02202511, + "epoch": 0.39434841424921085, + "flos": 11873994975360.0, + "grad_norm": 1.7227336889277227, + "language_loss": 0.72334582, + "learning_rate": 2.761722245724792e-06, + "loss": 0.74443531, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42382812, + "step": 6559, + "time_per_iteration": 2.3705270290374756 + }, + { + "auxiliary_loss_clip": 0.01068841, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.01732707, + "balance_loss_mlp": 1.02288556, + "epoch": 0.3944085375018789, + "flos": 16360718307840.0, + "grad_norm": 1.9595917626263695, + "language_loss": 0.82373452, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.84491777, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45898438, + "step": 6560, + "time_per_iteration": 2.3939342498779297 + }, + { + "auxiliary_loss_clip": 0.0106892, + "auxiliary_loss_mlp": 0.01047847, + "balance_loss_clip": 1.01717424, + "balance_loss_mlp": 1.02552021, + "epoch": 0.39446866075454684, + "flos": 10632623207040.0, + "grad_norm": 2.4484280641843235, + "language_loss": 0.8492763, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.870444, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43359375, + "step": 6561, + "time_per_iteration": 2.373145818710327 + }, + { + "auxiliary_loss_clip": 0.01066775, + "auxiliary_loss_mlp": 0.01045742, + "balance_loss_clip": 1.01601124, + "balance_loss_mlp": 1.02282631, + "epoch": 0.3945287840072148, + "flos": 18186501191040.0, + "grad_norm": 2.3351495841689793, + "language_loss": 0.80979526, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.83092046, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43945312, + "step": 6562, + "time_per_iteration": 2.4006145000457764 + }, + { + "auxiliary_loss_clip": 0.01063859, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_clip": 1.01783442, + "balance_loss_mlp": 1.02289224, + "epoch": 0.39458890725988277, + "flos": 23038765125120.0, + "grad_norm": 1.5439078273832556, + "language_loss": 0.8278231, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.84893703, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41015625, + "step": 6563, + "time_per_iteration": 2.418553352355957 + }, + { + "auxiliary_loss_clip": 0.01066371, + "auxiliary_loss_mlp": 0.01050327, + "balance_loss_clip": 1.01998782, + "balance_loss_mlp": 1.02343273, + "epoch": 0.39464903051255074, + "flos": 17158274472960.0, + "grad_norm": 2.9893835273915235, + "language_loss": 0.71003211, + "learning_rate": 2.759921340790127e-06, + "loss": 0.73119909, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4296875, + "step": 6564, + "time_per_iteration": 2.454163074493408 + }, + { + "auxiliary_loss_clip": 0.01070352, + "auxiliary_loss_mlp": 0.01062002, + "balance_loss_clip": 1.02867138, + "balance_loss_mlp": 1.0244534, + "epoch": 0.3947091537652187, + "flos": 15888064055040.0, + "grad_norm": 2.1402695405660563, + "language_loss": 0.84875494, + "learning_rate": 2.759561073299676e-06, + "loss": 0.87007856, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.45898438, + "step": 6565, + "time_per_iteration": 2.3947956562042236 + }, + { + "auxiliary_loss_clip": 0.01070209, + "auxiliary_loss_mlp": 0.0104877, + "balance_loss_clip": 1.01971853, + "balance_loss_mlp": 1.02660036, + "epoch": 0.39476927701788667, + "flos": 18544675495680.0, + "grad_norm": 1.817875420176955, + "language_loss": 0.84947002, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.87065983, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4375, + "step": 6566, + "time_per_iteration": 2.40604829788208 + }, + { + "auxiliary_loss_clip": 0.01072871, + "auxiliary_loss_mlp": 0.01061948, + "balance_loss_clip": 1.02922487, + "balance_loss_mlp": 1.02579725, + "epoch": 0.39482940027055463, + "flos": 22274551175040.0, + "grad_norm": 1.792203061019, + "language_loss": 0.78539824, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.80674648, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.47070312, + "step": 6567, + "time_per_iteration": 2.4227511882781982 + }, + { + "auxiliary_loss_clip": 0.01067169, + "auxiliary_loss_mlp": 0.01043774, + "balance_loss_clip": 1.01516354, + "balance_loss_mlp": 1.0261308, + "epoch": 0.3948895235232226, + "flos": 14756738492160.0, + "grad_norm": 1.6791064890871699, + "language_loss": 0.80573606, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82684547, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 6568, + "time_per_iteration": 2.4492266178131104 + }, + { + "auxiliary_loss_clip": 0.01068932, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_clip": 1.01575983, + "balance_loss_mlp": 1.02648163, + "epoch": 0.39494964677589056, + "flos": 22564644595200.0, + "grad_norm": 1.6943293293798425, + "language_loss": 0.85734987, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.87849295, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42578125, + "step": 6569, + "time_per_iteration": 2.4201362133026123 + }, + { + "auxiliary_loss_clip": 0.01069003, + "auxiliary_loss_mlp": 0.01043649, + "balance_loss_clip": 1.01658821, + "balance_loss_mlp": 1.02787292, + "epoch": 0.3950097700285585, + "flos": 22962165868800.0, + "grad_norm": 2.083954393445318, + "language_loss": 0.75638384, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.7775104, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 6570, + "time_per_iteration": 2.4737815856933594 + }, + { + "auxiliary_loss_clip": 0.01069484, + "auxiliary_loss_mlp": 0.01044668, + "balance_loss_clip": 1.01543772, + "balance_loss_mlp": 1.02606809, + "epoch": 0.3950698932812265, + "flos": 20594181571200.0, + "grad_norm": 1.7439517526373394, + "language_loss": 0.81426352, + "learning_rate": 2.757398863979922e-06, + "loss": 0.83540505, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43359375, + "step": 6571, + "time_per_iteration": 2.3850979804992676 + }, + { + "auxiliary_loss_clip": 0.0106694, + "auxiliary_loss_mlp": 0.01049727, + "balance_loss_clip": 1.02085412, + "balance_loss_mlp": 1.02534926, + "epoch": 0.39513001653389446, + "flos": 20374752476160.0, + "grad_norm": 1.6006710131683901, + "language_loss": 0.79383641, + "learning_rate": 2.757038395157997e-06, + "loss": 0.8150031, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41601562, + "step": 6572, + "time_per_iteration": 2.456850528717041 + }, + { + "auxiliary_loss_clip": 0.01068801, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.01894605, + "balance_loss_mlp": 1.02446175, + "epoch": 0.3951901397865625, + "flos": 26462592892800.0, + "grad_norm": 1.5860161399962769, + "language_loss": 0.75712323, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77830535, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44335938, + "step": 6573, + "time_per_iteration": 2.4480621814727783 + }, + { + "auxiliary_loss_clip": 0.01065861, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_clip": 1.01769054, + "balance_loss_mlp": 1.02328563, + "epoch": 0.39525026303923044, + "flos": 43836595856640.0, + "grad_norm": 1.4404098466532091, + "language_loss": 0.68860579, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70970833, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.42578125, + "step": 6574, + "time_per_iteration": 2.6452977657318115 + }, + { + "auxiliary_loss_clip": 0.01067, + "auxiliary_loss_mlp": 0.01046943, + "balance_loss_clip": 1.01740265, + "balance_loss_mlp": 1.0232861, + "epoch": 0.3953103862918984, + "flos": 18039831102720.0, + "grad_norm": 2.6816395077370307, + "language_loss": 0.72924733, + "learning_rate": 2.755956816505072e-06, + "loss": 0.75038677, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4375, + "step": 6575, + "time_per_iteration": 2.3808021545410156 + }, + { + "auxiliary_loss_clip": 0.01065391, + "auxiliary_loss_mlp": 0.01053307, + "balance_loss_clip": 1.0223608, + "balance_loss_mlp": 1.02096105, + "epoch": 0.3953705095445664, + "flos": 16975259792640.0, + "grad_norm": 1.9528032543563638, + "language_loss": 0.74665582, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.76784283, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44335938, + "step": 6576, + "time_per_iteration": 2.379528522491455 + }, + { + "auxiliary_loss_clip": 0.01063702, + "auxiliary_loss_mlp": 0.01046341, + "balance_loss_clip": 1.01767135, + "balance_loss_mlp": 1.02088308, + "epoch": 0.39543063279723434, + "flos": 17410452289920.0, + "grad_norm": 2.7240335072642594, + "language_loss": 0.85033059, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.87143099, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4296875, + "step": 6577, + "time_per_iteration": 2.3428261280059814 + }, + { + "auxiliary_loss_clip": 0.01065288, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_clip": 1.01478398, + "balance_loss_mlp": 1.02270341, + "epoch": 0.3954907560499023, + "flos": 22783096172160.0, + "grad_norm": 2.5255267154004595, + "language_loss": 0.92764366, + "learning_rate": 2.75487497985853e-06, + "loss": 0.94874835, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42578125, + "step": 6578, + "time_per_iteration": 2.409799337387085 + }, + { + "auxiliary_loss_clip": 0.01067617, + "auxiliary_loss_mlp": 0.01050018, + "balance_loss_clip": 1.01625824, + "balance_loss_mlp": 1.02160263, + "epoch": 0.39555087930257027, + "flos": 21943330306560.0, + "grad_norm": 2.1172474247901523, + "language_loss": 0.79952013, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.82069647, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.4609375, + "step": 6579, + "time_per_iteration": 2.406332492828369 + }, + { + "auxiliary_loss_clip": 0.01064794, + "auxiliary_loss_mlp": 0.01046765, + "balance_loss_clip": 1.01609206, + "balance_loss_mlp": 1.02042818, + "epoch": 0.39561100255523823, + "flos": 20403800593920.0, + "grad_norm": 2.3676734356794813, + "language_loss": 0.69703901, + "learning_rate": 2.754153612280037e-06, + "loss": 0.71815467, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44335938, + "step": 6580, + "time_per_iteration": 2.3831260204315186 + }, + { + "auxiliary_loss_clip": 0.0106254, + "auxiliary_loss_mlp": 0.01045415, + "balance_loss_clip": 1.01406264, + "balance_loss_mlp": 1.01950717, + "epoch": 0.3956711258079062, + "flos": 27963334218240.0, + "grad_norm": 1.7621189519290987, + "language_loss": 0.59903175, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.62011129, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4296875, + "step": 6581, + "time_per_iteration": 2.4207732677459717 + }, + { + "auxiliary_loss_clip": 0.01065278, + "auxiliary_loss_mlp": 0.01044462, + "balance_loss_clip": 1.01322925, + "balance_loss_mlp": 1.02157676, + "epoch": 0.39573124906057416, + "flos": 14427437748480.0, + "grad_norm": 1.7655546588486442, + "language_loss": 0.71159625, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.73269367, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4375, + "step": 6582, + "time_per_iteration": 2.3809916973114014 + }, + { + "auxiliary_loss_clip": 0.01063624, + "auxiliary_loss_mlp": 0.01050858, + "balance_loss_clip": 1.01811123, + "balance_loss_mlp": 1.01963735, + "epoch": 0.39579137231324213, + "flos": 18732717411840.0, + "grad_norm": 1.8816742438369298, + "language_loss": 0.78480732, + "learning_rate": 2.753071346464642e-06, + "loss": 0.80595219, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.43945312, + "step": 6583, + "time_per_iteration": 3.7931673526763916 + }, + { + "auxiliary_loss_clip": 0.01061317, + "auxiliary_loss_mlp": 0.01051703, + "balance_loss_clip": 1.02194881, + "balance_loss_mlp": 1.01921272, + "epoch": 0.3958514955659101, + "flos": 17675442576000.0, + "grad_norm": 1.9834482023032796, + "language_loss": 0.67008197, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.69121218, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 6584, + "time_per_iteration": 3.7370786666870117 + }, + { + "auxiliary_loss_clip": 0.01065446, + "auxiliary_loss_mlp": 0.01052741, + "balance_loss_clip": 1.01855195, + "balance_loss_mlp": 1.01965845, + "epoch": 0.39591161881857806, + "flos": 29307979388160.0, + "grad_norm": 2.1315734869060146, + "language_loss": 0.73751235, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.75869423, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.45898438, + "step": 6585, + "time_per_iteration": 2.417539596557617 + }, + { + "auxiliary_loss_clip": 0.01063896, + "auxiliary_loss_mlp": 0.01048317, + "balance_loss_clip": 1.01722777, + "balance_loss_mlp": 1.01979494, + "epoch": 0.3959717420712461, + "flos": 25770753924480.0, + "grad_norm": 1.745163099221485, + "language_loss": 0.73861855, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75974071, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44140625, + "step": 6586, + "time_per_iteration": 2.4298925399780273 + }, + { + "auxiliary_loss_clip": 0.01063657, + "auxiliary_loss_mlp": 0.01051219, + "balance_loss_clip": 1.01836538, + "balance_loss_mlp": 1.01952624, + "epoch": 0.39603186532391405, + "flos": 20922714264960.0, + "grad_norm": 1.6030298734055297, + "language_loss": 0.72658145, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.74773026, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44140625, + "step": 6587, + "time_per_iteration": 3.851867914199829 + }, + { + "auxiliary_loss_clip": 0.01020052, + "auxiliary_loss_mlp": 0.01015401, + "balance_loss_clip": 1.01125228, + "balance_loss_mlp": 1.00839615, + "epoch": 0.396091988576582, + "flos": 54878261086080.0, + "grad_norm": 0.8640067631226273, + "language_loss": 0.61373407, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63408858, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.04150391, + "router_z_loss_mlp": 0.11621094, + "step": 6588, + "time_per_iteration": 2.842393636703491 + }, + { + "auxiliary_loss_clip": 0.0106439, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_clip": 1.01215315, + "balance_loss_mlp": 1.02043581, + "epoch": 0.39615211182925, + "flos": 20701888715520.0, + "grad_norm": 1.6630146949221762, + "language_loss": 0.82335514, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.84444296, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.43945312, + "step": 6589, + "time_per_iteration": 3.83420729637146 + }, + { + "auxiliary_loss_clip": 0.01063081, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.01344395, + "balance_loss_mlp": 1.01952791, + "epoch": 0.39621223508191794, + "flos": 20993308767360.0, + "grad_norm": 3.4851546388631713, + "language_loss": 0.72603256, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.74712014, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.43554688, + "step": 6590, + "time_per_iteration": 2.3977856636047363 + }, + { + "auxiliary_loss_clip": 0.01064059, + "auxiliary_loss_mlp": 0.01056332, + "balance_loss_clip": 1.0261246, + "balance_loss_mlp": 1.02020049, + "epoch": 0.3962723583345859, + "flos": 23367681843840.0, + "grad_norm": 1.7348302769586301, + "language_loss": 0.76967812, + "learning_rate": 2.750184048805956e-06, + "loss": 0.79088199, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4375, + "step": 6591, + "time_per_iteration": 2.412048816680908 + }, + { + "auxiliary_loss_clip": 0.01064102, + "auxiliary_loss_mlp": 0.01057956, + "balance_loss_clip": 1.02506638, + "balance_loss_mlp": 1.02006912, + "epoch": 0.39633248158725387, + "flos": 25114526409600.0, + "grad_norm": 1.6818799987233088, + "language_loss": 0.7962997, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81752032, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.43945312, + "step": 6592, + "time_per_iteration": 2.4393246173858643 + }, + { + "auxiliary_loss_clip": 0.0105936, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.01377559, + "balance_loss_mlp": 1.01853168, + "epoch": 0.39639260483992184, + "flos": 39786007628160.0, + "grad_norm": 1.6995618594594704, + "language_loss": 0.69781876, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71881998, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 6593, + "time_per_iteration": 2.5917296409606934 + }, + { + "auxiliary_loss_clip": 0.01064913, + "auxiliary_loss_mlp": 0.01055266, + "balance_loss_clip": 1.02265, + "balance_loss_mlp": 1.0190289, + "epoch": 0.3964527280925898, + "flos": 17346106920960.0, + "grad_norm": 1.6058404737130039, + "language_loss": 0.79207981, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.81328166, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.45898438, + "step": 6594, + "time_per_iteration": 2.403973340988159 + }, + { + "auxiliary_loss_clip": 0.01018565, + "auxiliary_loss_mlp": 0.01003331, + "balance_loss_clip": 0.99937326, + "balance_loss_mlp": 1.00725555, + "epoch": 0.39651285134525777, + "flos": 71714182176000.0, + "grad_norm": 0.9521354805012647, + "language_loss": 0.63132745, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65154636, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.11328125, + "step": 6595, + "time_per_iteration": 3.067514181137085 + }, + { + "auxiliary_loss_clip": 0.0106516, + "auxiliary_loss_mlp": 0.01055747, + "balance_loss_clip": 1.02105713, + "balance_loss_mlp": 1.0193845, + "epoch": 0.39657297459792573, + "flos": 25774524351360.0, + "grad_norm": 2.888005343230197, + "language_loss": 0.65235054, + "learning_rate": 2.748378562795223e-06, + "loss": 0.67355967, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.45703125, + "step": 6596, + "time_per_iteration": 2.483163595199585 + }, + { + "auxiliary_loss_clip": 0.01061756, + "auxiliary_loss_mlp": 0.01059854, + "balance_loss_clip": 1.02800155, + "balance_loss_mlp": 1.01951718, + "epoch": 0.3966330978505937, + "flos": 20265090295680.0, + "grad_norm": 2.1416483215073843, + "language_loss": 0.80095077, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.82216686, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.421875, + "step": 6597, + "time_per_iteration": 2.368825674057007 + }, + { + "auxiliary_loss_clip": 0.01065532, + "auxiliary_loss_mlp": 0.01050178, + "balance_loss_clip": 1.01689446, + "balance_loss_mlp": 1.02022827, + "epoch": 0.39669322110326166, + "flos": 20630142138240.0, + "grad_norm": 2.0175675053984925, + "language_loss": 0.69873852, + "learning_rate": 2.747656169644941e-06, + "loss": 0.71989566, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.453125, + "step": 6598, + "time_per_iteration": 2.398622512817383 + }, + { + "auxiliary_loss_clip": 0.01062887, + "auxiliary_loss_mlp": 0.01049057, + "balance_loss_clip": 1.02097154, + "balance_loss_mlp": 1.01974797, + "epoch": 0.3967533443559297, + "flos": 21724983463680.0, + "grad_norm": 1.6927891508059618, + "language_loss": 0.80700386, + "learning_rate": 2.747294930536157e-06, + "loss": 0.82812333, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.43164062, + "step": 6599, + "time_per_iteration": 2.4118435382843018 + }, + { + "auxiliary_loss_clip": 0.01062633, + "auxiliary_loss_mlp": 0.01045232, + "balance_loss_clip": 1.01156747, + "balance_loss_mlp": 1.01963603, + "epoch": 0.39681346760859765, + "flos": 25482964654080.0, + "grad_norm": 2.403904920255821, + "language_loss": 0.73786384, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.75894248, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4296875, + "step": 6600, + "time_per_iteration": 2.445113182067871 + }, + { + "auxiliary_loss_clip": 0.01061558, + "auxiliary_loss_mlp": 0.01044575, + "balance_loss_clip": 1.0151782, + "balance_loss_mlp": 1.01867366, + "epoch": 0.3968735908612656, + "flos": 20958535186560.0, + "grad_norm": 2.522297849904298, + "language_loss": 0.87663019, + "learning_rate": 2.746572367319791e-06, + "loss": 0.89769149, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 6601, + "time_per_iteration": 2.397015333175659 + }, + { + "auxiliary_loss_clip": 0.01067934, + "auxiliary_loss_mlp": 0.01052573, + "balance_loss_clip": 1.01683438, + "balance_loss_mlp": 1.0208385, + "epoch": 0.3969337141139336, + "flos": 10706324820480.0, + "grad_norm": 2.05385878219445, + "language_loss": 0.72069091, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.74189597, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.47070312, + "step": 6602, + "time_per_iteration": 2.3832638263702393 + }, + { + "auxiliary_loss_clip": 0.01061466, + "auxiliary_loss_mlp": 0.01055742, + "balance_loss_clip": 1.02222013, + "balance_loss_mlp": 1.01904726, + "epoch": 0.39699383736660154, + "flos": 17593013122560.0, + "grad_norm": 2.529332073398103, + "language_loss": 0.8586241, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.87979615, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.42382812, + "step": 6603, + "time_per_iteration": 2.3587827682495117 + }, + { + "auxiliary_loss_clip": 0.01058196, + "auxiliary_loss_mlp": 0.01044211, + "balance_loss_clip": 1.0159111, + "balance_loss_mlp": 1.01753426, + "epoch": 0.3970539606192695, + "flos": 17784965111040.0, + "grad_norm": 1.5095053369022962, + "language_loss": 0.74195206, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.76297605, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40625, + "step": 6604, + "time_per_iteration": 2.4264731407165527 + }, + { + "auxiliary_loss_clip": 0.01057908, + "auxiliary_loss_mlp": 0.01041742, + "balance_loss_clip": 1.01292944, + "balance_loss_mlp": 1.01868033, + "epoch": 0.3971140838719375, + "flos": 24788367688320.0, + "grad_norm": 1.5797461932060621, + "language_loss": 0.83840752, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85940397, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.39257812, + "step": 6605, + "time_per_iteration": 2.448962926864624 + }, + { + "auxiliary_loss_clip": 0.01058024, + "auxiliary_loss_mlp": 0.01042564, + "balance_loss_clip": 1.01537263, + "balance_loss_mlp": 1.01738191, + "epoch": 0.39717420712460544, + "flos": 24242430758400.0, + "grad_norm": 1.539654948755483, + "language_loss": 0.75507736, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.77608323, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 6606, + "time_per_iteration": 2.4119200706481934 + }, + { + "auxiliary_loss_clip": 0.01062614, + "auxiliary_loss_mlp": 0.01045223, + "balance_loss_clip": 1.01517034, + "balance_loss_mlp": 1.01998138, + "epoch": 0.3972343303772734, + "flos": 25883523216000.0, + "grad_norm": 1.6809790421586417, + "language_loss": 0.74940026, + "learning_rate": 2.744403998666805e-06, + "loss": 0.77047861, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42578125, + "step": 6607, + "time_per_iteration": 2.412935733795166 + }, + { + "auxiliary_loss_clip": 0.01062384, + "auxiliary_loss_mlp": 0.01050068, + "balance_loss_clip": 1.02052832, + "balance_loss_mlp": 1.01872778, + "epoch": 0.39729445362994137, + "flos": 45621984430080.0, + "grad_norm": 1.5057463649613718, + "language_loss": 0.69050252, + "learning_rate": 2.744042505013797e-06, + "loss": 0.71162701, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4375, + "step": 6608, + "time_per_iteration": 2.622328758239746 + }, + { + "auxiliary_loss_clip": 0.01065055, + "auxiliary_loss_mlp": 0.0105457, + "balance_loss_clip": 1.02041674, + "balance_loss_mlp": 1.01983213, + "epoch": 0.39735457688260933, + "flos": 20192924782080.0, + "grad_norm": 1.8150786087737145, + "language_loss": 0.75552428, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.77672052, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.453125, + "step": 6609, + "time_per_iteration": 2.3879308700561523 + }, + { + "auxiliary_loss_clip": 0.01061251, + "auxiliary_loss_mlp": 0.0104455, + "balance_loss_clip": 1.01540375, + "balance_loss_mlp": 1.01859832, + "epoch": 0.3974147001352773, + "flos": 23330045531520.0, + "grad_norm": 1.487290648028129, + "language_loss": 0.73015839, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.75121641, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 6610, + "time_per_iteration": 2.405268907546997 + }, + { + "auxiliary_loss_clip": 0.01057412, + "auxiliary_loss_mlp": 0.0103986, + "balance_loss_clip": 1.01324058, + "balance_loss_mlp": 1.01736987, + "epoch": 0.39747482338794526, + "flos": 21687591530880.0, + "grad_norm": 1.6675355648339696, + "language_loss": 0.79952741, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.82050014, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40039062, + "step": 6611, + "time_per_iteration": 2.4055910110473633 + }, + { + "auxiliary_loss_clip": 0.0106243, + "auxiliary_loss_mlp": 0.01047336, + "balance_loss_clip": 1.01843989, + "balance_loss_mlp": 1.01948881, + "epoch": 0.3975349466406133, + "flos": 30987511119360.0, + "grad_norm": 1.7960885725342326, + "language_loss": 0.80192322, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.82302088, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 6612, + "time_per_iteration": 2.470108985900879 + }, + { + "auxiliary_loss_clip": 0.01016214, + "auxiliary_loss_mlp": 0.0100397, + "balance_loss_clip": 1.00025082, + "balance_loss_mlp": 1.00533175, + "epoch": 0.39759506989328125, + "flos": 63680702578560.0, + "grad_norm": 0.8485069416470725, + "language_loss": 0.65178192, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67198372, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.10888672, + "step": 6613, + "time_per_iteration": 2.9407365322113037 + }, + { + "auxiliary_loss_clip": 0.01060653, + "auxiliary_loss_mlp": 0.01047536, + "balance_loss_clip": 1.01867509, + "balance_loss_mlp": 1.01868594, + "epoch": 0.3976551931459492, + "flos": 23694713349120.0, + "grad_norm": 2.2888343184325515, + "language_loss": 0.73217654, + "learning_rate": 2.741872951078109e-06, + "loss": 0.75325841, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41992188, + "step": 6614, + "time_per_iteration": 2.409714698791504 + }, + { + "auxiliary_loss_clip": 0.01061139, + "auxiliary_loss_mlp": 0.01045096, + "balance_loss_clip": 1.01623535, + "balance_loss_mlp": 1.01927805, + "epoch": 0.3977153163986172, + "flos": 15668739694080.0, + "grad_norm": 1.649331351457274, + "language_loss": 0.82666487, + "learning_rate": 2.741511260213862e-06, + "loss": 0.84772718, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 6615, + "time_per_iteration": 2.3759772777557373 + }, + { + "auxiliary_loss_clip": 0.01058492, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.01095414, + "balance_loss_mlp": 1.01707745, + "epoch": 0.39777543965128515, + "flos": 14063817271680.0, + "grad_norm": 2.112959454538708, + "language_loss": 0.69551677, + "learning_rate": 2.741149541231434e-06, + "loss": 0.71649456, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41210938, + "step": 6616, + "time_per_iteration": 2.347687244415283 + }, + { + "auxiliary_loss_clip": 0.01063002, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.01765847, + "balance_loss_mlp": 1.01954603, + "epoch": 0.3978355629039531, + "flos": 23366355212160.0, + "grad_norm": 2.155867540555182, + "language_loss": 0.86156642, + "learning_rate": 2.740787794144541e-06, + "loss": 0.88268393, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43554688, + "step": 6617, + "time_per_iteration": 2.421766996383667 + }, + { + "auxiliary_loss_clip": 0.01057197, + "auxiliary_loss_mlp": 0.01048302, + "balance_loss_clip": 1.02226651, + "balance_loss_mlp": 1.01773536, + "epoch": 0.3978956861566211, + "flos": 19061773776000.0, + "grad_norm": 1.7062232940647544, + "language_loss": 0.73496479, + "learning_rate": 2.7404260189669e-06, + "loss": 0.75601977, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 6618, + "time_per_iteration": 2.3934054374694824 + }, + { + "auxiliary_loss_clip": 0.01060914, + "auxiliary_loss_mlp": 0.01042497, + "balance_loss_clip": 1.01257575, + "balance_loss_mlp": 1.01908362, + "epoch": 0.39795580940928904, + "flos": 30226369368960.0, + "grad_norm": 1.6438761070863968, + "language_loss": 0.67033458, + "learning_rate": 2.740064215712231e-06, + "loss": 0.6913687, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41796875, + "step": 6619, + "time_per_iteration": 2.481961250305176 + }, + { + "auxiliary_loss_clip": 0.01014121, + "auxiliary_loss_mlp": 0.01016175, + "balance_loss_clip": 1.01269388, + "balance_loss_mlp": 1.00349927, + "epoch": 0.398015932661957, + "flos": 69843885442560.0, + "grad_norm": 0.7744232988815002, + "language_loss": 0.58320642, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60350931, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.03491211, + "router_z_loss_mlp": 0.10644531, + "step": 6620, + "time_per_iteration": 2.9675285816192627 + }, + { + "auxiliary_loss_clip": 0.01059901, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.01565564, + "balance_loss_mlp": 1.01853514, + "epoch": 0.39807605591462497, + "flos": 20156719835520.0, + "grad_norm": 1.5353362550086451, + "language_loss": 0.80468434, + "learning_rate": 2.739340525026686e-06, + "loss": 0.82569987, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.4140625, + "step": 6621, + "time_per_iteration": 2.3846840858459473 + }, + { + "auxiliary_loss_clip": 0.01060415, + "auxiliary_loss_mlp": 0.01042116, + "balance_loss_clip": 1.01430392, + "balance_loss_mlp": 1.01901197, + "epoch": 0.39813617916729294, + "flos": 21140711994240.0, + "grad_norm": 1.8014180249784413, + "language_loss": 0.8001368, + "learning_rate": 2.738978637623252e-06, + "loss": 0.8211621, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 6622, + "time_per_iteration": 2.3750078678131104 + }, + { + "auxiliary_loss_clip": 0.01060166, + "auxiliary_loss_mlp": 0.01044838, + "balance_loss_clip": 1.01501179, + "balance_loss_mlp": 1.0180825, + "epoch": 0.3981963024199609, + "flos": 18987513580800.0, + "grad_norm": 1.7673162268155533, + "language_loss": 0.76781547, + "learning_rate": 2.738616722197674e-06, + "loss": 0.78886551, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.421875, + "step": 6623, + "time_per_iteration": 3.832249402999878 + }, + { + "auxiliary_loss_clip": 0.01060062, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.01799536, + "balance_loss_mlp": 1.01857662, + "epoch": 0.39825642567262887, + "flos": 16574352117120.0, + "grad_norm": 1.7729362276481357, + "language_loss": 0.81874955, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.83982778, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41601562, + "step": 6624, + "time_per_iteration": 3.7764759063720703 + }, + { + "auxiliary_loss_clip": 0.01064795, + "auxiliary_loss_mlp": 0.01054688, + "balance_loss_clip": 1.02195334, + "balance_loss_mlp": 1.02065337, + "epoch": 0.39831654892529683, + "flos": 22198754880000.0, + "grad_norm": 2.372502480895591, + "language_loss": 0.8499375, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.87113237, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44140625, + "step": 6625, + "time_per_iteration": 2.3927953243255615 + }, + { + "auxiliary_loss_clip": 0.01060421, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.01719284, + "balance_loss_mlp": 1.01901603, + "epoch": 0.39837667217796485, + "flos": 10487209927680.0, + "grad_norm": 2.0360359494721583, + "language_loss": 0.88454735, + "learning_rate": 2.737530807925321e-06, + "loss": 0.90560693, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4140625, + "step": 6626, + "time_per_iteration": 3.7187352180480957 + }, + { + "auxiliary_loss_clip": 0.01062255, + "auxiliary_loss_mlp": 0.01043573, + "balance_loss_clip": 1.01181602, + "balance_loss_mlp": 1.01987875, + "epoch": 0.3984367954306328, + "flos": 17964383921280.0, + "grad_norm": 2.33475797686895, + "language_loss": 0.84539515, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86645347, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.42382812, + "step": 6627, + "time_per_iteration": 2.368659257888794 + }, + { + "auxiliary_loss_clip": 0.01059556, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_clip": 1.01838541, + "balance_loss_mlp": 1.01868248, + "epoch": 0.3984969186833008, + "flos": 22709953140480.0, + "grad_norm": 1.5788651830278866, + "language_loss": 0.84025139, + "learning_rate": 2.736806725217998e-06, + "loss": 0.86131024, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41015625, + "step": 6628, + "time_per_iteration": 2.3978657722473145 + }, + { + "auxiliary_loss_clip": 0.01061941, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.01574898, + "balance_loss_mlp": 1.01930296, + "epoch": 0.39855704193596875, + "flos": 23404619928960.0, + "grad_norm": 1.69412254065451, + "language_loss": 0.7223711, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.74343109, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.42578125, + "step": 6629, + "time_per_iteration": 3.824693441390991 + }, + { + "auxiliary_loss_clip": 0.0105923, + "auxiliary_loss_mlp": 0.01045725, + "balance_loss_clip": 1.01630449, + "balance_loss_mlp": 1.01898694, + "epoch": 0.3986171651886367, + "flos": 21250862933760.0, + "grad_norm": 1.6238591893026357, + "language_loss": 0.81588125, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.83693081, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40234375, + "step": 6630, + "time_per_iteration": 2.385049343109131 + }, + { + "auxiliary_loss_clip": 0.01062793, + "auxiliary_loss_mlp": 0.01042597, + "balance_loss_clip": 1.0134145, + "balance_loss_mlp": 1.02047205, + "epoch": 0.3986772884413047, + "flos": 12457882419840.0, + "grad_norm": 1.8881750720265338, + "language_loss": 0.75819683, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77925068, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42382812, + "step": 6631, + "time_per_iteration": 2.4107179641723633 + }, + { + "auxiliary_loss_clip": 0.01061633, + "auxiliary_loss_mlp": 0.01046131, + "balance_loss_clip": 1.01609027, + "balance_loss_mlp": 1.01885915, + "epoch": 0.39873741169397264, + "flos": 19645102638720.0, + "grad_norm": 1.7429534159278208, + "language_loss": 0.72554779, + "learning_rate": 2.735358224635783e-06, + "loss": 0.74662542, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42773438, + "step": 6632, + "time_per_iteration": 2.3726062774658203 + }, + { + "auxiliary_loss_clip": 0.01059155, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_clip": 1.01495862, + "balance_loss_mlp": 1.01751578, + "epoch": 0.3987975349466406, + "flos": 21683821104000.0, + "grad_norm": 1.8342556236667038, + "language_loss": 0.76040411, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.78142458, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41601562, + "step": 6633, + "time_per_iteration": 2.389531135559082 + }, + { + "auxiliary_loss_clip": 0.01061003, + "auxiliary_loss_mlp": 0.01041923, + "balance_loss_clip": 1.01368248, + "balance_loss_mlp": 1.01840782, + "epoch": 0.3988576581993086, + "flos": 23912955457920.0, + "grad_norm": 1.8908097974129379, + "language_loss": 0.82414973, + "learning_rate": 2.7346338069806e-06, + "loss": 0.84517902, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.42578125, + "step": 6634, + "time_per_iteration": 2.4019882678985596 + }, + { + "auxiliary_loss_clip": 0.01062658, + "auxiliary_loss_mlp": 0.01049663, + "balance_loss_clip": 1.0190506, + "balance_loss_mlp": 1.02001548, + "epoch": 0.39891778145197654, + "flos": 18148934701440.0, + "grad_norm": 1.856461988518296, + "language_loss": 0.76246297, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.78358614, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.42578125, + "step": 6635, + "time_per_iteration": 2.4229767322540283 + }, + { + "auxiliary_loss_clip": 0.01064688, + "auxiliary_loss_mlp": 0.01051002, + "balance_loss_clip": 1.01852894, + "balance_loss_mlp": 1.01884282, + "epoch": 0.3989779047046445, + "flos": 22594356028800.0, + "grad_norm": 2.14017694516777, + "language_loss": 0.67673868, + "learning_rate": 2.733909277895868e-06, + "loss": 0.69789559, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45703125, + "step": 6636, + "time_per_iteration": 2.3729288578033447 + }, + { + "auxiliary_loss_clip": 0.01059258, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.01669466, + "balance_loss_mlp": 1.01782525, + "epoch": 0.39903802795731247, + "flos": 18076245517440.0, + "grad_norm": 1.8355359461641265, + "language_loss": 0.82903039, + "learning_rate": 2.733546971601763e-06, + "loss": 0.8501001, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4140625, + "step": 6637, + "time_per_iteration": 2.40867280960083 + }, + { + "auxiliary_loss_clip": 0.01012921, + "auxiliary_loss_mlp": 0.01004277, + "balance_loss_clip": 1.0010581, + "balance_loss_mlp": 1.00210822, + "epoch": 0.39909815120998043, + "flos": 70437722624640.0, + "grad_norm": 0.7187733083083574, + "language_loss": 0.53270769, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55287969, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.03222656, + "router_z_loss_mlp": 0.10839844, + "step": 6638, + "time_per_iteration": 3.065455198287964 + }, + { + "auxiliary_loss_clip": 0.01063395, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_clip": 1.01495266, + "balance_loss_mlp": 1.0190227, + "epoch": 0.39915827446264845, + "flos": 18548341188480.0, + "grad_norm": 1.4403640887065177, + "language_loss": 0.76329911, + "learning_rate": 2.732822275578769e-06, + "loss": 0.78439879, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4453125, + "step": 6639, + "time_per_iteration": 2.4108400344848633 + }, + { + "auxiliary_loss_clip": 0.01057899, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_clip": 1.01874816, + "balance_loss_mlp": 1.01728892, + "epoch": 0.3992183977153164, + "flos": 29895986373120.0, + "grad_norm": 1.5952850712210533, + "language_loss": 0.77374399, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.79478383, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 6640, + "time_per_iteration": 2.443411350250244 + }, + { + "auxiliary_loss_clip": 0.01061827, + "auxiliary_loss_mlp": 0.01044727, + "balance_loss_clip": 1.01482987, + "balance_loss_mlp": 1.01845527, + "epoch": 0.3992785209679844, + "flos": 22563981279360.0, + "grad_norm": 2.3523172322104444, + "language_loss": 0.83485568, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.85592121, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43359375, + "step": 6641, + "time_per_iteration": 2.4087867736816406 + }, + { + "auxiliary_loss_clip": 0.01063632, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.01686502, + "balance_loss_mlp": 1.01995063, + "epoch": 0.39933864422065235, + "flos": 19681656698880.0, + "grad_norm": 2.083821120885025, + "language_loss": 0.78257334, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.80369318, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43554688, + "step": 6642, + "time_per_iteration": 2.361539125442505 + }, + { + "auxiliary_loss_clip": 0.01061425, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_clip": 1.01912725, + "balance_loss_mlp": 1.01841474, + "epoch": 0.3993987674733203, + "flos": 23037403582080.0, + "grad_norm": 2.794269615223449, + "language_loss": 0.74068701, + "learning_rate": 2.731372550178393e-06, + "loss": 0.76179624, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4296875, + "step": 6643, + "time_per_iteration": 2.420633554458618 + }, + { + "auxiliary_loss_clip": 0.01061618, + "auxiliary_loss_mlp": 0.01041254, + "balance_loss_clip": 1.01056993, + "balance_loss_mlp": 1.01819181, + "epoch": 0.3994588907259883, + "flos": 19389817710720.0, + "grad_norm": 1.501398667070195, + "language_loss": 0.67748117, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.69850993, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43359375, + "step": 6644, + "time_per_iteration": 2.3755807876586914 + }, + { + "auxiliary_loss_clip": 0.01059236, + "auxiliary_loss_mlp": 0.01047198, + "balance_loss_clip": 1.01526237, + "balance_loss_mlp": 1.01709998, + "epoch": 0.39951901397865625, + "flos": 13733573921280.0, + "grad_norm": 2.129494547185246, + "language_loss": 0.79627711, + "learning_rate": 2.730647521020907e-06, + "loss": 0.81734145, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.41992188, + "step": 6645, + "time_per_iteration": 2.3878259658813477 + }, + { + "auxiliary_loss_clip": 0.0106142, + "auxiliary_loss_mlp": 0.01050201, + "balance_loss_clip": 1.01918268, + "balance_loss_mlp": 1.01861644, + "epoch": 0.3995791372313242, + "flos": 23585330459520.0, + "grad_norm": 1.4852694359003202, + "language_loss": 0.71291232, + "learning_rate": 2.73028496487595e-06, + "loss": 0.73402858, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4296875, + "step": 6646, + "time_per_iteration": 2.408247470855713 + }, + { + "auxiliary_loss_clip": 0.01061675, + "auxiliary_loss_mlp": 0.01045586, + "balance_loss_clip": 1.015414, + "balance_loss_mlp": 1.01782763, + "epoch": 0.3996392604839922, + "flos": 21354974208000.0, + "grad_norm": 1.9324502662986602, + "language_loss": 0.73321855, + "learning_rate": 2.729922381038513e-06, + "loss": 0.75429112, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4375, + "step": 6647, + "time_per_iteration": 2.3924667835235596 + }, + { + "auxiliary_loss_clip": 0.01058236, + "auxiliary_loss_mlp": 0.01044029, + "balance_loss_clip": 1.01408339, + "balance_loss_mlp": 1.01758623, + "epoch": 0.39969938373666014, + "flos": 26031031176960.0, + "grad_norm": 1.4220041875021372, + "language_loss": 0.75240505, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.77342772, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.40625, + "step": 6648, + "time_per_iteration": 2.4563000202178955 + }, + { + "auxiliary_loss_clip": 0.01063399, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.0144316, + "balance_loss_mlp": 1.01935792, + "epoch": 0.3997595069893281, + "flos": 20115452741760.0, + "grad_norm": 1.7164734913746131, + "language_loss": 0.67341286, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.6944961, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44140625, + "step": 6649, + "time_per_iteration": 2.3850913047790527 + }, + { + "auxiliary_loss_clip": 0.01064868, + "auxiliary_loss_mlp": 0.01051622, + "balance_loss_clip": 1.02123582, + "balance_loss_mlp": 1.02200627, + "epoch": 0.39981963024199607, + "flos": 27782134928640.0, + "grad_norm": 1.6936213033411966, + "language_loss": 0.76815593, + "learning_rate": 2.728834463508826e-06, + "loss": 0.78932077, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4296875, + "step": 6650, + "time_per_iteration": 2.447040557861328 + }, + { + "auxiliary_loss_clip": 0.0106214, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_clip": 1.0142678, + "balance_loss_mlp": 1.01903677, + "epoch": 0.39987975349466404, + "flos": 21943365217920.0, + "grad_norm": 1.4696525256074258, + "language_loss": 0.72997165, + "learning_rate": 2.728471769038975e-06, + "loss": 0.75102168, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 6651, + "time_per_iteration": 2.4140660762786865 + }, + { + "auxiliary_loss_clip": 0.01063015, + "auxiliary_loss_mlp": 0.01047976, + "balance_loss_clip": 1.01818514, + "balance_loss_mlp": 1.01865208, + "epoch": 0.39993987674733206, + "flos": 20703354992640.0, + "grad_norm": 1.9810109388719117, + "language_loss": 0.75106704, + "learning_rate": 2.728109046945403e-06, + "loss": 0.77217692, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4453125, + "step": 6652, + "time_per_iteration": 2.3832647800445557 + }, + { + "auxiliary_loss_clip": 0.01015664, + "auxiliary_loss_mlp": 0.01013048, + "balance_loss_clip": 1.00944829, + "balance_loss_mlp": 1.00527608, + "epoch": 0.4, + "flos": 61522407106560.0, + "grad_norm": 0.8477509527979804, + "language_loss": 0.60769397, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62798107, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.03588867, + "router_z_loss_mlp": 0.10351562, + "step": 6653, + "time_per_iteration": 2.9542505741119385 + }, + { + "auxiliary_loss_clip": 0.01058531, + "auxiliary_loss_mlp": 0.01043904, + "balance_loss_clip": 1.01627088, + "balance_loss_mlp": 1.01855612, + "epoch": 0.400060123252668, + "flos": 14501418652800.0, + "grad_norm": 1.9650202821663447, + "language_loss": 0.67988634, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.70091069, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40039062, + "step": 6654, + "time_per_iteration": 2.363396406173706 + }, + { + "auxiliary_loss_clip": 0.01061348, + "auxiliary_loss_mlp": 0.01050447, + "balance_loss_clip": 1.02137232, + "balance_loss_mlp": 1.01789975, + "epoch": 0.40012024650533595, + "flos": 19092462727680.0, + "grad_norm": 1.9434368748815518, + "language_loss": 0.91249776, + "learning_rate": 2.7270207150599e-06, + "loss": 0.93361568, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43359375, + "step": 6655, + "time_per_iteration": 2.4396133422851562 + }, + { + "auxiliary_loss_clip": 0.0105877, + "auxiliary_loss_mlp": 0.01042609, + "balance_loss_clip": 1.01736104, + "balance_loss_mlp": 1.01855183, + "epoch": 0.4001803697580039, + "flos": 29349735240960.0, + "grad_norm": 1.6275160134194933, + "language_loss": 0.74300581, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.76401961, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40234375, + "step": 6656, + "time_per_iteration": 2.469726324081421 + }, + { + "auxiliary_loss_clip": 0.01062456, + "auxiliary_loss_mlp": 0.01046747, + "balance_loss_clip": 1.01775527, + "balance_loss_mlp": 1.01910734, + "epoch": 0.4002404930106719, + "flos": 20919083483520.0, + "grad_norm": 1.5268899967431677, + "language_loss": 0.74756992, + "learning_rate": 2.726295022603144e-06, + "loss": 0.76866192, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43359375, + "step": 6657, + "time_per_iteration": 2.536707878112793 + }, + { + "auxiliary_loss_clip": 0.01061028, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_clip": 1.01984262, + "balance_loss_mlp": 1.01825309, + "epoch": 0.40030061626333985, + "flos": 28404391824000.0, + "grad_norm": 1.4589387601450203, + "language_loss": 0.80331975, + "learning_rate": 2.725932135056117e-06, + "loss": 0.82446545, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.42773438, + "step": 6658, + "time_per_iteration": 2.4572863578796387 + }, + { + "auxiliary_loss_clip": 0.01061116, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.01504517, + "balance_loss_mlp": 1.01810861, + "epoch": 0.4003607395160078, + "flos": 25920426389760.0, + "grad_norm": 1.758528267437464, + "language_loss": 0.78267652, + "learning_rate": 2.72556921998167e-06, + "loss": 0.80373019, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 6659, + "time_per_iteration": 2.4424755573272705 + }, + { + "auxiliary_loss_clip": 0.0105584, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.0117228, + "balance_loss_mlp": 1.01660311, + "epoch": 0.4004208627686758, + "flos": 20767840007040.0, + "grad_norm": 1.6280929567521922, + "language_loss": 0.73679483, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.75771976, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39257812, + "step": 6660, + "time_per_iteration": 2.379701852798462 + }, + { + "auxiliary_loss_clip": 0.01059942, + "auxiliary_loss_mlp": 0.01045612, + "balance_loss_clip": 1.01831353, + "balance_loss_mlp": 1.01852059, + "epoch": 0.40048098602134374, + "flos": 24680067050880.0, + "grad_norm": 1.854562849094713, + "language_loss": 0.7245481, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.74560368, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 6661, + "time_per_iteration": 2.4408788681030273 + }, + { + "auxiliary_loss_clip": 0.01062383, + "auxiliary_loss_mlp": 0.01049114, + "balance_loss_clip": 1.01805949, + "balance_loss_mlp": 1.01861656, + "epoch": 0.4005411092740117, + "flos": 23184562429440.0, + "grad_norm": 1.7400620002660434, + "language_loss": 0.77217388, + "learning_rate": 2.724480309731437e-06, + "loss": 0.79328889, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4375, + "step": 6662, + "time_per_iteration": 2.410400867462158 + }, + { + "auxiliary_loss_clip": 0.01062741, + "auxiliary_loss_mlp": 0.01049083, + "balance_loss_clip": 1.01894653, + "balance_loss_mlp": 1.01894116, + "epoch": 0.4006012325266797, + "flos": 17521580747520.0, + "grad_norm": 1.9669718289794274, + "language_loss": 0.68117326, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.70229149, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4375, + "step": 6663, + "time_per_iteration": 4.121229648590088 + }, + { + "auxiliary_loss_clip": 0.01060141, + "auxiliary_loss_mlp": 0.01047696, + "balance_loss_clip": 1.01894307, + "balance_loss_mlp": 1.01758814, + "epoch": 0.40066135577934764, + "flos": 19856397386880.0, + "grad_norm": 2.422715111093911, + "language_loss": 0.87837064, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.89944905, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42578125, + "step": 6664, + "time_per_iteration": 2.378819704055786 + }, + { + "auxiliary_loss_clip": 0.01061525, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.01673639, + "balance_loss_mlp": 1.01947939, + "epoch": 0.40072147903201566, + "flos": 18149039435520.0, + "grad_norm": 2.0512983477917177, + "language_loss": 0.86302537, + "learning_rate": 2.723391152229917e-06, + "loss": 0.88409507, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 6665, + "time_per_iteration": 2.3858020305633545 + }, + { + "auxiliary_loss_clip": 0.01064611, + "auxiliary_loss_mlp": 0.01048257, + "balance_loss_clip": 1.01580858, + "balance_loss_mlp": 1.02013707, + "epoch": 0.4007816022846836, + "flos": 18660272607360.0, + "grad_norm": 1.532348077686855, + "language_loss": 0.79039949, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.81152815, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4453125, + "step": 6666, + "time_per_iteration": 3.910855770111084 + }, + { + "auxiliary_loss_clip": 0.01063841, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_clip": 1.01571417, + "balance_loss_mlp": 1.02116966, + "epoch": 0.4008417255373516, + "flos": 25701974812800.0, + "grad_norm": 2.224874818192285, + "language_loss": 0.75637043, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.77746898, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.42773438, + "step": 6667, + "time_per_iteration": 2.454721689224243 + }, + { + "auxiliary_loss_clip": 0.01065316, + "auxiliary_loss_mlp": 0.01052418, + "balance_loss_clip": 1.01953983, + "balance_loss_mlp": 1.02074289, + "epoch": 0.40090184879001955, + "flos": 22857461101440.0, + "grad_norm": 1.4613830830131478, + "language_loss": 0.76566958, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.78684688, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4453125, + "step": 6668, + "time_per_iteration": 3.9102590084075928 + }, + { + "auxiliary_loss_clip": 0.01062541, + "auxiliary_loss_mlp": 0.0104991, + "balance_loss_clip": 1.01972628, + "balance_loss_mlp": 1.02057731, + "epoch": 0.4009619720426875, + "flos": 29058559568640.0, + "grad_norm": 1.769861034234263, + "language_loss": 0.83535975, + "learning_rate": 2.721938558257248e-06, + "loss": 0.85648429, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41992188, + "step": 6669, + "time_per_iteration": 2.4540603160858154 + }, + { + "auxiliary_loss_clip": 0.01012869, + "auxiliary_loss_mlp": 0.01003825, + "balance_loss_clip": 0.99979627, + "balance_loss_mlp": 1.00334835, + "epoch": 0.4010220952953555, + "flos": 66056332464000.0, + "grad_norm": 0.7023188949266852, + "language_loss": 0.53446221, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55462909, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.09570312, + "step": 6670, + "time_per_iteration": 3.3570563793182373 + }, + { + "auxiliary_loss_clip": 0.01061437, + "auxiliary_loss_mlp": 0.01043771, + "balance_loss_clip": 1.01527977, + "balance_loss_mlp": 1.01935291, + "epoch": 0.40108221854802345, + "flos": 29641539317760.0, + "grad_norm": 1.6111001709736401, + "language_loss": 0.89767575, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.91872787, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41992188, + "step": 6671, + "time_per_iteration": 2.476142644882202 + }, + { + "auxiliary_loss_clip": 0.01062571, + "auxiliary_loss_mlp": 0.0104744, + "balance_loss_clip": 1.01710141, + "balance_loss_mlp": 1.01998568, + "epoch": 0.4011423418006914, + "flos": 19928772368640.0, + "grad_norm": 1.737198903830294, + "language_loss": 0.79913384, + "learning_rate": 2.720848825281736e-06, + "loss": 0.820234, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42578125, + "step": 6672, + "time_per_iteration": 2.35787034034729 + }, + { + "auxiliary_loss_clip": 0.01059416, + "auxiliary_loss_mlp": 0.01041914, + "balance_loss_clip": 1.01397097, + "balance_loss_mlp": 1.01866913, + "epoch": 0.4012024650533594, + "flos": 20083262601600.0, + "grad_norm": 2.0319766543297653, + "language_loss": 0.65071714, + "learning_rate": 2.72048552626888e-06, + "loss": 0.6717304, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40820312, + "step": 6673, + "time_per_iteration": 2.4052648544311523 + }, + { + "auxiliary_loss_clip": 0.01061146, + "auxiliary_loss_mlp": 0.01048163, + "balance_loss_clip": 1.01816988, + "balance_loss_mlp": 1.01838088, + "epoch": 0.40126258830602735, + "flos": 21694469068800.0, + "grad_norm": 1.6722654332631037, + "language_loss": 0.80894881, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.83004189, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42773438, + "step": 6674, + "time_per_iteration": 2.433069944381714 + }, + { + "auxiliary_loss_clip": 0.01062327, + "auxiliary_loss_mlp": 0.01046322, + "balance_loss_clip": 1.01600671, + "balance_loss_mlp": 1.01843572, + "epoch": 0.4013227115586953, + "flos": 12019582811520.0, + "grad_norm": 3.7031268573787806, + "language_loss": 0.84638011, + "learning_rate": 2.719758846294294e-06, + "loss": 0.86746663, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43945312, + "step": 6675, + "time_per_iteration": 2.3563990592956543 + }, + { + "auxiliary_loss_clip": 0.01058879, + "auxiliary_loss_mlp": 0.01047005, + "balance_loss_clip": 1.017012, + "balance_loss_mlp": 1.01753294, + "epoch": 0.4013828348113633, + "flos": 25446340771200.0, + "grad_norm": 1.6749227630775092, + "language_loss": 0.9465239, + "learning_rate": 2.71939546536012e-06, + "loss": 0.96758276, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4140625, + "step": 6676, + "time_per_iteration": 2.435750961303711 + }, + { + "auxiliary_loss_clip": 0.01067185, + "auxiliary_loss_mlp": 0.01048868, + "balance_loss_clip": 1.01501238, + "balance_loss_mlp": 1.02103448, + "epoch": 0.40144295806403124, + "flos": 18582102339840.0, + "grad_norm": 1.8065313939949859, + "language_loss": 0.80436206, + "learning_rate": 2.719032057146399e-06, + "loss": 0.82552266, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.4609375, + "step": 6677, + "time_per_iteration": 2.355731964111328 + }, + { + "auxiliary_loss_clip": 0.01061757, + "auxiliary_loss_mlp": 0.01046663, + "balance_loss_clip": 1.01910138, + "balance_loss_mlp": 1.01956284, + "epoch": 0.4015030813166992, + "flos": 22929102944640.0, + "grad_norm": 1.849750971415185, + "language_loss": 0.85047531, + "learning_rate": 2.71866862166691e-06, + "loss": 0.8715595, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.421875, + "step": 6678, + "time_per_iteration": 2.422074317932129 + }, + { + "auxiliary_loss_clip": 0.01061434, + "auxiliary_loss_mlp": 0.01049981, + "balance_loss_clip": 1.02079833, + "balance_loss_mlp": 1.0200417, + "epoch": 0.4015632045693672, + "flos": 20594007014400.0, + "grad_norm": 2.616815365461533, + "language_loss": 0.64778596, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66890013, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4140625, + "step": 6679, + "time_per_iteration": 2.3820815086364746 + }, + { + "auxiliary_loss_clip": 0.01058955, + "auxiliary_loss_mlp": 0.01042188, + "balance_loss_clip": 1.01391125, + "balance_loss_mlp": 1.01791775, + "epoch": 0.4016233278220352, + "flos": 23437857409920.0, + "grad_norm": 1.4673847173390842, + "language_loss": 0.79642493, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81743634, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41015625, + "step": 6680, + "time_per_iteration": 2.4193241596221924 + }, + { + "auxiliary_loss_clip": 0.0106556, + "auxiliary_loss_mlp": 0.01058581, + "balance_loss_clip": 1.02777719, + "balance_loss_mlp": 1.01996183, + "epoch": 0.40168345107470316, + "flos": 21430072275840.0, + "grad_norm": 1.6660204425943363, + "language_loss": 0.77281785, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.79405922, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45703125, + "step": 6681, + "time_per_iteration": 2.43605637550354 + }, + { + "auxiliary_loss_clip": 0.01059974, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.01661801, + "balance_loss_mlp": 1.01739991, + "epoch": 0.4017435743273711, + "flos": 22856099558400.0, + "grad_norm": 1.7524689865711462, + "language_loss": 0.65529764, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.67635268, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42578125, + "step": 6682, + "time_per_iteration": 2.4062209129333496 + }, + { + "auxiliary_loss_clip": 0.01063755, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.02036881, + "balance_loss_mlp": 1.01944768, + "epoch": 0.4018036975800391, + "flos": 28621028010240.0, + "grad_norm": 1.8342176293817907, + "language_loss": 0.74788964, + "learning_rate": 2.716851035765337e-06, + "loss": 0.76903838, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.44335938, + "step": 6683, + "time_per_iteration": 2.4276769161224365 + }, + { + "auxiliary_loss_clip": 0.01060575, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.01619673, + "balance_loss_mlp": 1.01759434, + "epoch": 0.40186382083270705, + "flos": 26650006404480.0, + "grad_norm": 1.5599264722987427, + "language_loss": 0.74250078, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.76356006, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4296875, + "step": 6684, + "time_per_iteration": 2.5580732822418213 + }, + { + "auxiliary_loss_clip": 0.01012966, + "auxiliary_loss_mlp": 0.01008228, + "balance_loss_clip": 1.00469911, + "balance_loss_mlp": 1.00378013, + "epoch": 0.401923944085375, + "flos": 59257102717440.0, + "grad_norm": 0.8103456544754818, + "language_loss": 0.60447168, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62468362, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.09179688, + "step": 6685, + "time_per_iteration": 3.1436448097229004 + }, + { + "auxiliary_loss_clip": 0.01062068, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.01565158, + "balance_loss_mlp": 1.01835144, + "epoch": 0.401984067338043, + "flos": 16981858039680.0, + "grad_norm": 1.600130865015991, + "language_loss": 0.71402645, + "learning_rate": 2.715760157917357e-06, + "loss": 0.73511505, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4375, + "step": 6686, + "time_per_iteration": 2.3819172382354736 + }, + { + "auxiliary_loss_clip": 0.01060265, + "auxiliary_loss_mlp": 0.01050836, + "balance_loss_clip": 1.02104568, + "balance_loss_mlp": 1.018471, + "epoch": 0.40204419059071095, + "flos": 24971347457280.0, + "grad_norm": 1.3590088221860444, + "language_loss": 0.75248116, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.77359217, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41796875, + "step": 6687, + "time_per_iteration": 2.4135968685150146 + }, + { + "auxiliary_loss_clip": 0.01063911, + "auxiliary_loss_mlp": 0.01046681, + "balance_loss_clip": 1.01628256, + "balance_loss_mlp": 1.01979017, + "epoch": 0.4021043138433789, + "flos": 23476331594880.0, + "grad_norm": 1.7668070233092341, + "language_loss": 0.71786571, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73897159, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44140625, + "step": 6688, + "time_per_iteration": 2.420823574066162 + }, + { + "auxiliary_loss_clip": 0.0106573, + "auxiliary_loss_mlp": 0.01049107, + "balance_loss_clip": 1.01670575, + "balance_loss_mlp": 1.02037179, + "epoch": 0.4021644370960469, + "flos": 25994581850880.0, + "grad_norm": 1.8283297126128062, + "language_loss": 0.65821147, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.67935979, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.453125, + "step": 6689, + "time_per_iteration": 2.427699565887451 + }, + { + "auxiliary_loss_clip": 0.01062261, + "auxiliary_loss_mlp": 0.01040541, + "balance_loss_clip": 1.0101788, + "balance_loss_mlp": 1.0184983, + "epoch": 0.40222456034871484, + "flos": 13587183123840.0, + "grad_norm": 2.2101844232240353, + "language_loss": 0.75618124, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.77720922, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4375, + "step": 6690, + "time_per_iteration": 2.3774843215942383 + }, + { + "auxiliary_loss_clip": 0.01058973, + "auxiliary_loss_mlp": 0.01048189, + "balance_loss_clip": 1.0186491, + "balance_loss_mlp": 1.01700807, + "epoch": 0.4022846836013828, + "flos": 24276925048320.0, + "grad_norm": 1.6139898094195189, + "language_loss": 0.75693083, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.7780025, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41992188, + "step": 6691, + "time_per_iteration": 2.4119131565093994 + }, + { + "auxiliary_loss_clip": 0.01063984, + "auxiliary_loss_mlp": 0.01048119, + "balance_loss_clip": 1.01613498, + "balance_loss_mlp": 1.01987576, + "epoch": 0.40234480685405083, + "flos": 20150715081600.0, + "grad_norm": 1.6454589150618726, + "language_loss": 0.74889487, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.77001584, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44140625, + "step": 6692, + "time_per_iteration": 2.3972322940826416 + }, + { + "auxiliary_loss_clip": 0.0106072, + "auxiliary_loss_mlp": 0.01044935, + "balance_loss_clip": 1.0152638, + "balance_loss_mlp": 1.01783752, + "epoch": 0.4024049301067188, + "flos": 22929102944640.0, + "grad_norm": 1.7758819025610326, + "language_loss": 0.84984052, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.87089705, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 6693, + "time_per_iteration": 2.3834784030914307 + }, + { + "auxiliary_loss_clip": 0.01063207, + "auxiliary_loss_mlp": 0.01048585, + "balance_loss_clip": 1.01997447, + "balance_loss_mlp": 1.02016997, + "epoch": 0.40246505335938676, + "flos": 36026944185600.0, + "grad_norm": 1.6776854000515748, + "language_loss": 0.71443915, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73555708, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4296875, + "step": 6694, + "time_per_iteration": 2.752451181411743 + }, + { + "auxiliary_loss_clip": 0.01060607, + "auxiliary_loss_mlp": 0.01049091, + "balance_loss_clip": 1.01887155, + "balance_loss_mlp": 1.01810813, + "epoch": 0.4025251766120547, + "flos": 20593273875840.0, + "grad_norm": 2.1437230271760286, + "language_loss": 0.69517159, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.7162686, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.42382812, + "step": 6695, + "time_per_iteration": 2.3652024269104004 + }, + { + "auxiliary_loss_clip": 0.01061684, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.01503897, + "balance_loss_mlp": 1.01857162, + "epoch": 0.4025852998647227, + "flos": 64521657296640.0, + "grad_norm": 1.8443421129552275, + "language_loss": 0.81325698, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.83431894, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.43164062, + "step": 6696, + "time_per_iteration": 2.7932941913604736 + }, + { + "auxiliary_loss_clip": 0.0106314, + "auxiliary_loss_mlp": 0.01052118, + "balance_loss_clip": 1.02071881, + "balance_loss_mlp": 1.01995003, + "epoch": 0.40264542311739066, + "flos": 20885252509440.0, + "grad_norm": 1.7468942609580878, + "language_loss": 0.72382659, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.7449792, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43164062, + "step": 6697, + "time_per_iteration": 2.4722185134887695 + }, + { + "auxiliary_loss_clip": 0.01060841, + "auxiliary_loss_mlp": 0.01051341, + "balance_loss_clip": 1.02156281, + "balance_loss_mlp": 1.01876044, + "epoch": 0.4027055463700586, + "flos": 26248993994880.0, + "grad_norm": 1.8903056939822591, + "language_loss": 0.63284361, + "learning_rate": 2.711394207496984e-06, + "loss": 0.65396547, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 6698, + "time_per_iteration": 2.522446870803833 + }, + { + "auxiliary_loss_clip": 0.01061489, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.01431179, + "balance_loss_mlp": 1.01772869, + "epoch": 0.4027656696227266, + "flos": 20630351606400.0, + "grad_norm": 1.9319210419453947, + "language_loss": 0.7880013, + "learning_rate": 2.711030202621491e-06, + "loss": 0.80908155, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4375, + "step": 6699, + "time_per_iteration": 2.3892745971679688 + }, + { + "auxiliary_loss_clip": 0.01058093, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.01462543, + "balance_loss_mlp": 1.01688492, + "epoch": 0.40282579287539455, + "flos": 22345180588800.0, + "grad_norm": 1.578395550803258, + "language_loss": 0.82228863, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.84329766, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41015625, + "step": 6700, + "time_per_iteration": 2.390028715133667 + }, + { + "auxiliary_loss_clip": 0.01062217, + "auxiliary_loss_mlp": 0.01053219, + "balance_loss_clip": 1.01857686, + "balance_loss_mlp": 1.01672196, + "epoch": 0.4028859161280625, + "flos": 29273799300480.0, + "grad_norm": 1.7721704359051866, + "language_loss": 0.75794613, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77910054, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.453125, + "step": 6701, + "time_per_iteration": 2.4299838542938232 + }, + { + "auxiliary_loss_clip": 0.01060404, + "auxiliary_loss_mlp": 0.01045315, + "balance_loss_clip": 1.01608467, + "balance_loss_mlp": 1.0181638, + "epoch": 0.4029460393807305, + "flos": 28621028010240.0, + "grad_norm": 1.8557372655085609, + "language_loss": 0.67097676, + "learning_rate": 2.709938026276208e-06, + "loss": 0.69203395, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.421875, + "step": 6702, + "time_per_iteration": 3.937166929244995 + }, + { + "auxiliary_loss_clip": 0.01061606, + "auxiliary_loss_mlp": 0.01050927, + "balance_loss_clip": 1.01854944, + "balance_loss_mlp": 1.01784968, + "epoch": 0.40300616263339845, + "flos": 22600814630400.0, + "grad_norm": 3.109393997213965, + "language_loss": 0.67323166, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.69435698, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4375, + "step": 6703, + "time_per_iteration": 3.834017515182495 + }, + { + "auxiliary_loss_clip": 0.01063482, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.01977408, + "balance_loss_mlp": 1.01997495, + "epoch": 0.4030662858860664, + "flos": 25519134689280.0, + "grad_norm": 1.7722878102181687, + "language_loss": 0.83611345, + "learning_rate": 2.709209774085071e-06, + "loss": 0.8572461, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43359375, + "step": 6704, + "time_per_iteration": 2.409428119659424 + }, + { + "auxiliary_loss_clip": 0.01064339, + "auxiliary_loss_mlp": 0.0105334, + "balance_loss_clip": 1.0230608, + "balance_loss_mlp": 1.01950598, + "epoch": 0.40312640913873443, + "flos": 23585574839040.0, + "grad_norm": 1.7398558872793886, + "language_loss": 0.75493693, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.77611375, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44726562, + "step": 6705, + "time_per_iteration": 3.710819721221924 + }, + { + "auxiliary_loss_clip": 0.01058289, + "auxiliary_loss_mlp": 0.01050146, + "balance_loss_clip": 1.02146459, + "balance_loss_mlp": 1.01717186, + "epoch": 0.4031865323914024, + "flos": 20010014835840.0, + "grad_norm": 1.6180606022751252, + "language_loss": 0.6816681, + "learning_rate": 2.708481414320713e-06, + "loss": 0.70275247, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41210938, + "step": 6706, + "time_per_iteration": 2.406973123550415 + }, + { + "auxiliary_loss_clip": 0.01060699, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.01728725, + "balance_loss_mlp": 1.01852119, + "epoch": 0.40324665564407036, + "flos": 21870361831680.0, + "grad_norm": 1.2991496258271822, + "language_loss": 0.71952873, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.74059069, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 6707, + "time_per_iteration": 2.4201419353485107 + }, + { + "auxiliary_loss_clip": 0.01055321, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.01081824, + "balance_loss_mlp": 1.0168705, + "epoch": 0.4033067788967383, + "flos": 23877588384000.0, + "grad_norm": 1.5051597102784322, + "language_loss": 0.81151378, + "learning_rate": 2.707752947093611e-06, + "loss": 0.8324362, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38476562, + "step": 6708, + "time_per_iteration": 3.8970463275909424 + }, + { + "auxiliary_loss_clip": 0.01062213, + "auxiliary_loss_mlp": 0.01053932, + "balance_loss_clip": 1.02452302, + "balance_loss_mlp": 1.01734126, + "epoch": 0.4033669021494063, + "flos": 17418970661760.0, + "grad_norm": 2.032306469775412, + "language_loss": 0.84709668, + "learning_rate": 2.70738867321606e-06, + "loss": 0.86825812, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44921875, + "step": 6709, + "time_per_iteration": 2.4405786991119385 + }, + { + "auxiliary_loss_clip": 0.01062191, + "auxiliary_loss_mlp": 0.01049572, + "balance_loss_clip": 1.01829171, + "balance_loss_mlp": 1.01900291, + "epoch": 0.40342702540207426, + "flos": 29599434351360.0, + "grad_norm": 1.4977425945501732, + "language_loss": 0.7200681, + "learning_rate": 2.70702437251426e-06, + "loss": 0.74118578, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43164062, + "step": 6710, + "time_per_iteration": 2.4417598247528076 + }, + { + "auxiliary_loss_clip": 0.01058866, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_clip": 1.01594961, + "balance_loss_mlp": 1.01725817, + "epoch": 0.4034871486547422, + "flos": 11283998042880.0, + "grad_norm": 2.270819229349762, + "language_loss": 0.86221033, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.88325119, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41601562, + "step": 6711, + "time_per_iteration": 2.344531536102295 + }, + { + "auxiliary_loss_clip": 0.01061794, + "auxiliary_loss_mlp": 0.01053103, + "balance_loss_clip": 1.02146494, + "balance_loss_mlp": 1.01878953, + "epoch": 0.4035472719074102, + "flos": 15552130152960.0, + "grad_norm": 3.22884150947739, + "language_loss": 0.79094589, + "learning_rate": 2.706295690693168e-06, + "loss": 0.81209487, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4296875, + "step": 6712, + "time_per_iteration": 2.3570449352264404 + }, + { + "auxiliary_loss_clip": 0.01060291, + "auxiliary_loss_mlp": 0.01047087, + "balance_loss_clip": 1.01796365, + "balance_loss_mlp": 1.01825631, + "epoch": 0.40360739516007815, + "flos": 24673398981120.0, + "grad_norm": 1.8306544944107408, + "language_loss": 0.80552226, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.82659608, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41992188, + "step": 6713, + "time_per_iteration": 2.3951334953308105 + }, + { + "auxiliary_loss_clip": 0.0106136, + "auxiliary_loss_mlp": 0.01049404, + "balance_loss_clip": 1.01869571, + "balance_loss_mlp": 1.01819503, + "epoch": 0.4036675184127461, + "flos": 17303338638720.0, + "grad_norm": 1.6365174335071342, + "language_loss": 0.89647961, + "learning_rate": 2.705566901740865e-06, + "loss": 0.91758728, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43164062, + "step": 6714, + "time_per_iteration": 2.368803024291992 + }, + { + "auxiliary_loss_clip": 0.0105921, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_clip": 1.01868796, + "balance_loss_mlp": 1.01810348, + "epoch": 0.4037276416654141, + "flos": 19863030545280.0, + "grad_norm": 1.7477482617658588, + "language_loss": 0.70868188, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.72973502, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41210938, + "step": 6715, + "time_per_iteration": 2.394578218460083 + }, + { + "auxiliary_loss_clip": 0.01061869, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_clip": 1.01651692, + "balance_loss_mlp": 1.01832211, + "epoch": 0.40378776491808205, + "flos": 18295290587520.0, + "grad_norm": 1.9710862158104563, + "language_loss": 0.79374021, + "learning_rate": 2.704838005767892e-06, + "loss": 0.81482339, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43554688, + "step": 6716, + "time_per_iteration": 2.378110408782959 + }, + { + "auxiliary_loss_clip": 0.01059086, + "auxiliary_loss_mlp": 0.01041401, + "balance_loss_clip": 1.01419687, + "balance_loss_mlp": 1.01771152, + "epoch": 0.40384788817075, + "flos": 15048472746240.0, + "grad_norm": 1.847326448961894, + "language_loss": 0.77407616, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.79508102, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 6717, + "time_per_iteration": 2.3410379886627197 + }, + { + "auxiliary_loss_clip": 0.01011772, + "auxiliary_loss_mlp": 0.01004774, + "balance_loss_clip": 1.00100696, + "balance_loss_mlp": 1.00273681, + "epoch": 0.40390801142341803, + "flos": 61926805918080.0, + "grad_norm": 0.9285321132764148, + "language_loss": 0.6090163, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62918174, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.03759766, + "router_z_loss_mlp": 0.09033203, + "step": 6718, + "time_per_iteration": 2.8976173400878906 + }, + { + "auxiliary_loss_clip": 0.01063473, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.01457119, + "balance_loss_mlp": 1.0184443, + "epoch": 0.403968134676086, + "flos": 22737919006080.0, + "grad_norm": 7.192007474702263, + "language_loss": 0.75924873, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.78035414, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45117188, + "step": 6719, + "time_per_iteration": 2.3622887134552 + }, + { + "auxiliary_loss_clip": 0.01061629, + "auxiliary_loss_mlp": 0.01051115, + "balance_loss_clip": 1.02205181, + "balance_loss_mlp": 1.01902914, + "epoch": 0.40402825792875396, + "flos": 19783603468800.0, + "grad_norm": 2.0206800155719926, + "language_loss": 0.83143884, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.8525663, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 6720, + "time_per_iteration": 2.3720524311065674 + }, + { + "auxiliary_loss_clip": 0.01059936, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.01171672, + "balance_loss_mlp": 1.01796269, + "epoch": 0.40408838118142193, + "flos": 19608269287680.0, + "grad_norm": 1.9210080049731184, + "language_loss": 0.77692401, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79793191, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41796875, + "step": 6721, + "time_per_iteration": 2.354743719100952 + }, + { + "auxiliary_loss_clip": 0.01056744, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.01170027, + "balance_loss_mlp": 1.01782584, + "epoch": 0.4041485044340899, + "flos": 24424886856960.0, + "grad_norm": 1.7519820076202277, + "language_loss": 0.73954356, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.76046586, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38867188, + "step": 6722, + "time_per_iteration": 2.433443784713745 + }, + { + "auxiliary_loss_clip": 0.01058143, + "auxiliary_loss_mlp": 0.01037932, + "balance_loss_clip": 1.0118494, + "balance_loss_mlp": 1.01751614, + "epoch": 0.40420862768675786, + "flos": 16759356744960.0, + "grad_norm": 1.844964166386102, + "language_loss": 0.67557895, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.69653964, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 6723, + "time_per_iteration": 2.4324264526367188 + }, + { + "auxiliary_loss_clip": 0.01061719, + "auxiliary_loss_mlp": 0.01053155, + "balance_loss_clip": 1.02310228, + "balance_loss_mlp": 1.0191679, + "epoch": 0.4042687509394258, + "flos": 22490489134080.0, + "grad_norm": 1.5372515738737929, + "language_loss": 0.74636304, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76751179, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42578125, + "step": 6724, + "time_per_iteration": 2.397710084915161 + }, + { + "auxiliary_loss_clip": 0.01055316, + "auxiliary_loss_mlp": 0.01041669, + "balance_loss_clip": 1.01764846, + "balance_loss_mlp": 1.01704979, + "epoch": 0.4043288741920938, + "flos": 30334844563200.0, + "grad_norm": 1.773990244045234, + "language_loss": 0.76103783, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.78200769, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 6725, + "time_per_iteration": 2.434135675430298 + }, + { + "auxiliary_loss_clip": 0.01058596, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_clip": 1.01503479, + "balance_loss_mlp": 1.01705885, + "epoch": 0.40438899744476176, + "flos": 46346711765760.0, + "grad_norm": 1.4551007173092978, + "language_loss": 0.77629346, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79733133, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4140625, + "step": 6726, + "time_per_iteration": 2.6206133365631104 + }, + { + "auxiliary_loss_clip": 0.01061209, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.01616669, + "balance_loss_mlp": 1.01832843, + "epoch": 0.4044491206974297, + "flos": 13332701157120.0, + "grad_norm": 1.9969705625644383, + "language_loss": 0.83355892, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.85460997, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4296875, + "step": 6727, + "time_per_iteration": 2.346367359161377 + }, + { + "auxiliary_loss_clip": 0.01057767, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.01415539, + "balance_loss_mlp": 1.01656651, + "epoch": 0.4045092439500977, + "flos": 12092935311360.0, + "grad_norm": 2.185046239203533, + "language_loss": 0.87300706, + "learning_rate": 2.700462388688447e-06, + "loss": 0.89399576, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41210938, + "step": 6728, + "time_per_iteration": 2.379528045654297 + }, + { + "auxiliary_loss_clip": 0.0105822, + "auxiliary_loss_mlp": 0.01041539, + "balance_loss_clip": 1.01407361, + "balance_loss_mlp": 1.01813388, + "epoch": 0.40456936720276565, + "flos": 21178592686080.0, + "grad_norm": 1.931874867860244, + "language_loss": 0.8283723, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84936994, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 6729, + "time_per_iteration": 2.3803186416625977 + }, + { + "auxiliary_loss_clip": 0.01058593, + "auxiliary_loss_mlp": 0.01045891, + "balance_loss_clip": 1.01873517, + "balance_loss_mlp": 1.01837921, + "epoch": 0.4046294904554336, + "flos": 23914142444160.0, + "grad_norm": 2.1374502504587496, + "language_loss": 0.75254869, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.77359354, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 6730, + "time_per_iteration": 2.4174606800079346 + }, + { + "auxiliary_loss_clip": 0.01057879, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.0158273, + "balance_loss_mlp": 1.01726818, + "epoch": 0.4046896137081016, + "flos": 38069712368640.0, + "grad_norm": 1.605361533243142, + "language_loss": 0.69351548, + "learning_rate": 2.699367885848985e-06, + "loss": 0.7145223, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 6731, + "time_per_iteration": 2.5135505199432373 + }, + { + "auxiliary_loss_clip": 0.01059303, + "auxiliary_loss_mlp": 0.01045186, + "balance_loss_clip": 1.01901984, + "balance_loss_mlp": 1.01863468, + "epoch": 0.4047497369607696, + "flos": 23616298702080.0, + "grad_norm": 1.595830247697664, + "language_loss": 0.75338024, + "learning_rate": 2.699002998510517e-06, + "loss": 0.77442515, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 6732, + "time_per_iteration": 2.4827959537506104 + }, + { + "auxiliary_loss_clip": 0.01057101, + "auxiliary_loss_mlp": 0.01037213, + "balance_loss_clip": 1.01219058, + "balance_loss_mlp": 1.01701641, + "epoch": 0.40480986021343757, + "flos": 12822759705600.0, + "grad_norm": 1.5899965227819073, + "language_loss": 0.78608429, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.80702746, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.40234375, + "step": 6733, + "time_per_iteration": 2.358292818069458 + }, + { + "auxiliary_loss_clip": 0.01061553, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_clip": 1.01635599, + "balance_loss_mlp": 1.01883173, + "epoch": 0.40486998346610553, + "flos": 23767646912640.0, + "grad_norm": 1.8900399297436081, + "language_loss": 0.77946436, + "learning_rate": 2.698273144328627e-06, + "loss": 0.8005513, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.42773438, + "step": 6734, + "time_per_iteration": 2.414019823074341 + }, + { + "auxiliary_loss_clip": 0.01061464, + "auxiliary_loss_mlp": 0.01046387, + "balance_loss_clip": 1.01753867, + "balance_loss_mlp": 1.01818848, + "epoch": 0.4049301067187735, + "flos": 22855715533440.0, + "grad_norm": 2.242730709232734, + "language_loss": 0.66661137, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.68768984, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43359375, + "step": 6735, + "time_per_iteration": 2.4102447032928467 + }, + { + "auxiliary_loss_clip": 0.01055584, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.0130465, + "balance_loss_mlp": 1.01661885, + "epoch": 0.40499022997144146, + "flos": 22782886704000.0, + "grad_norm": 1.6592205777843074, + "language_loss": 0.84601605, + "learning_rate": 2.697543184232387e-06, + "loss": 0.86695015, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38867188, + "step": 6736, + "time_per_iteration": 2.4836626052856445 + }, + { + "auxiliary_loss_clip": 0.0106144, + "auxiliary_loss_mlp": 0.01046602, + "balance_loss_clip": 1.0168829, + "balance_loss_mlp": 1.01845288, + "epoch": 0.4050503532241094, + "flos": 23038241454720.0, + "grad_norm": 1.6406257085431104, + "language_loss": 0.76354748, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.78462791, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 6737, + "time_per_iteration": 2.417987585067749 + }, + { + "auxiliary_loss_clip": 0.01058898, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.01391852, + "balance_loss_mlp": 1.01850486, + "epoch": 0.4051104764767774, + "flos": 16647006389760.0, + "grad_norm": 2.117880713577063, + "language_loss": 0.73722482, + "learning_rate": 2.696813118332519e-06, + "loss": 0.75822771, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40429688, + "step": 6738, + "time_per_iteration": 2.343068838119507 + }, + { + "auxiliary_loss_clip": 0.0105814, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.01244545, + "balance_loss_mlp": 1.0179857, + "epoch": 0.40517059972944536, + "flos": 16358134867200.0, + "grad_norm": 1.7585958932197852, + "language_loss": 0.75609535, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77705252, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40234375, + "step": 6739, + "time_per_iteration": 2.378783941268921 + }, + { + "auxiliary_loss_clip": 0.01060038, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.01084638, + "balance_loss_mlp": 1.01929736, + "epoch": 0.4052307229821133, + "flos": 28802122565760.0, + "grad_norm": 1.6793368597970977, + "language_loss": 0.75017583, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.77115285, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 6740, + "time_per_iteration": 2.432659149169922 + }, + { + "auxiliary_loss_clip": 0.01057639, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.01223946, + "balance_loss_mlp": 1.01797771, + "epoch": 0.4052908462347813, + "flos": 21396799883520.0, + "grad_norm": 1.4459962157539432, + "language_loss": 0.78558034, + "learning_rate": 2.695717821343153e-06, + "loss": 0.80654597, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3984375, + "step": 6741, + "time_per_iteration": 2.4358673095703125 + }, + { + "auxiliary_loss_clip": 0.01059625, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.012972, + "balance_loss_mlp": 1.01760733, + "epoch": 0.40535096948744925, + "flos": 22417974506880.0, + "grad_norm": 3.1775111946841745, + "language_loss": 0.72376901, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.74475515, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.41992188, + "step": 6742, + "time_per_iteration": 3.801206588745117 + }, + { + "auxiliary_loss_clip": 0.01061003, + "auxiliary_loss_mlp": 0.01042532, + "balance_loss_clip": 1.01387429, + "balance_loss_mlp": 1.01931119, + "epoch": 0.4054110927401172, + "flos": 17010068284800.0, + "grad_norm": 2.2186683237312246, + "language_loss": 0.73817056, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.75920594, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41601562, + "step": 6743, + "time_per_iteration": 3.7643866539001465 + }, + { + "auxiliary_loss_clip": 0.01062294, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.01489472, + "balance_loss_mlp": 1.01859844, + "epoch": 0.4054712159927852, + "flos": 21613820094720.0, + "grad_norm": 2.5912123632401816, + "language_loss": 0.72802448, + "learning_rate": 2.694622286918588e-06, + "loss": 0.74909306, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4375, + "step": 6744, + "time_per_iteration": 2.3709847927093506 + }, + { + "auxiliary_loss_clip": 0.01058081, + "auxiliary_loss_mlp": 0.01037235, + "balance_loss_clip": 1.01155686, + "balance_loss_mlp": 1.01771688, + "epoch": 0.4055313392454532, + "flos": 25811357702400.0, + "grad_norm": 1.493238959990269, + "language_loss": 0.8127141, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.83366728, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40429688, + "step": 6745, + "time_per_iteration": 3.875493288040161 + }, + { + "auxiliary_loss_clip": 0.01058935, + "auxiliary_loss_mlp": 0.01040223, + "balance_loss_clip": 1.01422369, + "balance_loss_mlp": 1.01906276, + "epoch": 0.40559146249812117, + "flos": 14136227164800.0, + "grad_norm": 1.783338820930679, + "language_loss": 0.67705888, + "learning_rate": 2.693891798911731e-06, + "loss": 0.6980505, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3984375, + "step": 6746, + "time_per_iteration": 2.32987904548645 + }, + { + "auxiliary_loss_clip": 0.01057412, + "auxiliary_loss_mlp": 0.01037829, + "balance_loss_clip": 1.01225829, + "balance_loss_mlp": 1.01760125, + "epoch": 0.40565158575078913, + "flos": 41353852320000.0, + "grad_norm": 1.5094765456128498, + "language_loss": 0.58264172, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.60359418, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3984375, + "step": 6747, + "time_per_iteration": 3.9971392154693604 + }, + { + "auxiliary_loss_clip": 0.01059427, + "auxiliary_loss_mlp": 0.01043957, + "balance_loss_clip": 1.01830363, + "balance_loss_mlp": 1.01860774, + "epoch": 0.4057117090034571, + "flos": 28543381413120.0, + "grad_norm": 1.9353398198231206, + "language_loss": 0.8605926, + "learning_rate": 2.693161205655089e-06, + "loss": 0.88162649, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40820312, + "step": 6748, + "time_per_iteration": 2.5066874027252197 + }, + { + "auxiliary_loss_clip": 0.01060312, + "auxiliary_loss_mlp": 0.01043636, + "balance_loss_clip": 1.01636124, + "balance_loss_mlp": 1.01883328, + "epoch": 0.40577183225612506, + "flos": 18003102485760.0, + "grad_norm": 1.7668454995603449, + "language_loss": 0.83229339, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.85333288, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 6749, + "time_per_iteration": 2.3854219913482666 + }, + { + "auxiliary_loss_clip": 0.01060183, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.0170809, + "balance_loss_mlp": 1.01904726, + "epoch": 0.40583195550879303, + "flos": 19535719749120.0, + "grad_norm": 1.496016886610126, + "language_loss": 0.76384306, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.78487337, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41210938, + "step": 6750, + "time_per_iteration": 2.3812036514282227 + }, + { + "auxiliary_loss_clip": 0.01061521, + "auxiliary_loss_mlp": 0.01044882, + "balance_loss_clip": 1.01631904, + "balance_loss_mlp": 1.01812327, + "epoch": 0.405892078761461, + "flos": 22308382149120.0, + "grad_norm": 2.222324231890562, + "language_loss": 0.75204474, + "learning_rate": 2.692065118669195e-06, + "loss": 0.77310872, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43359375, + "step": 6751, + "time_per_iteration": 2.4254021644592285 + }, + { + "auxiliary_loss_clip": 0.01059473, + "auxiliary_loss_mlp": 0.01046835, + "balance_loss_clip": 1.02028704, + "balance_loss_mlp": 1.01861429, + "epoch": 0.40595220201412896, + "flos": 25483209033600.0, + "grad_norm": 1.588514380329064, + "language_loss": 0.69151771, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.7125808, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40820312, + "step": 6752, + "time_per_iteration": 2.4194719791412354 + }, + { + "auxiliary_loss_clip": 0.01061338, + "auxiliary_loss_mlp": 0.01046255, + "balance_loss_clip": 1.01771617, + "balance_loss_mlp": 1.01822042, + "epoch": 0.4060123252667969, + "flos": 49854155973120.0, + "grad_norm": 2.079540196368654, + "language_loss": 0.7243607, + "learning_rate": 2.691334262772948e-06, + "loss": 0.74543667, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43164062, + "step": 6753, + "time_per_iteration": 2.6267988681793213 + }, + { + "auxiliary_loss_clip": 0.01061425, + "auxiliary_loss_mlp": 0.01045332, + "balance_loss_clip": 1.01595902, + "balance_loss_mlp": 1.01819479, + "epoch": 0.4060724485194649, + "flos": 21134602506240.0, + "grad_norm": 1.809914920535624, + "language_loss": 0.73310083, + "learning_rate": 2.690968795494699e-06, + "loss": 0.75416839, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43359375, + "step": 6754, + "time_per_iteration": 2.356799602508545 + }, + { + "auxiliary_loss_clip": 0.01060532, + "auxiliary_loss_mlp": 0.01044699, + "balance_loss_clip": 1.01595795, + "balance_loss_mlp": 1.01785851, + "epoch": 0.40613257177213286, + "flos": 21757103781120.0, + "grad_norm": 1.7232179734965538, + "language_loss": 0.84392726, + "learning_rate": 2.690603302014844e-06, + "loss": 0.86497962, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42578125, + "step": 6755, + "time_per_iteration": 2.392775535583496 + }, + { + "auxiliary_loss_clip": 0.01062632, + "auxiliary_loss_mlp": 0.01049896, + "balance_loss_clip": 1.02016509, + "balance_loss_mlp": 1.01894057, + "epoch": 0.4061926950248008, + "flos": 25553943181440.0, + "grad_norm": 1.5802576290960288, + "language_loss": 0.72240341, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.74352872, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4375, + "step": 6756, + "time_per_iteration": 2.413966178894043 + }, + { + "auxiliary_loss_clip": 0.01062418, + "auxiliary_loss_mlp": 0.01048207, + "balance_loss_clip": 1.0202527, + "balance_loss_mlp": 1.01859176, + "epoch": 0.4062528182774688, + "flos": 23694678437760.0, + "grad_norm": 1.7831995292535898, + "language_loss": 0.80722737, + "learning_rate": 2.689872236505755e-06, + "loss": 0.82833362, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.4375, + "step": 6757, + "time_per_iteration": 2.42838716506958 + }, + { + "auxiliary_loss_clip": 0.01061884, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.01152921, + "balance_loss_mlp": 1.01967764, + "epoch": 0.4063129415301368, + "flos": 21724948552320.0, + "grad_norm": 2.55780635764178, + "language_loss": 0.79599732, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.81701803, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 6758, + "time_per_iteration": 2.379997968673706 + }, + { + "auxiliary_loss_clip": 0.01059067, + "auxiliary_loss_mlp": 0.01041799, + "balance_loss_clip": 1.01420176, + "balance_loss_mlp": 1.01745605, + "epoch": 0.40637306478280477, + "flos": 12786729315840.0, + "grad_norm": 1.8381279469803526, + "language_loss": 0.91089523, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.9319039, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41601562, + "step": 6759, + "time_per_iteration": 2.3989579677581787 + }, + { + "auxiliary_loss_clip": 0.01061198, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.01931882, + "balance_loss_mlp": 1.01946926, + "epoch": 0.40643318803547274, + "flos": 24023350776960.0, + "grad_norm": 1.8285783698118152, + "language_loss": 0.66315007, + "learning_rate": 2.688775442076598e-06, + "loss": 0.68424547, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41796875, + "step": 6760, + "time_per_iteration": 2.3963887691497803 + }, + { + "auxiliary_loss_clip": 0.01061744, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.01644397, + "balance_loss_mlp": 1.01862264, + "epoch": 0.4064933112881407, + "flos": 25591265291520.0, + "grad_norm": 2.029493864358895, + "language_loss": 0.76041901, + "learning_rate": 2.688409791678193e-06, + "loss": 0.78149199, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43164062, + "step": 6761, + "time_per_iteration": 2.5706796646118164 + }, + { + "auxiliary_loss_clip": 0.01056238, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.01294374, + "balance_loss_mlp": 1.01724434, + "epoch": 0.40655343454080867, + "flos": 22053236866560.0, + "grad_norm": 1.5425735839443133, + "language_loss": 0.70640212, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72734976, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.390625, + "step": 6762, + "time_per_iteration": 2.381429433822632 + }, + { + "auxiliary_loss_clip": 0.01061012, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.01488817, + "balance_loss_mlp": 1.01871347, + "epoch": 0.40661355779347663, + "flos": 26467689951360.0, + "grad_norm": 1.3990169506038654, + "language_loss": 0.74575359, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.76680833, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42382812, + "step": 6763, + "time_per_iteration": 2.452731132507324 + }, + { + "auxiliary_loss_clip": 0.01064226, + "auxiliary_loss_mlp": 0.01042303, + "balance_loss_clip": 1.0117023, + "balance_loss_mlp": 1.01987922, + "epoch": 0.4066736810461446, + "flos": 13260291264000.0, + "grad_norm": 1.7421632316578726, + "language_loss": 0.70188379, + "learning_rate": 2.687312683911033e-06, + "loss": 0.72294903, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44335938, + "step": 6764, + "time_per_iteration": 2.3399744033813477 + }, + { + "auxiliary_loss_clip": 0.01065199, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.02039683, + "balance_loss_mlp": 1.02046573, + "epoch": 0.40673380429881256, + "flos": 28802366945280.0, + "grad_norm": 2.196932128319159, + "language_loss": 0.92828751, + "learning_rate": 2.686946929177557e-06, + "loss": 0.94945693, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.44726562, + "step": 6765, + "time_per_iteration": 2.4781062602996826 + }, + { + "auxiliary_loss_clip": 0.0106436, + "auxiliary_loss_mlp": 0.01052963, + "balance_loss_clip": 1.02056217, + "balance_loss_mlp": 1.01881289, + "epoch": 0.4067939275514805, + "flos": 12494506302720.0, + "grad_norm": 2.248035761685446, + "language_loss": 0.80729765, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.82847083, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45703125, + "step": 6766, + "time_per_iteration": 2.3476390838623047 + }, + { + "auxiliary_loss_clip": 0.01062233, + "auxiliary_loss_mlp": 0.01044296, + "balance_loss_clip": 1.01418424, + "balance_loss_mlp": 1.01811528, + "epoch": 0.4068540508041485, + "flos": 18769515851520.0, + "grad_norm": 1.791998091660492, + "language_loss": 0.77996147, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.8010267, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44140625, + "step": 6767, + "time_per_iteration": 2.3910014629364014 + }, + { + "auxiliary_loss_clip": 0.01061193, + "auxiliary_loss_mlp": 0.01050294, + "balance_loss_clip": 1.02045548, + "balance_loss_mlp": 1.01970029, + "epoch": 0.40691417405681646, + "flos": 28511540386560.0, + "grad_norm": 1.6876588172892986, + "language_loss": 0.78399885, + "learning_rate": 2.685849508738034e-06, + "loss": 0.80511367, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4140625, + "step": 6768, + "time_per_iteration": 2.4395647048950195 + }, + { + "auxiliary_loss_clip": 0.01061894, + "auxiliary_loss_mlp": 0.01044099, + "balance_loss_clip": 1.01448774, + "balance_loss_mlp": 1.01981759, + "epoch": 0.4069742973094844, + "flos": 20812982261760.0, + "grad_norm": 1.8659744906486369, + "language_loss": 0.88509721, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.90615714, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.421875, + "step": 6769, + "time_per_iteration": 2.3833093643188477 + }, + { + "auxiliary_loss_clip": 0.0106052, + "auxiliary_loss_mlp": 0.01046393, + "balance_loss_clip": 1.02051234, + "balance_loss_mlp": 1.01997995, + "epoch": 0.4070344205621524, + "flos": 21469209776640.0, + "grad_norm": 1.6828611398483073, + "language_loss": 0.81921172, + "learning_rate": 2.685117765051156e-06, + "loss": 0.84028083, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40429688, + "step": 6770, + "time_per_iteration": 2.380537509918213 + }, + { + "auxiliary_loss_clip": 0.01066695, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.01449966, + "balance_loss_mlp": 1.02081633, + "epoch": 0.4070945438148204, + "flos": 26828936455680.0, + "grad_norm": 1.5709361951049785, + "language_loss": 0.81246626, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.83359611, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.45703125, + "step": 6771, + "time_per_iteration": 2.437577724456787 + }, + { + "auxiliary_loss_clip": 0.01060716, + "auxiliary_loss_mlp": 0.01046405, + "balance_loss_clip": 1.0181402, + "balance_loss_mlp": 1.01875019, + "epoch": 0.4071546670674884, + "flos": 26353105269120.0, + "grad_norm": 1.4520322368434146, + "language_loss": 0.77761745, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.79868865, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41992188, + "step": 6772, + "time_per_iteration": 2.4378464221954346 + }, + { + "auxiliary_loss_clip": 0.01060908, + "auxiliary_loss_mlp": 0.01047238, + "balance_loss_clip": 1.018628, + "balance_loss_mlp": 1.01747561, + "epoch": 0.40721479032015634, + "flos": 17894417823360.0, + "grad_norm": 1.6737339253995835, + "language_loss": 0.82739937, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.84848088, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43359375, + "step": 6773, + "time_per_iteration": 2.373967170715332 + }, + { + "auxiliary_loss_clip": 0.01012123, + "auxiliary_loss_mlp": 0.01005771, + "balance_loss_clip": 1.00252843, + "balance_loss_mlp": 1.00315297, + "epoch": 0.4072749135728243, + "flos": 49851745223040.0, + "grad_norm": 0.8191802058667056, + "language_loss": 0.64458442, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66476333, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.08984375, + "step": 6774, + "time_per_iteration": 2.9694011211395264 + }, + { + "auxiliary_loss_clip": 0.0106338, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.01851737, + "balance_loss_mlp": 1.01855028, + "epoch": 0.40733503682549227, + "flos": 27562391631360.0, + "grad_norm": 1.755950378974841, + "language_loss": 0.74283135, + "learning_rate": 2.683287951431446e-06, + "loss": 0.7639643, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44921875, + "step": 6775, + "time_per_iteration": 2.5026276111602783 + }, + { + "auxiliary_loss_clip": 0.01062374, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_clip": 1.02236652, + "balance_loss_mlp": 1.0178808, + "epoch": 0.40739516007816023, + "flos": 22125891139200.0, + "grad_norm": 1.3569417248987956, + "language_loss": 0.79103756, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.81217599, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4453125, + "step": 6776, + "time_per_iteration": 2.389362335205078 + }, + { + "auxiliary_loss_clip": 0.01064488, + "auxiliary_loss_mlp": 0.01041583, + "balance_loss_clip": 1.01106513, + "balance_loss_mlp": 1.01958525, + "epoch": 0.4074552833308282, + "flos": 23841104146560.0, + "grad_norm": 2.3353930905530644, + "language_loss": 0.81172508, + "learning_rate": 2.682555844513981e-06, + "loss": 0.83278579, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44921875, + "step": 6777, + "time_per_iteration": 2.4420011043548584 + }, + { + "auxiliary_loss_clip": 0.01011575, + "auxiliary_loss_mlp": 0.01005488, + "balance_loss_clip": 1.00229347, + "balance_loss_mlp": 1.00300074, + "epoch": 0.40751540658349616, + "flos": 57996702391680.0, + "grad_norm": 0.6848869639511033, + "language_loss": 0.53337395, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55354458, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.03198242, + "router_z_loss_mlp": 0.0859375, + "step": 6778, + "time_per_iteration": 3.062209367752075 + }, + { + "auxiliary_loss_clip": 0.01060975, + "auxiliary_loss_mlp": 0.01048254, + "balance_loss_clip": 1.01728368, + "balance_loss_mlp": 1.01796782, + "epoch": 0.40757552983616413, + "flos": 21213610646400.0, + "grad_norm": 1.8422266861439953, + "language_loss": 0.8412841, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.86237645, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4296875, + "step": 6779, + "time_per_iteration": 2.453190565109253 + }, + { + "auxiliary_loss_clip": 0.01060089, + "auxiliary_loss_mlp": 0.01045478, + "balance_loss_clip": 1.01461482, + "balance_loss_mlp": 1.01803613, + "epoch": 0.4076356530888321, + "flos": 26832322857600.0, + "grad_norm": 1.471276927691935, + "language_loss": 0.77408624, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.79514194, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.41992188, + "step": 6780, + "time_per_iteration": 2.4125490188598633 + }, + { + "auxiliary_loss_clip": 0.01058805, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.0175004, + "balance_loss_mlp": 1.01780283, + "epoch": 0.40769577634150006, + "flos": 12202213466880.0, + "grad_norm": 2.475739249448689, + "language_loss": 0.68168634, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.70271736, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 6781, + "time_per_iteration": 3.7963500022888184 + }, + { + "auxiliary_loss_clip": 0.01061173, + "auxiliary_loss_mlp": 0.01050508, + "balance_loss_clip": 1.02130198, + "balance_loss_mlp": 1.01860213, + "epoch": 0.407755899594168, + "flos": 33653897740800.0, + "grad_norm": 1.5829730633435086, + "language_loss": 0.71932757, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.74044436, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 6782, + "time_per_iteration": 2.476768732070923 + }, + { + "auxiliary_loss_clip": 0.01063209, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.01390469, + "balance_loss_mlp": 1.01948833, + "epoch": 0.407816022846836, + "flos": 20156300899200.0, + "grad_norm": 1.58075326422288, + "language_loss": 0.83490396, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.85597414, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.4375, + "step": 6783, + "time_per_iteration": 3.797112464904785 + }, + { + "auxiliary_loss_clip": 0.01061679, + "auxiliary_loss_mlp": 0.01048575, + "balance_loss_clip": 1.01746106, + "balance_loss_mlp": 1.01863551, + "epoch": 0.40787614609950396, + "flos": 21177754813440.0, + "grad_norm": 1.5323948204903912, + "language_loss": 0.82029247, + "learning_rate": 2.679992655730283e-06, + "loss": 0.84139496, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4296875, + "step": 6784, + "time_per_iteration": 3.7212843894958496 + }, + { + "auxiliary_loss_clip": 0.01066226, + "auxiliary_loss_mlp": 0.01050761, + "balance_loss_clip": 1.01633346, + "balance_loss_mlp": 1.01932633, + "epoch": 0.407936269352172, + "flos": 20519642085120.0, + "grad_norm": 2.3011985904092724, + "language_loss": 0.67729902, + "learning_rate": 2.679626382651386e-06, + "loss": 0.69846892, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.46875, + "step": 6785, + "time_per_iteration": 2.381941556930542 + }, + { + "auxiliary_loss_clip": 0.01061151, + "auxiliary_loss_mlp": 0.01043017, + "balance_loss_clip": 1.0140729, + "balance_loss_mlp": 1.0191462, + "epoch": 0.40799639260483994, + "flos": 20117826714240.0, + "grad_norm": 1.9588399885314052, + "language_loss": 0.80781126, + "learning_rate": 2.679260083800989e-06, + "loss": 0.82885295, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41992188, + "step": 6786, + "time_per_iteration": 3.8912861347198486 + }, + { + "auxiliary_loss_clip": 0.01061866, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_clip": 1.02228785, + "balance_loss_mlp": 1.01898587, + "epoch": 0.4080565158575079, + "flos": 20996241321600.0, + "grad_norm": 1.6008802814860668, + "language_loss": 0.82139266, + "learning_rate": 2.678893759192982e-06, + "loss": 0.84250957, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.42773438, + "step": 6787, + "time_per_iteration": 2.3949897289276123 + }, + { + "auxiliary_loss_clip": 0.01059558, + "auxiliary_loss_mlp": 0.01041081, + "balance_loss_clip": 1.01135063, + "balance_loss_mlp": 1.01853299, + "epoch": 0.40811663911017587, + "flos": 19316709590400.0, + "grad_norm": 1.6537731180825717, + "language_loss": 0.69187158, + "learning_rate": 2.678527408841255e-06, + "loss": 0.71287793, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41015625, + "step": 6788, + "time_per_iteration": 2.363983154296875 + }, + { + "auxiliary_loss_clip": 0.01060173, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_clip": 1.01690733, + "balance_loss_mlp": 1.0187062, + "epoch": 0.40817676236284384, + "flos": 40623783546240.0, + "grad_norm": 1.8608151108829818, + "language_loss": 0.67489529, + "learning_rate": 2.678161032759701e-06, + "loss": 0.69596851, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4140625, + "step": 6789, + "time_per_iteration": 2.5769643783569336 + }, + { + "auxiliary_loss_clip": 0.01062392, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.01206899, + "balance_loss_mlp": 1.01908708, + "epoch": 0.4082368856155118, + "flos": 20521038539520.0, + "grad_norm": 1.7397700748324245, + "language_loss": 0.62699461, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.64803678, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43359375, + "step": 6790, + "time_per_iteration": 2.3711435794830322 + }, + { + "auxiliary_loss_clip": 0.01063778, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_clip": 1.01813483, + "balance_loss_mlp": 1.02082491, + "epoch": 0.40829700886817977, + "flos": 11427211906560.0, + "grad_norm": 2.5702063709109515, + "language_loss": 0.71760309, + "learning_rate": 2.677428203462683e-06, + "loss": 0.73873228, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4296875, + "step": 6791, + "time_per_iteration": 2.3717784881591797 + }, + { + "auxiliary_loss_clip": 0.01011338, + "auxiliary_loss_mlp": 0.01013647, + "balance_loss_clip": 1.01042819, + "balance_loss_mlp": 1.00255942, + "epoch": 0.40835713212084773, + "flos": 67327380754560.0, + "grad_norm": 0.75727990237952, + "language_loss": 0.59818858, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61843842, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.03222656, + "router_z_loss_mlp": 0.08789062, + "step": 6792, + "time_per_iteration": 2.9931931495666504 + }, + { + "auxiliary_loss_clip": 0.01063951, + "auxiliary_loss_mlp": 0.01053575, + "balance_loss_clip": 1.02262855, + "balance_loss_mlp": 1.02037716, + "epoch": 0.4084172553735157, + "flos": 21760944030720.0, + "grad_norm": 1.6174485141118269, + "language_loss": 0.81983435, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.84100962, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43554688, + "step": 6793, + "time_per_iteration": 2.4188411235809326 + }, + { + "auxiliary_loss_clip": 0.01064137, + "auxiliary_loss_mlp": 0.01052692, + "balance_loss_clip": 1.02104151, + "balance_loss_mlp": 1.01925719, + "epoch": 0.40847737862618366, + "flos": 27416035745280.0, + "grad_norm": 2.0875358288474337, + "language_loss": 0.86006355, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.8812319, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 6794, + "time_per_iteration": 2.428314685821533 + }, + { + "auxiliary_loss_clip": 0.01062777, + "auxiliary_loss_mlp": 0.01050147, + "balance_loss_clip": 1.01847315, + "balance_loss_mlp": 1.01918232, + "epoch": 0.4085375018788516, + "flos": 18586291703040.0, + "grad_norm": 1.5421340811220356, + "language_loss": 0.80392683, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82505608, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43554688, + "step": 6795, + "time_per_iteration": 2.4042773246765137 + }, + { + "auxiliary_loss_clip": 0.0106684, + "auxiliary_loss_mlp": 0.01056083, + "balance_loss_clip": 1.02172685, + "balance_loss_mlp": 1.01975918, + "epoch": 0.4085976251315196, + "flos": 15410941148160.0, + "grad_norm": 2.3042246371347272, + "language_loss": 0.71610636, + "learning_rate": 2.675595680920792e-06, + "loss": 0.73733556, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.47070312, + "step": 6796, + "time_per_iteration": 2.3477187156677246 + }, + { + "auxiliary_loss_clip": 0.01061364, + "auxiliary_loss_mlp": 0.01048321, + "balance_loss_clip": 1.01885295, + "balance_loss_mlp": 1.01893032, + "epoch": 0.40865774838418756, + "flos": 21251142224640.0, + "grad_norm": 1.6162502838956696, + "language_loss": 0.79360318, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.81470007, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42382812, + "step": 6797, + "time_per_iteration": 2.428269863128662 + }, + { + "auxiliary_loss_clip": 0.01062424, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.01964998, + "balance_loss_mlp": 1.01885986, + "epoch": 0.4087178716368556, + "flos": 13771384790400.0, + "grad_norm": 1.9984674687327404, + "language_loss": 0.87323153, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.89436388, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43554688, + "step": 6798, + "time_per_iteration": 2.345200777053833 + }, + { + "auxiliary_loss_clip": 0.01059976, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.01924264, + "balance_loss_mlp": 1.01932037, + "epoch": 0.40877799488952354, + "flos": 23620662622080.0, + "grad_norm": 1.4749691008674646, + "language_loss": 0.85057884, + "learning_rate": 2.674495859860601e-06, + "loss": 0.87165236, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40625, + "step": 6799, + "time_per_iteration": 2.570819139480591 + }, + { + "auxiliary_loss_clip": 0.01064471, + "auxiliary_loss_mlp": 0.0105396, + "balance_loss_clip": 1.02148783, + "balance_loss_mlp": 1.01979423, + "epoch": 0.4088381181421915, + "flos": 20917861585920.0, + "grad_norm": 4.89926270316154, + "language_loss": 0.85365546, + "learning_rate": 2.6741292016681e-06, + "loss": 0.87483972, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44726562, + "step": 6800, + "time_per_iteration": 2.391497850418091 + }, + { + "auxiliary_loss_clip": 0.0106414, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_clip": 1.01658499, + "balance_loss_mlp": 1.01982999, + "epoch": 0.4088982413948595, + "flos": 13296740590080.0, + "grad_norm": 1.9672785846341585, + "language_loss": 0.76572549, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.78686202, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44335938, + "step": 6801, + "time_per_iteration": 2.415107011795044 + }, + { + "auxiliary_loss_clip": 0.01064618, + "auxiliary_loss_mlp": 0.01052902, + "balance_loss_clip": 1.02176428, + "balance_loss_mlp": 1.02022004, + "epoch": 0.40895836464752744, + "flos": 15266784677760.0, + "grad_norm": 1.8483921193768464, + "language_loss": 0.81740606, + "learning_rate": 2.673395808607861e-06, + "loss": 0.83858132, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4453125, + "step": 6802, + "time_per_iteration": 2.333857774734497 + }, + { + "auxiliary_loss_clip": 0.01062396, + "auxiliary_loss_mlp": 0.01051105, + "balance_loss_clip": 1.01872742, + "balance_loss_mlp": 1.01839364, + "epoch": 0.4090184879001954, + "flos": 14500545868800.0, + "grad_norm": 2.0448293928706414, + "language_loss": 0.77267683, + "learning_rate": 2.673029073767934e-06, + "loss": 0.79381186, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44140625, + "step": 6803, + "time_per_iteration": 2.3563039302825928 + }, + { + "auxiliary_loss_clip": 0.01063839, + "auxiliary_loss_mlp": 0.01050495, + "balance_loss_clip": 1.02038276, + "balance_loss_mlp": 1.01959276, + "epoch": 0.40907861115286337, + "flos": 13880732768640.0, + "grad_norm": 2.0057673584024487, + "language_loss": 0.8003239, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.82146722, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44140625, + "step": 6804, + "time_per_iteration": 2.398557424545288 + }, + { + "auxiliary_loss_clip": 0.01065624, + "auxiliary_loss_mlp": 0.01045788, + "balance_loss_clip": 1.01407814, + "balance_loss_mlp": 1.01924384, + "epoch": 0.40913873440553133, + "flos": 28036372515840.0, + "grad_norm": 1.7826767445094973, + "language_loss": 0.77320707, + "learning_rate": 2.672295527537998e-06, + "loss": 0.79432118, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.46484375, + "step": 6805, + "time_per_iteration": 2.4431025981903076 + }, + { + "auxiliary_loss_clip": 0.01064172, + "auxiliary_loss_mlp": 0.01047192, + "balance_loss_clip": 1.01660275, + "balance_loss_mlp": 1.02039647, + "epoch": 0.4091988576581993, + "flos": 21617066851200.0, + "grad_norm": 1.595091393049744, + "language_loss": 0.80356628, + "learning_rate": 2.671928716175804e-06, + "loss": 0.82467985, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4375, + "step": 6806, + "time_per_iteration": 2.4364919662475586 + }, + { + "auxiliary_loss_clip": 0.01064306, + "auxiliary_loss_mlp": 0.01045408, + "balance_loss_clip": 1.0114336, + "balance_loss_mlp": 1.01963019, + "epoch": 0.40925898091086726, + "flos": 25223036515200.0, + "grad_norm": 2.1469923815840883, + "language_loss": 0.73694289, + "learning_rate": 2.671561879334007e-06, + "loss": 0.75804001, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4453125, + "step": 6807, + "time_per_iteration": 2.420684814453125 + }, + { + "auxiliary_loss_clip": 0.01012141, + "auxiliary_loss_mlp": 0.0100519, + "balance_loss_clip": 1.00199521, + "balance_loss_mlp": 1.0032872, + "epoch": 0.40931910416353523, + "flos": 68927380675200.0, + "grad_norm": 0.8225381655621209, + "language_loss": 0.58833492, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60850823, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.03198242, + "router_z_loss_mlp": 0.08886719, + "step": 6808, + "time_per_iteration": 3.1178836822509766 + }, + { + "auxiliary_loss_clip": 0.01061368, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.01860046, + "balance_loss_mlp": 1.01904786, + "epoch": 0.4093792274162032, + "flos": 20188630684800.0, + "grad_norm": 1.5203462041899707, + "language_loss": 0.55954403, + "learning_rate": 2.670828129267242e-06, + "loss": 0.58063149, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 6809, + "time_per_iteration": 2.4253292083740234 + }, + { + "auxiliary_loss_clip": 0.01064688, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.01074505, + "balance_loss_mlp": 1.02055764, + "epoch": 0.40943935066887116, + "flos": 25227574992000.0, + "grad_norm": 2.0639836128379043, + "language_loss": 0.84996992, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.87101746, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44140625, + "step": 6810, + "time_per_iteration": 2.410400390625 + }, + { + "auxiliary_loss_clip": 0.01066675, + "auxiliary_loss_mlp": 0.01054396, + "balance_loss_clip": 1.01934898, + "balance_loss_mlp": 1.01993322, + "epoch": 0.4094994739215392, + "flos": 23254284147840.0, + "grad_norm": 2.0931004247645477, + "language_loss": 0.79787683, + "learning_rate": 2.670094277448999e-06, + "loss": 0.81908751, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.46679688, + "step": 6811, + "time_per_iteration": 2.4300496578216553 + }, + { + "auxiliary_loss_clip": 0.01062557, + "auxiliary_loss_mlp": 0.01046694, + "balance_loss_clip": 1.01426935, + "balance_loss_mlp": 1.01850212, + "epoch": 0.40955959717420715, + "flos": 17381264526720.0, + "grad_norm": 1.4944164024388253, + "language_loss": 0.71396232, + "learning_rate": 2.669727313417857e-06, + "loss": 0.73505485, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44140625, + "step": 6812, + "time_per_iteration": 2.4540069103240967 + }, + { + "auxiliary_loss_clip": 0.0106329, + "auxiliary_loss_mlp": 0.01052775, + "balance_loss_clip": 1.02099347, + "balance_loss_mlp": 1.01920903, + "epoch": 0.4096197204268751, + "flos": 25081254017280.0, + "grad_norm": 1.5278273305104828, + "language_loss": 0.68133152, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.70249218, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44140625, + "step": 6813, + "time_per_iteration": 2.434086561203003 + }, + { + "auxiliary_loss_clip": 0.0106371, + "auxiliary_loss_mlp": 0.01046132, + "balance_loss_clip": 1.01435065, + "balance_loss_mlp": 1.01951337, + "epoch": 0.4096798436795431, + "flos": 30585591014400.0, + "grad_norm": 1.983238248587552, + "language_loss": 0.75298792, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.77408636, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44140625, + "step": 6814, + "time_per_iteration": 2.474334955215454 + }, + { + "auxiliary_loss_clip": 0.01064273, + "auxiliary_loss_mlp": 0.01051539, + "balance_loss_clip": 1.01806498, + "balance_loss_mlp": 1.01907182, + "epoch": 0.40973996693221104, + "flos": 24132489287040.0, + "grad_norm": 2.052459265585176, + "language_loss": 0.67934561, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.70050371, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.453125, + "step": 6815, + "time_per_iteration": 2.44216251373291 + }, + { + "auxiliary_loss_clip": 0.01063058, + "auxiliary_loss_mlp": 0.01057203, + "balance_loss_clip": 1.02623296, + "balance_loss_mlp": 1.0199666, + "epoch": 0.409800090184879, + "flos": 23987809146240.0, + "grad_norm": 1.6110630095695289, + "language_loss": 0.77602434, + "learning_rate": 2.668259203471188e-06, + "loss": 0.79722691, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4296875, + "step": 6816, + "time_per_iteration": 2.4137892723083496 + }, + { + "auxiliary_loss_clip": 0.01061921, + "auxiliary_loss_mlp": 0.01056473, + "balance_loss_clip": 1.02531195, + "balance_loss_mlp": 1.01910782, + "epoch": 0.40986021343754697, + "flos": 16142755489920.0, + "grad_norm": 2.0466049100470824, + "language_loss": 0.82873017, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.84991407, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4296875, + "step": 6817, + "time_per_iteration": 2.3611483573913574 + }, + { + "auxiliary_loss_clip": 0.0106434, + "auxiliary_loss_mlp": 0.01049856, + "balance_loss_clip": 1.01454592, + "balance_loss_mlp": 1.01865613, + "epoch": 0.40992033669021494, + "flos": 24789659408640.0, + "grad_norm": 1.584356061447555, + "language_loss": 0.81667244, + "learning_rate": 2.667524996399444e-06, + "loss": 0.83781439, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.45703125, + "step": 6818, + "time_per_iteration": 2.435990810394287 + }, + { + "auxiliary_loss_clip": 0.01062159, + "auxiliary_loss_mlp": 0.01046037, + "balance_loss_clip": 1.01711679, + "balance_loss_mlp": 1.01797116, + "epoch": 0.4099804599428829, + "flos": 29640631622400.0, + "grad_norm": 1.4712255719436849, + "language_loss": 0.67170054, + "learning_rate": 2.66715785488769e-06, + "loss": 0.69278252, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44140625, + "step": 6819, + "time_per_iteration": 2.472405195236206 + }, + { + "auxiliary_loss_clip": 0.01066955, + "auxiliary_loss_mlp": 0.01058595, + "balance_loss_clip": 1.02414358, + "balance_loss_mlp": 1.0199244, + "epoch": 0.41004058319555087, + "flos": 24825445418880.0, + "grad_norm": 1.6015480178874426, + "language_loss": 0.86720759, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.88846314, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.47070312, + "step": 6820, + "time_per_iteration": 2.441683292388916 + }, + { + "auxiliary_loss_clip": 0.01060374, + "auxiliary_loss_mlp": 0.01046067, + "balance_loss_clip": 1.01757574, + "balance_loss_mlp": 1.01861548, + "epoch": 0.41010070644821883, + "flos": 25736329457280.0, + "grad_norm": 1.5714177748038813, + "language_loss": 0.72728515, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.74834955, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 6821, + "time_per_iteration": 3.8891398906707764 + }, + { + "auxiliary_loss_clip": 0.01062548, + "auxiliary_loss_mlp": 0.01050078, + "balance_loss_clip": 1.02115834, + "balance_loss_mlp": 1.01884604, + "epoch": 0.4101608297008868, + "flos": 22344971120640.0, + "grad_norm": 1.8565807483905061, + "language_loss": 0.75654542, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.77767169, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4375, + "step": 6822, + "time_per_iteration": 3.7820310592651367 + }, + { + "auxiliary_loss_clip": 0.01062723, + "auxiliary_loss_mlp": 0.01050796, + "balance_loss_clip": 1.02155375, + "balance_loss_mlp": 1.01891339, + "epoch": 0.41022095295355476, + "flos": 21943993622400.0, + "grad_norm": 2.017679989095914, + "language_loss": 0.77601057, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.79714584, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4375, + "step": 6823, + "time_per_iteration": 2.428248882293701 + }, + { + "auxiliary_loss_clip": 0.01066619, + "auxiliary_loss_mlp": 0.01052481, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01959169, + "epoch": 0.4102810762062228, + "flos": 27449377960320.0, + "grad_norm": 1.6429081943805641, + "language_loss": 0.74115437, + "learning_rate": 2.665321768127001e-06, + "loss": 0.76234537, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.46875, + "step": 6824, + "time_per_iteration": 3.8870837688446045 + }, + { + "auxiliary_loss_clip": 0.01065438, + "auxiliary_loss_mlp": 0.01048741, + "balance_loss_clip": 1.01672125, + "balance_loss_mlp": 1.01903057, + "epoch": 0.41034119945889075, + "flos": 24498099711360.0, + "grad_norm": 2.167302937304581, + "language_loss": 0.73626065, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.75740242, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46484375, + "step": 6825, + "time_per_iteration": 2.4103164672851562 + }, + { + "auxiliary_loss_clip": 0.01062169, + "auxiliary_loss_mlp": 0.01042948, + "balance_loss_clip": 1.01547003, + "balance_loss_mlp": 1.01952124, + "epoch": 0.4104013227115587, + "flos": 24351499445760.0, + "grad_norm": 1.9675360724555868, + "language_loss": 0.85874903, + "learning_rate": 2.664587156721768e-06, + "loss": 0.8798002, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42578125, + "step": 6826, + "time_per_iteration": 3.886806011199951 + }, + { + "auxiliary_loss_clip": 0.01059774, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.0138607, + "balance_loss_mlp": 1.01924527, + "epoch": 0.4104614459642267, + "flos": 23728299943680.0, + "grad_norm": 1.7119857450170477, + "language_loss": 0.67204154, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.69307452, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40625, + "step": 6827, + "time_per_iteration": 2.3925068378448486 + }, + { + "auxiliary_loss_clip": 0.01062307, + "auxiliary_loss_mlp": 0.01047136, + "balance_loss_clip": 1.01833487, + "balance_loss_mlp": 1.02006698, + "epoch": 0.41052156921689464, + "flos": 22126868657280.0, + "grad_norm": 1.4479917564285445, + "language_loss": 0.73789746, + "learning_rate": 2.663852444511689e-06, + "loss": 0.75899184, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 6828, + "time_per_iteration": 2.4186601638793945 + }, + { + "auxiliary_loss_clip": 0.01064876, + "auxiliary_loss_mlp": 0.01051173, + "balance_loss_clip": 1.02082193, + "balance_loss_mlp": 1.02009714, + "epoch": 0.4105816924695626, + "flos": 20083332424320.0, + "grad_norm": 6.643626070223331, + "language_loss": 0.85293156, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.87409204, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44921875, + "step": 6829, + "time_per_iteration": 2.3661324977874756 + }, + { + "auxiliary_loss_clip": 0.01060786, + "auxiliary_loss_mlp": 0.01044687, + "balance_loss_clip": 1.01698279, + "balance_loss_mlp": 1.01943994, + "epoch": 0.4106418157222306, + "flos": 18075826581120.0, + "grad_norm": 1.539160306468367, + "language_loss": 0.90236974, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92342454, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 6830, + "time_per_iteration": 2.387516498565674 + }, + { + "auxiliary_loss_clip": 0.01062403, + "auxiliary_loss_mlp": 0.0104541, + "balance_loss_clip": 1.01671624, + "balance_loss_mlp": 1.01948524, + "epoch": 0.41070193897489854, + "flos": 21646917930240.0, + "grad_norm": 1.6699164100247337, + "language_loss": 0.6651032, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68618137, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4296875, + "step": 6831, + "time_per_iteration": 2.3889496326446533 + }, + { + "auxiliary_loss_clip": 0.01061345, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.01092923, + "balance_loss_mlp": 1.019346, + "epoch": 0.4107620622275665, + "flos": 26647073850240.0, + "grad_norm": 1.7606193525450733, + "language_loss": 0.70623219, + "learning_rate": 2.662382718122776e-06, + "loss": 0.72723389, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41992188, + "step": 6832, + "time_per_iteration": 2.4610888957977295 + }, + { + "auxiliary_loss_clip": 0.01061171, + "auxiliary_loss_mlp": 0.01040806, + "balance_loss_clip": 1.01477075, + "balance_loss_mlp": 1.01991701, + "epoch": 0.41082218548023447, + "flos": 18733310904960.0, + "grad_norm": 1.8457383348601888, + "language_loss": 0.75205815, + "learning_rate": 2.662015223696666e-06, + "loss": 0.77307796, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.41210938, + "step": 6833, + "time_per_iteration": 2.394274950027466 + }, + { + "auxiliary_loss_clip": 0.01065306, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_clip": 1.01572216, + "balance_loss_mlp": 1.02032351, + "epoch": 0.41088230873290243, + "flos": 22892653618560.0, + "grad_norm": 1.6200315330802235, + "language_loss": 0.74133486, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.76243818, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44921875, + "step": 6834, + "time_per_iteration": 2.414973258972168 + }, + { + "auxiliary_loss_clip": 0.01062809, + "auxiliary_loss_mlp": 0.01046306, + "balance_loss_clip": 1.01743388, + "balance_loss_mlp": 1.01877093, + "epoch": 0.4109424319855704, + "flos": 24275912618880.0, + "grad_norm": 2.5396849588679644, + "language_loss": 0.72563553, + "learning_rate": 2.661280159547329e-06, + "loss": 0.74672675, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43945312, + "step": 6835, + "time_per_iteration": 2.432098627090454 + }, + { + "auxiliary_loss_clip": 0.01064125, + "auxiliary_loss_mlp": 0.01045808, + "balance_loss_clip": 1.01471841, + "balance_loss_mlp": 1.02053297, + "epoch": 0.41100255523823837, + "flos": 12968312630400.0, + "grad_norm": 1.8967255257128466, + "language_loss": 0.89060462, + "learning_rate": 2.660912589851978e-06, + "loss": 0.91170394, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43554688, + "step": 6836, + "time_per_iteration": 2.409686803817749 + }, + { + "auxiliary_loss_clip": 0.01061826, + "auxiliary_loss_mlp": 0.01045498, + "balance_loss_clip": 1.0179013, + "balance_loss_mlp": 1.01968765, + "epoch": 0.4110626784909064, + "flos": 23144621967360.0, + "grad_norm": 2.010308768167035, + "language_loss": 0.70250452, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.72357768, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.421875, + "step": 6837, + "time_per_iteration": 2.4810755252838135 + }, + { + "auxiliary_loss_clip": 0.01064152, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.01535249, + "balance_loss_mlp": 1.0195961, + "epoch": 0.41112280174357435, + "flos": 22746297732480.0, + "grad_norm": 1.679286883657385, + "language_loss": 0.7641747, + "learning_rate": 2.660177375289599e-06, + "loss": 0.785285, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 6838, + "time_per_iteration": 2.4240612983703613 + }, + { + "auxiliary_loss_clip": 0.01061588, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.01749766, + "balance_loss_mlp": 1.01894641, + "epoch": 0.4111829249962423, + "flos": 21101434848000.0, + "grad_norm": 1.8298889752443506, + "language_loss": 0.83098042, + "learning_rate": 2.659809730450451e-06, + "loss": 0.85207283, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42578125, + "step": 6839, + "time_per_iteration": 2.3772270679473877 + }, + { + "auxiliary_loss_clip": 0.01060564, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_clip": 1.02423477, + "balance_loss_mlp": 1.01865697, + "epoch": 0.4112430482489103, + "flos": 21504751407360.0, + "grad_norm": 1.7161224014367062, + "language_loss": 0.82021427, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.84133017, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41992188, + "step": 6840, + "time_per_iteration": 2.3982417583465576 + }, + { + "auxiliary_loss_clip": 0.01060007, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.01641786, + "balance_loss_mlp": 1.01840067, + "epoch": 0.41130317150157825, + "flos": 19569096875520.0, + "grad_norm": 1.6912461456717376, + "language_loss": 0.67917228, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.70020294, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41601562, + "step": 6841, + "time_per_iteration": 2.3581559658050537 + }, + { + "auxiliary_loss_clip": 0.01011919, + "auxiliary_loss_mlp": 0.01006241, + "balance_loss_clip": 1.00242615, + "balance_loss_mlp": 1.00345182, + "epoch": 0.4113632947542462, + "flos": 62379593740800.0, + "grad_norm": 0.7784957605423053, + "language_loss": 0.59733307, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61751473, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.03808594, + "router_z_loss_mlp": 0.08496094, + "step": 6842, + "time_per_iteration": 3.1188442707061768 + }, + { + "auxiliary_loss_clip": 0.01058778, + "auxiliary_loss_mlp": 0.01045651, + "balance_loss_clip": 1.01956832, + "balance_loss_mlp": 1.01891458, + "epoch": 0.4114234180069142, + "flos": 13917740676480.0, + "grad_norm": 1.8507635090734271, + "language_loss": 0.71458292, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.73562717, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3984375, + "step": 6843, + "time_per_iteration": 2.3552639484405518 + }, + { + "auxiliary_loss_clip": 0.01011148, + "auxiliary_loss_mlp": 0.01003666, + "balance_loss_clip": 1.00030434, + "balance_loss_mlp": 1.00311923, + "epoch": 0.41148354125958214, + "flos": 64925111635200.0, + "grad_norm": 0.7091461734915483, + "language_loss": 0.53678071, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55692887, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.08007812, + "step": 6844, + "time_per_iteration": 3.0310726165771484 + }, + { + "auxiliary_loss_clip": 0.01060167, + "auxiliary_loss_mlp": 0.01046832, + "balance_loss_clip": 1.02122617, + "balance_loss_mlp": 1.01837373, + "epoch": 0.4115436645122501, + "flos": 18727934555520.0, + "grad_norm": 1.7246607406836638, + "language_loss": 0.6764735, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.69754356, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.41796875, + "step": 6845, + "time_per_iteration": 2.355976104736328 + }, + { + "auxiliary_loss_clip": 0.01060547, + "auxiliary_loss_mlp": 0.01048549, + "balance_loss_clip": 1.02122593, + "balance_loss_mlp": 1.01975322, + "epoch": 0.41160378776491807, + "flos": 16251998734080.0, + "grad_norm": 1.829155277457834, + "language_loss": 0.71735072, + "learning_rate": 2.657235516795808e-06, + "loss": 0.73844165, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40820312, + "step": 6846, + "time_per_iteration": 2.376189947128296 + }, + { + "auxiliary_loss_clip": 0.01059713, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.02450812, + "balance_loss_mlp": 1.01814067, + "epoch": 0.41166391101758604, + "flos": 27968640744960.0, + "grad_norm": 1.5441236701952108, + "language_loss": 0.66539121, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.6865254, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41601562, + "step": 6847, + "time_per_iteration": 2.474001169204712 + }, + { + "auxiliary_loss_clip": 0.01060394, + "auxiliary_loss_mlp": 0.01050942, + "balance_loss_clip": 1.02351189, + "balance_loss_mlp": 1.01850748, + "epoch": 0.411724034270254, + "flos": 34129868572800.0, + "grad_norm": 1.3692571601459331, + "language_loss": 0.71301413, + "learning_rate": 2.656499802669069e-06, + "loss": 0.73412746, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41992188, + "step": 6848, + "time_per_iteration": 2.493251323699951 + }, + { + "auxiliary_loss_clip": 0.01010018, + "auxiliary_loss_mlp": 0.0100748, + "balance_loss_clip": 1.00364113, + "balance_loss_mlp": 1.00187349, + "epoch": 0.41178415752292197, + "flos": 67920100773120.0, + "grad_norm": 0.8957345610649452, + "language_loss": 0.56340384, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58357882, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.03833008, + "router_z_loss_mlp": 0.08105469, + "step": 6849, + "time_per_iteration": 3.1800320148468018 + }, + { + "auxiliary_loss_clip": 0.01062069, + "auxiliary_loss_mlp": 0.01048689, + "balance_loss_clip": 1.0206871, + "balance_loss_mlp": 1.0203588, + "epoch": 0.41184428077558993, + "flos": 34312499228160.0, + "grad_norm": 1.5415271242590203, + "language_loss": 0.76932454, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.7904321, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 6850, + "time_per_iteration": 2.5250630378723145 + }, + { + "auxiliary_loss_clip": 0.01058719, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.01835608, + "balance_loss_mlp": 1.01854467, + "epoch": 0.41190440402825795, + "flos": 35442672716160.0, + "grad_norm": 1.730016362075321, + "language_loss": 0.68961453, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.71064764, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 6851, + "time_per_iteration": 2.5043740272521973 + }, + { + "auxiliary_loss_clip": 0.01064678, + "auxiliary_loss_mlp": 0.01060292, + "balance_loss_clip": 1.0268178, + "balance_loss_mlp": 1.0198673, + "epoch": 0.4119645272809259, + "flos": 20848838094720.0, + "grad_norm": 3.386491720986252, + "language_loss": 0.82797015, + "learning_rate": 2.655028075792743e-06, + "loss": 0.8492198, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.44921875, + "step": 6852, + "time_per_iteration": 2.394551992416382 + }, + { + "auxiliary_loss_clip": 0.01065077, + "auxiliary_loss_mlp": 0.01054153, + "balance_loss_clip": 1.02182341, + "balance_loss_mlp": 1.01997864, + "epoch": 0.4120246505335939, + "flos": 27560855531520.0, + "grad_norm": 2.1019546168811405, + "language_loss": 0.7939347, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.81512702, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.45117188, + "step": 6853, + "time_per_iteration": 2.4236204624176025 + }, + { + "auxiliary_loss_clip": 0.01066493, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_clip": 1.02039576, + "balance_loss_mlp": 1.02066755, + "epoch": 0.41208477378626185, + "flos": 37813938681600.0, + "grad_norm": 1.661838399317321, + "language_loss": 0.67861915, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.69981921, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45898438, + "step": 6854, + "time_per_iteration": 2.5552010536193848 + }, + { + "auxiliary_loss_clip": 0.01062176, + "auxiliary_loss_mlp": 0.01044732, + "balance_loss_clip": 1.01687288, + "balance_loss_mlp": 1.02025592, + "epoch": 0.4121448970389298, + "flos": 23439637889280.0, + "grad_norm": 1.7829995866128785, + "language_loss": 0.8520242, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.87309331, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 6855, + "time_per_iteration": 2.3911221027374268 + }, + { + "auxiliary_loss_clip": 0.01059323, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_clip": 1.01729822, + "balance_loss_mlp": 1.01837885, + "epoch": 0.4122050202915978, + "flos": 21324215433600.0, + "grad_norm": 1.6422147421281972, + "language_loss": 0.8034156, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.82445645, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41015625, + "step": 6856, + "time_per_iteration": 2.4088330268859863 + }, + { + "auxiliary_loss_clip": 0.01064637, + "auxiliary_loss_mlp": 0.01044611, + "balance_loss_clip": 1.01588178, + "balance_loss_mlp": 1.0209043, + "epoch": 0.41226514354426574, + "flos": 17305468231680.0, + "grad_norm": 2.61216854830912, + "language_loss": 0.8142494, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.83534193, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4375, + "step": 6857, + "time_per_iteration": 2.340552568435669 + }, + { + "auxiliary_loss_clip": 0.01064536, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.01333523, + "balance_loss_mlp": 1.01994634, + "epoch": 0.4123252667969337, + "flos": 17637910997760.0, + "grad_norm": 1.7699545814321902, + "language_loss": 0.72143382, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.74250853, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4453125, + "step": 6858, + "time_per_iteration": 2.3817138671875 + }, + { + "auxiliary_loss_clip": 0.01060955, + "auxiliary_loss_mlp": 0.01045346, + "balance_loss_clip": 1.01733184, + "balance_loss_mlp": 1.01953876, + "epoch": 0.4123853900496017, + "flos": 46423101553920.0, + "grad_norm": 1.5299996464339505, + "language_loss": 0.60564846, + "learning_rate": 2.652451598005391e-06, + "loss": 0.62671149, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 6859, + "time_per_iteration": 2.5990231037139893 + }, + { + "auxiliary_loss_clip": 0.01062274, + "auxiliary_loss_mlp": 0.01045282, + "balance_loss_clip": 1.01698184, + "balance_loss_mlp": 1.01925504, + "epoch": 0.41244551330226964, + "flos": 17674220678400.0, + "grad_norm": 4.9951363418552965, + "language_loss": 0.75393903, + "learning_rate": 2.652083430674264e-06, + "loss": 0.77501458, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4296875, + "step": 6860, + "time_per_iteration": 2.3921658992767334 + }, + { + "auxiliary_loss_clip": 0.01061158, + "auxiliary_loss_mlp": 0.01041714, + "balance_loss_clip": 1.01479626, + "balance_loss_mlp": 1.01976669, + "epoch": 0.4125056365549376, + "flos": 18692846772480.0, + "grad_norm": 2.033921765860662, + "language_loss": 0.75122398, + "learning_rate": 2.651715238616068e-06, + "loss": 0.7722528, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 6861, + "time_per_iteration": 3.80245304107666 + }, + { + "auxiliary_loss_clip": 0.01062095, + "auxiliary_loss_mlp": 0.01043019, + "balance_loss_clip": 1.01537478, + "balance_loss_mlp": 1.02097607, + "epoch": 0.41256575980760557, + "flos": 17894313089280.0, + "grad_norm": 13.05125102721179, + "language_loss": 0.80844444, + "learning_rate": 2.651347021844765e-06, + "loss": 0.82949555, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41210938, + "step": 6862, + "time_per_iteration": 3.8166825771331787 + }, + { + "auxiliary_loss_clip": 0.01061694, + "auxiliary_loss_mlp": 0.01048624, + "balance_loss_clip": 1.02077699, + "balance_loss_mlp": 1.01967061, + "epoch": 0.41262588306027354, + "flos": 21980233480320.0, + "grad_norm": 1.6160959060446645, + "language_loss": 0.77764046, + "learning_rate": 2.650978780374318e-06, + "loss": 0.79874361, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.421875, + "step": 6863, + "time_per_iteration": 3.7784807682037354 + }, + { + "auxiliary_loss_clip": 0.01011823, + "auxiliary_loss_mlp": 0.01009009, + "balance_loss_clip": 1.00571907, + "balance_loss_mlp": 1.00329733, + "epoch": 0.41268600631294156, + "flos": 53347284656640.0, + "grad_norm": 0.7012403835506678, + "language_loss": 0.52745116, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54765952, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.03295898, + "router_z_loss_mlp": 0.08496094, + "step": 6864, + "time_per_iteration": 2.975092649459839 + }, + { + "auxiliary_loss_clip": 0.01064617, + "auxiliary_loss_mlp": 0.01052666, + "balance_loss_clip": 1.02113497, + "balance_loss_mlp": 1.01954734, + "epoch": 0.4127461295656095, + "flos": 24384317990400.0, + "grad_norm": 1.665530575754607, + "language_loss": 0.73737884, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.7585516, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45117188, + "step": 6865, + "time_per_iteration": 2.3937106132507324 + }, + { + "auxiliary_loss_clip": 0.0101096, + "auxiliary_loss_mlp": 0.01012989, + "balance_loss_clip": 1.00905526, + "balance_loss_mlp": 1.00220299, + "epoch": 0.4128062528182775, + "flos": 71701928288640.0, + "grad_norm": 0.9293256228967738, + "language_loss": 0.66686225, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68710172, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.08789062, + "step": 6866, + "time_per_iteration": 4.365203619003296 + }, + { + "auxiliary_loss_clip": 0.01059642, + "auxiliary_loss_mlp": 0.01047732, + "balance_loss_clip": 1.01944351, + "balance_loss_mlp": 1.01756561, + "epoch": 0.41286637607094545, + "flos": 17848402784640.0, + "grad_norm": 2.0524904868874434, + "language_loss": 0.82679236, + "learning_rate": 2.649505567780375e-06, + "loss": 0.84786606, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 6867, + "time_per_iteration": 2.342517137527466 + }, + { + "auxiliary_loss_clip": 0.01063498, + "auxiliary_loss_mlp": 0.01053104, + "balance_loss_clip": 1.0235045, + "balance_loss_mlp": 1.02028799, + "epoch": 0.4129264993236134, + "flos": 25548566832000.0, + "grad_norm": 2.1466294765926963, + "language_loss": 0.79783857, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.81900465, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43164062, + "step": 6868, + "time_per_iteration": 2.4033260345458984 + }, + { + "auxiliary_loss_clip": 0.01012483, + "auxiliary_loss_mlp": 0.01020829, + "balance_loss_clip": 1.01746738, + "balance_loss_mlp": 1.00373089, + "epoch": 0.4129866225762814, + "flos": 65411732298240.0, + "grad_norm": 0.8640014064247609, + "language_loss": 0.57907015, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59940326, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.08740234, + "step": 6869, + "time_per_iteration": 2.8236522674560547 + }, + { + "auxiliary_loss_clip": 0.01058701, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.01256418, + "balance_loss_mlp": 1.01801646, + "epoch": 0.41304674582894935, + "flos": 28875719445120.0, + "grad_norm": 1.6817207501873805, + "language_loss": 0.76479036, + "learning_rate": 2.64840039967822e-06, + "loss": 0.78577358, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 6870, + "time_per_iteration": 2.447242498397827 + }, + { + "auxiliary_loss_clip": 0.01064739, + "auxiliary_loss_mlp": 0.01050048, + "balance_loss_clip": 1.02085376, + "balance_loss_mlp": 1.02142859, + "epoch": 0.4131068690816173, + "flos": 22890908050560.0, + "grad_norm": 1.4539915026127357, + "language_loss": 0.84408987, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.86523777, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43359375, + "step": 6871, + "time_per_iteration": 2.412222385406494 + }, + { + "auxiliary_loss_clip": 0.01066632, + "auxiliary_loss_mlp": 0.01052859, + "balance_loss_clip": 1.02459431, + "balance_loss_mlp": 1.02307129, + "epoch": 0.4131669923342853, + "flos": 26064059189760.0, + "grad_norm": 1.9435468515144003, + "language_loss": 0.70145893, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.72265387, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.43554688, + "step": 6872, + "time_per_iteration": 2.4217400550842285 + }, + { + "auxiliary_loss_clip": 0.01064255, + "auxiliary_loss_mlp": 0.01047931, + "balance_loss_clip": 1.020751, + "balance_loss_mlp": 1.02192116, + "epoch": 0.41322711558695324, + "flos": 19243566558720.0, + "grad_norm": 1.8016787641662964, + "language_loss": 0.77036691, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.79148877, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.42382812, + "step": 6873, + "time_per_iteration": 2.435338020324707 + }, + { + "auxiliary_loss_clip": 0.01066304, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_clip": 1.01691985, + "balance_loss_mlp": 1.02318895, + "epoch": 0.4132872388396212, + "flos": 22673364168960.0, + "grad_norm": 1.8082373268695555, + "language_loss": 0.84685111, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.86796248, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.43164062, + "step": 6874, + "time_per_iteration": 2.434215545654297 + }, + { + "auxiliary_loss_clip": 0.0106552, + "auxiliary_loss_mlp": 0.01050264, + "balance_loss_clip": 1.02029467, + "balance_loss_mlp": 1.02233219, + "epoch": 0.4133473620922892, + "flos": 20149353538560.0, + "grad_norm": 1.587686525395317, + "language_loss": 0.73360229, + "learning_rate": 2.646557961279436e-06, + "loss": 0.75476015, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43164062, + "step": 6875, + "time_per_iteration": 2.375777006149292 + }, + { + "auxiliary_loss_clip": 0.01062955, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.02248788, + "balance_loss_mlp": 1.02296412, + "epoch": 0.41340748534495714, + "flos": 24241627797120.0, + "grad_norm": 1.4436590747716789, + "language_loss": 0.83021325, + "learning_rate": 2.646189399991154e-06, + "loss": 0.8513211, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 6876, + "time_per_iteration": 2.4261913299560547 + }, + { + "auxiliary_loss_clip": 0.01065909, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.02628827, + "balance_loss_mlp": 1.02136803, + "epoch": 0.41346760859762516, + "flos": 14391302624640.0, + "grad_norm": 2.1708243837018077, + "language_loss": 0.6633414, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.68456447, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4453125, + "step": 6877, + "time_per_iteration": 2.3520143032073975 + }, + { + "auxiliary_loss_clip": 0.01063247, + "auxiliary_loss_mlp": 0.01047475, + "balance_loss_clip": 1.02017605, + "balance_loss_mlp": 1.02178693, + "epoch": 0.4135277318502931, + "flos": 22490908070400.0, + "grad_norm": 1.7996539414602961, + "language_loss": 0.77676094, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.79786813, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 6878, + "time_per_iteration": 2.4239308834075928 + }, + { + "auxiliary_loss_clip": 0.01063991, + "auxiliary_loss_mlp": 0.01048841, + "balance_loss_clip": 1.02063584, + "balance_loss_mlp": 1.02185726, + "epoch": 0.4135878551029611, + "flos": 22417660304640.0, + "grad_norm": 1.839806438391506, + "language_loss": 0.81721663, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.83834493, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 6879, + "time_per_iteration": 2.3978376388549805 + }, + { + "auxiliary_loss_clip": 0.0106166, + "auxiliary_loss_mlp": 0.01046011, + "balance_loss_clip": 1.01846135, + "balance_loss_mlp": 1.02045822, + "epoch": 0.41364797835562905, + "flos": 27051996332160.0, + "grad_norm": 1.5923888476707455, + "language_loss": 0.85499871, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.87607551, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41015625, + "step": 6880, + "time_per_iteration": 2.4505865573883057 + }, + { + "auxiliary_loss_clip": 0.01065058, + "auxiliary_loss_mlp": 0.01049936, + "balance_loss_clip": 1.02089691, + "balance_loss_mlp": 1.02089071, + "epoch": 0.413708101608297, + "flos": 22966459966080.0, + "grad_norm": 1.6314582594679947, + "language_loss": 0.71768993, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.73883986, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.44140625, + "step": 6881, + "time_per_iteration": 2.398024082183838 + }, + { + "auxiliary_loss_clip": 0.01060537, + "auxiliary_loss_mlp": 0.01050141, + "balance_loss_clip": 1.02422464, + "balance_loss_mlp": 1.02015412, + "epoch": 0.413768224860965, + "flos": 13333155004800.0, + "grad_norm": 1.7587442927129842, + "language_loss": 0.82967865, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.85078537, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40234375, + "step": 6882, + "time_per_iteration": 2.4494967460632324 + }, + { + "auxiliary_loss_clip": 0.01064799, + "auxiliary_loss_mlp": 0.01055403, + "balance_loss_clip": 1.02222729, + "balance_loss_mlp": 1.02079749, + "epoch": 0.41382834811363295, + "flos": 20812912439040.0, + "grad_norm": 2.5343687883453825, + "language_loss": 0.71863604, + "learning_rate": 2.643608785656077e-06, + "loss": 0.739838, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.43945312, + "step": 6883, + "time_per_iteration": 2.466644763946533 + }, + { + "auxiliary_loss_clip": 0.0106151, + "auxiliary_loss_mlp": 0.01056083, + "balance_loss_clip": 1.02682853, + "balance_loss_mlp": 1.01956558, + "epoch": 0.4138884713663009, + "flos": 20666102705280.0, + "grad_norm": 1.7736203822176544, + "language_loss": 0.77130461, + "learning_rate": 2.643240028730663e-06, + "loss": 0.79248053, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41992188, + "step": 6884, + "time_per_iteration": 2.405782461166382 + }, + { + "auxiliary_loss_clip": 0.01062786, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.01527083, + "balance_loss_mlp": 1.01913142, + "epoch": 0.4139485946189689, + "flos": 29055417546240.0, + "grad_norm": 1.5626390475427796, + "language_loss": 0.76859319, + "learning_rate": 2.642871247413523e-06, + "loss": 0.78967673, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4375, + "step": 6885, + "time_per_iteration": 2.4410901069641113 + }, + { + "auxiliary_loss_clip": 0.01062931, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.02440381, + "balance_loss_mlp": 1.01955366, + "epoch": 0.41400871787163684, + "flos": 24424572654720.0, + "grad_norm": 2.130401022305573, + "language_loss": 0.71113849, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.73231751, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 6886, + "time_per_iteration": 2.4358346462249756 + }, + { + "auxiliary_loss_clip": 0.0106454, + "auxiliary_loss_mlp": 0.01053808, + "balance_loss_clip": 1.02202702, + "balance_loss_mlp": 1.02086961, + "epoch": 0.4140688411243048, + "flos": 19463030565120.0, + "grad_norm": 1.7176309536534093, + "language_loss": 0.76261556, + "learning_rate": 2.642133611660002e-06, + "loss": 0.78379905, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4375, + "step": 6887, + "time_per_iteration": 2.454051971435547 + }, + { + "auxiliary_loss_clip": 0.01063071, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.02041149, + "balance_loss_mlp": 1.02013755, + "epoch": 0.4141289643769728, + "flos": 19312764606720.0, + "grad_norm": 1.9058903326208385, + "language_loss": 0.73120916, + "learning_rate": 2.641764757251592e-06, + "loss": 0.75232404, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.4296875, + "step": 6888, + "time_per_iteration": 2.393204689025879 + }, + { + "auxiliary_loss_clip": 0.01061387, + "auxiliary_loss_mlp": 0.01046198, + "balance_loss_clip": 1.01727819, + "balance_loss_mlp": 1.01945519, + "epoch": 0.41418908762964074, + "flos": 16725979618560.0, + "grad_norm": 1.9878321060371102, + "language_loss": 0.77252376, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.79359961, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41992188, + "step": 6889, + "time_per_iteration": 2.3531088829040527 + }, + { + "auxiliary_loss_clip": 0.01062163, + "auxiliary_loss_mlp": 0.01047318, + "balance_loss_clip": 1.01714659, + "balance_loss_mlp": 1.0203712, + "epoch": 0.41424921088230876, + "flos": 25295795521920.0, + "grad_norm": 3.593537180988896, + "language_loss": 0.80528754, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82638234, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41796875, + "step": 6890, + "time_per_iteration": 2.451902151107788 + }, + { + "auxiliary_loss_clip": 0.01063141, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.01693201, + "balance_loss_mlp": 1.02172852, + "epoch": 0.4143093341349767, + "flos": 20959442881920.0, + "grad_norm": 2.0975203473686324, + "language_loss": 0.75383955, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.77494615, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4140625, + "step": 6891, + "time_per_iteration": 2.3856749534606934 + }, + { + "auxiliary_loss_clip": 0.01066628, + "auxiliary_loss_mlp": 0.01055267, + "balance_loss_clip": 1.02464175, + "balance_loss_mlp": 1.02186, + "epoch": 0.4143694573876447, + "flos": 22016612983680.0, + "grad_norm": 1.8553218036592682, + "language_loss": 0.85299969, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.87421858, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44726562, + "step": 6892, + "time_per_iteration": 2.4220194816589355 + }, + { + "auxiliary_loss_clip": 0.01063601, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.01368725, + "balance_loss_mlp": 1.02252388, + "epoch": 0.41442958064031266, + "flos": 35696002608000.0, + "grad_norm": 1.4790262044469062, + "language_loss": 0.71161705, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.7326709, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41015625, + "step": 6893, + "time_per_iteration": 2.5084493160247803 + }, + { + "auxiliary_loss_clip": 0.0106662, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_clip": 1.01521873, + "balance_loss_mlp": 1.02327943, + "epoch": 0.4144897038929806, + "flos": 28292495316480.0, + "grad_norm": 1.3831329945201825, + "language_loss": 0.73698288, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75810438, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43359375, + "step": 6894, + "time_per_iteration": 2.4806461334228516 + }, + { + "auxiliary_loss_clip": 0.01065729, + "auxiliary_loss_mlp": 0.01046596, + "balance_loss_clip": 1.01762772, + "balance_loss_mlp": 1.02226341, + "epoch": 0.4145498271456486, + "flos": 11647513785600.0, + "grad_norm": 2.839650379137775, + "language_loss": 0.64590549, + "learning_rate": 2.63918209577416e-06, + "loss": 0.66702873, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43554688, + "step": 6895, + "time_per_iteration": 2.341264009475708 + }, + { + "auxiliary_loss_clip": 0.01064825, + "auxiliary_loss_mlp": 0.01046264, + "balance_loss_clip": 1.01692605, + "balance_loss_mlp": 1.02164841, + "epoch": 0.41460995039831655, + "flos": 27234382608000.0, + "grad_norm": 1.4937804998986777, + "language_loss": 0.71696943, + "learning_rate": 2.638813047071192e-06, + "loss": 0.73808032, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43164062, + "step": 6896, + "time_per_iteration": 2.4765000343322754 + }, + { + "auxiliary_loss_clip": 0.01064947, + "auxiliary_loss_mlp": 0.0105638, + "balance_loss_clip": 1.02395523, + "balance_loss_mlp": 1.0217123, + "epoch": 0.4146700736509845, + "flos": 25921159528320.0, + "grad_norm": 1.7229300115172874, + "language_loss": 0.74450654, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.76571977, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.43164062, + "step": 6897, + "time_per_iteration": 2.437869071960449 + }, + { + "auxiliary_loss_clip": 0.01064097, + "auxiliary_loss_mlp": 0.01056546, + "balance_loss_clip": 1.02778089, + "balance_loss_mlp": 1.02117944, + "epoch": 0.4147301969036525, + "flos": 26832043566720.0, + "grad_norm": 1.4476523972964352, + "language_loss": 0.85262531, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.87383175, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4296875, + "step": 6898, + "time_per_iteration": 2.4773736000061035 + }, + { + "auxiliary_loss_clip": 0.01064409, + "auxiliary_loss_mlp": 0.01049593, + "balance_loss_clip": 1.02010036, + "balance_loss_mlp": 1.02067161, + "epoch": 0.41479032015632045, + "flos": 20297385169920.0, + "grad_norm": 1.6688845313277039, + "language_loss": 0.76346159, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.78460163, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4375, + "step": 6899, + "time_per_iteration": 2.3939030170440674 + }, + { + "auxiliary_loss_clip": 0.01068182, + "auxiliary_loss_mlp": 0.01050495, + "balance_loss_clip": 1.01914322, + "balance_loss_mlp": 1.02233624, + "epoch": 0.4148504434089884, + "flos": 25263814849920.0, + "grad_norm": 1.6575461117754335, + "language_loss": 0.77418244, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.79536927, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.45898438, + "step": 6900, + "time_per_iteration": 3.9573097229003906 + }, + { + "auxiliary_loss_clip": 0.01065814, + "auxiliary_loss_mlp": 0.01053394, + "balance_loss_clip": 1.02145743, + "balance_loss_mlp": 1.0222249, + "epoch": 0.4149105666616564, + "flos": 12821502896640.0, + "grad_norm": 1.967445832138476, + "language_loss": 0.81893873, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.84013075, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.43554688, + "step": 6901, + "time_per_iteration": 3.776214122772217 + }, + { + "auxiliary_loss_clip": 0.01061042, + "auxiliary_loss_mlp": 0.01043188, + "balance_loss_clip": 1.01549554, + "balance_loss_mlp": 1.01982188, + "epoch": 0.41497068991432434, + "flos": 16762952615040.0, + "grad_norm": 1.9030436346250224, + "language_loss": 0.71318185, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.73422414, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 6902, + "time_per_iteration": 3.7447848320007324 + }, + { + "auxiliary_loss_clip": 0.01060343, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_clip": 1.02240729, + "balance_loss_mlp": 1.01966166, + "epoch": 0.4150308131669923, + "flos": 18000030286080.0, + "grad_norm": 1.6150654586690232, + "language_loss": 0.8517645, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.87285936, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 6903, + "time_per_iteration": 2.3958587646484375 + }, + { + "auxiliary_loss_clip": 0.01065899, + "auxiliary_loss_mlp": 0.01055967, + "balance_loss_clip": 1.02244508, + "balance_loss_mlp": 1.02011108, + "epoch": 0.41509093641966033, + "flos": 30043459422720.0, + "grad_norm": 1.7973931327707913, + "language_loss": 0.69822544, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.71944404, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.45703125, + "step": 6904, + "time_per_iteration": 2.4511280059814453 + }, + { + "auxiliary_loss_clip": 0.01063123, + "auxiliary_loss_mlp": 0.01047007, + "balance_loss_clip": 1.0169543, + "balance_loss_mlp": 1.01959109, + "epoch": 0.4151510596723283, + "flos": 24278845173120.0, + "grad_norm": 1.5892736519392996, + "language_loss": 0.79032946, + "learning_rate": 2.635490520350643e-06, + "loss": 0.81143069, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43554688, + "step": 6905, + "time_per_iteration": 3.8604774475097656 + }, + { + "auxiliary_loss_clip": 0.01061465, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.01404142, + "balance_loss_mlp": 1.01880229, + "epoch": 0.41521118292499626, + "flos": 23475109697280.0, + "grad_norm": 1.5300259607081832, + "language_loss": 0.70135194, + "learning_rate": 2.635121230039025e-06, + "loss": 0.72241127, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42773438, + "step": 6906, + "time_per_iteration": 2.4253618717193604 + }, + { + "auxiliary_loss_clip": 0.01059864, + "auxiliary_loss_mlp": 0.01041654, + "balance_loss_clip": 1.01489186, + "balance_loss_mlp": 1.01840711, + "epoch": 0.4152713061776642, + "flos": 22124459773440.0, + "grad_norm": 2.0702398993043536, + "language_loss": 0.68991339, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.71092856, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.4140625, + "step": 6907, + "time_per_iteration": 2.376469135284424 + }, + { + "auxiliary_loss_clip": 0.01062518, + "auxiliary_loss_mlp": 0.01047907, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.019719, + "epoch": 0.4153314294303322, + "flos": 21250339263360.0, + "grad_norm": 1.8350866762654374, + "language_loss": 0.78569794, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.80680221, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42773438, + "step": 6908, + "time_per_iteration": 2.4065663814544678 + }, + { + "auxiliary_loss_clip": 0.01018276, + "auxiliary_loss_mlp": 0.01005428, + "balance_loss_clip": 1.00182796, + "balance_loss_mlp": 1.00955594, + "epoch": 0.41539155268300015, + "flos": 57917554606080.0, + "grad_norm": 0.7566920961874242, + "language_loss": 0.64952564, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66976261, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.03588867, + "router_z_loss_mlp": 0.08691406, + "step": 6909, + "time_per_iteration": 2.9775915145874023 + }, + { + "auxiliary_loss_clip": 0.0106231, + "auxiliary_loss_mlp": 0.01049493, + "balance_loss_clip": 1.01938128, + "balance_loss_mlp": 1.0195744, + "epoch": 0.4154516759356681, + "flos": 21902726528640.0, + "grad_norm": 1.420516524311748, + "language_loss": 0.87988496, + "learning_rate": 2.633643828093996e-06, + "loss": 0.90100306, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42773438, + "step": 6910, + "time_per_iteration": 2.4264187812805176 + }, + { + "auxiliary_loss_clip": 0.01019923, + "auxiliary_loss_mlp": 0.01006107, + "balance_loss_clip": 1.00243545, + "balance_loss_mlp": 1.01099062, + "epoch": 0.4155117991883361, + "flos": 67830584313600.0, + "grad_norm": 0.7976033667776528, + "language_loss": 0.62121093, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64147127, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.08935547, + "step": 6911, + "time_per_iteration": 3.013174533843994 + }, + { + "auxiliary_loss_clip": 0.01067989, + "auxiliary_loss_mlp": 0.01049597, + "balance_loss_clip": 1.01707649, + "balance_loss_mlp": 1.02172911, + "epoch": 0.41557192244100405, + "flos": 14281815000960.0, + "grad_norm": 2.241779656589657, + "language_loss": 0.89365095, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.91482687, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4609375, + "step": 6912, + "time_per_iteration": 2.3594298362731934 + }, + { + "auxiliary_loss_clip": 0.01063133, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.01747584, + "balance_loss_mlp": 1.02122688, + "epoch": 0.415632045693672, + "flos": 24460812512640.0, + "grad_norm": 2.237092431997439, + "language_loss": 0.64928246, + "learning_rate": 2.632535524293914e-06, + "loss": 0.6703546, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41992188, + "step": 6913, + "time_per_iteration": 2.402801752090454 + }, + { + "auxiliary_loss_clip": 0.01061316, + "auxiliary_loss_mlp": 0.01050074, + "balance_loss_clip": 1.02358592, + "balance_loss_mlp": 1.02000427, + "epoch": 0.41569216894634, + "flos": 20114405400960.0, + "grad_norm": 1.7335303559870685, + "language_loss": 0.75827253, + "learning_rate": 2.632166041703586e-06, + "loss": 0.7793864, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41210938, + "step": 6914, + "time_per_iteration": 2.404083490371704 + }, + { + "auxiliary_loss_clip": 0.01063047, + "auxiliary_loss_mlp": 0.01049494, + "balance_loss_clip": 1.01549566, + "balance_loss_mlp": 1.01926279, + "epoch": 0.41575229219900794, + "flos": 23797882016640.0, + "grad_norm": 1.7162487386587615, + "language_loss": 0.89432567, + "learning_rate": 2.631796535141458e-06, + "loss": 0.91545105, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.4375, + "step": 6915, + "time_per_iteration": 2.3877437114715576 + }, + { + "auxiliary_loss_clip": 0.01063824, + "auxiliary_loss_mlp": 0.01045794, + "balance_loss_clip": 1.01736271, + "balance_loss_mlp": 1.02136683, + "epoch": 0.4158124154516759, + "flos": 23107230034560.0, + "grad_norm": 2.538602610590582, + "language_loss": 0.72488058, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.74597669, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42382812, + "step": 6916, + "time_per_iteration": 2.423398971557617 + }, + { + "auxiliary_loss_clip": 0.0106489, + "auxiliary_loss_mlp": 0.01050041, + "balance_loss_clip": 1.01854563, + "balance_loss_mlp": 1.02012575, + "epoch": 0.41587253870434393, + "flos": 24241837265280.0, + "grad_norm": 1.3628526943838877, + "language_loss": 0.7326014, + "learning_rate": 2.631057450157852e-06, + "loss": 0.75375074, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44726562, + "step": 6917, + "time_per_iteration": 2.4208035469055176 + }, + { + "auxiliary_loss_clip": 0.01061702, + "auxiliary_loss_mlp": 0.01045546, + "balance_loss_clip": 1.01682878, + "balance_loss_mlp": 1.01900792, + "epoch": 0.4159326619570119, + "flos": 23880381292800.0, + "grad_norm": 1.7303363505996443, + "language_loss": 0.82210159, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.8431741, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42578125, + "step": 6918, + "time_per_iteration": 2.453056812286377 + }, + { + "auxiliary_loss_clip": 0.01066038, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_clip": 1.01729238, + "balance_loss_mlp": 1.02257776, + "epoch": 0.41599278520967986, + "flos": 40624900709760.0, + "grad_norm": 1.3783153946509965, + "language_loss": 0.71293807, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.73408639, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43359375, + "step": 6919, + "time_per_iteration": 2.562023401260376 + }, + { + "auxiliary_loss_clip": 0.0106526, + "auxiliary_loss_mlp": 0.01045984, + "balance_loss_clip": 1.01322508, + "balance_loss_mlp": 1.0214045, + "epoch": 0.4160529084623478, + "flos": 18221972999040.0, + "grad_norm": 1.8095487424691885, + "language_loss": 0.83189595, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.85300839, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4375, + "step": 6920, + "time_per_iteration": 2.388762950897217 + }, + { + "auxiliary_loss_clip": 0.01063993, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.01546121, + "balance_loss_mlp": 1.01981688, + "epoch": 0.4161130317150158, + "flos": 13661129116800.0, + "grad_norm": 1.9646291772722262, + "language_loss": 0.67244309, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.69356287, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44140625, + "step": 6921, + "time_per_iteration": 2.352733612060547 + }, + { + "auxiliary_loss_clip": 0.01063074, + "auxiliary_loss_mlp": 0.01048716, + "balance_loss_clip": 1.01940227, + "balance_loss_mlp": 1.01996231, + "epoch": 0.41617315496768376, + "flos": 16177633804800.0, + "grad_norm": 1.8839636030783153, + "language_loss": 0.82285655, + "learning_rate": 2.629209319173274e-06, + "loss": 0.84397447, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43164062, + "step": 6922, + "time_per_iteration": 2.3805439472198486 + }, + { + "auxiliary_loss_clip": 0.01063574, + "auxiliary_loss_mlp": 0.01046771, + "balance_loss_clip": 1.01491857, + "balance_loss_mlp": 1.01946306, + "epoch": 0.4162332782203517, + "flos": 26212125732480.0, + "grad_norm": 1.4953594480890096, + "language_loss": 0.68678361, + "learning_rate": 2.628839621341247e-06, + "loss": 0.70788705, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44140625, + "step": 6923, + "time_per_iteration": 2.428374767303467 + }, + { + "auxiliary_loss_clip": 0.01064066, + "auxiliary_loss_mlp": 0.0105478, + "balance_loss_clip": 1.0230701, + "balance_loss_mlp": 1.02075958, + "epoch": 0.4162934014730197, + "flos": 28182728401920.0, + "grad_norm": 2.0125392768408967, + "language_loss": 0.77214724, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.79333568, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43359375, + "step": 6924, + "time_per_iteration": 2.5279664993286133 + }, + { + "auxiliary_loss_clip": 0.01063304, + "auxiliary_loss_mlp": 0.01046227, + "balance_loss_clip": 1.01766431, + "balance_loss_mlp": 1.02008808, + "epoch": 0.41635352472568765, + "flos": 19864287354240.0, + "grad_norm": 1.8552226924705366, + "language_loss": 0.75004172, + "learning_rate": 2.62810015415423e-06, + "loss": 0.77113712, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43359375, + "step": 6925, + "time_per_iteration": 2.4691524505615234 + }, + { + "auxiliary_loss_clip": 0.01060736, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.01579046, + "balance_loss_mlp": 1.01885724, + "epoch": 0.4164136479783556, + "flos": 14934586291200.0, + "grad_norm": 1.8266046498928117, + "language_loss": 0.85557997, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.87662703, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 6926, + "time_per_iteration": 2.422821044921875 + }, + { + "auxiliary_loss_clip": 0.0106049, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.01394486, + "balance_loss_mlp": 1.01951051, + "epoch": 0.4164737712310236, + "flos": 21756649933440.0, + "grad_norm": 1.7440688893500638, + "language_loss": 0.87908185, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.90008211, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.41015625, + "step": 6927, + "time_per_iteration": 2.374929189682007 + }, + { + "auxiliary_loss_clip": 0.01061158, + "auxiliary_loss_mlp": 0.01044341, + "balance_loss_clip": 1.01509917, + "balance_loss_mlp": 1.01925635, + "epoch": 0.41653389448369155, + "flos": 20739106091520.0, + "grad_norm": 2.2338879109268674, + "language_loss": 0.74933928, + "learning_rate": 2.626990774776604e-06, + "loss": 0.77039433, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41992188, + "step": 6928, + "time_per_iteration": 2.399108648300171 + }, + { + "auxiliary_loss_clip": 0.01058716, + "auxiliary_loss_mlp": 0.01044189, + "balance_loss_clip": 1.01656818, + "balance_loss_mlp": 1.01741755, + "epoch": 0.4165940177363595, + "flos": 24971731482240.0, + "grad_norm": 1.7947413159961685, + "language_loss": 0.79321742, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.81424642, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 6929, + "time_per_iteration": 2.397709846496582 + }, + { + "auxiliary_loss_clip": 0.01061636, + "auxiliary_loss_mlp": 0.01045714, + "balance_loss_clip": 1.01659131, + "balance_loss_mlp": 1.01907539, + "epoch": 0.41665414098902753, + "flos": 20520689425920.0, + "grad_norm": 1.8611203523020974, + "language_loss": 0.72217715, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.74325061, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 6930, + "time_per_iteration": 2.4103097915649414 + }, + { + "auxiliary_loss_clip": 0.01061718, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.01878905, + "balance_loss_mlp": 1.01916623, + "epoch": 0.4167142642416955, + "flos": 19681901078400.0, + "grad_norm": 1.5894622593943415, + "language_loss": 0.82529783, + "learning_rate": 2.625881181419007e-06, + "loss": 0.84637982, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42578125, + "step": 6931, + "time_per_iteration": 2.4070539474487305 + }, + { + "auxiliary_loss_clip": 0.01059731, + "auxiliary_loss_mlp": 0.01045924, + "balance_loss_clip": 1.01737344, + "balance_loss_mlp": 1.0183028, + "epoch": 0.41677438749436346, + "flos": 23762759322240.0, + "grad_norm": 1.6383696271321837, + "language_loss": 0.80206114, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.82311767, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 6932, + "time_per_iteration": 2.3880603313446045 + }, + { + "auxiliary_loss_clip": 0.01062431, + "auxiliary_loss_mlp": 0.01046766, + "balance_loss_clip": 1.01959801, + "balance_loss_mlp": 1.02058983, + "epoch": 0.41683451074703143, + "flos": 30408720733440.0, + "grad_norm": 1.7604895218220933, + "language_loss": 0.84176856, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.8628605, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41796875, + "step": 6933, + "time_per_iteration": 2.470787525177002 + }, + { + "auxiliary_loss_clip": 0.0106387, + "auxiliary_loss_mlp": 0.01045791, + "balance_loss_clip": 1.01476097, + "balance_loss_mlp": 1.02015567, + "epoch": 0.4168946339996994, + "flos": 21505694014080.0, + "grad_norm": 1.8222445909321046, + "language_loss": 0.78148651, + "learning_rate": 2.624771374460121e-06, + "loss": 0.8025831, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4375, + "step": 6934, + "time_per_iteration": 2.412191867828369 + }, + { + "auxiliary_loss_clip": 0.01063345, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.01409721, + "balance_loss_mlp": 1.02156067, + "epoch": 0.41695475725236736, + "flos": 17637736440960.0, + "grad_norm": 1.7425597720308887, + "language_loss": 0.68579435, + "learning_rate": 2.624401391405668e-06, + "loss": 0.70685267, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 6935, + "time_per_iteration": 2.4219818115234375 + }, + { + "auxiliary_loss_clip": 0.0106253, + "auxiliary_loss_mlp": 0.01045529, + "balance_loss_clip": 1.01794374, + "balance_loss_mlp": 1.02041507, + "epoch": 0.4170148805050353, + "flos": 15668006555520.0, + "grad_norm": 2.1053048073310716, + "language_loss": 0.76210892, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.78318954, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.421875, + "step": 6936, + "time_per_iteration": 2.3394296169281006 + }, + { + "auxiliary_loss_clip": 0.01061098, + "auxiliary_loss_mlp": 0.01041147, + "balance_loss_clip": 1.01426506, + "balance_loss_mlp": 1.01969647, + "epoch": 0.4170750037577033, + "flos": 15158239660800.0, + "grad_norm": 1.8511482196411164, + "language_loss": 0.76561791, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.78664041, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 6937, + "time_per_iteration": 2.426276206970215 + }, + { + "auxiliary_loss_clip": 0.01061107, + "auxiliary_loss_mlp": 0.01045111, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.01999855, + "epoch": 0.41713512701037125, + "flos": 28766999871360.0, + "grad_norm": 1.4437855912180177, + "language_loss": 0.85099089, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.87205309, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41015625, + "step": 6938, + "time_per_iteration": 2.5360188484191895 + }, + { + "auxiliary_loss_clip": 0.01063655, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.01615691, + "balance_loss_mlp": 1.02024555, + "epoch": 0.4171952502630392, + "flos": 28255731788160.0, + "grad_norm": 1.7864956405659764, + "language_loss": 0.75179386, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.77286744, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.43554688, + "step": 6939, + "time_per_iteration": 2.5022788047790527 + }, + { + "auxiliary_loss_clip": 0.01062433, + "auxiliary_loss_mlp": 0.01046449, + "balance_loss_clip": 1.0188756, + "balance_loss_mlp": 1.01933146, + "epoch": 0.4172553735157072, + "flos": 24570544515840.0, + "grad_norm": 1.4663211926028332, + "language_loss": 0.7575112, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77859998, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.43164062, + "step": 6940, + "time_per_iteration": 3.8669564723968506 + }, + { + "auxiliary_loss_clip": 0.01061931, + "auxiliary_loss_mlp": 0.01047962, + "balance_loss_clip": 1.01991189, + "balance_loss_mlp": 1.02009702, + "epoch": 0.41731549676837515, + "flos": 27044769680640.0, + "grad_norm": 1.7656908593088827, + "language_loss": 0.72513461, + "learning_rate": 2.622180996345424e-06, + "loss": 0.74623358, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 6941, + "time_per_iteration": 5.289955377578735 + }, + { + "auxiliary_loss_clip": 0.01065504, + "auxiliary_loss_mlp": 0.01052722, + "balance_loss_clip": 1.02200174, + "balance_loss_mlp": 1.02100301, + "epoch": 0.4173756200210431, + "flos": 28393045632000.0, + "grad_norm": 1.8598881220408374, + "language_loss": 0.74919146, + "learning_rate": 2.621810847844104e-06, + "loss": 0.77037382, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4453125, + "step": 6942, + "time_per_iteration": 2.423283815383911 + }, + { + "auxiliary_loss_clip": 0.01067122, + "auxiliary_loss_mlp": 0.01053558, + "balance_loss_clip": 1.02301657, + "balance_loss_mlp": 1.02170777, + "epoch": 0.41743574327371114, + "flos": 22520654415360.0, + "grad_norm": 2.021604697133654, + "language_loss": 0.73830318, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.75950998, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.453125, + "step": 6943, + "time_per_iteration": 2.4139246940612793 + }, + { + "auxiliary_loss_clip": 0.01065325, + "auxiliary_loss_mlp": 0.01051217, + "balance_loss_clip": 1.0206635, + "balance_loss_mlp": 1.020648, + "epoch": 0.4174958665263791, + "flos": 30112238534400.0, + "grad_norm": 1.7742853564056418, + "language_loss": 0.6469385, + "learning_rate": 2.621070480118111e-06, + "loss": 0.66810393, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44726562, + "step": 6944, + "time_per_iteration": 3.8718364238739014 + }, + { + "auxiliary_loss_clip": 0.01064069, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.01595855, + "balance_loss_mlp": 1.0206126, + "epoch": 0.41755598977904707, + "flos": 25262313661440.0, + "grad_norm": 1.5217779472690336, + "language_loss": 0.71582866, + "learning_rate": 2.620700260921513e-06, + "loss": 0.73691112, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.43359375, + "step": 6945, + "time_per_iteration": 2.458113670349121 + }, + { + "auxiliary_loss_clip": 0.01060709, + "auxiliary_loss_mlp": 0.01051669, + "balance_loss_clip": 1.02232003, + "balance_loss_mlp": 1.01874399, + "epoch": 0.41761611303171503, + "flos": 19827558737280.0, + "grad_norm": 1.705515089135438, + "language_loss": 0.82492703, + "learning_rate": 2.620330018187899e-06, + "loss": 0.8460508, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41796875, + "step": 6946, + "time_per_iteration": 2.3652594089508057 + }, + { + "auxiliary_loss_clip": 0.01062765, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.01797891, + "balance_loss_mlp": 1.01946449, + "epoch": 0.417676236284383, + "flos": 15522348896640.0, + "grad_norm": 2.0339594832603334, + "language_loss": 0.78808093, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.80916607, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.43359375, + "step": 6947, + "time_per_iteration": 2.3816962242126465 + }, + { + "auxiliary_loss_clip": 0.01064254, + "auxiliary_loss_mlp": 0.01047978, + "balance_loss_clip": 1.01877129, + "balance_loss_mlp": 1.01970959, + "epoch": 0.41773635953705096, + "flos": 32523130759680.0, + "grad_norm": 1.5640097634199697, + "language_loss": 0.72925097, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.75037324, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4453125, + "step": 6948, + "time_per_iteration": 2.466956615447998 + }, + { + "auxiliary_loss_clip": 0.01059232, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.0160203, + "balance_loss_mlp": 1.01813805, + "epoch": 0.4177964827897189, + "flos": 23439812446080.0, + "grad_norm": 1.5117055401685704, + "language_loss": 0.7749927, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79602957, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 6949, + "time_per_iteration": 2.481131076812744 + }, + { + "auxiliary_loss_clip": 0.01065132, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.02165782, + "balance_loss_mlp": 1.02054036, + "epoch": 0.4178566060423869, + "flos": 22747764009600.0, + "grad_norm": 1.547902210265985, + "language_loss": 0.82839572, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84957051, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4453125, + "step": 6950, + "time_per_iteration": 2.3883023262023926 + }, + { + "auxiliary_loss_clip": 0.01059505, + "auxiliary_loss_mlp": 0.01041451, + "balance_loss_clip": 1.01529646, + "balance_loss_mlp": 1.0192802, + "epoch": 0.41791672929505486, + "flos": 26031554847360.0, + "grad_norm": 1.3379075853303293, + "language_loss": 0.77186656, + "learning_rate": 2.618478451956007e-06, + "loss": 0.79287612, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 6951, + "time_per_iteration": 2.4552152156829834 + }, + { + "auxiliary_loss_clip": 0.01064416, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_clip": 1.01685572, + "balance_loss_mlp": 1.01920116, + "epoch": 0.4179768525477228, + "flos": 19567805155200.0, + "grad_norm": 1.6856666494322161, + "language_loss": 0.74500126, + "learning_rate": 2.61810806829516e-06, + "loss": 0.76611274, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.45117188, + "step": 6952, + "time_per_iteration": 2.3558571338653564 + }, + { + "auxiliary_loss_clip": 0.01063492, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.01934099, + "balance_loss_mlp": 1.02009487, + "epoch": 0.4180369758003908, + "flos": 17782905340800.0, + "grad_norm": 2.2522734516358227, + "language_loss": 0.741467, + "learning_rate": 2.617737661195593e-06, + "loss": 0.76258636, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43359375, + "step": 6953, + "time_per_iteration": 2.3688621520996094 + }, + { + "auxiliary_loss_clip": 0.01061203, + "auxiliary_loss_mlp": 0.01045158, + "balance_loss_clip": 1.0147357, + "balance_loss_mlp": 1.02012384, + "epoch": 0.41809709905305875, + "flos": 20959582527360.0, + "grad_norm": 1.6108852155326008, + "language_loss": 0.77560413, + "learning_rate": 2.617367230671353e-06, + "loss": 0.7966677, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.41015625, + "step": 6954, + "time_per_iteration": 2.3741347789764404 + }, + { + "auxiliary_loss_clip": 0.01062678, + "auxiliary_loss_mlp": 0.0105148, + "balance_loss_clip": 1.02145123, + "balance_loss_mlp": 1.02016246, + "epoch": 0.4181572223057267, + "flos": 22016543160960.0, + "grad_norm": 2.0901331358541184, + "language_loss": 0.85908985, + "learning_rate": 2.616996776736485e-06, + "loss": 0.8802315, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42382812, + "step": 6955, + "time_per_iteration": 2.4504058361053467 + }, + { + "auxiliary_loss_clip": 0.01060306, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.01175404, + "balance_loss_mlp": 1.01913953, + "epoch": 0.4182173455583947, + "flos": 26244455518080.0, + "grad_norm": 1.5504858464025972, + "language_loss": 0.84279883, + "learning_rate": 2.616626299405037e-06, + "loss": 0.8637923, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41210938, + "step": 6956, + "time_per_iteration": 2.4231624603271484 + }, + { + "auxiliary_loss_clip": 0.0106526, + "auxiliary_loss_mlp": 0.01052808, + "balance_loss_clip": 1.02305305, + "balance_loss_mlp": 1.02148998, + "epoch": 0.4182774688110627, + "flos": 14790778934400.0, + "grad_norm": 2.109289019397376, + "language_loss": 0.7336092, + "learning_rate": 2.616255798691059e-06, + "loss": 0.75478989, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4375, + "step": 6957, + "time_per_iteration": 2.4056179523468018 + }, + { + "auxiliary_loss_clip": 0.01064112, + "auxiliary_loss_mlp": 0.01049293, + "balance_loss_clip": 1.02107596, + "balance_loss_mlp": 1.02120018, + "epoch": 0.41833759206373067, + "flos": 20410992334080.0, + "grad_norm": 2.100268717896933, + "language_loss": 0.77243382, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.79356784, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4296875, + "step": 6958, + "time_per_iteration": 2.3812100887298584 + }, + { + "auxiliary_loss_clip": 0.01063659, + "auxiliary_loss_mlp": 0.01039916, + "balance_loss_clip": 1.01243806, + "balance_loss_mlp": 1.02061176, + "epoch": 0.41839771531639863, + "flos": 23655296557440.0, + "grad_norm": 1.855385230747985, + "language_loss": 0.78230143, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.80333722, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.4296875, + "step": 6959, + "time_per_iteration": 2.4208006858825684 + }, + { + "auxiliary_loss_clip": 0.01063782, + "auxiliary_loss_mlp": 0.01047854, + "balance_loss_clip": 1.017169, + "balance_loss_mlp": 1.0199157, + "epoch": 0.4184578385690666, + "flos": 19753158896640.0, + "grad_norm": 1.8059121675539462, + "language_loss": 0.77932799, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.80044436, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43945312, + "step": 6960, + "time_per_iteration": 2.370515823364258 + }, + { + "auxiliary_loss_clip": 0.01060812, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.00939608, + "balance_loss_mlp": 1.02130902, + "epoch": 0.41851796182173456, + "flos": 20192366200320.0, + "grad_norm": 2.0444204351759425, + "language_loss": 0.77315831, + "learning_rate": 2.614773562290835e-06, + "loss": 0.79410684, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39453125, + "step": 6961, + "time_per_iteration": 2.5369179248809814 + }, + { + "auxiliary_loss_clip": 0.01024771, + "auxiliary_loss_mlp": 0.01001799, + "balance_loss_clip": 0.99841332, + "balance_loss_mlp": 1.01558852, + "epoch": 0.41857808507440253, + "flos": 59015537953920.0, + "grad_norm": 0.772597990464078, + "language_loss": 0.54675245, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56701815, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.09179688, + "step": 6962, + "time_per_iteration": 2.9491982460021973 + }, + { + "auxiliary_loss_clip": 0.01066381, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.01752651, + "balance_loss_mlp": 1.02145314, + "epoch": 0.4186382083270705, + "flos": 18477816508800.0, + "grad_norm": 1.8946214904360785, + "language_loss": 0.87171221, + "learning_rate": 2.614032304160864e-06, + "loss": 0.89286244, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44921875, + "step": 6963, + "time_per_iteration": 2.4910085201263428 + }, + { + "auxiliary_loss_clip": 0.01064487, + "auxiliary_loss_mlp": 0.010464, + "balance_loss_clip": 1.01912534, + "balance_loss_mlp": 1.02204406, + "epoch": 0.41869833157973846, + "flos": 21577719882240.0, + "grad_norm": 1.5286151617973818, + "language_loss": 0.71507204, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.73618096, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42382812, + "step": 6964, + "time_per_iteration": 2.3960442543029785 + }, + { + "auxiliary_loss_clip": 0.01060647, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.01941478, + "balance_loss_mlp": 1.02002192, + "epoch": 0.4187584548324064, + "flos": 35515955393280.0, + "grad_norm": 1.8059326409105234, + "language_loss": 0.72195196, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.74302751, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40625, + "step": 6965, + "time_per_iteration": 2.568942070007324 + }, + { + "auxiliary_loss_clip": 0.010587, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.01791954, + "balance_loss_mlp": 1.01772296, + "epoch": 0.4188185780850744, + "flos": 18655035903360.0, + "grad_norm": 1.494466307953777, + "language_loss": 0.72715104, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74817991, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41015625, + "step": 6966, + "time_per_iteration": 2.4081456661224365 + }, + { + "auxiliary_loss_clip": 0.01064488, + "auxiliary_loss_mlp": 0.01050314, + "balance_loss_clip": 1.02070272, + "balance_loss_mlp": 1.01992154, + "epoch": 0.41887870133774235, + "flos": 40331839824000.0, + "grad_norm": 2.3078952410206, + "language_loss": 0.72610259, + "learning_rate": 2.612549508603375e-06, + "loss": 0.74725062, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4453125, + "step": 6967, + "time_per_iteration": 2.591635227203369 + }, + { + "auxiliary_loss_clip": 0.01011459, + "auxiliary_loss_mlp": 0.01008652, + "balance_loss_clip": 1.00526643, + "balance_loss_mlp": 1.00294423, + "epoch": 0.4189388245904103, + "flos": 61368545278080.0, + "grad_norm": 0.6851648707951838, + "language_loss": 0.46343893, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48364002, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.08496094, + "step": 6968, + "time_per_iteration": 3.0140902996063232 + }, + { + "auxiliary_loss_clip": 0.01063515, + "auxiliary_loss_mlp": 0.01048976, + "balance_loss_clip": 1.01797009, + "balance_loss_mlp": 1.01885319, + "epoch": 0.4189989478430783, + "flos": 28214499605760.0, + "grad_norm": 1.5796098643700078, + "language_loss": 0.76588833, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.78701317, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44726562, + "step": 6969, + "time_per_iteration": 2.450751543045044 + }, + { + "auxiliary_loss_clip": 0.01060708, + "auxiliary_loss_mlp": 0.01051136, + "balance_loss_clip": 1.02245462, + "balance_loss_mlp": 1.01856804, + "epoch": 0.4190590710957463, + "flos": 24564888875520.0, + "grad_norm": 1.4778441490816057, + "language_loss": 0.81217384, + "learning_rate": 2.611437167992705e-06, + "loss": 0.83329225, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 6970, + "time_per_iteration": 2.401111602783203 + }, + { + "auxiliary_loss_clip": 0.01060219, + "auxiliary_loss_mlp": 0.0105032, + "balance_loss_clip": 1.02134085, + "balance_loss_mlp": 1.01853585, + "epoch": 0.41911919434841427, + "flos": 21724948552320.0, + "grad_norm": 2.3290168556379123, + "language_loss": 0.84543395, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.8665393, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41601562, + "step": 6971, + "time_per_iteration": 2.409578800201416 + }, + { + "auxiliary_loss_clip": 0.01062331, + "auxiliary_loss_mlp": 0.01046496, + "balance_loss_clip": 1.01779032, + "balance_loss_mlp": 1.0206356, + "epoch": 0.41917931760108224, + "flos": 17600623799040.0, + "grad_norm": 2.1112468824618604, + "language_loss": 0.76417398, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.78526223, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 6972, + "time_per_iteration": 2.3574726581573486 + }, + { + "auxiliary_loss_clip": 0.01061725, + "auxiliary_loss_mlp": 0.01047896, + "balance_loss_clip": 1.02001309, + "balance_loss_mlp": 1.01988935, + "epoch": 0.4192394408537502, + "flos": 37815160579200.0, + "grad_norm": 1.5729468666016286, + "language_loss": 0.7393887, + "learning_rate": 2.610324618710212e-06, + "loss": 0.76048493, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 6973, + "time_per_iteration": 2.5703933238983154 + }, + { + "auxiliary_loss_clip": 0.01068677, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.01607049, + "balance_loss_mlp": 1.0227493, + "epoch": 0.41929956410641817, + "flos": 23106741275520.0, + "grad_norm": 2.2760616564474137, + "language_loss": 0.75783718, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77897882, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.45898438, + "step": 6974, + "time_per_iteration": 2.411020040512085 + }, + { + "auxiliary_loss_clip": 0.01063611, + "auxiliary_loss_mlp": 0.01045151, + "balance_loss_clip": 1.01532531, + "balance_loss_mlp": 1.02172327, + "epoch": 0.41935968735908613, + "flos": 22523552058240.0, + "grad_norm": 1.6799829352751552, + "language_loss": 0.7412554, + "learning_rate": 2.609582803447259e-06, + "loss": 0.76234305, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41796875, + "step": 6975, + "time_per_iteration": 2.4225268363952637 + }, + { + "auxiliary_loss_clip": 0.01063206, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_clip": 1.02002764, + "balance_loss_mlp": 1.02099538, + "epoch": 0.4194198106117541, + "flos": 26869226031360.0, + "grad_norm": 1.4556015215150344, + "language_loss": 0.81380975, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83493316, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.421875, + "step": 6976, + "time_per_iteration": 2.5013701915740967 + }, + { + "auxiliary_loss_clip": 0.01063651, + "auxiliary_loss_mlp": 0.01045915, + "balance_loss_clip": 1.01649404, + "balance_loss_mlp": 1.02036476, + "epoch": 0.41947993386442206, + "flos": 19901365084800.0, + "grad_norm": 1.8857980186826795, + "language_loss": 0.70160383, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.72269952, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43359375, + "step": 6977, + "time_per_iteration": 2.405846357345581 + }, + { + "auxiliary_loss_clip": 0.01065397, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.01544857, + "balance_loss_mlp": 1.02267611, + "epoch": 0.41954005711709, + "flos": 17382940272000.0, + "grad_norm": 2.319037137216687, + "language_loss": 0.82640183, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.84750301, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 6978, + "time_per_iteration": 2.386373996734619 + }, + { + "auxiliary_loss_clip": 0.01064446, + "auxiliary_loss_mlp": 0.01047184, + "balance_loss_clip": 1.0162847, + "balance_loss_mlp": 1.01992953, + "epoch": 0.419600180369758, + "flos": 25002315699840.0, + "grad_norm": 1.6225564972976276, + "language_loss": 0.8387785, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.85989475, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 6979, + "time_per_iteration": 3.8772037029266357 + }, + { + "auxiliary_loss_clip": 0.01063812, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.01182783, + "balance_loss_mlp": 1.02083588, + "epoch": 0.41966030362242596, + "flos": 17382835537920.0, + "grad_norm": 1.7133137237807547, + "language_loss": 0.84760332, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.86866152, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4296875, + "step": 6980, + "time_per_iteration": 2.370277166366577 + }, + { + "auxiliary_loss_clip": 0.01066021, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_clip": 1.01755476, + "balance_loss_mlp": 1.02182448, + "epoch": 0.4197204268750939, + "flos": 22155288370560.0, + "grad_norm": 2.2909717757559287, + "language_loss": 0.80730355, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.828426, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.44140625, + "step": 6981, + "time_per_iteration": 3.872154474258423 + }, + { + "auxiliary_loss_clip": 0.01060206, + "auxiliary_loss_mlp": 0.01038021, + "balance_loss_clip": 1.0104003, + "balance_loss_mlp": 1.01860726, + "epoch": 0.4197805501277619, + "flos": 22083227591040.0, + "grad_norm": 1.6185662692359881, + "language_loss": 0.85261506, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.87359726, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41601562, + "step": 6982, + "time_per_iteration": 2.438020944595337 + }, + { + "auxiliary_loss_clip": 0.01065383, + "auxiliary_loss_mlp": 0.01052645, + "balance_loss_clip": 1.02177012, + "balance_loss_mlp": 1.02082002, + "epoch": 0.4198406733804299, + "flos": 26430996245760.0, + "grad_norm": 1.8423761448485036, + "language_loss": 0.57318807, + "learning_rate": 2.606614618903214e-06, + "loss": 0.59436834, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 6983, + "time_per_iteration": 2.422132730484009 + }, + { + "auxiliary_loss_clip": 0.01062134, + "auxiliary_loss_mlp": 0.01045417, + "balance_loss_clip": 1.01680636, + "balance_loss_mlp": 1.02054477, + "epoch": 0.4199007966330979, + "flos": 12530222490240.0, + "grad_norm": 1.7195865543058393, + "language_loss": 0.84566033, + "learning_rate": 2.606243492174471e-06, + "loss": 0.86673582, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41601562, + "step": 6984, + "time_per_iteration": 3.8606343269348145 + }, + { + "auxiliary_loss_clip": 0.01061905, + "auxiliary_loss_mlp": 0.01044071, + "balance_loss_clip": 1.01556802, + "balance_loss_mlp": 1.01999664, + "epoch": 0.41996091988576584, + "flos": 21761851726080.0, + "grad_norm": 1.7707844950745832, + "language_loss": 0.80654454, + "learning_rate": 2.605872342456914e-06, + "loss": 0.82760429, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 6985, + "time_per_iteration": 2.4222891330718994 + }, + { + "auxiliary_loss_clip": 0.0106457, + "auxiliary_loss_mlp": 0.01049479, + "balance_loss_clip": 1.01738811, + "balance_loss_mlp": 1.0194484, + "epoch": 0.4200210431384338, + "flos": 26540728248960.0, + "grad_norm": 1.553942388199177, + "language_loss": 0.79432082, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.81546134, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.45117188, + "step": 6986, + "time_per_iteration": 2.461432695388794 + }, + { + "auxiliary_loss_clip": 0.01058067, + "auxiliary_loss_mlp": 0.01041596, + "balance_loss_clip": 1.01537049, + "balance_loss_mlp": 1.01814258, + "epoch": 0.42008116639110177, + "flos": 26794651633920.0, + "grad_norm": 1.657994269881226, + "language_loss": 0.73325145, + "learning_rate": 2.605129974111655e-06, + "loss": 0.75424814, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 6987, + "time_per_iteration": 2.429535150527954 + }, + { + "auxiliary_loss_clip": 0.01063268, + "auxiliary_loss_mlp": 0.01045899, + "balance_loss_clip": 1.01644289, + "balance_loss_mlp": 1.02033055, + "epoch": 0.42014128964376973, + "flos": 32085983226240.0, + "grad_norm": 1.5497007706244421, + "language_loss": 0.76079369, + "learning_rate": 2.604758755512104e-06, + "loss": 0.78188539, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 6988, + "time_per_iteration": 2.6145107746124268 + }, + { + "auxiliary_loss_clip": 0.01063363, + "auxiliary_loss_mlp": 0.01050654, + "balance_loss_clip": 1.02030313, + "balance_loss_mlp": 1.01929331, + "epoch": 0.4202014128964377, + "flos": 26465979294720.0, + "grad_norm": 1.487907087922204, + "language_loss": 0.75290453, + "learning_rate": 2.60438751398004e-06, + "loss": 0.77404475, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44140625, + "step": 6989, + "time_per_iteration": 2.44108510017395 + }, + { + "auxiliary_loss_clip": 0.01060713, + "auxiliary_loss_mlp": 0.01047814, + "balance_loss_clip": 1.01856017, + "balance_loss_mlp": 1.01807451, + "epoch": 0.42026153614910566, + "flos": 13400537662080.0, + "grad_norm": 1.9392142728509727, + "language_loss": 0.72382712, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.74491239, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 6990, + "time_per_iteration": 2.399506092071533 + }, + { + "auxiliary_loss_clip": 0.01012237, + "auxiliary_loss_mlp": 0.01003781, + "balance_loss_clip": 1.00056255, + "balance_loss_mlp": 1.00379598, + "epoch": 0.42032165940177363, + "flos": 60247413832320.0, + "grad_norm": 0.8271413923703486, + "language_loss": 0.6055088, + "learning_rate": 2.603644962174685e-06, + "loss": 0.625669, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.03222656, + "router_z_loss_mlp": 0.08447266, + "step": 6991, + "time_per_iteration": 2.904433012008667 + }, + { + "auxiliary_loss_clip": 0.01064613, + "auxiliary_loss_mlp": 0.01046616, + "balance_loss_clip": 1.01690865, + "balance_loss_mlp": 1.02047431, + "epoch": 0.4203817826544416, + "flos": 24534060278400.0, + "grad_norm": 1.4900120713827931, + "language_loss": 0.8417905, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.86290276, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44140625, + "step": 6992, + "time_per_iteration": 2.4334983825683594 + }, + { + "auxiliary_loss_clip": 0.01011422, + "auxiliary_loss_mlp": 0.01002517, + "balance_loss_clip": 0.99936998, + "balance_loss_mlp": 1.00279546, + "epoch": 0.42044190590710956, + "flos": 58817965236480.0, + "grad_norm": 0.8160422846959353, + "language_loss": 0.65585411, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67599344, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.03149414, + "router_z_loss_mlp": 0.08642578, + "step": 6993, + "time_per_iteration": 3.0211689472198486 + }, + { + "auxiliary_loss_clip": 0.01065149, + "auxiliary_loss_mlp": 0.01047797, + "balance_loss_clip": 1.01475227, + "balance_loss_mlp": 1.01917362, + "epoch": 0.4205020291597775, + "flos": 16435118148480.0, + "grad_norm": 2.284467228835431, + "language_loss": 0.84618032, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.86730981, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.4609375, + "step": 6994, + "time_per_iteration": 2.350649118423462 + }, + { + "auxiliary_loss_clip": 0.01062557, + "auxiliary_loss_mlp": 0.01044291, + "balance_loss_clip": 1.01552534, + "balance_loss_mlp": 1.02152348, + "epoch": 0.4205621524124455, + "flos": 18404673477120.0, + "grad_norm": 1.513573976131421, + "language_loss": 0.80076993, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.82183838, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41015625, + "step": 6995, + "time_per_iteration": 2.388667345046997 + }, + { + "auxiliary_loss_clip": 0.0105826, + "auxiliary_loss_mlp": 0.01045336, + "balance_loss_clip": 1.01906216, + "balance_loss_mlp": 1.01777363, + "epoch": 0.4206222756651135, + "flos": 25518925221120.0, + "grad_norm": 1.3689152432876912, + "language_loss": 0.81123638, + "learning_rate": 2.60178818232786e-06, + "loss": 0.83227229, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40429688, + "step": 6996, + "time_per_iteration": 2.398460626602173 + }, + { + "auxiliary_loss_clip": 0.01062082, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.01214528, + "balance_loss_mlp": 1.01966763, + "epoch": 0.4206823989177815, + "flos": 15303443472000.0, + "grad_norm": 1.8568914992380874, + "language_loss": 0.76873147, + "learning_rate": 2.601416757842559e-06, + "loss": 0.78975832, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42578125, + "step": 6997, + "time_per_iteration": 2.381293535232544 + }, + { + "auxiliary_loss_clip": 0.01062774, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.01965857, + "balance_loss_mlp": 1.02018356, + "epoch": 0.42074252217044944, + "flos": 15553352050560.0, + "grad_norm": 1.7529189448676497, + "language_loss": 0.75935477, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78047031, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 6998, + "time_per_iteration": 2.371706485748291 + }, + { + "auxiliary_loss_clip": 0.01064084, + "auxiliary_loss_mlp": 0.01047084, + "balance_loss_clip": 1.01682878, + "balance_loss_mlp": 1.02066755, + "epoch": 0.4208026454231174, + "flos": 26144533607040.0, + "grad_norm": 1.8321725656563574, + "language_loss": 0.77930397, + "learning_rate": 2.60067384046869e-06, + "loss": 0.80041564, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43359375, + "step": 6999, + "time_per_iteration": 2.4788832664489746 + }, + { + "auxiliary_loss_clip": 0.01061759, + "auxiliary_loss_mlp": 0.01046425, + "balance_loss_clip": 1.01819611, + "balance_loss_mlp": 1.02035689, + "epoch": 0.42086276867578537, + "flos": 23548985867520.0, + "grad_norm": 1.7182368435645439, + "language_loss": 0.65083271, + "learning_rate": 2.600302347608295e-06, + "loss": 0.67191452, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4140625, + "step": 7000, + "time_per_iteration": 2.42244029045105 + }, + { + "auxiliary_loss_clip": 0.01062435, + "auxiliary_loss_mlp": 0.01044243, + "balance_loss_clip": 1.01539421, + "balance_loss_mlp": 1.02013206, + "epoch": 0.42092289192845334, + "flos": 18112450464000.0, + "grad_norm": 1.575725068459106, + "language_loss": 0.77526605, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.79633284, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 7001, + "time_per_iteration": 2.4829678535461426 + }, + { + "auxiliary_loss_clip": 0.01063289, + "auxiliary_loss_mlp": 0.0103973, + "balance_loss_clip": 1.01326549, + "balance_loss_mlp": 1.02142596, + "epoch": 0.4209830151811213, + "flos": 20005685827200.0, + "grad_norm": 1.5829303879582668, + "language_loss": 0.87409461, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.89512479, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41796875, + "step": 7002, + "time_per_iteration": 2.427133321762085 + }, + { + "auxiliary_loss_clip": 0.01061203, + "auxiliary_loss_mlp": 0.01041203, + "balance_loss_clip": 1.01378441, + "balance_loss_mlp": 1.01958513, + "epoch": 0.42104313843378927, + "flos": 21977929330560.0, + "grad_norm": 1.9552841722307295, + "language_loss": 0.68808049, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.7091046, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41601562, + "step": 7003, + "time_per_iteration": 2.377445936203003 + }, + { + "auxiliary_loss_clip": 0.01065063, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.01662493, + "balance_loss_mlp": 1.02161098, + "epoch": 0.42110326168645723, + "flos": 25442884546560.0, + "grad_norm": 1.887311179039325, + "language_loss": 0.7955147, + "learning_rate": 2.598816148672344e-06, + "loss": 0.81663656, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 7004, + "time_per_iteration": 2.449510335922241 + }, + { + "auxiliary_loss_clip": 0.01060524, + "auxiliary_loss_mlp": 0.01049462, + "balance_loss_clip": 1.02211523, + "balance_loss_mlp": 1.02017879, + "epoch": 0.4211633849391252, + "flos": 17821588993920.0, + "grad_norm": 1.557279867399258, + "language_loss": 0.69982749, + "learning_rate": 2.59844454213521e-06, + "loss": 0.72092736, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 7005, + "time_per_iteration": 2.365572214126587 + }, + { + "auxiliary_loss_clip": 0.01063077, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.01674843, + "balance_loss_mlp": 1.02084613, + "epoch": 0.42122350819179316, + "flos": 16281710167680.0, + "grad_norm": 1.7585008069990156, + "language_loss": 0.74249858, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.7635479, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.421875, + "step": 7006, + "time_per_iteration": 2.3916828632354736 + }, + { + "auxiliary_loss_clip": 0.01062326, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.02002716, + "balance_loss_mlp": 1.02000809, + "epoch": 0.4212836314444611, + "flos": 19644858259200.0, + "grad_norm": 2.281167554569502, + "language_loss": 0.72638774, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.74747598, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 7007, + "time_per_iteration": 2.360018253326416 + }, + { + "auxiliary_loss_clip": 0.01062487, + "auxiliary_loss_mlp": 0.01046625, + "balance_loss_clip": 1.02061307, + "balance_loss_mlp": 1.01995039, + "epoch": 0.4213437546971291, + "flos": 18368049594240.0, + "grad_norm": 1.8228566319821353, + "language_loss": 0.83360696, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.85469806, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.42382812, + "step": 7008, + "time_per_iteration": 2.393033742904663 + }, + { + "auxiliary_loss_clip": 0.01062047, + "auxiliary_loss_mlp": 0.01047116, + "balance_loss_clip": 1.01891077, + "balance_loss_mlp": 1.01977515, + "epoch": 0.42140387794979706, + "flos": 27703406079360.0, + "grad_norm": 1.8363189849385069, + "language_loss": 0.72701329, + "learning_rate": 2.596957889196831e-06, + "loss": 0.74810487, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 7009, + "time_per_iteration": 2.417668581008911 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.01045258, + "balance_loss_clip": 1.0193541, + "balance_loss_mlp": 1.01754022, + "epoch": 0.4214640012024651, + "flos": 28145825228160.0, + "grad_norm": 2.2468253968349585, + "language_loss": 0.67614365, + "learning_rate": 2.596586169335243e-06, + "loss": 0.69718593, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.4140625, + "step": 7010, + "time_per_iteration": 2.4362387657165527 + }, + { + "auxiliary_loss_clip": 0.0105871, + "auxiliary_loss_mlp": 0.01052448, + "balance_loss_clip": 1.02531552, + "balance_loss_mlp": 1.01784563, + "epoch": 0.42152412445513304, + "flos": 22996311045120.0, + "grad_norm": 1.402197107947976, + "language_loss": 0.7320298, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.7531414, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40820312, + "step": 7011, + "time_per_iteration": 2.3694143295288086 + }, + { + "auxiliary_loss_clip": 0.01010296, + "auxiliary_loss_mlp": 0.01020274, + "balance_loss_clip": 1.01710355, + "balance_loss_mlp": 1.0018754, + "epoch": 0.421584247707801, + "flos": 63746549136000.0, + "grad_norm": 0.8115876951864184, + "language_loss": 0.54451984, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56482559, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.03173828, + "router_z_loss_mlp": 0.08398438, + "step": 7012, + "time_per_iteration": 2.9188554286956787 + }, + { + "auxiliary_loss_clip": 0.01060346, + "auxiliary_loss_mlp": 0.01046754, + "balance_loss_clip": 1.01997972, + "balance_loss_mlp": 1.01871061, + "epoch": 0.421644370960469, + "flos": 24313514019840.0, + "grad_norm": 1.3181305507484915, + "language_loss": 0.79576981, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.81684077, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41601562, + "step": 7013, + "time_per_iteration": 2.404327630996704 + }, + { + "auxiliary_loss_clip": 0.01061422, + "auxiliary_loss_mlp": 0.01042726, + "balance_loss_clip": 1.0139246, + "balance_loss_mlp": 1.01845253, + "epoch": 0.42170449421313694, + "flos": 23439568066560.0, + "grad_norm": 1.6364213695517416, + "language_loss": 0.82028139, + "learning_rate": 2.595099063803787e-06, + "loss": 0.8413229, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 7014, + "time_per_iteration": 2.4892189502716064 + }, + { + "auxiliary_loss_clip": 0.01061255, + "auxiliary_loss_mlp": 0.01049952, + "balance_loss_clip": 1.02153254, + "balance_loss_mlp": 1.01925647, + "epoch": 0.4217646174658049, + "flos": 23694364235520.0, + "grad_norm": 1.4861353046465025, + "language_loss": 0.79063839, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.81175047, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41992188, + "step": 7015, + "time_per_iteration": 2.423894166946411 + }, + { + "auxiliary_loss_clip": 0.01064278, + "auxiliary_loss_mlp": 0.0105011, + "balance_loss_clip": 1.01987886, + "balance_loss_mlp": 1.02072239, + "epoch": 0.42182474071847287, + "flos": 24970439761920.0, + "grad_norm": 1.3070260407391678, + "language_loss": 0.83131444, + "learning_rate": 2.594355375584368e-06, + "loss": 0.85245836, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43554688, + "step": 7016, + "time_per_iteration": 2.4258899688720703 + }, + { + "auxiliary_loss_clip": 0.01065702, + "auxiliary_loss_mlp": 0.01050262, + "balance_loss_clip": 1.01786137, + "balance_loss_mlp": 1.0220964, + "epoch": 0.42188486397114083, + "flos": 22855540976640.0, + "grad_norm": 1.8213109615779581, + "language_loss": 0.6962707, + "learning_rate": 2.593983497660586e-06, + "loss": 0.71743035, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.43554688, + "step": 7017, + "time_per_iteration": 2.377586841583252 + }, + { + "auxiliary_loss_clip": 0.01019591, + "auxiliary_loss_mlp": 0.01007234, + "balance_loss_clip": 1.00401568, + "balance_loss_mlp": 1.01088893, + "epoch": 0.4219449872238088, + "flos": 66972139004160.0, + "grad_norm": 0.6872751907513095, + "language_loss": 0.59599411, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61626244, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.03222656, + "router_z_loss_mlp": 0.08691406, + "step": 7018, + "time_per_iteration": 4.4762938022613525 + }, + { + "auxiliary_loss_clip": 0.01065003, + "auxiliary_loss_mlp": 0.01042177, + "balance_loss_clip": 1.01438904, + "balance_loss_mlp": 1.02191567, + "epoch": 0.42200511047647676, + "flos": 13114528871040.0, + "grad_norm": 1.8959677751689596, + "language_loss": 0.77679139, + "learning_rate": 2.593239674255382e-06, + "loss": 0.79786319, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.43164062, + "step": 7019, + "time_per_iteration": 2.403743028640747 + }, + { + "auxiliary_loss_clip": 0.0106316, + "auxiliary_loss_mlp": 0.01047833, + "balance_loss_clip": 1.01892447, + "balance_loss_mlp": 1.02111983, + "epoch": 0.42206523372914473, + "flos": 13990325126400.0, + "grad_norm": 1.946900206351706, + "language_loss": 0.70781988, + "learning_rate": 2.592867728802166e-06, + "loss": 0.72892982, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41992188, + "step": 7020, + "time_per_iteration": 5.127791166305542 + }, + { + "auxiliary_loss_clip": 0.01062347, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_clip": 1.02203631, + "balance_loss_mlp": 1.02077115, + "epoch": 0.4221253569818127, + "flos": 21941305447680.0, + "grad_norm": 1.6199512194630543, + "language_loss": 0.82108384, + "learning_rate": 2.592495760867347e-06, + "loss": 0.84220821, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41601562, + "step": 7021, + "time_per_iteration": 2.388517379760742 + }, + { + "auxiliary_loss_clip": 0.01064728, + "auxiliary_loss_mlp": 0.01053881, + "balance_loss_clip": 1.02425718, + "balance_loss_mlp": 1.02158654, + "epoch": 0.42218548023448066, + "flos": 32191351309440.0, + "grad_norm": 1.4524928738282323, + "language_loss": 0.70925677, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.73044288, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 7022, + "time_per_iteration": 2.499687671661377 + }, + { + "auxiliary_loss_clip": 0.01061413, + "auxiliary_loss_mlp": 0.01048484, + "balance_loss_clip": 1.02370083, + "balance_loss_mlp": 1.02220476, + "epoch": 0.4222456034871487, + "flos": 30117614883840.0, + "grad_norm": 1.6354897982622338, + "language_loss": 0.68709046, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.70818943, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 7023, + "time_per_iteration": 2.4555563926696777 + }, + { + "auxiliary_loss_clip": 0.01064604, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_clip": 1.03083456, + "balance_loss_mlp": 1.02218473, + "epoch": 0.42230572673981664, + "flos": 22126798834560.0, + "grad_norm": 1.5692864532023039, + "language_loss": 0.70869625, + "learning_rate": 2.591379722314322e-06, + "loss": 0.72996753, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.42382812, + "step": 7024, + "time_per_iteration": 3.8536741733551025 + }, + { + "auxiliary_loss_clip": 0.01064715, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_clip": 1.03173828, + "balance_loss_mlp": 1.02153623, + "epoch": 0.4223658499924846, + "flos": 22053970005120.0, + "grad_norm": 1.4715617742923865, + "language_loss": 0.77560329, + "learning_rate": 2.591007664594147e-06, + "loss": 0.79685432, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.43164062, + "step": 7025, + "time_per_iteration": 2.4837124347686768 + }, + { + "auxiliary_loss_clip": 0.01059983, + "auxiliary_loss_mlp": 0.01058261, + "balance_loss_clip": 1.02886426, + "balance_loss_mlp": 1.01972938, + "epoch": 0.4224259732451526, + "flos": 20409735525120.0, + "grad_norm": 1.8295413582628264, + "language_loss": 0.81112993, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.83231235, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40234375, + "step": 7026, + "time_per_iteration": 2.4410085678100586 + }, + { + "auxiliary_loss_clip": 0.01018295, + "auxiliary_loss_mlp": 0.01026169, + "balance_loss_clip": 1.02249777, + "balance_loss_mlp": 1.00927055, + "epoch": 0.42248609649782054, + "flos": 62843380508160.0, + "grad_norm": 0.7631552107929441, + "language_loss": 0.62097669, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.64142132, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.09033203, + "step": 7027, + "time_per_iteration": 3.111438751220703 + }, + { + "auxiliary_loss_clip": 0.01060924, + "auxiliary_loss_mlp": 0.01053044, + "balance_loss_clip": 1.02520859, + "balance_loss_mlp": 1.01867223, + "epoch": 0.4225462197504885, + "flos": 26248749615360.0, + "grad_norm": 2.077638704611507, + "language_loss": 0.72302473, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.74416441, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.421875, + "step": 7028, + "time_per_iteration": 2.433840274810791 + }, + { + "auxiliary_loss_clip": 0.01063489, + "auxiliary_loss_mlp": 0.0105842, + "balance_loss_clip": 1.02685344, + "balance_loss_mlp": 1.02018905, + "epoch": 0.42260634300315647, + "flos": 20520898894080.0, + "grad_norm": 1.9173418893414949, + "language_loss": 0.83507717, + "learning_rate": 2.589519209743846e-06, + "loss": 0.8562963, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43359375, + "step": 7029, + "time_per_iteration": 2.409543514251709 + }, + { + "auxiliary_loss_clip": 0.01064928, + "auxiliary_loss_mlp": 0.01051178, + "balance_loss_clip": 1.02071965, + "balance_loss_mlp": 1.02011323, + "epoch": 0.42266646625582444, + "flos": 24315573790080.0, + "grad_norm": 2.0046448697178505, + "language_loss": 0.76663566, + "learning_rate": 2.589147040109424e-06, + "loss": 0.78779674, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44726562, + "step": 7030, + "time_per_iteration": 2.4128429889678955 + }, + { + "auxiliary_loss_clip": 0.010608, + "auxiliary_loss_mlp": 0.01052915, + "balance_loss_clip": 1.02158689, + "balance_loss_mlp": 1.01831996, + "epoch": 0.4227265895084924, + "flos": 24203083789440.0, + "grad_norm": 2.6018484797626518, + "language_loss": 0.87779951, + "learning_rate": 2.588774848134486e-06, + "loss": 0.89893675, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.42382812, + "step": 7031, + "time_per_iteration": 2.404933452606201 + }, + { + "auxiliary_loss_clip": 0.01063034, + "auxiliary_loss_mlp": 0.01050742, + "balance_loss_clip": 1.01841271, + "balance_loss_mlp": 1.02001095, + "epoch": 0.42278671276116037, + "flos": 16908819742080.0, + "grad_norm": 2.006021257473933, + "language_loss": 0.75040734, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.77154511, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4296875, + "step": 7032, + "time_per_iteration": 2.3725264072418213 + }, + { + "auxiliary_loss_clip": 0.01063727, + "auxiliary_loss_mlp": 0.01047932, + "balance_loss_clip": 1.01988173, + "balance_loss_mlp": 1.01990557, + "epoch": 0.42284683601382833, + "flos": 25409891445120.0, + "grad_norm": 1.4706723715567316, + "language_loss": 0.71996534, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.74108195, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4375, + "step": 7033, + "time_per_iteration": 2.4294841289520264 + }, + { + "auxiliary_loss_clip": 0.0106351, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.01541352, + "balance_loss_mlp": 1.02075684, + "epoch": 0.4229069592664963, + "flos": 23039184061440.0, + "grad_norm": 1.7790354904096468, + "language_loss": 0.91906285, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.94014537, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42773438, + "step": 7034, + "time_per_iteration": 2.489312171936035 + }, + { + "auxiliary_loss_clip": 0.01064055, + "auxiliary_loss_mlp": 0.01041555, + "balance_loss_clip": 1.01462579, + "balance_loss_mlp": 1.02187049, + "epoch": 0.42296708251916426, + "flos": 26066258605440.0, + "grad_norm": 1.5861115438622075, + "language_loss": 0.78666127, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.80771732, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.421875, + "step": 7035, + "time_per_iteration": 2.4245872497558594 + }, + { + "auxiliary_loss_clip": 0.01066887, + "auxiliary_loss_mlp": 0.01052887, + "balance_loss_clip": 1.02208376, + "balance_loss_mlp": 1.02399528, + "epoch": 0.4230272057718323, + "flos": 19457514570240.0, + "grad_norm": 1.8551081219813903, + "language_loss": 0.83682704, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.85802484, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4296875, + "step": 7036, + "time_per_iteration": 2.3810019493103027 + }, + { + "auxiliary_loss_clip": 0.01066751, + "auxiliary_loss_mlp": 0.01042755, + "balance_loss_clip": 1.014467, + "balance_loss_mlp": 1.02627826, + "epoch": 0.42308732902450025, + "flos": 22382188496640.0, + "grad_norm": 1.6054841089416787, + "language_loss": 0.71532083, + "learning_rate": 2.58654122792447e-06, + "loss": 0.73641586, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40429688, + "step": 7037, + "time_per_iteration": 2.426670789718628 + }, + { + "auxiliary_loss_clip": 0.01068135, + "auxiliary_loss_mlp": 0.0104905, + "balance_loss_clip": 1.01803136, + "balance_loss_mlp": 1.02519238, + "epoch": 0.4231474522771682, + "flos": 20994391019520.0, + "grad_norm": 1.5717391903505562, + "language_loss": 0.79313028, + "learning_rate": 2.586168879961155e-06, + "loss": 0.81430209, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4296875, + "step": 7038, + "time_per_iteration": 2.51074481010437 + }, + { + "auxiliary_loss_clip": 0.01072304, + "auxiliary_loss_mlp": 0.01063183, + "balance_loss_clip": 1.02734828, + "balance_loss_mlp": 1.02565825, + "epoch": 0.4232075755298362, + "flos": 14974980600960.0, + "grad_norm": 2.1605150038157994, + "language_loss": 0.69722795, + "learning_rate": 2.585796509770259e-06, + "loss": 0.71858281, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 0.46679688, + "step": 7039, + "time_per_iteration": 2.361018180847168 + }, + { + "auxiliary_loss_clip": 0.01070883, + "auxiliary_loss_mlp": 0.01050383, + "balance_loss_clip": 1.01947236, + "balance_loss_mlp": 1.02539158, + "epoch": 0.42326769878250414, + "flos": 24531581571840.0, + "grad_norm": 1.8570197629135705, + "language_loss": 0.76564413, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78685677, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45507812, + "step": 7040, + "time_per_iteration": 2.4396746158599854 + }, + { + "auxiliary_loss_clip": 0.01071144, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.0138824, + "balance_loss_mlp": 1.02726865, + "epoch": 0.4233278220351721, + "flos": 26869086385920.0, + "grad_norm": 1.764787542755636, + "language_loss": 0.66907573, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.69023144, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43945312, + "step": 7041, + "time_per_iteration": 2.436676025390625 + }, + { + "auxiliary_loss_clip": 0.01068226, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.01509142, + "balance_loss_mlp": 1.02408457, + "epoch": 0.4233879452878401, + "flos": 42813256728960.0, + "grad_norm": 1.7470933615250035, + "language_loss": 0.7536785, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.77481425, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44140625, + "step": 7042, + "time_per_iteration": 2.588362693786621 + }, + { + "auxiliary_loss_clip": 0.01064891, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_clip": 1.02166319, + "balance_loss_mlp": 1.02335656, + "epoch": 0.42344806854050804, + "flos": 25227819371520.0, + "grad_norm": 1.2610364600599953, + "language_loss": 0.82831442, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84944767, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.4140625, + "step": 7043, + "time_per_iteration": 2.441255569458008 + }, + { + "auxiliary_loss_clip": 0.01068063, + "auxiliary_loss_mlp": 0.01053393, + "balance_loss_clip": 1.02189827, + "balance_loss_mlp": 1.02510273, + "epoch": 0.423508191793176, + "flos": 22777859468160.0, + "grad_norm": 2.2912522014569174, + "language_loss": 0.66761816, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.68883276, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4296875, + "step": 7044, + "time_per_iteration": 2.408329486846924 + }, + { + "auxiliary_loss_clip": 0.01067938, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.02048945, + "balance_loss_mlp": 1.02383447, + "epoch": 0.42356831504584397, + "flos": 34636179242880.0, + "grad_norm": 3.141586302742389, + "language_loss": 0.76606703, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.78727174, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44140625, + "step": 7045, + "time_per_iteration": 2.499405860900879 + }, + { + "auxiliary_loss_clip": 0.01066042, + "auxiliary_loss_mlp": 0.0104476, + "balance_loss_clip": 1.0176158, + "balance_loss_mlp": 1.02466428, + "epoch": 0.42362843829851193, + "flos": 17595980588160.0, + "grad_norm": 2.200874456923916, + "language_loss": 0.82526577, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.8463738, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 7046, + "time_per_iteration": 2.388711929321289 + }, + { + "auxiliary_loss_clip": 0.01065554, + "auxiliary_loss_mlp": 0.01049799, + "balance_loss_clip": 1.02011561, + "balance_loss_mlp": 1.02195263, + "epoch": 0.4236885615511799, + "flos": 22564574772480.0, + "grad_norm": 2.153288002744184, + "language_loss": 0.77982473, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.8009783, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4375, + "step": 7047, + "time_per_iteration": 2.393549680709839 + }, + { + "auxiliary_loss_clip": 0.01063904, + "auxiliary_loss_mlp": 0.01052025, + "balance_loss_clip": 1.02356982, + "balance_loss_mlp": 1.02246678, + "epoch": 0.42374868480384786, + "flos": 26468004153600.0, + "grad_norm": 1.676196671168699, + "language_loss": 0.69262099, + "learning_rate": 2.582444180141098e-06, + "loss": 0.71378028, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 7048, + "time_per_iteration": 2.466933250427246 + }, + { + "auxiliary_loss_clip": 0.01063365, + "auxiliary_loss_mlp": 0.01048911, + "balance_loss_clip": 1.01946676, + "balance_loss_mlp": 1.02057207, + "epoch": 0.4238088080565159, + "flos": 20369341215360.0, + "grad_norm": 1.731967014770645, + "language_loss": 0.79367673, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.81479955, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42773438, + "step": 7049, + "time_per_iteration": 2.375509738922119 + }, + { + "auxiliary_loss_clip": 0.01063665, + "auxiliary_loss_mlp": 0.0105995, + "balance_loss_clip": 1.02819228, + "balance_loss_mlp": 1.02015054, + "epoch": 0.42386893130918385, + "flos": 21171226389120.0, + "grad_norm": 1.7579478155730783, + "language_loss": 0.84000552, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.8612417, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.43359375, + "step": 7050, + "time_per_iteration": 2.4404566287994385 + }, + { + "auxiliary_loss_clip": 0.01060572, + "auxiliary_loss_mlp": 0.01052409, + "balance_loss_clip": 1.02305937, + "balance_loss_mlp": 1.01820457, + "epoch": 0.4239290545618518, + "flos": 17674674526080.0, + "grad_norm": 2.156134022157503, + "language_loss": 0.75179565, + "learning_rate": 2.581326338868687e-06, + "loss": 0.77292544, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42382812, + "step": 7051, + "time_per_iteration": 2.3480515480041504 + }, + { + "auxiliary_loss_clip": 0.01059738, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_clip": 1.01925421, + "balance_loss_mlp": 1.0189091, + "epoch": 0.4239891778145198, + "flos": 24313409285760.0, + "grad_norm": 1.3920812039497787, + "language_loss": 0.8712281, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.89231962, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.40820312, + "step": 7052, + "time_per_iteration": 2.528925895690918 + }, + { + "auxiliary_loss_clip": 0.01062917, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.02210057, + "balance_loss_mlp": 1.0198946, + "epoch": 0.42404930106718774, + "flos": 20557383131520.0, + "grad_norm": 1.4915131728520232, + "language_loss": 0.73435211, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.75549793, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 7053, + "time_per_iteration": 2.3708138465881348 + }, + { + "auxiliary_loss_clip": 0.01061124, + "auxiliary_loss_mlp": 0.01046855, + "balance_loss_clip": 1.01872194, + "balance_loss_mlp": 1.01989055, + "epoch": 0.4241094243198557, + "flos": 22307020606080.0, + "grad_norm": 2.1553594058614305, + "language_loss": 0.83547509, + "learning_rate": 2.580208299200704e-06, + "loss": 0.85655487, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 7054, + "time_per_iteration": 2.435666084289551 + }, + { + "auxiliary_loss_clip": 0.01013222, + "auxiliary_loss_mlp": 0.01007898, + "balance_loss_clip": 1.00420284, + "balance_loss_mlp": 1.00473595, + "epoch": 0.4241695475725237, + "flos": 70609111822080.0, + "grad_norm": 0.7823861302483855, + "language_loss": 0.60518193, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62539309, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.03686523, + "router_z_loss_mlp": 0.08496094, + "step": 7055, + "time_per_iteration": 2.9726390838623047 + }, + { + "auxiliary_loss_clip": 0.01062567, + "auxiliary_loss_mlp": 0.01051945, + "balance_loss_clip": 1.02043796, + "balance_loss_mlp": 1.0195874, + "epoch": 0.42422967082519164, + "flos": 14026599895680.0, + "grad_norm": 2.629165747488401, + "language_loss": 0.78747523, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.80862033, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4296875, + "step": 7056, + "time_per_iteration": 2.391479969024658 + }, + { + "auxiliary_loss_clip": 0.01066979, + "auxiliary_loss_mlp": 0.01043711, + "balance_loss_clip": 1.01200187, + "balance_loss_mlp": 1.0214833, + "epoch": 0.4242897940778596, + "flos": 22344447450240.0, + "grad_norm": 1.7773338764018567, + "language_loss": 0.8555789, + "learning_rate": 2.579090061518714e-06, + "loss": 0.87668586, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45507812, + "step": 7057, + "time_per_iteration": 2.3921215534210205 + }, + { + "auxiliary_loss_clip": 0.01066243, + "auxiliary_loss_mlp": 0.01047801, + "balance_loss_clip": 1.01477969, + "balance_loss_mlp": 1.02182913, + "epoch": 0.42434991733052757, + "flos": 22594914610560.0, + "grad_norm": 2.27188235587229, + "language_loss": 0.84127378, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.86241418, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.44335938, + "step": 7058, + "time_per_iteration": 3.8619606494903564 + }, + { + "auxiliary_loss_clip": 0.0106199, + "auxiliary_loss_mlp": 0.01044703, + "balance_loss_clip": 1.01811957, + "balance_loss_mlp": 1.02049279, + "epoch": 0.42441004058319554, + "flos": 20010398860800.0, + "grad_norm": 1.9325642564035053, + "language_loss": 0.81690085, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.83796775, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41601562, + "step": 7059, + "time_per_iteration": 3.8843259811401367 + }, + { + "auxiliary_loss_clip": 0.01065092, + "auxiliary_loss_mlp": 0.01049759, + "balance_loss_clip": 1.01741779, + "balance_loss_mlp": 1.02074051, + "epoch": 0.4244701638358635, + "flos": 11144205492480.0, + "grad_norm": 2.677350905218339, + "language_loss": 0.71509683, + "learning_rate": 2.57797162620435e-06, + "loss": 0.73624533, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.44335938, + "step": 7060, + "time_per_iteration": 3.8871965408325195 + }, + { + "auxiliary_loss_clip": 0.01064138, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.01487625, + "balance_loss_mlp": 1.02203369, + "epoch": 0.42453028708853147, + "flos": 23986622160000.0, + "grad_norm": 1.5612158493837427, + "language_loss": 0.76720655, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78828514, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 7061, + "time_per_iteration": 2.448136806488037 + }, + { + "auxiliary_loss_clip": 0.01065418, + "auxiliary_loss_mlp": 0.01046285, + "balance_loss_clip": 1.01601768, + "balance_loss_mlp": 1.02209723, + "epoch": 0.42459041034119943, + "flos": 18405336792960.0, + "grad_norm": 2.0189664063784742, + "language_loss": 0.74846333, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.76958036, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43359375, + "step": 7062, + "time_per_iteration": 2.4076755046844482 + }, + { + "auxiliary_loss_clip": 0.0106515, + "auxiliary_loss_mlp": 0.01056968, + "balance_loss_clip": 1.02690351, + "balance_loss_mlp": 1.02178645, + "epoch": 0.42465053359386745, + "flos": 20956999086720.0, + "grad_norm": 1.8102841021843734, + "language_loss": 0.67076635, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.69198751, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43359375, + "step": 7063, + "time_per_iteration": 2.3859846591949463 + }, + { + "auxiliary_loss_clip": 0.01060866, + "auxiliary_loss_mlp": 0.01040528, + "balance_loss_clip": 1.01345563, + "balance_loss_mlp": 1.02040422, + "epoch": 0.4247106568465354, + "flos": 33104888611200.0, + "grad_norm": 1.5007505672677646, + "language_loss": 0.79559779, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.81661177, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40429688, + "step": 7064, + "time_per_iteration": 3.8425300121307373 + }, + { + "auxiliary_loss_clip": 0.01062855, + "auxiliary_loss_mlp": 0.01045189, + "balance_loss_clip": 1.01545787, + "balance_loss_mlp": 1.02071512, + "epoch": 0.4247707800992034, + "flos": 20045905580160.0, + "grad_norm": 1.8425283830935937, + "language_loss": 0.76477951, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.78586, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 7065, + "time_per_iteration": 2.4308419227600098 + }, + { + "auxiliary_loss_clip": 0.0106381, + "auxiliary_loss_mlp": 0.01049499, + "balance_loss_clip": 1.0203644, + "balance_loss_mlp": 1.02170467, + "epoch": 0.42483090335187135, + "flos": 22383968976000.0, + "grad_norm": 1.3603584040310257, + "language_loss": 0.73052561, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.75165868, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41992188, + "step": 7066, + "time_per_iteration": 2.4468142986297607 + }, + { + "auxiliary_loss_clip": 0.01065623, + "auxiliary_loss_mlp": 0.01053483, + "balance_loss_clip": 1.02238166, + "balance_loss_mlp": 1.02180481, + "epoch": 0.4248910266045393, + "flos": 21355881903360.0, + "grad_norm": 1.9969188210026327, + "language_loss": 0.81012046, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.83131158, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4375, + "step": 7067, + "time_per_iteration": 2.3843860626220703 + }, + { + "auxiliary_loss_clip": 0.01015624, + "auxiliary_loss_mlp": 0.0100753, + "balance_loss_clip": 1.00435925, + "balance_loss_mlp": 1.00684357, + "epoch": 0.4249511498572073, + "flos": 64004976086400.0, + "grad_norm": 0.9175460588093126, + "language_loss": 0.63614625, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65637779, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.03173828, + "router_z_loss_mlp": 0.08789062, + "step": 7068, + "time_per_iteration": 2.985718011856079 + }, + { + "auxiliary_loss_clip": 0.01062157, + "auxiliary_loss_mlp": 0.01045676, + "balance_loss_clip": 1.01546812, + "balance_loss_mlp": 1.01923656, + "epoch": 0.42501127310987524, + "flos": 19606104783360.0, + "grad_norm": 1.8443240715664695, + "language_loss": 0.74089432, + "learning_rate": 2.574615138284361e-06, + "loss": 0.76197267, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4296875, + "step": 7069, + "time_per_iteration": 2.3807029724121094 + }, + { + "auxiliary_loss_clip": 0.01063208, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.02065122, + "balance_loss_mlp": 1.02005816, + "epoch": 0.4250713963625432, + "flos": 19461354819840.0, + "grad_norm": 1.964308186991792, + "language_loss": 0.80826068, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.82941651, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43164062, + "step": 7070, + "time_per_iteration": 2.419203996658325 + }, + { + "auxiliary_loss_clip": 0.01060628, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_clip": 1.01490688, + "balance_loss_mlp": 1.01827979, + "epoch": 0.4251315196152112, + "flos": 25336538945280.0, + "grad_norm": 1.9562490555233887, + "language_loss": 0.71506941, + "learning_rate": 2.573869012032795e-06, + "loss": 0.73612207, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42382812, + "step": 7071, + "time_per_iteration": 2.421995162963867 + }, + { + "auxiliary_loss_clip": 0.01059944, + "auxiliary_loss_mlp": 0.01046108, + "balance_loss_clip": 1.01714027, + "balance_loss_mlp": 1.01812482, + "epoch": 0.42519164286787914, + "flos": 26357992859520.0, + "grad_norm": 2.4142923390016158, + "language_loss": 0.73412502, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.75518548, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 7072, + "time_per_iteration": 2.451519727706909 + }, + { + "auxiliary_loss_clip": 0.01062465, + "auxiliary_loss_mlp": 0.01046591, + "balance_loss_clip": 1.01763546, + "balance_loss_mlp": 1.0194397, + "epoch": 0.4252517661205471, + "flos": 26029879102080.0, + "grad_norm": 1.6702541698407491, + "language_loss": 0.83087534, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.8519659, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 7073, + "time_per_iteration": 2.439826488494873 + }, + { + "auxiliary_loss_clip": 0.01058865, + "auxiliary_loss_mlp": 0.01042011, + "balance_loss_clip": 1.0133884, + "balance_loss_mlp": 1.01802742, + "epoch": 0.42531188937321507, + "flos": 12712818234240.0, + "grad_norm": 2.2250197823485287, + "language_loss": 0.93070495, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.95171368, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40820312, + "step": 7074, + "time_per_iteration": 2.375411033630371 + }, + { + "auxiliary_loss_clip": 0.0106205, + "auxiliary_loss_mlp": 0.01042703, + "balance_loss_clip": 1.01188707, + "balance_loss_mlp": 1.01801777, + "epoch": 0.42537201262588303, + "flos": 22090558976640.0, + "grad_norm": 1.694292793572092, + "language_loss": 0.64918876, + "learning_rate": 2.572376498508805e-06, + "loss": 0.67023629, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.44140625, + "step": 7075, + "time_per_iteration": 2.396697998046875 + }, + { + "auxiliary_loss_clip": 0.01058887, + "auxiliary_loss_mlp": 0.01040959, + "balance_loss_clip": 1.01429224, + "balance_loss_mlp": 1.01906228, + "epoch": 0.42543213587855105, + "flos": 23002001596800.0, + "grad_norm": 1.598573831565425, + "language_loss": 0.75430012, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.7752986, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 7076, + "time_per_iteration": 2.4554989337921143 + }, + { + "auxiliary_loss_clip": 0.01063253, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.01400328, + "balance_loss_mlp": 1.01862991, + "epoch": 0.425492259131219, + "flos": 25081288928640.0, + "grad_norm": 1.7186593960114798, + "language_loss": 0.80787182, + "learning_rate": 2.571630111462766e-06, + "loss": 0.82895148, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4453125, + "step": 7077, + "time_per_iteration": 2.401122808456421 + }, + { + "auxiliary_loss_clip": 0.0105808, + "auxiliary_loss_mlp": 0.01043299, + "balance_loss_clip": 1.01870573, + "balance_loss_mlp": 1.01858425, + "epoch": 0.425552382383887, + "flos": 22815844894080.0, + "grad_norm": 1.5438464987871954, + "language_loss": 0.74048656, + "learning_rate": 2.571256885418265e-06, + "loss": 0.76150036, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39453125, + "step": 7078, + "time_per_iteration": 2.5226354598999023 + }, + { + "auxiliary_loss_clip": 0.01062239, + "auxiliary_loss_mlp": 0.01046344, + "balance_loss_clip": 1.01946187, + "balance_loss_mlp": 1.021384, + "epoch": 0.42561250563655495, + "flos": 13552723745280.0, + "grad_norm": 2.419177192250445, + "language_loss": 0.81715393, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.83823973, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 7079, + "time_per_iteration": 2.3789761066436768 + }, + { + "auxiliary_loss_clip": 0.0106202, + "auxiliary_loss_mlp": 0.01039922, + "balance_loss_clip": 1.0137198, + "balance_loss_mlp": 1.0214057, + "epoch": 0.4256726288892229, + "flos": 46976404780800.0, + "grad_norm": 1.3535396153281427, + "language_loss": 0.72908199, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.75010139, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 7080, + "time_per_iteration": 2.668379306793213 + }, + { + "auxiliary_loss_clip": 0.01061495, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_clip": 1.02225852, + "balance_loss_mlp": 1.02045369, + "epoch": 0.4257327521418909, + "flos": 23585330459520.0, + "grad_norm": 2.135285399700532, + "language_loss": 0.81375611, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.83484983, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.41015625, + "step": 7081, + "time_per_iteration": 2.388793468475342 + }, + { + "auxiliary_loss_clip": 0.01059828, + "auxiliary_loss_mlp": 0.01042503, + "balance_loss_clip": 1.01662314, + "balance_loss_mlp": 1.02069521, + "epoch": 0.42579287539455885, + "flos": 18988979857920.0, + "grad_norm": 1.6297901112165836, + "language_loss": 0.82100552, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.84202886, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 7082, + "time_per_iteration": 2.396122932434082 + }, + { + "auxiliary_loss_clip": 0.01062068, + "auxiliary_loss_mlp": 0.01049833, + "balance_loss_clip": 1.02220047, + "balance_loss_mlp": 1.0212698, + "epoch": 0.4258529986472268, + "flos": 25190741640960.0, + "grad_norm": 2.0556936076843515, + "language_loss": 0.71151543, + "learning_rate": 2.569390430547065e-06, + "loss": 0.73263443, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 7083, + "time_per_iteration": 2.4003679752349854 + }, + { + "auxiliary_loss_clip": 0.01014954, + "auxiliary_loss_mlp": 0.01005378, + "balance_loss_clip": 1.00175393, + "balance_loss_mlp": 1.00656188, + "epoch": 0.4259131218998948, + "flos": 69964614524160.0, + "grad_norm": 0.8774692434108863, + "language_loss": 0.6715073, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69171059, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.03613281, + "router_z_loss_mlp": 0.08398438, + "step": 7084, + "time_per_iteration": 3.105396270751953 + }, + { + "auxiliary_loss_clip": 0.01063076, + "auxiliary_loss_mlp": 0.01047029, + "balance_loss_clip": 1.01853848, + "balance_loss_mlp": 1.02255225, + "epoch": 0.42597324515256274, + "flos": 18003975269760.0, + "grad_norm": 1.904409565292695, + "language_loss": 0.79889947, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.82000053, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40625, + "step": 7085, + "time_per_iteration": 2.5312464237213135 + }, + { + "auxiliary_loss_clip": 0.01066105, + "auxiliary_loss_mlp": 0.01050729, + "balance_loss_clip": 1.02029479, + "balance_loss_mlp": 1.02152443, + "epoch": 0.4260333684052307, + "flos": 15157890547200.0, + "grad_norm": 1.9391301518000674, + "language_loss": 0.77232021, + "learning_rate": 2.568270298414995e-06, + "loss": 0.79348856, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4453125, + "step": 7086, + "time_per_iteration": 2.4556844234466553 + }, + { + "auxiliary_loss_clip": 0.01064217, + "auxiliary_loss_mlp": 0.01046835, + "balance_loss_clip": 1.0195483, + "balance_loss_mlp": 1.02262855, + "epoch": 0.42609349165789867, + "flos": 14938461452160.0, + "grad_norm": 1.7527787026326156, + "language_loss": 0.81814086, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.8392514, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41601562, + "step": 7087, + "time_per_iteration": 2.350301742553711 + }, + { + "auxiliary_loss_clip": 0.01062127, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_clip": 1.01817369, + "balance_loss_mlp": 1.02112556, + "epoch": 0.42615361491056664, + "flos": 23730848472960.0, + "grad_norm": 2.0794903303860983, + "language_loss": 0.6773566, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.69842637, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41015625, + "step": 7088, + "time_per_iteration": 2.48347806930542 + }, + { + "auxiliary_loss_clip": 0.01061464, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.02366018, + "balance_loss_mlp": 1.01881576, + "epoch": 0.42621373816323466, + "flos": 24935282156160.0, + "grad_norm": 1.8877935459084403, + "language_loss": 0.69795668, + "learning_rate": 2.56714997234313e-06, + "loss": 0.71907592, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.42578125, + "step": 7089, + "time_per_iteration": 2.410597085952759 + }, + { + "auxiliary_loss_clip": 0.01061285, + "auxiliary_loss_mlp": 0.01044185, + "balance_loss_clip": 1.01757789, + "balance_loss_mlp": 1.01907182, + "epoch": 0.4262738614159026, + "flos": 13552130252160.0, + "grad_norm": 3.1986335662398906, + "language_loss": 0.76215214, + "learning_rate": 2.566776487287525e-06, + "loss": 0.78320682, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 7090, + "time_per_iteration": 2.3808205127716064 + }, + { + "auxiliary_loss_clip": 0.01063235, + "auxiliary_loss_mlp": 0.01049118, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.01976097, + "epoch": 0.4263339846685706, + "flos": 29747605628160.0, + "grad_norm": 1.9111605278408594, + "language_loss": 0.76090419, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.78202772, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.43359375, + "step": 7091, + "time_per_iteration": 2.426913022994995 + }, + { + "auxiliary_loss_clip": 0.01056609, + "auxiliary_loss_mlp": 0.01045436, + "balance_loss_clip": 1.02147496, + "balance_loss_mlp": 1.01806998, + "epoch": 0.42639410792123855, + "flos": 16833337649280.0, + "grad_norm": 1.6738369077042294, + "language_loss": 0.84012449, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.86114502, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.38476562, + "step": 7092, + "time_per_iteration": 2.4408488273620605 + }, + { + "auxiliary_loss_clip": 0.01062806, + "auxiliary_loss_mlp": 0.01049811, + "balance_loss_clip": 1.02081919, + "balance_loss_mlp": 1.01854312, + "epoch": 0.4264542311739065, + "flos": 28761972635520.0, + "grad_norm": 1.5659470454292643, + "language_loss": 0.75080383, + "learning_rate": 2.565655903224038e-06, + "loss": 0.77192998, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44140625, + "step": 7093, + "time_per_iteration": 2.49782133102417 + }, + { + "auxiliary_loss_clip": 0.01060441, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.01746607, + "balance_loss_mlp": 1.01952362, + "epoch": 0.4265143544265745, + "flos": 24712571393280.0, + "grad_norm": 2.1937760679972853, + "language_loss": 0.71988213, + "learning_rate": 2.565282332284532e-06, + "loss": 0.74095297, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41015625, + "step": 7094, + "time_per_iteration": 2.462940216064453 + }, + { + "auxiliary_loss_clip": 0.01060849, + "auxiliary_loss_mlp": 0.01048355, + "balance_loss_clip": 1.02047205, + "balance_loss_mlp": 1.01934028, + "epoch": 0.42657447767924245, + "flos": 21865055304960.0, + "grad_norm": 2.2828912983415184, + "language_loss": 0.83650643, + "learning_rate": 2.564908739909464e-06, + "loss": 0.85759842, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41601562, + "step": 7095, + "time_per_iteration": 2.4193615913391113 + }, + { + "auxiliary_loss_clip": 0.01061816, + "auxiliary_loss_mlp": 0.01052013, + "balance_loss_clip": 1.02311683, + "balance_loss_mlp": 1.01983249, + "epoch": 0.4266346009319104, + "flos": 21469174865280.0, + "grad_norm": 1.722246782821041, + "language_loss": 0.82179749, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.8429358, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41992188, + "step": 7096, + "time_per_iteration": 2.4393861293792725 + }, + { + "auxiliary_loss_clip": 0.0106395, + "auxiliary_loss_mlp": 0.0104862, + "balance_loss_clip": 1.02098739, + "balance_loss_mlp": 1.02017319, + "epoch": 0.4266947241845784, + "flos": 25518226993920.0, + "grad_norm": 1.8654285385093228, + "language_loss": 0.66829956, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.68942529, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.4375, + "step": 7097, + "time_per_iteration": 4.726633310317993 + }, + { + "auxiliary_loss_clip": 0.01059198, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_clip": 1.01687574, + "balance_loss_mlp": 1.01845396, + "epoch": 0.42675484743724634, + "flos": 26540030021760.0, + "grad_norm": 1.6238429899422295, + "language_loss": 0.75274056, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.77376902, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40820312, + "step": 7098, + "time_per_iteration": 3.9148941040039062 + }, + { + "auxiliary_loss_clip": 0.01058527, + "auxiliary_loss_mlp": 0.01043573, + "balance_loss_clip": 1.01597643, + "balance_loss_mlp": 1.0183754, + "epoch": 0.4268149706899143, + "flos": 23111593954560.0, + "grad_norm": 1.6479332541172997, + "language_loss": 0.76729417, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.78831518, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40234375, + "step": 7099, + "time_per_iteration": 3.7686681747436523 + }, + { + "auxiliary_loss_clip": 0.01062218, + "auxiliary_loss_mlp": 0.01047503, + "balance_loss_clip": 1.01736653, + "balance_loss_mlp": 1.01984274, + "epoch": 0.4268750939425823, + "flos": 22705554309120.0, + "grad_norm": 2.0361918164121526, + "language_loss": 0.84480894, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.86590618, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42382812, + "step": 7100, + "time_per_iteration": 2.459355354309082 + }, + { + "auxiliary_loss_clip": 0.01061399, + "auxiliary_loss_mlp": 0.01041649, + "balance_loss_clip": 1.01424277, + "balance_loss_mlp": 1.01882601, + "epoch": 0.42693521719525024, + "flos": 25373686498560.0, + "grad_norm": 1.3548182291888482, + "language_loss": 0.82980454, + "learning_rate": 2.562666736305627e-06, + "loss": 0.85083508, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42578125, + "step": 7101, + "time_per_iteration": 2.4331889152526855 + }, + { + "auxiliary_loss_clip": 0.01064775, + "auxiliary_loss_mlp": 0.01046256, + "balance_loss_clip": 1.01701379, + "balance_loss_mlp": 1.0209738, + "epoch": 0.42699534044791826, + "flos": 18149702751360.0, + "grad_norm": 1.8121704532004443, + "language_loss": 0.74265373, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.76376402, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4375, + "step": 7102, + "time_per_iteration": 2.423921585083008 + }, + { + "auxiliary_loss_clip": 0.01060683, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.01479173, + "balance_loss_mlp": 1.01936769, + "epoch": 0.4270554637005862, + "flos": 13697578442880.0, + "grad_norm": 1.7522531121775522, + "language_loss": 0.85040694, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.87145221, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4140625, + "step": 7103, + "time_per_iteration": 2.3918521404266357 + }, + { + "auxiliary_loss_clip": 0.01063273, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.01700974, + "balance_loss_mlp": 1.01947224, + "epoch": 0.4271155869532542, + "flos": 17492637363840.0, + "grad_norm": 2.2472432002383935, + "language_loss": 0.74861735, + "learning_rate": 2.561545446271294e-06, + "loss": 0.76971316, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4375, + "step": 7104, + "time_per_iteration": 3.892261028289795 + }, + { + "auxiliary_loss_clip": 0.01061929, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.00860524, + "balance_loss_mlp": 1.01997232, + "epoch": 0.42717571020592215, + "flos": 32450930334720.0, + "grad_norm": 2.1509854232229855, + "language_loss": 0.77646971, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.79742444, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.41992188, + "step": 7105, + "time_per_iteration": 2.52921199798584 + }, + { + "auxiliary_loss_clip": 0.01062473, + "auxiliary_loss_mlp": 0.01041721, + "balance_loss_clip": 1.01439786, + "balance_loss_mlp": 1.0203737, + "epoch": 0.4272358334585901, + "flos": 16252138379520.0, + "grad_norm": 1.8074749579327214, + "language_loss": 0.78500015, + "learning_rate": 2.560797813088819e-06, + "loss": 0.80604208, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.421875, + "step": 7106, + "time_per_iteration": 2.472628116607666 + }, + { + "auxiliary_loss_clip": 0.01061019, + "auxiliary_loss_mlp": 0.01047547, + "balance_loss_clip": 1.01897264, + "balance_loss_mlp": 1.01906157, + "epoch": 0.4272959567112581, + "flos": 24199138805760.0, + "grad_norm": 1.7775745604894322, + "language_loss": 0.82101858, + "learning_rate": 2.560423964592229e-06, + "loss": 0.8421042, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41992188, + "step": 7107, + "time_per_iteration": 2.4683425426483154 + }, + { + "auxiliary_loss_clip": 0.01061762, + "auxiliary_loss_mlp": 0.010461, + "balance_loss_clip": 1.01640475, + "balance_loss_mlp": 1.02129269, + "epoch": 0.42735607996392605, + "flos": 27962286877440.0, + "grad_norm": 1.4157054898646022, + "language_loss": 0.68688524, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70796388, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40429688, + "step": 7108, + "time_per_iteration": 2.5705130100250244 + }, + { + "auxiliary_loss_clip": 0.01061002, + "auxiliary_loss_mlp": 0.01048554, + "balance_loss_clip": 1.02077818, + "balance_loss_mlp": 1.01825678, + "epoch": 0.427416203216594, + "flos": 20294766817920.0, + "grad_norm": 1.6961118895184641, + "language_loss": 0.73211497, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.75321054, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42773438, + "step": 7109, + "time_per_iteration": 2.4466779232025146 + }, + { + "auxiliary_loss_clip": 0.01061423, + "auxiliary_loss_mlp": 0.01049083, + "balance_loss_clip": 1.01777923, + "balance_loss_mlp": 1.01841557, + "epoch": 0.427476326469262, + "flos": 26942718176640.0, + "grad_norm": 1.8632046963716977, + "language_loss": 0.65896255, + "learning_rate": 2.559302291651174e-06, + "loss": 0.68006766, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4296875, + "step": 7110, + "time_per_iteration": 2.489095449447632 + }, + { + "auxiliary_loss_clip": 0.01060198, + "auxiliary_loss_mlp": 0.01048181, + "balance_loss_clip": 1.0189991, + "balance_loss_mlp": 1.01820326, + "epoch": 0.42753644972192995, + "flos": 25701660610560.0, + "grad_norm": 1.7535767076756146, + "language_loss": 0.77772814, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.79881191, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41992188, + "step": 7111, + "time_per_iteration": 2.4730191230773926 + }, + { + "auxiliary_loss_clip": 0.01061006, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.01358175, + "balance_loss_mlp": 1.0191921, + "epoch": 0.4275965729745979, + "flos": 18766513474560.0, + "grad_norm": 1.9849111933941892, + "language_loss": 0.74744594, + "learning_rate": 2.558554403622845e-06, + "loss": 0.76846743, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 7112, + "time_per_iteration": 2.504270315170288 + }, + { + "auxiliary_loss_clip": 0.01058503, + "auxiliary_loss_mlp": 0.01044157, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.01816535, + "epoch": 0.4276566962272659, + "flos": 23763422638080.0, + "grad_norm": 1.5414804027061388, + "language_loss": 0.72324634, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.74427295, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 7113, + "time_per_iteration": 2.412998914718628 + }, + { + "auxiliary_loss_clip": 0.01062782, + "auxiliary_loss_mlp": 0.01056898, + "balance_loss_clip": 1.02692854, + "balance_loss_mlp": 1.02003765, + "epoch": 0.42771681947993384, + "flos": 22491396829440.0, + "grad_norm": 1.6227575222862194, + "language_loss": 0.6312803, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.65247715, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.42773438, + "step": 7114, + "time_per_iteration": 2.567538022994995 + }, + { + "auxiliary_loss_clip": 0.01063329, + "auxiliary_loss_mlp": 0.01051805, + "balance_loss_clip": 1.02016711, + "balance_loss_mlp": 1.0188458, + "epoch": 0.42777694273260186, + "flos": 25043582793600.0, + "grad_norm": 1.547263451445419, + "language_loss": 0.65540814, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.67655945, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4453125, + "step": 7115, + "time_per_iteration": 2.42500901222229 + }, + { + "auxiliary_loss_clip": 0.01058509, + "auxiliary_loss_mlp": 0.01046475, + "balance_loss_clip": 1.02034485, + "balance_loss_mlp": 1.0169549, + "epoch": 0.4278370659852698, + "flos": 18660516986880.0, + "grad_norm": 1.4779445658233834, + "language_loss": 0.74453264, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.7655825, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41601562, + "step": 7116, + "time_per_iteration": 2.434450387954712 + }, + { + "auxiliary_loss_clip": 0.01058403, + "auxiliary_loss_mlp": 0.01043095, + "balance_loss_clip": 1.0175606, + "balance_loss_mlp": 1.01825488, + "epoch": 0.4278971892379378, + "flos": 27307036880640.0, + "grad_norm": 1.592921566738255, + "language_loss": 0.7042945, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.72530949, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 7117, + "time_per_iteration": 2.4244508743286133 + }, + { + "auxiliary_loss_clip": 0.01061835, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.01706374, + "balance_loss_mlp": 1.01961851, + "epoch": 0.42795731249060576, + "flos": 12888082592640.0, + "grad_norm": 2.9072234490366187, + "language_loss": 0.71343875, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.73452562, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 7118, + "time_per_iteration": 2.3739075660705566 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.0104573, + "balance_loss_clip": 1.01666677, + "balance_loss_mlp": 1.01742363, + "epoch": 0.4280174357432737, + "flos": 33400044178560.0, + "grad_norm": 1.8259791501328313, + "language_loss": 0.75648296, + "learning_rate": 2.55593612908444e-06, + "loss": 0.77752799, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4140625, + "step": 7119, + "time_per_iteration": 2.484738826751709 + }, + { + "auxiliary_loss_clip": 0.01060439, + "auxiliary_loss_mlp": 0.01047697, + "balance_loss_clip": 1.01980233, + "balance_loss_mlp": 1.01857078, + "epoch": 0.4280775589959417, + "flos": 18258143034240.0, + "grad_norm": 1.7700519237003003, + "language_loss": 0.75620431, + "learning_rate": 2.555562005426573e-06, + "loss": 0.7772857, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 7120, + "time_per_iteration": 2.412067174911499 + }, + { + "auxiliary_loss_clip": 0.01059003, + "auxiliary_loss_mlp": 0.01044439, + "balance_loss_clip": 1.01774788, + "balance_loss_mlp": 1.01814771, + "epoch": 0.42813768224860965, + "flos": 21470187294720.0, + "grad_norm": 1.5302823349098866, + "language_loss": 0.77668488, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.79771924, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 7121, + "time_per_iteration": 2.3772034645080566 + }, + { + "auxiliary_loss_clip": 0.01060078, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.01489282, + "balance_loss_mlp": 1.01925576, + "epoch": 0.4281978055012776, + "flos": 15668355669120.0, + "grad_norm": 2.287238292921327, + "language_loss": 0.87033945, + "learning_rate": 2.554813694924126e-06, + "loss": 0.89132446, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.40820312, + "step": 7122, + "time_per_iteration": 2.3753767013549805 + }, + { + "auxiliary_loss_clip": 0.01058566, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.01590323, + "balance_loss_mlp": 1.01916027, + "epoch": 0.4282579287539456, + "flos": 17711054029440.0, + "grad_norm": 1.7620258611752178, + "language_loss": 0.82466829, + "learning_rate": 2.554439508107921e-06, + "loss": 0.84566092, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39453125, + "step": 7123, + "time_per_iteration": 2.3760831356048584 + }, + { + "auxiliary_loss_clip": 0.01058911, + "auxiliary_loss_mlp": 0.01045342, + "balance_loss_clip": 1.01821017, + "balance_loss_mlp": 1.01853704, + "epoch": 0.42831805200661355, + "flos": 19280155530240.0, + "grad_norm": 1.516239405562046, + "language_loss": 0.81649786, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83754039, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 7124, + "time_per_iteration": 2.3926174640655518 + }, + { + "auxiliary_loss_clip": 0.01060774, + "auxiliary_loss_mlp": 0.0105078, + "balance_loss_clip": 1.01986945, + "balance_loss_mlp": 1.0190258, + "epoch": 0.4283781752592815, + "flos": 19791598170240.0, + "grad_norm": 1.8247720408406791, + "language_loss": 0.81441242, + "learning_rate": 2.553691071416498e-06, + "loss": 0.8355279, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.41796875, + "step": 7125, + "time_per_iteration": 2.377126932144165 + }, + { + "auxiliary_loss_clip": 0.01060287, + "auxiliary_loss_mlp": 0.01041036, + "balance_loss_clip": 1.01557243, + "balance_loss_mlp": 1.02008247, + "epoch": 0.4284382985119495, + "flos": 16507144016640.0, + "grad_norm": 1.823112887700415, + "language_loss": 0.75970936, + "learning_rate": 2.553316821569659e-06, + "loss": 0.7807225, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40234375, + "step": 7126, + "time_per_iteration": 2.3913590908050537 + }, + { + "auxiliary_loss_clip": 0.01059152, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.01784647, + "balance_loss_mlp": 1.01850414, + "epoch": 0.42849842176461744, + "flos": 23329661506560.0, + "grad_norm": 1.5858936275459181, + "language_loss": 0.82768387, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.84873116, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 7127, + "time_per_iteration": 2.4677815437316895 + }, + { + "auxiliary_loss_clip": 0.010599, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_clip": 1.01557362, + "balance_loss_mlp": 1.01836705, + "epoch": 0.4285585450172854, + "flos": 17273487559680.0, + "grad_norm": 2.105098437501118, + "language_loss": 0.76515102, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78617764, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 7128, + "time_per_iteration": 2.3807613849639893 + }, + { + "auxiliary_loss_clip": 0.01062259, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.01474667, + "balance_loss_mlp": 1.01887703, + "epoch": 0.42861866826995343, + "flos": 24278461148160.0, + "grad_norm": 2.040571365091949, + "language_loss": 0.74581587, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76687574, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43359375, + "step": 7129, + "time_per_iteration": 2.392146587371826 + }, + { + "auxiliary_loss_clip": 0.01062202, + "auxiliary_loss_mlp": 0.01041078, + "balance_loss_clip": 1.01305199, + "balance_loss_mlp": 1.02048707, + "epoch": 0.4286787915226214, + "flos": 24351953293440.0, + "grad_norm": 1.6331817519804848, + "language_loss": 0.79797363, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.81900644, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 7130, + "time_per_iteration": 2.4554696083068848 + }, + { + "auxiliary_loss_clip": 0.01064477, + "auxiliary_loss_mlp": 0.01044237, + "balance_loss_clip": 1.0155549, + "balance_loss_mlp": 1.02146149, + "epoch": 0.42873891477528936, + "flos": 15449101130880.0, + "grad_norm": 1.9166529364148959, + "language_loss": 0.7500149, + "learning_rate": 2.551445257891886e-06, + "loss": 0.77110207, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4296875, + "step": 7131, + "time_per_iteration": 2.364858627319336 + }, + { + "auxiliary_loss_clip": 0.0106088, + "auxiliary_loss_mlp": 0.01043958, + "balance_loss_clip": 1.01617062, + "balance_loss_mlp": 1.01890492, + "epoch": 0.4287990380279573, + "flos": 17638609224960.0, + "grad_norm": 2.435784632563269, + "language_loss": 0.7900629, + "learning_rate": 2.551070882366973e-06, + "loss": 0.81111127, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41992188, + "step": 7132, + "time_per_iteration": 2.4228222370147705 + }, + { + "auxiliary_loss_clip": 0.01062946, + "auxiliary_loss_mlp": 0.01045109, + "balance_loss_clip": 1.0167129, + "balance_loss_mlp": 1.02145684, + "epoch": 0.4288591612806253, + "flos": 27161099930880.0, + "grad_norm": 1.576375223862875, + "language_loss": 0.7978583, + "learning_rate": 2.550696485945397e-06, + "loss": 0.81893885, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 7133, + "time_per_iteration": 2.5782976150512695 + }, + { + "auxiliary_loss_clip": 0.01061538, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_clip": 1.01936507, + "balance_loss_mlp": 1.01898241, + "epoch": 0.42891928453329325, + "flos": 17162289279360.0, + "grad_norm": 1.9201273333221922, + "language_loss": 0.7650423, + "learning_rate": 2.550322068641355e-06, + "loss": 0.78612411, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42578125, + "step": 7134, + "time_per_iteration": 2.433131217956543 + }, + { + "auxiliary_loss_clip": 0.01056983, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.01631927, + "balance_loss_mlp": 1.01750576, + "epoch": 0.4289794077859612, + "flos": 18186047343360.0, + "grad_norm": 1.7525280747535679, + "language_loss": 0.85356522, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.87454945, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39648438, + "step": 7135, + "time_per_iteration": 2.4047629833221436 + }, + { + "auxiliary_loss_clip": 0.01056205, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.01255584, + "balance_loss_mlp": 1.01750565, + "epoch": 0.4290395310386292, + "flos": 28255627054080.0, + "grad_norm": 1.8399752202663024, + "language_loss": 0.77339071, + "learning_rate": 2.549573171442666e-06, + "loss": 0.79433608, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 7136, + "time_per_iteration": 2.4801571369171143 + }, + { + "auxiliary_loss_clip": 0.010579, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_clip": 1.01776242, + "balance_loss_mlp": 1.01652992, + "epoch": 0.42909965429129715, + "flos": 16215165383040.0, + "grad_norm": 1.9659011682676815, + "language_loss": 0.81303084, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.83405846, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 7137, + "time_per_iteration": 4.447194576263428 + }, + { + "auxiliary_loss_clip": 0.01061197, + "auxiliary_loss_mlp": 0.01043331, + "balance_loss_clip": 1.0155673, + "balance_loss_mlp": 1.01933408, + "epoch": 0.4291597775439651, + "flos": 23111733600000.0, + "grad_norm": 1.9829509210855931, + "language_loss": 0.77919817, + "learning_rate": 2.548824190884499e-06, + "loss": 0.8002435, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 7138, + "time_per_iteration": 3.839500904083252 + }, + { + "auxiliary_loss_clip": 0.01011458, + "auxiliary_loss_mlp": 0.01005674, + "balance_loss_clip": 1.00197804, + "balance_loss_mlp": 1.00279164, + "epoch": 0.4292199007966331, + "flos": 67543004511360.0, + "grad_norm": 0.7758363582394228, + "language_loss": 0.56287479, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58304608, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.03686523, + "router_z_loss_mlp": 0.08691406, + "step": 7139, + "time_per_iteration": 4.3483545780181885 + }, + { + "auxiliary_loss_clip": 0.01057991, + "auxiliary_loss_mlp": 0.01043574, + "balance_loss_clip": 1.02064979, + "balance_loss_mlp": 1.01921797, + "epoch": 0.42928002404930105, + "flos": 22998824663040.0, + "grad_norm": 1.6347169872899117, + "language_loss": 0.81658506, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83760071, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.38671875, + "step": 7140, + "time_per_iteration": 2.4201200008392334 + }, + { + "auxiliary_loss_clip": 0.0105867, + "auxiliary_loss_mlp": 0.01045357, + "balance_loss_clip": 1.01982188, + "balance_loss_mlp": 1.01835072, + "epoch": 0.429340147301969, + "flos": 11544170561280.0, + "grad_norm": 1.8816753064993963, + "language_loss": 0.83117628, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.85221654, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 7141, + "time_per_iteration": 2.374645948410034 + }, + { + "auxiliary_loss_clip": 0.01060505, + "auxiliary_loss_mlp": 0.01054649, + "balance_loss_clip": 1.02715933, + "balance_loss_mlp": 1.01884484, + "epoch": 0.42940027055463703, + "flos": 25263814849920.0, + "grad_norm": 1.7134811576447517, + "language_loss": 0.87975895, + "learning_rate": 2.547325980144166e-06, + "loss": 0.9009105, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 7142, + "time_per_iteration": 3.8696744441986084 + }, + { + "auxiliary_loss_clip": 0.01056735, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.01703024, + "balance_loss_mlp": 1.01845837, + "epoch": 0.429460393807305, + "flos": 23803886770560.0, + "grad_norm": 1.9709071273517011, + "language_loss": 0.79539752, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.81637251, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 7143, + "time_per_iteration": 2.445742130279541 + }, + { + "auxiliary_loss_clip": 0.01057564, + "auxiliary_loss_mlp": 0.01042416, + "balance_loss_clip": 1.01648831, + "balance_loss_mlp": 1.01793182, + "epoch": 0.42952051705997296, + "flos": 13917426474240.0, + "grad_norm": 2.1595880893253883, + "language_loss": 0.78495872, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.80595851, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 7144, + "time_per_iteration": 2.383176803588867 + }, + { + "auxiliary_loss_clip": 0.01058242, + "auxiliary_loss_mlp": 0.01042123, + "balance_loss_clip": 1.01674318, + "balance_loss_mlp": 1.01807857, + "epoch": 0.4295806403126409, + "flos": 26759179825920.0, + "grad_norm": 2.2225051820734776, + "language_loss": 0.75637913, + "learning_rate": 2.54620210411532e-06, + "loss": 0.77738279, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40234375, + "step": 7145, + "time_per_iteration": 2.4220123291015625 + }, + { + "auxiliary_loss_clip": 0.01060058, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.01587045, + "balance_loss_mlp": 1.01918614, + "epoch": 0.4296407635653089, + "flos": 20951762382720.0, + "grad_norm": 1.9627279618844375, + "language_loss": 0.80693102, + "learning_rate": 2.545827437329352e-06, + "loss": 0.82794607, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40820312, + "step": 7146, + "time_per_iteration": 2.400641679763794 + }, + { + "auxiliary_loss_clip": 0.01057657, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_clip": 1.01565111, + "balance_loss_mlp": 1.01829696, + "epoch": 0.42970088681797686, + "flos": 15851405260800.0, + "grad_norm": 2.0803092586269076, + "language_loss": 0.84250683, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.8634963, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 7147, + "time_per_iteration": 2.353229522705078 + }, + { + "auxiliary_loss_clip": 0.01061551, + "auxiliary_loss_mlp": 0.01042822, + "balance_loss_clip": 1.01598752, + "balance_loss_mlp": 1.01940441, + "epoch": 0.4297610100706448, + "flos": 22381525180800.0, + "grad_norm": 1.8943487407854593, + "language_loss": 0.88104403, + "learning_rate": 2.545078041678131e-06, + "loss": 0.90208781, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.421875, + "step": 7148, + "time_per_iteration": 2.4030771255493164 + }, + { + "auxiliary_loss_clip": 0.01058267, + "auxiliary_loss_mlp": 0.01043513, + "balance_loss_clip": 1.01753712, + "balance_loss_mlp": 1.01820922, + "epoch": 0.4298211333233128, + "flos": 27924510919680.0, + "grad_norm": 1.7633007639421525, + "language_loss": 0.79594195, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.81695974, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40039062, + "step": 7149, + "time_per_iteration": 2.4749510288238525 + }, + { + "auxiliary_loss_clip": 0.01058168, + "auxiliary_loss_mlp": 0.01040104, + "balance_loss_clip": 1.0145936, + "balance_loss_mlp": 1.01839256, + "epoch": 0.42988125657598075, + "flos": 24424467920640.0, + "grad_norm": 1.6130407582481165, + "language_loss": 0.81004262, + "learning_rate": 2.544328563349256e-06, + "loss": 0.83102536, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3984375, + "step": 7150, + "time_per_iteration": 2.476376533508301 + }, + { + "auxiliary_loss_clip": 0.01062195, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_clip": 1.02001309, + "balance_loss_mlp": 1.01983654, + "epoch": 0.4299413798286487, + "flos": 15849310579200.0, + "grad_norm": 2.013932123516498, + "language_loss": 0.77224398, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.79334563, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 7151, + "time_per_iteration": 2.4122672080993652 + }, + { + "auxiliary_loss_clip": 0.01063376, + "auxiliary_loss_mlp": 0.01045113, + "balance_loss_clip": 1.01573992, + "balance_loss_mlp": 1.02054584, + "epoch": 0.4300015030813167, + "flos": 22308417060480.0, + "grad_norm": 1.9354616669465323, + "language_loss": 0.71799934, + "learning_rate": 2.543579002456406e-06, + "loss": 0.73908424, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42773438, + "step": 7152, + "time_per_iteration": 2.5281565189361572 + }, + { + "auxiliary_loss_clip": 0.01059217, + "auxiliary_loss_mlp": 0.01043961, + "balance_loss_clip": 1.01775932, + "balance_loss_mlp": 1.01833737, + "epoch": 0.43006162633398465, + "flos": 34896212115840.0, + "grad_norm": 1.5804167269229072, + "language_loss": 0.72615719, + "learning_rate": 2.54320419108402e-06, + "loss": 0.74718904, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41015625, + "step": 7153, + "time_per_iteration": 2.5258703231811523 + }, + { + "auxiliary_loss_clip": 0.0106078, + "auxiliary_loss_mlp": 0.01038173, + "balance_loss_clip": 1.01046896, + "balance_loss_mlp": 1.01917744, + "epoch": 0.4301217495866526, + "flos": 15960648504960.0, + "grad_norm": 2.458239342696999, + "language_loss": 0.80617976, + "learning_rate": 2.542829359113276e-06, + "loss": 0.8271693, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41601562, + "step": 7154, + "time_per_iteration": 2.4534530639648438 + }, + { + "auxiliary_loss_clip": 0.01058466, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.01629686, + "balance_loss_mlp": 1.01960647, + "epoch": 0.43018187283932063, + "flos": 18769376206080.0, + "grad_norm": 1.5948911335845621, + "language_loss": 0.801525, + "learning_rate": 2.542454506558389e-06, + "loss": 0.82251292, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38867188, + "step": 7155, + "time_per_iteration": 2.4419100284576416 + }, + { + "auxiliary_loss_clip": 0.01058429, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.01244593, + "balance_loss_mlp": 1.01834023, + "epoch": 0.4302419960919886, + "flos": 20150819815680.0, + "grad_norm": 1.7430846704680216, + "language_loss": 0.90161943, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.92256808, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.40039062, + "step": 7156, + "time_per_iteration": 2.592782497406006 + }, + { + "auxiliary_loss_clip": 0.01061063, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.01115537, + "balance_loss_mlp": 1.01872396, + "epoch": 0.43030211934465656, + "flos": 26431519916160.0, + "grad_norm": 1.676717078979573, + "language_loss": 0.84601462, + "learning_rate": 2.541704739753042e-06, + "loss": 0.86702681, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42382812, + "step": 7157, + "time_per_iteration": 2.5139517784118652 + }, + { + "auxiliary_loss_clip": 0.01062675, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.01649606, + "balance_loss_mlp": 1.01995885, + "epoch": 0.43036224259732453, + "flos": 24388088417280.0, + "grad_norm": 2.129035916271821, + "language_loss": 0.73637521, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.7574411, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42578125, + "step": 7158, + "time_per_iteration": 2.5387179851531982 + }, + { + "auxiliary_loss_clip": 0.01060926, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.01618516, + "balance_loss_mlp": 1.01961267, + "epoch": 0.4304223658499925, + "flos": 17200763464320.0, + "grad_norm": 1.8098031999358877, + "language_loss": 0.84843552, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.86946988, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.4140625, + "step": 7159, + "time_per_iteration": 2.433806896209717 + }, + { + "auxiliary_loss_clip": 0.01059515, + "auxiliary_loss_mlp": 0.01043443, + "balance_loss_clip": 1.01650155, + "balance_loss_mlp": 1.01845002, + "epoch": 0.43048248910266046, + "flos": 14902116860160.0, + "grad_norm": 2.210293602418018, + "language_loss": 0.85060197, + "learning_rate": 2.54057993551933e-06, + "loss": 0.87163156, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 7160, + "time_per_iteration": 2.4539196491241455 + }, + { + "auxiliary_loss_clip": 0.0106126, + "auxiliary_loss_mlp": 0.01054342, + "balance_loss_clip": 1.02227449, + "balance_loss_mlp": 1.01831985, + "epoch": 0.4305426123553284, + "flos": 21578767223040.0, + "grad_norm": 6.990140104024967, + "language_loss": 0.78853214, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.80968809, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4296875, + "step": 7161, + "time_per_iteration": 2.4627578258514404 + }, + { + "auxiliary_loss_clip": 0.01059881, + "auxiliary_loss_mlp": 0.01042902, + "balance_loss_clip": 1.01665235, + "balance_loss_mlp": 1.01854086, + "epoch": 0.4306027356079964, + "flos": 22600186225920.0, + "grad_norm": 2.634567561112199, + "language_loss": 0.74913383, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.77016163, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.4140625, + "step": 7162, + "time_per_iteration": 2.477551221847534 + }, + { + "auxiliary_loss_clip": 0.01011676, + "auxiliary_loss_mlp": 0.01006347, + "balance_loss_clip": 1.00315273, + "balance_loss_mlp": 1.00316954, + "epoch": 0.43066285886066435, + "flos": 70669128182400.0, + "grad_norm": 1.436855119596756, + "language_loss": 0.59181809, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61199832, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.03198242, + "router_z_loss_mlp": 0.08496094, + "step": 7163, + "time_per_iteration": 2.963192939758301 + }, + { + "auxiliary_loss_clip": 0.01058024, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.01543319, + "balance_loss_mlp": 1.01747406, + "epoch": 0.4307229821133323, + "flos": 26719483743360.0, + "grad_norm": 1.5604910230556561, + "language_loss": 0.79966348, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.82065165, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40625, + "step": 7164, + "time_per_iteration": 2.5151240825653076 + }, + { + "auxiliary_loss_clip": 0.0106107, + "auxiliary_loss_mlp": 0.01050916, + "balance_loss_clip": 1.02271104, + "balance_loss_mlp": 1.01814938, + "epoch": 0.4307831053660003, + "flos": 26175920785920.0, + "grad_norm": 1.7272694827222066, + "language_loss": 0.70325226, + "learning_rate": 2.538704852009177e-06, + "loss": 0.72437209, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4296875, + "step": 7165, + "time_per_iteration": 2.521815776824951 + }, + { + "auxiliary_loss_clip": 0.01060342, + "auxiliary_loss_mlp": 0.01054658, + "balance_loss_clip": 1.02672708, + "balance_loss_mlp": 1.0187974, + "epoch": 0.43084322861866825, + "flos": 18909517870080.0, + "grad_norm": 2.014244628065655, + "language_loss": 0.76137173, + "learning_rate": 2.538329773967034e-06, + "loss": 0.78252178, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.4140625, + "step": 7166, + "time_per_iteration": 2.5475032329559326 + }, + { + "auxiliary_loss_clip": 0.01059422, + "auxiliary_loss_mlp": 0.0104709, + "balance_loss_clip": 1.02097154, + "balance_loss_mlp": 1.01898384, + "epoch": 0.4309033518713362, + "flos": 26431694472960.0, + "grad_norm": 1.607737102827125, + "language_loss": 0.73255843, + "learning_rate": 2.537954675511372e-06, + "loss": 0.7536236, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 7167, + "time_per_iteration": 2.509458541870117 + }, + { + "auxiliary_loss_clip": 0.010574, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.01665807, + "balance_loss_mlp": 1.01848531, + "epoch": 0.43096347512400424, + "flos": 21212284014720.0, + "grad_norm": 1.5314434970669786, + "language_loss": 0.79610252, + "learning_rate": 2.537579556656414e-06, + "loss": 0.81708908, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.390625, + "step": 7168, + "time_per_iteration": 2.5042614936828613 + }, + { + "auxiliary_loss_clip": 0.01060783, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_clip": 1.02063274, + "balance_loss_mlp": 1.01871157, + "epoch": 0.4310235983766722, + "flos": 16539334156800.0, + "grad_norm": 2.675383762337884, + "language_loss": 0.83623576, + "learning_rate": 2.537204417416387e-06, + "loss": 0.85733193, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 7169, + "time_per_iteration": 2.4548728466033936 + }, + { + "auxiliary_loss_clip": 0.01011194, + "auxiliary_loss_mlp": 0.01010358, + "balance_loss_clip": 1.00738931, + "balance_loss_mlp": 1.00257242, + "epoch": 0.43108372162934017, + "flos": 64772506615680.0, + "grad_norm": 0.672636339054128, + "language_loss": 0.608257, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62847251, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.0859375, + "step": 7170, + "time_per_iteration": 3.2654716968536377 + }, + { + "auxiliary_loss_clip": 0.01060257, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.01619685, + "balance_loss_mlp": 1.0187881, + "epoch": 0.43114384488200813, + "flos": 13443236121600.0, + "grad_norm": 1.6245605107089953, + "language_loss": 0.77982366, + "learning_rate": 2.536454077838021e-06, + "loss": 0.80085814, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 7171, + "time_per_iteration": 2.433366537094116 + }, + { + "auxiliary_loss_clip": 0.01059686, + "auxiliary_loss_mlp": 0.01043754, + "balance_loss_clip": 1.01644325, + "balance_loss_mlp": 1.01877165, + "epoch": 0.4312039681346761, + "flos": 26285478232320.0, + "grad_norm": 1.889531474753538, + "language_loss": 0.78238022, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.8034147, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40820312, + "step": 7172, + "time_per_iteration": 2.523635149002075 + }, + { + "auxiliary_loss_clip": 0.01061639, + "auxiliary_loss_mlp": 0.01047697, + "balance_loss_clip": 1.01752543, + "balance_loss_mlp": 1.01831985, + "epoch": 0.43126409138734406, + "flos": 20375625260160.0, + "grad_norm": 1.90557236436243, + "language_loss": 0.78335935, + "learning_rate": 2.535703656890086e-06, + "loss": 0.80445278, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43359375, + "step": 7173, + "time_per_iteration": 2.4714770317077637 + }, + { + "auxiliary_loss_clip": 0.0106029, + "auxiliary_loss_mlp": 0.01045729, + "balance_loss_clip": 1.01747632, + "balance_loss_mlp": 1.01912737, + "epoch": 0.431324214640012, + "flos": 22122120712320.0, + "grad_norm": 1.536180315555645, + "language_loss": 0.77190536, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79296553, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 7174, + "time_per_iteration": 2.523775100708008 + }, + { + "auxiliary_loss_clip": 0.01061735, + "auxiliary_loss_mlp": 0.01043893, + "balance_loss_clip": 1.01380479, + "balance_loss_mlp": 1.01865101, + "epoch": 0.43138433789268, + "flos": 15230125883520.0, + "grad_norm": 1.4468362805532222, + "language_loss": 0.83705318, + "learning_rate": 2.534953154686407e-06, + "loss": 0.85810941, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4296875, + "step": 7175, + "time_per_iteration": 2.529812812805176 + }, + { + "auxiliary_loss_clip": 0.01063985, + "auxiliary_loss_mlp": 0.01052126, + "balance_loss_clip": 1.02134585, + "balance_loss_mlp": 1.02019048, + "epoch": 0.43144446114534796, + "flos": 18149318726400.0, + "grad_norm": 2.22916184166232, + "language_loss": 0.76551402, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.7866751, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4375, + "step": 7176, + "time_per_iteration": 2.432892322540283 + }, + { + "auxiliary_loss_clip": 0.01061544, + "auxiliary_loss_mlp": 0.01046379, + "balance_loss_clip": 1.01626635, + "balance_loss_mlp": 1.01913822, + "epoch": 0.4315045843980159, + "flos": 22928753831040.0, + "grad_norm": 4.330548958783723, + "language_loss": 0.75277901, + "learning_rate": 2.534202571340819e-06, + "loss": 0.77385825, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42382812, + "step": 7177, + "time_per_iteration": 5.944704055786133 + }, + { + "auxiliary_loss_clip": 0.01066685, + "auxiliary_loss_mlp": 0.01052038, + "balance_loss_clip": 1.01889825, + "balance_loss_mlp": 1.02030897, + "epoch": 0.4315647076506839, + "flos": 22125786405120.0, + "grad_norm": 1.928211966503039, + "language_loss": 0.82920355, + "learning_rate": 2.533827249275387e-06, + "loss": 0.85039079, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46484375, + "step": 7178, + "time_per_iteration": 3.848681688308716 + }, + { + "auxiliary_loss_clip": 0.01059171, + "auxiliary_loss_mlp": 0.01044689, + "balance_loss_clip": 1.01767659, + "balance_loss_mlp": 1.01929593, + "epoch": 0.43162483090335185, + "flos": 26869889347200.0, + "grad_norm": 1.47402342841166, + "language_loss": 0.84706205, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86810064, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 7179, + "time_per_iteration": 2.4304518699645996 + }, + { + "auxiliary_loss_clip": 0.01062299, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.01682091, + "balance_loss_mlp": 1.02073288, + "epoch": 0.4316849541560198, + "flos": 13912399238400.0, + "grad_norm": 1.6322716524806418, + "language_loss": 0.76931638, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.7903924, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41601562, + "step": 7180, + "time_per_iteration": 2.4406208992004395 + }, + { + "auxiliary_loss_clip": 0.0106167, + "auxiliary_loss_mlp": 0.0104904, + "balance_loss_clip": 1.0178076, + "balance_loss_mlp": 1.01847363, + "epoch": 0.4317450774086878, + "flos": 16434245364480.0, + "grad_norm": 1.682701197060865, + "language_loss": 0.83005786, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.851165, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43164062, + "step": 7181, + "time_per_iteration": 2.3584885597229004 + }, + { + "auxiliary_loss_clip": 0.01063739, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.01705837, + "balance_loss_mlp": 1.01964617, + "epoch": 0.4318052006613558, + "flos": 20554031640960.0, + "grad_norm": 1.5094432165405487, + "language_loss": 0.90043747, + "learning_rate": 2.532325758728165e-06, + "loss": 0.92156792, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.44140625, + "step": 7182, + "time_per_iteration": 3.885211229324341 + }, + { + "auxiliary_loss_clip": 0.01058701, + "auxiliary_loss_mlp": 0.01043735, + "balance_loss_clip": 1.01526737, + "balance_loss_mlp": 1.01796961, + "epoch": 0.43186532391402377, + "flos": 22818952005120.0, + "grad_norm": 1.6107760234146897, + "language_loss": 0.77142203, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.79244637, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 7183, + "time_per_iteration": 2.542673349380493 + }, + { + "auxiliary_loss_clip": 0.01060462, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.01785946, + "balance_loss_mlp": 1.0174005, + "epoch": 0.43192544716669173, + "flos": 25555409458560.0, + "grad_norm": 1.503850797554251, + "language_loss": 0.78890949, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.81000435, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4296875, + "step": 7184, + "time_per_iteration": 2.487062931060791 + }, + { + "auxiliary_loss_clip": 0.01057042, + "auxiliary_loss_mlp": 0.01044986, + "balance_loss_clip": 1.01860487, + "balance_loss_mlp": 1.01724219, + "epoch": 0.4319855704193597, + "flos": 30953400854400.0, + "grad_norm": 1.8917905370775512, + "language_loss": 0.74628729, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.76730758, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 7185, + "time_per_iteration": 2.51365065574646 + }, + { + "auxiliary_loss_clip": 0.01063694, + "auxiliary_loss_mlp": 0.01058742, + "balance_loss_clip": 1.02677011, + "balance_loss_mlp": 1.0186938, + "epoch": 0.43204569367202766, + "flos": 24237717724800.0, + "grad_norm": 2.2189323072832132, + "language_loss": 0.77298737, + "learning_rate": 2.530823945207421e-06, + "loss": 0.79421169, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44921875, + "step": 7186, + "time_per_iteration": 2.443582057952881 + }, + { + "auxiliary_loss_clip": 0.01061089, + "auxiliary_loss_mlp": 0.01045666, + "balance_loss_clip": 1.01635289, + "balance_loss_mlp": 1.01857829, + "epoch": 0.43210581692469563, + "flos": 18405930286080.0, + "grad_norm": 2.2955046663005354, + "language_loss": 0.7707836, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.79185116, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 7187, + "time_per_iteration": 2.4538910388946533 + }, + { + "auxiliary_loss_clip": 0.01009344, + "auxiliary_loss_mlp": 0.01004828, + "balance_loss_clip": 1.0013473, + "balance_loss_mlp": 1.00089359, + "epoch": 0.4321659401773636, + "flos": 49829155706880.0, + "grad_norm": 0.854052341629861, + "language_loss": 0.68349922, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70364094, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.03491211, + "router_z_loss_mlp": 0.08496094, + "step": 7188, + "time_per_iteration": 3.0745813846588135 + }, + { + "auxiliary_loss_clip": 0.01056901, + "auxiliary_loss_mlp": 0.01044129, + "balance_loss_clip": 1.01730692, + "balance_loss_mlp": 1.01699722, + "epoch": 0.43222606343003156, + "flos": 17127620432640.0, + "grad_norm": 1.7871946743936769, + "language_loss": 0.79783064, + "learning_rate": 2.529697373663614e-06, + "loss": 0.81884098, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3984375, + "step": 7189, + "time_per_iteration": 2.458501100540161 + }, + { + "auxiliary_loss_clip": 0.01062223, + "auxiliary_loss_mlp": 0.01047777, + "balance_loss_clip": 1.01699722, + "balance_loss_mlp": 1.01783729, + "epoch": 0.4322861866826995, + "flos": 22748776439040.0, + "grad_norm": 1.7844175418296926, + "language_loss": 0.73600197, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.75710195, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.44335938, + "step": 7190, + "time_per_iteration": 2.4763271808624268 + }, + { + "auxiliary_loss_clip": 0.01056632, + "auxiliary_loss_mlp": 0.01046138, + "balance_loss_clip": 1.01718211, + "balance_loss_mlp": 1.01623988, + "epoch": 0.4323463099353675, + "flos": 27890679945600.0, + "grad_norm": 1.3473584153746445, + "language_loss": 0.80786061, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.8288883, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40429688, + "step": 7191, + "time_per_iteration": 2.481743335723877 + }, + { + "auxiliary_loss_clip": 0.0105859, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_clip": 1.01398897, + "balance_loss_mlp": 1.01802862, + "epoch": 0.43240643318803546, + "flos": 21613715360640.0, + "grad_norm": 1.6006910870569688, + "language_loss": 0.77242219, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.79343152, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40625, + "step": 7192, + "time_per_iteration": 2.465409994125366 + }, + { + "auxiliary_loss_clip": 0.01060704, + "auxiliary_loss_mlp": 0.01049474, + "balance_loss_clip": 1.01900446, + "balance_loss_mlp": 1.01840234, + "epoch": 0.4324665564407034, + "flos": 17557646048640.0, + "grad_norm": 1.771776424319067, + "language_loss": 0.79882407, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81992579, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.421875, + "step": 7193, + "time_per_iteration": 2.4294402599334717 + }, + { + "auxiliary_loss_clip": 0.01060215, + "auxiliary_loss_mlp": 0.01050129, + "balance_loss_clip": 1.01928973, + "balance_loss_mlp": 1.01840568, + "epoch": 0.4325266796933714, + "flos": 18401531454720.0, + "grad_norm": 2.1180400700083273, + "language_loss": 0.76332742, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.78443086, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.41796875, + "step": 7194, + "time_per_iteration": 2.3969926834106445 + }, + { + "auxiliary_loss_clip": 0.01061712, + "auxiliary_loss_mlp": 0.01050459, + "balance_loss_clip": 1.01944077, + "balance_loss_mlp": 1.01951957, + "epoch": 0.4325868029460394, + "flos": 22563701988480.0, + "grad_norm": 1.823426272341479, + "language_loss": 0.61335123, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.63447291, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.421875, + "step": 7195, + "time_per_iteration": 2.4437122344970703 + }, + { + "auxiliary_loss_clip": 0.01063468, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.01265216, + "balance_loss_mlp": 1.02007484, + "epoch": 0.43264692619870737, + "flos": 14604796788480.0, + "grad_norm": 2.229463416741991, + "language_loss": 0.66996223, + "learning_rate": 2.527068004376515e-06, + "loss": 0.69102573, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43359375, + "step": 7196, + "time_per_iteration": 2.375047445297241 + }, + { + "auxiliary_loss_clip": 0.01065763, + "auxiliary_loss_mlp": 0.01053091, + "balance_loss_clip": 1.02194118, + "balance_loss_mlp": 1.01920664, + "epoch": 0.43270704945137534, + "flos": 21500736600960.0, + "grad_norm": 2.050825528987171, + "language_loss": 0.73649091, + "learning_rate": 2.526692300132797e-06, + "loss": 0.7576794, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.46679688, + "step": 7197, + "time_per_iteration": 2.4208762645721436 + }, + { + "auxiliary_loss_clip": 0.01059823, + "auxiliary_loss_mlp": 0.01045879, + "balance_loss_clip": 1.01847267, + "balance_loss_mlp": 1.01877964, + "epoch": 0.4327671727040433, + "flos": 25154711251200.0, + "grad_norm": 1.9107183219017396, + "language_loss": 0.73398054, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.75503755, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 7198, + "time_per_iteration": 2.4871480464935303 + }, + { + "auxiliary_loss_clip": 0.01059328, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.01021981, + "balance_loss_mlp": 1.01825786, + "epoch": 0.43282729595671127, + "flos": 25445991657600.0, + "grad_norm": 1.346488812381435, + "language_loss": 0.81792063, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83889127, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41015625, + "step": 7199, + "time_per_iteration": 2.5029296875 + }, + { + "auxiliary_loss_clip": 0.01061514, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_clip": 1.01557624, + "balance_loss_mlp": 1.01964283, + "epoch": 0.43288741920937923, + "flos": 24125192812800.0, + "grad_norm": 2.218615200353538, + "language_loss": 0.70411575, + "learning_rate": 2.525565067625286e-06, + "loss": 0.72517204, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41992188, + "step": 7200, + "time_per_iteration": 2.456690788269043 + }, + { + "auxiliary_loss_clip": 0.01061782, + "auxiliary_loss_mlp": 0.01054576, + "balance_loss_clip": 1.02274716, + "balance_loss_mlp": 1.01867318, + "epoch": 0.4329475424620472, + "flos": 19204045032960.0, + "grad_norm": 1.7914641704922911, + "language_loss": 0.89010668, + "learning_rate": 2.525189283578157e-06, + "loss": 0.91127026, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.43164062, + "step": 7201, + "time_per_iteration": 2.4084606170654297 + }, + { + "auxiliary_loss_clip": 0.0106546, + "auxiliary_loss_mlp": 0.01054522, + "balance_loss_clip": 1.02178681, + "balance_loss_mlp": 1.02121615, + "epoch": 0.43300766571471516, + "flos": 22637263956480.0, + "grad_norm": 2.381333491361938, + "language_loss": 0.66418135, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.68538117, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44140625, + "step": 7202, + "time_per_iteration": 2.447852849960327 + }, + { + "auxiliary_loss_clip": 0.01060691, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.01585269, + "balance_loss_mlp": 1.01925397, + "epoch": 0.4330677889673831, + "flos": 22120165676160.0, + "grad_norm": 1.7299329415608007, + "language_loss": 0.83219254, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.85322642, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.4140625, + "step": 7203, + "time_per_iteration": 2.415625810623169 + }, + { + "auxiliary_loss_clip": 0.01063301, + "auxiliary_loss_mlp": 0.01052236, + "balance_loss_clip": 1.02059793, + "balance_loss_mlp": 1.01806355, + "epoch": 0.4331279122200511, + "flos": 23220418262400.0, + "grad_norm": 2.0087591890587273, + "language_loss": 0.82338387, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.84453923, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.453125, + "step": 7204, + "time_per_iteration": 2.469393730163574 + }, + { + "auxiliary_loss_clip": 0.01061017, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_clip": 1.01473737, + "balance_loss_mlp": 1.01893735, + "epoch": 0.43318803547271906, + "flos": 18258771438720.0, + "grad_norm": 4.717251897415797, + "language_loss": 0.75741363, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.77845716, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.421875, + "step": 7205, + "time_per_iteration": 2.401212692260742 + }, + { + "auxiliary_loss_clip": 0.01059834, + "auxiliary_loss_mlp": 0.01043937, + "balance_loss_clip": 1.01620924, + "balance_loss_mlp": 1.01990962, + "epoch": 0.433248158725387, + "flos": 27417152908800.0, + "grad_norm": 1.8506772376123721, + "language_loss": 0.76048994, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.7815277, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 7206, + "time_per_iteration": 2.5068812370300293 + }, + { + "auxiliary_loss_clip": 0.01059857, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_clip": 1.01347733, + "balance_loss_mlp": 1.01804495, + "epoch": 0.433308281978055, + "flos": 23216996949120.0, + "grad_norm": 1.94484167270054, + "language_loss": 0.80228627, + "learning_rate": 2.522934161574342e-06, + "loss": 0.82330477, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 7207, + "time_per_iteration": 2.411655902862549 + }, + { + "auxiliary_loss_clip": 0.0106218, + "auxiliary_loss_mlp": 0.01049595, + "balance_loss_clip": 1.019292, + "balance_loss_mlp": 1.01832068, + "epoch": 0.433368405230723, + "flos": 15851475083520.0, + "grad_norm": 5.116659620132036, + "language_loss": 0.81459206, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83570981, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43945312, + "step": 7208, + "time_per_iteration": 2.4860317707061768 + }, + { + "auxiliary_loss_clip": 0.01059918, + "auxiliary_loss_mlp": 0.01043107, + "balance_loss_clip": 1.01553392, + "balance_loss_mlp": 1.01764214, + "epoch": 0.433428528483391, + "flos": 19025080070400.0, + "grad_norm": 2.6398422258695855, + "language_loss": 0.72500366, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.74603391, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.421875, + "step": 7209, + "time_per_iteration": 2.4088857173919678 + }, + { + "auxiliary_loss_clip": 0.01061194, + "auxiliary_loss_mlp": 0.01047985, + "balance_loss_clip": 1.01792073, + "balance_loss_mlp": 1.0191524, + "epoch": 0.43348865173605894, + "flos": 24717074958720.0, + "grad_norm": 1.532202535407639, + "language_loss": 0.82138592, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.84247768, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41992188, + "step": 7210, + "time_per_iteration": 2.464977264404297 + }, + { + "auxiliary_loss_clip": 0.01058841, + "auxiliary_loss_mlp": 0.01045724, + "balance_loss_clip": 1.01868773, + "balance_loss_mlp": 1.01768875, + "epoch": 0.4335487749887269, + "flos": 22089581458560.0, + "grad_norm": 1.8177881228333863, + "language_loss": 0.82796288, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84900856, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41210938, + "step": 7211, + "time_per_iteration": 2.4403293132781982 + }, + { + "auxiliary_loss_clip": 0.01057834, + "auxiliary_loss_mlp": 0.01041384, + "balance_loss_clip": 1.01460946, + "balance_loss_mlp": 1.01763844, + "epoch": 0.43360889824139487, + "flos": 22381839383040.0, + "grad_norm": 2.0611927925642344, + "language_loss": 0.76518184, + "learning_rate": 2.521054347790029e-06, + "loss": 0.786174, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40234375, + "step": 7212, + "time_per_iteration": 2.424225330352783 + }, + { + "auxiliary_loss_clip": 0.01060665, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.01682878, + "balance_loss_mlp": 1.01937795, + "epoch": 0.43366902149406283, + "flos": 17527376033280.0, + "grad_norm": 1.6365430360657236, + "language_loss": 0.77523905, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.79628056, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41210938, + "step": 7213, + "time_per_iteration": 2.386165142059326 + }, + { + "auxiliary_loss_clip": 0.01060344, + "auxiliary_loss_mlp": 0.01049196, + "balance_loss_clip": 1.02132452, + "balance_loss_mlp": 1.01850772, + "epoch": 0.4337291447467308, + "flos": 19021763491200.0, + "grad_norm": 1.5836433618532024, + "language_loss": 0.65649271, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67758811, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 7214, + "time_per_iteration": 2.4387238025665283 + }, + { + "auxiliary_loss_clip": 0.01056902, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.01601291, + "balance_loss_mlp": 1.01781058, + "epoch": 0.43378926799939876, + "flos": 27232846508160.0, + "grad_norm": 1.630806901420469, + "language_loss": 0.72219521, + "learning_rate": 2.519926222304191e-06, + "loss": 0.74319303, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 7215, + "time_per_iteration": 2.510660171508789 + }, + { + "auxiliary_loss_clip": 0.01058631, + "auxiliary_loss_mlp": 0.01045001, + "balance_loss_clip": 1.01840496, + "balance_loss_mlp": 1.01872492, + "epoch": 0.43384939125206673, + "flos": 15960194657280.0, + "grad_norm": 1.8139245939597084, + "language_loss": 0.76345122, + "learning_rate": 2.519550141025255e-06, + "loss": 0.78448755, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 7216, + "time_per_iteration": 6.073709964752197 + }, + { + "auxiliary_loss_clip": 0.01065188, + "auxiliary_loss_mlp": 0.01049183, + "balance_loss_clip": 1.0156858, + "balance_loss_mlp": 1.01917851, + "epoch": 0.4339095145047347, + "flos": 21792296298240.0, + "grad_norm": 3.157210619327897, + "language_loss": 0.779917, + "learning_rate": 2.519174040044927e-06, + "loss": 0.80106074, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4609375, + "step": 7217, + "time_per_iteration": 3.816547155380249 + }, + { + "auxiliary_loss_clip": 0.01061293, + "auxiliary_loss_mlp": 0.01048951, + "balance_loss_clip": 1.02066267, + "balance_loss_mlp": 1.01893473, + "epoch": 0.43396963775740266, + "flos": 14208986171520.0, + "grad_norm": 1.9355293790525843, + "language_loss": 0.75286162, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.77396405, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42382812, + "step": 7218, + "time_per_iteration": 2.3996498584747314 + }, + { + "auxiliary_loss_clip": 0.01061731, + "auxiliary_loss_mlp": 0.01047325, + "balance_loss_clip": 1.01702178, + "balance_loss_mlp": 1.01853693, + "epoch": 0.4340297610100706, + "flos": 19718036202240.0, + "grad_norm": 1.8558112183635016, + "language_loss": 0.70679045, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.72788101, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43164062, + "step": 7219, + "time_per_iteration": 2.4120004177093506 + }, + { + "auxiliary_loss_clip": 0.01060087, + "auxiliary_loss_mlp": 0.01046817, + "balance_loss_clip": 1.01834977, + "balance_loss_mlp": 1.01921701, + "epoch": 0.4340898842627386, + "flos": 18952495620480.0, + "grad_norm": 1.5914494276078401, + "language_loss": 0.78566885, + "learning_rate": 2.518045619038202e-06, + "loss": 0.8067379, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 7220, + "time_per_iteration": 2.4493658542633057 + }, + { + "auxiliary_loss_clip": 0.01059169, + "auxiliary_loss_mlp": 0.01044121, + "balance_loss_clip": 1.01667893, + "balance_loss_mlp": 1.01862669, + "epoch": 0.4341500075154066, + "flos": 22017206476800.0, + "grad_norm": 1.872346508453308, + "language_loss": 0.70273894, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.72377187, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 7221, + "time_per_iteration": 2.4338510036468506 + }, + { + "auxiliary_loss_clip": 0.01061318, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_clip": 1.01777935, + "balance_loss_mlp": 1.01930952, + "epoch": 0.4342101307680746, + "flos": 23581455298560.0, + "grad_norm": 1.7656287341704238, + "language_loss": 0.66270709, + "learning_rate": 2.51729324012157e-06, + "loss": 0.68377185, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41992188, + "step": 7222, + "time_per_iteration": 3.8299214839935303 + }, + { + "auxiliary_loss_clip": 0.01058185, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.01322496, + "balance_loss_mlp": 1.01776028, + "epoch": 0.43427025402074254, + "flos": 17967002273280.0, + "grad_norm": 3.1536880394021227, + "language_loss": 0.7477265, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.76871121, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40429688, + "step": 7223, + "time_per_iteration": 2.4220056533813477 + }, + { + "auxiliary_loss_clip": 0.01061551, + "auxiliary_loss_mlp": 0.01043474, + "balance_loss_clip": 1.0161159, + "balance_loss_mlp": 1.01880467, + "epoch": 0.4343303772734105, + "flos": 26285198941440.0, + "grad_norm": 1.9788595466805834, + "language_loss": 0.94854051, + "learning_rate": 2.516540782741694e-06, + "loss": 0.96959078, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42773438, + "step": 7224, + "time_per_iteration": 2.4639101028442383 + }, + { + "auxiliary_loss_clip": 0.01058807, + "auxiliary_loss_mlp": 0.01042647, + "balance_loss_clip": 1.01471651, + "balance_loss_mlp": 1.01823735, + "epoch": 0.43439050052607847, + "flos": 26832741793920.0, + "grad_norm": 1.7781925718811329, + "language_loss": 0.61970675, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.6407212, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 7225, + "time_per_iteration": 2.496264696121216 + }, + { + "auxiliary_loss_clip": 0.01059499, + "auxiliary_loss_mlp": 0.01041309, + "balance_loss_clip": 1.01559544, + "balance_loss_mlp": 1.01895368, + "epoch": 0.43445062377874644, + "flos": 21396590415360.0, + "grad_norm": 2.000241194240365, + "language_loss": 0.79472637, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.81573451, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40625, + "step": 7226, + "time_per_iteration": 2.4356067180633545 + }, + { + "auxiliary_loss_clip": 0.01058314, + "auxiliary_loss_mlp": 0.01043053, + "balance_loss_clip": 1.01745832, + "balance_loss_mlp": 1.01889741, + "epoch": 0.4345107470314144, + "flos": 19900911237120.0, + "grad_norm": 1.7456541532261833, + "language_loss": 0.86092103, + "learning_rate": 2.515411949802964e-06, + "loss": 0.8819347, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39453125, + "step": 7227, + "time_per_iteration": 2.405559778213501 + }, + { + "auxiliary_loss_clip": 0.01058849, + "auxiliary_loss_mlp": 0.01051053, + "balance_loss_clip": 1.02274096, + "balance_loss_mlp": 1.01833773, + "epoch": 0.43457087028408237, + "flos": 26431415182080.0, + "grad_norm": 1.9781709709021398, + "language_loss": 0.78416884, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.80526781, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40429688, + "step": 7228, + "time_per_iteration": 2.495698928833008 + }, + { + "auxiliary_loss_clip": 0.0105861, + "auxiliary_loss_mlp": 0.01046994, + "balance_loss_clip": 1.01845503, + "balance_loss_mlp": 1.01819026, + "epoch": 0.43463099353675033, + "flos": 31867461826560.0, + "grad_norm": 1.543915600521893, + "language_loss": 0.81778544, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.8388415, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40429688, + "step": 7229, + "time_per_iteration": 2.503937244415283 + }, + { + "auxiliary_loss_clip": 0.01060675, + "auxiliary_loss_mlp": 0.01043539, + "balance_loss_clip": 1.01407075, + "balance_loss_mlp": 1.01830864, + "epoch": 0.4346911167894183, + "flos": 24570125579520.0, + "grad_norm": 1.9287491933985998, + "language_loss": 0.82994664, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.85098881, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42382812, + "step": 7230, + "time_per_iteration": 2.4554226398468018 + }, + { + "auxiliary_loss_clip": 0.0106428, + "auxiliary_loss_mlp": 0.01050917, + "balance_loss_clip": 1.02177012, + "balance_loss_mlp": 1.0202167, + "epoch": 0.43475124004208626, + "flos": 17089774652160.0, + "grad_norm": 2.149355720167987, + "language_loss": 0.78819335, + "learning_rate": 2.513906565661973e-06, + "loss": 0.80934536, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.44140625, + "step": 7231, + "time_per_iteration": 2.379403591156006 + }, + { + "auxiliary_loss_clip": 0.01057082, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.0178405, + "balance_loss_mlp": 1.01744628, + "epoch": 0.4348113632947542, + "flos": 26103406158720.0, + "grad_norm": 1.447883877632418, + "language_loss": 0.69807744, + "learning_rate": 2.513530170872575e-06, + "loss": 0.71908009, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39648438, + "step": 7232, + "time_per_iteration": 2.5072898864746094 + }, + { + "auxiliary_loss_clip": 0.01061449, + "auxiliary_loss_mlp": 0.0104434, + "balance_loss_clip": 1.0161109, + "balance_loss_mlp": 1.01860964, + "epoch": 0.4348714865474222, + "flos": 34199171354880.0, + "grad_norm": 1.8029140613846586, + "language_loss": 0.72855532, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74961323, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.42773438, + "step": 7233, + "time_per_iteration": 2.5074596405029297 + }, + { + "auxiliary_loss_clip": 0.01062286, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.01865005, + "balance_loss_mlp": 1.01869917, + "epoch": 0.43493160980009016, + "flos": 31536206046720.0, + "grad_norm": 2.0916033044491527, + "language_loss": 0.75744772, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.77855593, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43554688, + "step": 7234, + "time_per_iteration": 2.56392240524292 + }, + { + "auxiliary_loss_clip": 0.01062475, + "auxiliary_loss_mlp": 0.01053943, + "balance_loss_clip": 1.02266216, + "balance_loss_mlp": 1.01934361, + "epoch": 0.4349917330527582, + "flos": 24060184128000.0, + "grad_norm": 2.374468395817092, + "language_loss": 0.60553217, + "learning_rate": 2.512400869722782e-06, + "loss": 0.62669629, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4296875, + "step": 7235, + "time_per_iteration": 2.4698903560638428 + }, + { + "auxiliary_loss_clip": 0.01061427, + "auxiliary_loss_mlp": 0.01045381, + "balance_loss_clip": 1.01491106, + "balance_loss_mlp": 1.01803446, + "epoch": 0.43505185630542614, + "flos": 30517998888960.0, + "grad_norm": 2.1403308154972596, + "language_loss": 0.78222287, + "learning_rate": 2.512024397126566e-06, + "loss": 0.80329096, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 7236, + "time_per_iteration": 2.5081276893615723 + }, + { + "auxiliary_loss_clip": 0.01059458, + "auxiliary_loss_mlp": 0.0104344, + "balance_loss_clip": 1.01593864, + "balance_loss_mlp": 1.01916075, + "epoch": 0.4351119795580941, + "flos": 15734446606080.0, + "grad_norm": 2.384284804746426, + "language_loss": 0.82004988, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.84107888, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40234375, + "step": 7237, + "time_per_iteration": 2.4114208221435547 + }, + { + "auxiliary_loss_clip": 0.01059296, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.01618862, + "balance_loss_mlp": 1.0182364, + "epoch": 0.4351721028107621, + "flos": 18730832198400.0, + "grad_norm": 1.670508583618334, + "language_loss": 0.64055276, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.66158187, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 7238, + "time_per_iteration": 2.456557512283325 + }, + { + "auxiliary_loss_clip": 0.01058131, + "auxiliary_loss_mlp": 0.01049012, + "balance_loss_clip": 1.01988912, + "balance_loss_mlp": 1.01758504, + "epoch": 0.43523222606343004, + "flos": 25225759601280.0, + "grad_norm": 1.6868226932697041, + "language_loss": 0.8666333, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88770473, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40625, + "step": 7239, + "time_per_iteration": 2.44221568107605 + }, + { + "auxiliary_loss_clip": 0.01061168, + "auxiliary_loss_mlp": 0.01039808, + "balance_loss_clip": 1.01049483, + "balance_loss_mlp": 1.01986575, + "epoch": 0.435292349316098, + "flos": 22708137749760.0, + "grad_norm": 1.6837704325941958, + "language_loss": 0.74158537, + "learning_rate": 2.510518312724309e-06, + "loss": 0.76259506, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4140625, + "step": 7240, + "time_per_iteration": 2.43851637840271 + }, + { + "auxiliary_loss_clip": 0.01063228, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.01425886, + "balance_loss_mlp": 1.02019656, + "epoch": 0.43535247256876597, + "flos": 25774698908160.0, + "grad_norm": 1.734747043323094, + "language_loss": 0.82662988, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.84772515, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4296875, + "step": 7241, + "time_per_iteration": 2.4432718753814697 + }, + { + "auxiliary_loss_clip": 0.0106462, + "auxiliary_loss_mlp": 0.01047445, + "balance_loss_clip": 1.01568794, + "balance_loss_mlp": 1.0196979, + "epoch": 0.43541259582143393, + "flos": 17527236387840.0, + "grad_norm": 2.54591015861134, + "language_loss": 0.80753809, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.8286587, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.44921875, + "step": 7242, + "time_per_iteration": 2.3900856971740723 + }, + { + "auxiliary_loss_clip": 0.0106271, + "auxiliary_loss_mlp": 0.01047184, + "balance_loss_clip": 1.01542628, + "balance_loss_mlp": 1.01829815, + "epoch": 0.4354727190741019, + "flos": 15194304961920.0, + "grad_norm": 2.0901314103983664, + "language_loss": 0.7074796, + "learning_rate": 2.509388546104138e-06, + "loss": 0.72857851, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4453125, + "step": 7243, + "time_per_iteration": 2.378183126449585 + }, + { + "auxiliary_loss_clip": 0.01058985, + "auxiliary_loss_mlp": 0.01040949, + "balance_loss_clip": 1.01494932, + "balance_loss_mlp": 1.01884246, + "epoch": 0.43553284232676986, + "flos": 16648472666880.0, + "grad_norm": 1.5931986632256858, + "language_loss": 0.82405639, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.8450557, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40234375, + "step": 7244, + "time_per_iteration": 2.404832124710083 + }, + { + "auxiliary_loss_clip": 0.01060492, + "auxiliary_loss_mlp": 0.01042462, + "balance_loss_clip": 1.01455522, + "balance_loss_mlp": 1.0183332, + "epoch": 0.43559296557943783, + "flos": 23399837072640.0, + "grad_norm": 1.6496200814887945, + "language_loss": 0.74305522, + "learning_rate": 2.508635271753234e-06, + "loss": 0.76408476, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.421875, + "step": 7245, + "time_per_iteration": 2.405592918395996 + }, + { + "auxiliary_loss_clip": 0.01062047, + "auxiliary_loss_mlp": 0.01051283, + "balance_loss_clip": 1.02190971, + "balance_loss_mlp": 1.01910305, + "epoch": 0.4356530888321058, + "flos": 22417974506880.0, + "grad_norm": 1.593495521317607, + "language_loss": 0.78758073, + "learning_rate": 2.508258605639389e-06, + "loss": 0.80871403, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 7246, + "time_per_iteration": 2.518554449081421 + }, + { + "auxiliary_loss_clip": 0.01061627, + "auxiliary_loss_mlp": 0.01050251, + "balance_loss_clip": 1.01975763, + "balance_loss_mlp": 1.01879787, + "epoch": 0.43571321208477376, + "flos": 21615076903680.0, + "grad_norm": 1.9461150110039138, + "language_loss": 0.87172788, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.89284664, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4296875, + "step": 7247, + "time_per_iteration": 2.4161739349365234 + }, + { + "auxiliary_loss_clip": 0.01062232, + "auxiliary_loss_mlp": 0.01056235, + "balance_loss_clip": 1.02555084, + "balance_loss_mlp": 1.01948786, + "epoch": 0.4357733353374418, + "flos": 23986238135040.0, + "grad_norm": 1.7127224920227593, + "language_loss": 0.73643243, + "learning_rate": 2.507505215606333e-06, + "loss": 0.75761712, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.42773438, + "step": 7248, + "time_per_iteration": 2.4894182682037354 + }, + { + "auxiliary_loss_clip": 0.01062292, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_clip": 1.01861167, + "balance_loss_mlp": 1.01942813, + "epoch": 0.43583345859010975, + "flos": 25263570470400.0, + "grad_norm": 1.4506900692625024, + "language_loss": 0.87911606, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.9002341, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.42773438, + "step": 7249, + "time_per_iteration": 2.4342713356018066 + }, + { + "auxiliary_loss_clip": 0.01063931, + "auxiliary_loss_mlp": 0.01054223, + "balance_loss_clip": 1.02362192, + "balance_loss_mlp": 1.02054429, + "epoch": 0.4358935818427777, + "flos": 23695167196800.0, + "grad_norm": 1.7341307479292052, + "language_loss": 0.82988262, + "learning_rate": 2.506751748594683e-06, + "loss": 0.85106421, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 7250, + "time_per_iteration": 2.468912124633789 + }, + { + "auxiliary_loss_clip": 0.0106233, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.01363957, + "balance_loss_mlp": 1.01948023, + "epoch": 0.4359537050954457, + "flos": 29531562935040.0, + "grad_norm": 1.668448177763121, + "language_loss": 0.86354482, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.88462949, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.42773438, + "step": 7251, + "time_per_iteration": 2.4663045406341553 + }, + { + "auxiliary_loss_clip": 0.01058821, + "auxiliary_loss_mlp": 0.01050543, + "balance_loss_clip": 1.01954865, + "balance_loss_mlp": 1.01794744, + "epoch": 0.43601382834811364, + "flos": 22710162608640.0, + "grad_norm": 1.4579019776635065, + "language_loss": 0.70383686, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.72493052, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.40820312, + "step": 7252, + "time_per_iteration": 2.455044746398926 + }, + { + "auxiliary_loss_clip": 0.0105955, + "auxiliary_loss_mlp": 0.01044216, + "balance_loss_clip": 1.01659513, + "balance_loss_mlp": 1.01825416, + "epoch": 0.4360739516007816, + "flos": 19097734343040.0, + "grad_norm": 1.6226356929420411, + "language_loss": 0.85019135, + "learning_rate": 2.505621403992348e-06, + "loss": 0.87122905, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 7253, + "time_per_iteration": 2.4109578132629395 + }, + { + "auxiliary_loss_clip": 0.01061716, + "auxiliary_loss_mlp": 0.0104483, + "balance_loss_clip": 1.01494384, + "balance_loss_mlp": 1.01878774, + "epoch": 0.43613407485344957, + "flos": 23403293297280.0, + "grad_norm": 1.5778030614518577, + "language_loss": 0.71406627, + "learning_rate": 2.505244584092757e-06, + "loss": 0.73513174, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.4296875, + "step": 7254, + "time_per_iteration": 2.498340606689453 + }, + { + "auxiliary_loss_clip": 0.01059761, + "auxiliary_loss_mlp": 0.01047884, + "balance_loss_clip": 1.01626956, + "balance_loss_mlp": 1.01814997, + "epoch": 0.43619419810611754, + "flos": 22636705374720.0, + "grad_norm": 2.4086388282572457, + "language_loss": 0.82595444, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.84703088, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.41601562, + "step": 7255, + "time_per_iteration": 2.410710096359253 + }, + { + "auxiliary_loss_clip": 0.0105984, + "auxiliary_loss_mlp": 0.01045207, + "balance_loss_clip": 1.01540458, + "balance_loss_mlp": 1.01738286, + "epoch": 0.4362543213587855, + "flos": 20046918009600.0, + "grad_norm": 1.708505657555692, + "language_loss": 0.78488672, + "learning_rate": 2.504490886831089e-06, + "loss": 0.80593717, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42382812, + "step": 7256, + "time_per_iteration": 5.403707027435303 + }, + { + "auxiliary_loss_clip": 0.01060241, + "auxiliary_loss_mlp": 0.01051405, + "balance_loss_clip": 1.01883674, + "balance_loss_mlp": 1.01891041, + "epoch": 0.43631444461145347, + "flos": 21360245823360.0, + "grad_norm": 1.4857585139889533, + "language_loss": 0.77192998, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.79304641, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.4140625, + "step": 7257, + "time_per_iteration": 2.4434125423431396 + }, + { + "auxiliary_loss_clip": 0.01060308, + "auxiliary_loss_mlp": 0.01042871, + "balance_loss_clip": 1.01194823, + "balance_loss_mlp": 1.01795578, + "epoch": 0.43637456786412143, + "flos": 22417450836480.0, + "grad_norm": 1.6665422210036394, + "language_loss": 0.74079013, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.76182193, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.42382812, + "step": 7258, + "time_per_iteration": 2.4162564277648926 + }, + { + "auxiliary_loss_clip": 0.01060732, + "auxiliary_loss_mlp": 0.01041306, + "balance_loss_clip": 1.01242161, + "balance_loss_mlp": 1.01800013, + "epoch": 0.4364346911167894, + "flos": 28547570776320.0, + "grad_norm": 1.7953296279094395, + "language_loss": 0.78327072, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.80429107, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42773438, + "step": 7259, + "time_per_iteration": 2.5225110054016113 + }, + { + "auxiliary_loss_clip": 0.0101256, + "auxiliary_loss_mlp": 0.01004058, + "balance_loss_clip": 1.00095892, + "balance_loss_mlp": 1.00358367, + "epoch": 0.43649481436945736, + "flos": 62656211376000.0, + "grad_norm": 0.7489872958787197, + "language_loss": 0.57088745, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59105361, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.03100586, + "router_z_loss_mlp": 0.08984375, + "step": 7260, + "time_per_iteration": 2.994781494140625 + }, + { + "auxiliary_loss_clip": 0.01061481, + "auxiliary_loss_mlp": 0.01047373, + "balance_loss_clip": 1.01454258, + "balance_loss_mlp": 1.01811182, + "epoch": 0.4365549376221254, + "flos": 30590792807040.0, + "grad_norm": 1.9268872618588564, + "language_loss": 0.72241974, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.74350828, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.43359375, + "step": 7261, + "time_per_iteration": 3.986769199371338 + }, + { + "auxiliary_loss_clip": 0.01060178, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_clip": 1.02013636, + "balance_loss_mlp": 1.0173018, + "epoch": 0.43661506087479335, + "flos": 17164907631360.0, + "grad_norm": 1.9553467228589583, + "language_loss": 0.70968747, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.73078763, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 7262, + "time_per_iteration": 2.4204537868499756 + }, + { + "auxiliary_loss_clip": 0.01058474, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_clip": 1.01591969, + "balance_loss_mlp": 1.01983654, + "epoch": 0.4366751841274613, + "flos": 22046603708160.0, + "grad_norm": 1.6113552833893432, + "language_loss": 0.80773258, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82873762, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 7263, + "time_per_iteration": 2.417114496231079 + }, + { + "auxiliary_loss_clip": 0.01060358, + "auxiliary_loss_mlp": 0.01045613, + "balance_loss_clip": 1.01722884, + "balance_loss_mlp": 1.01953745, + "epoch": 0.4367353073801293, + "flos": 15996399603840.0, + "grad_norm": 1.7059403925342342, + "language_loss": 0.76571941, + "learning_rate": 2.50147533371401e-06, + "loss": 0.78677905, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 7264, + "time_per_iteration": 2.389117956161499 + }, + { + "auxiliary_loss_clip": 0.01059457, + "auxiliary_loss_mlp": 0.01043522, + "balance_loss_clip": 1.01612723, + "balance_loss_mlp": 1.01800597, + "epoch": 0.43679543063279724, + "flos": 38215998432000.0, + "grad_norm": 1.8590905006112874, + "language_loss": 0.6354726, + "learning_rate": 2.501098303852298e-06, + "loss": 0.65650237, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 7265, + "time_per_iteration": 2.5082695484161377 + }, + { + "auxiliary_loss_clip": 0.01057677, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.01130247, + "balance_loss_mlp": 1.01746821, + "epoch": 0.4368555538854652, + "flos": 15192384837120.0, + "grad_norm": 2.027301064648061, + "language_loss": 0.74146211, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.76243347, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 7266, + "time_per_iteration": 2.38924503326416 + }, + { + "auxiliary_loss_clip": 0.01061225, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.0153234, + "balance_loss_mlp": 1.01921248, + "epoch": 0.4369156771381332, + "flos": 23068162356480.0, + "grad_norm": 1.9705221364745482, + "language_loss": 0.83578223, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.85682905, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 7267, + "time_per_iteration": 2.3830809593200684 + }, + { + "auxiliary_loss_clip": 0.01059695, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.01157355, + "balance_loss_mlp": 1.01840496, + "epoch": 0.43697580039080114, + "flos": 23439952091520.0, + "grad_norm": 1.837530162600504, + "language_loss": 0.75864649, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.77963459, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41210938, + "step": 7268, + "time_per_iteration": 2.408071279525757 + }, + { + "auxiliary_loss_clip": 0.01062703, + "auxiliary_loss_mlp": 0.01051037, + "balance_loss_clip": 1.02147353, + "balance_loss_mlp": 1.01936841, + "epoch": 0.4370359236434691, + "flos": 18513707253120.0, + "grad_norm": 2.1361051460597342, + "language_loss": 0.81477088, + "learning_rate": 2.499589994531454e-06, + "loss": 0.83590829, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.43359375, + "step": 7269, + "time_per_iteration": 2.3714592456817627 + }, + { + "auxiliary_loss_clip": 0.01058846, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.01654959, + "balance_loss_mlp": 1.01870382, + "epoch": 0.43709604689613707, + "flos": 23221360869120.0, + "grad_norm": 1.784599587098049, + "language_loss": 0.76114619, + "learning_rate": 2.499212869804237e-06, + "loss": 0.78217131, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 7270, + "time_per_iteration": 2.4248766899108887 + }, + { + "auxiliary_loss_clip": 0.01060923, + "auxiliary_loss_mlp": 0.01041744, + "balance_loss_clip": 1.01347947, + "balance_loss_mlp": 1.01905251, + "epoch": 0.43715617014880503, + "flos": 23802629961600.0, + "grad_norm": 1.716008088922179, + "language_loss": 0.80516469, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.82619131, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41992188, + "step": 7271, + "time_per_iteration": 2.415276288986206 + }, + { + "auxiliary_loss_clip": 0.01010171, + "auxiliary_loss_mlp": 0.01003401, + "balance_loss_clip": 0.99989665, + "balance_loss_mlp": 1.0016911, + "epoch": 0.437216293401473, + "flos": 61937768085120.0, + "grad_norm": 0.7031852598678011, + "language_loss": 0.54979455, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56993032, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.03515625, + "router_z_loss_mlp": 0.08496094, + "step": 7272, + "time_per_iteration": 3.0944581031799316 + }, + { + "auxiliary_loss_clip": 0.01061785, + "auxiliary_loss_mlp": 0.01050825, + "balance_loss_clip": 1.0203315, + "balance_loss_mlp": 1.01994061, + "epoch": 0.43727641665414096, + "flos": 21981141175680.0, + "grad_norm": 1.6543342685010045, + "language_loss": 0.71012402, + "learning_rate": 2.498081382098581e-06, + "loss": 0.73125005, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.41796875, + "step": 7273, + "time_per_iteration": 2.4050798416137695 + }, + { + "auxiliary_loss_clip": 0.01061417, + "auxiliary_loss_mlp": 0.01047344, + "balance_loss_clip": 1.0182569, + "balance_loss_mlp": 1.01903784, + "epoch": 0.437336539906809, + "flos": 39529291334400.0, + "grad_norm": 1.7713239548930133, + "language_loss": 0.78059328, + "learning_rate": 2.497704181736367e-06, + "loss": 0.80168086, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42382812, + "step": 7274, + "time_per_iteration": 2.567021608352661 + }, + { + "auxiliary_loss_clip": 0.01057245, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.0117352, + "balance_loss_mlp": 1.01700246, + "epoch": 0.43739666315947695, + "flos": 17456188037760.0, + "grad_norm": 1.9197336378327285, + "language_loss": 0.81434011, + "learning_rate": 2.49732696250116e-06, + "loss": 0.83527815, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.40234375, + "step": 7275, + "time_per_iteration": 2.353828191757202 + }, + { + "auxiliary_loss_clip": 0.01060431, + "auxiliary_loss_mlp": 0.01047543, + "balance_loss_clip": 1.02011275, + "balance_loss_mlp": 1.01939559, + "epoch": 0.4374567864121449, + "flos": 16357925399040.0, + "grad_norm": 2.2118297283344703, + "language_loss": 0.81908619, + "learning_rate": 2.496949724407266e-06, + "loss": 0.84016591, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 7276, + "time_per_iteration": 2.3761255741119385 + }, + { + "auxiliary_loss_clip": 0.01063986, + "auxiliary_loss_mlp": 0.01050863, + "balance_loss_clip": 1.01895118, + "balance_loss_mlp": 1.019315, + "epoch": 0.4375169096648129, + "flos": 30586324152960.0, + "grad_norm": 2.411715766094461, + "language_loss": 0.74420345, + "learning_rate": 2.496572467468988e-06, + "loss": 0.76535201, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44726562, + "step": 7277, + "time_per_iteration": 2.4802350997924805 + }, + { + "auxiliary_loss_clip": 0.01058731, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_clip": 1.01830697, + "balance_loss_mlp": 1.01705575, + "epoch": 0.43757703291748085, + "flos": 30554273658240.0, + "grad_norm": 2.5585596810012765, + "language_loss": 0.74098837, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.76204681, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41601562, + "step": 7278, + "time_per_iteration": 2.4695401191711426 + }, + { + "auxiliary_loss_clip": 0.01057975, + "auxiliary_loss_mlp": 0.01049646, + "balance_loss_clip": 1.0235033, + "balance_loss_mlp": 1.01795959, + "epoch": 0.4376371561701488, + "flos": 21396311124480.0, + "grad_norm": 1.603053283743268, + "language_loss": 0.67597657, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.69705278, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 7279, + "time_per_iteration": 2.399232864379883 + }, + { + "auxiliary_loss_clip": 0.01063974, + "auxiliary_loss_mlp": 0.01055379, + "balance_loss_clip": 1.02582741, + "balance_loss_mlp": 1.02049923, + "epoch": 0.4376972794228168, + "flos": 23403258385920.0, + "grad_norm": 1.8943765127530712, + "language_loss": 0.83247763, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.85367119, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43359375, + "step": 7280, + "time_per_iteration": 2.4058315753936768 + }, + { + "auxiliary_loss_clip": 0.01055755, + "auxiliary_loss_mlp": 0.01045938, + "balance_loss_clip": 1.01939011, + "balance_loss_mlp": 1.01612926, + "epoch": 0.43775740267548474, + "flos": 22891850657280.0, + "grad_norm": 1.5736777842160419, + "language_loss": 0.78024936, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.80126625, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 7281, + "time_per_iteration": 2.3885397911071777 + }, + { + "auxiliary_loss_clip": 0.01058442, + "auxiliary_loss_mlp": 0.01041627, + "balance_loss_clip": 1.01521039, + "balance_loss_mlp": 1.01738262, + "epoch": 0.4378175259281527, + "flos": 23293282003200.0, + "grad_norm": 1.9251414261803406, + "language_loss": 0.77112889, + "learning_rate": 2.494685900612569e-06, + "loss": 0.79212958, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41015625, + "step": 7282, + "time_per_iteration": 2.379964828491211 + }, + { + "auxiliary_loss_clip": 0.01061244, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.01677895, + "balance_loss_mlp": 1.01918983, + "epoch": 0.43787764918082067, + "flos": 23875807904640.0, + "grad_norm": 1.7869918120781367, + "language_loss": 0.86286801, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.88394988, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.41992188, + "step": 7283, + "time_per_iteration": 2.4463982582092285 + }, + { + "auxiliary_loss_clip": 0.010624, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_clip": 1.01334596, + "balance_loss_mlp": 1.01871395, + "epoch": 0.43793777243348864, + "flos": 23987006184960.0, + "grad_norm": 1.7024830483412288, + "language_loss": 0.81518173, + "learning_rate": 2.49393114246007e-06, + "loss": 0.83624732, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43554688, + "step": 7284, + "time_per_iteration": 2.393005132675171 + }, + { + "auxiliary_loss_clip": 0.01059121, + "auxiliary_loss_mlp": 0.01050588, + "balance_loss_clip": 1.0230031, + "balance_loss_mlp": 1.01762199, + "epoch": 0.4379978956861566, + "flos": 18623090142720.0, + "grad_norm": 2.18000639063845, + "language_loss": 0.81613672, + "learning_rate": 2.493553735281787e-06, + "loss": 0.83723378, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.4140625, + "step": 7285, + "time_per_iteration": 2.4198246002197266 + }, + { + "auxiliary_loss_clip": 0.01059808, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_clip": 1.01298428, + "balance_loss_mlp": 1.01850593, + "epoch": 0.43805801893882457, + "flos": 21980303303040.0, + "grad_norm": 2.0600247713932007, + "language_loss": 0.75408256, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77510154, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41210938, + "step": 7286, + "time_per_iteration": 2.375105381011963 + }, + { + "auxiliary_loss_clip": 0.01060861, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.01768601, + "balance_loss_mlp": 1.01805401, + "epoch": 0.43811814219149253, + "flos": 26392207858560.0, + "grad_norm": 2.193043740158527, + "language_loss": 0.74713695, + "learning_rate": 2.492798864792712e-06, + "loss": 0.7682178, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 7287, + "time_per_iteration": 2.442220687866211 + }, + { + "auxiliary_loss_clip": 0.01062136, + "auxiliary_loss_mlp": 0.01053082, + "balance_loss_clip": 1.02330399, + "balance_loss_mlp": 1.01982665, + "epoch": 0.43817826544416055, + "flos": 17492358072960.0, + "grad_norm": 1.7552469550869914, + "language_loss": 0.84033799, + "learning_rate": 2.492421401510545e-06, + "loss": 0.86149019, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 7288, + "time_per_iteration": 2.370328903198242 + }, + { + "auxiliary_loss_clip": 0.01062616, + "auxiliary_loss_mlp": 0.01046688, + "balance_loss_clip": 1.01676702, + "balance_loss_mlp": 1.01872134, + "epoch": 0.4382383886968285, + "flos": 21579919297920.0, + "grad_norm": 1.3935986503864533, + "language_loss": 0.84869266, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86978567, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43945312, + "step": 7289, + "time_per_iteration": 2.4336354732513428 + }, + { + "auxiliary_loss_clip": 0.01063014, + "auxiliary_loss_mlp": 0.01048464, + "balance_loss_clip": 1.01870966, + "balance_loss_mlp": 1.01828504, + "epoch": 0.4382985119494965, + "flos": 27922625706240.0, + "grad_norm": 2.1227753501590816, + "language_loss": 0.79161209, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.81272686, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44726562, + "step": 7290, + "time_per_iteration": 2.4175989627838135 + }, + { + "auxiliary_loss_clip": 0.01058603, + "auxiliary_loss_mlp": 0.01044435, + "balance_loss_clip": 1.0162065, + "balance_loss_mlp": 1.01783335, + "epoch": 0.43835863520216445, + "flos": 24935666181120.0, + "grad_norm": 2.1057694776811986, + "language_loss": 0.79284644, + "learning_rate": 2.491288899685288e-06, + "loss": 0.81387681, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40625, + "step": 7291, + "time_per_iteration": 2.4486818313598633 + }, + { + "auxiliary_loss_clip": 0.01060667, + "auxiliary_loss_mlp": 0.01042758, + "balance_loss_clip": 1.01494634, + "balance_loss_mlp": 1.01905727, + "epoch": 0.4384187584548324, + "flos": 33508903397760.0, + "grad_norm": 1.7019848218104956, + "language_loss": 0.66320062, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.68423486, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41601562, + "step": 7292, + "time_per_iteration": 2.4842960834503174 + }, + { + "auxiliary_loss_clip": 0.01060596, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.01150262, + "balance_loss_mlp": 1.01874924, + "epoch": 0.4384788817075004, + "flos": 23949928454400.0, + "grad_norm": 1.8268222825076799, + "language_loss": 0.75726479, + "learning_rate": 2.49053380529597e-06, + "loss": 0.77825999, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41796875, + "step": 7293, + "time_per_iteration": 2.435281276702881 + }, + { + "auxiliary_loss_clip": 0.01060472, + "auxiliary_loss_mlp": 0.01045556, + "balance_loss_clip": 1.01464498, + "balance_loss_mlp": 1.01798761, + "epoch": 0.43853900496016834, + "flos": 19097524874880.0, + "grad_norm": 1.8274360074500626, + "language_loss": 0.8059845, + "learning_rate": 2.490156230192516e-06, + "loss": 0.82704473, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.42578125, + "step": 7294, + "time_per_iteration": 2.3682472705841064 + }, + { + "auxiliary_loss_clip": 0.0106281, + "auxiliary_loss_mlp": 0.01046122, + "balance_loss_clip": 1.01798916, + "balance_loss_mlp": 1.02033865, + "epoch": 0.4385991282128363, + "flos": 13224505253760.0, + "grad_norm": 1.8557812731595573, + "language_loss": 0.74372649, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.76481581, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42382812, + "step": 7295, + "time_per_iteration": 3.766171455383301 + }, + { + "auxiliary_loss_clip": 0.0106277, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.01577628, + "balance_loss_mlp": 1.01941538, + "epoch": 0.4386592514655043, + "flos": 14318997465600.0, + "grad_norm": 1.6637092409755259, + "language_loss": 0.76261288, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.78371161, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43359375, + "step": 7296, + "time_per_iteration": 5.186948776245117 + }, + { + "auxiliary_loss_clip": 0.01060549, + "auxiliary_loss_mlp": 0.01046952, + "balance_loss_clip": 1.01765037, + "balance_loss_mlp": 1.01831114, + "epoch": 0.43871937471817224, + "flos": 22783305640320.0, + "grad_norm": 2.1633159758761944, + "language_loss": 0.70417559, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.7252506, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.421875, + "step": 7297, + "time_per_iteration": 2.4623565673828125 + }, + { + "auxiliary_loss_clip": 0.01059312, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.01174498, + "balance_loss_mlp": 1.01789522, + "epoch": 0.4387794979708402, + "flos": 28071111185280.0, + "grad_norm": 1.5128305863123008, + "language_loss": 0.71367806, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.73466176, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 7298, + "time_per_iteration": 2.4492974281311035 + }, + { + "auxiliary_loss_clip": 0.01059407, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.01466656, + "balance_loss_mlp": 1.01866531, + "epoch": 0.43883962122350817, + "flos": 26248365590400.0, + "grad_norm": 2.157175793690281, + "language_loss": 0.73466778, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.75570631, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40820312, + "step": 7299, + "time_per_iteration": 2.44999098777771 + }, + { + "auxiliary_loss_clip": 0.01064173, + "auxiliary_loss_mlp": 0.01048258, + "balance_loss_clip": 1.01392543, + "balance_loss_mlp": 1.01941276, + "epoch": 0.43889974447617613, + "flos": 25882615520640.0, + "grad_norm": 1.9281355343772062, + "language_loss": 0.78611612, + "learning_rate": 2.487890389750719e-06, + "loss": 0.80724043, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.44726562, + "step": 7300, + "time_per_iteration": 2.4579555988311768 + }, + { + "auxiliary_loss_clip": 0.01061881, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.01779127, + "balance_loss_mlp": 1.01855087, + "epoch": 0.43895986772884416, + "flos": 25045433095680.0, + "grad_norm": 1.893186658732785, + "language_loss": 0.73207045, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.75318402, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43359375, + "step": 7301, + "time_per_iteration": 3.858041524887085 + }, + { + "auxiliary_loss_clip": 0.01064124, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.01441419, + "balance_loss_mlp": 1.01956987, + "epoch": 0.4390199909815121, + "flos": 25993394864640.0, + "grad_norm": 2.15830940212214, + "language_loss": 0.72258341, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.74369776, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4453125, + "step": 7302, + "time_per_iteration": 2.4330086708068848 + }, + { + "auxiliary_loss_clip": 0.01062367, + "auxiliary_loss_mlp": 0.01047285, + "balance_loss_clip": 1.0174228, + "balance_loss_mlp": 1.01902533, + "epoch": 0.4390801142341801, + "flos": 29020993079040.0, + "grad_norm": 1.777201791060803, + "language_loss": 0.83281636, + "learning_rate": 2.486757219574983e-06, + "loss": 0.85391283, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43359375, + "step": 7303, + "time_per_iteration": 2.476213216781616 + }, + { + "auxiliary_loss_clip": 0.01066237, + "auxiliary_loss_mlp": 0.01054685, + "balance_loss_clip": 1.02125835, + "balance_loss_mlp": 1.01949286, + "epoch": 0.43914023748684805, + "flos": 33437121909120.0, + "grad_norm": 1.8658270369586951, + "language_loss": 0.70212865, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.72333789, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46679688, + "step": 7304, + "time_per_iteration": 2.4966511726379395 + }, + { + "auxiliary_loss_clip": 0.01059228, + "auxiliary_loss_mlp": 0.01048494, + "balance_loss_clip": 1.02070618, + "balance_loss_mlp": 1.01787901, + "epoch": 0.439200360739516, + "flos": 34530427134720.0, + "grad_norm": 1.533887665136781, + "language_loss": 0.79193187, + "learning_rate": 2.486001680477873e-06, + "loss": 0.81300908, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 7305, + "time_per_iteration": 2.491532564163208 + }, + { + "auxiliary_loss_clip": 0.01061454, + "auxiliary_loss_mlp": 0.01055533, + "balance_loss_clip": 1.02385926, + "balance_loss_mlp": 1.01898372, + "epoch": 0.439260483992184, + "flos": 21906776246400.0, + "grad_norm": 1.674203335379242, + "language_loss": 0.70174617, + "learning_rate": 2.485623883278308e-06, + "loss": 0.72291613, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.42578125, + "step": 7306, + "time_per_iteration": 2.386427640914917 + }, + { + "auxiliary_loss_clip": 0.01062823, + "auxiliary_loss_mlp": 0.01044375, + "balance_loss_clip": 1.01373804, + "balance_loss_mlp": 1.01957369, + "epoch": 0.43932060724485195, + "flos": 20995368537600.0, + "grad_norm": 1.652152664364772, + "language_loss": 0.64685053, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.66792256, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43359375, + "step": 7307, + "time_per_iteration": 2.4130983352661133 + }, + { + "auxiliary_loss_clip": 0.0106559, + "auxiliary_loss_mlp": 0.01044963, + "balance_loss_clip": 1.01438546, + "balance_loss_mlp": 1.02007794, + "epoch": 0.4393807304975199, + "flos": 17746141812480.0, + "grad_norm": 2.276553847959972, + "language_loss": 0.73295546, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.75406098, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.45507812, + "step": 7308, + "time_per_iteration": 2.341179609298706 + }, + { + "auxiliary_loss_clip": 0.01063698, + "auxiliary_loss_mlp": 0.01051046, + "balance_loss_clip": 1.01847792, + "balance_loss_mlp": 1.01827371, + "epoch": 0.4394408537501879, + "flos": 22527427219200.0, + "grad_norm": 8.952840129246773, + "language_loss": 0.77506065, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79620802, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.453125, + "step": 7309, + "time_per_iteration": 2.4719150066375732 + }, + { + "auxiliary_loss_clip": 0.01058909, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.01495266, + "balance_loss_mlp": 1.01835704, + "epoch": 0.43950097700285584, + "flos": 23439533155200.0, + "grad_norm": 1.6806576653859169, + "language_loss": 0.7269522, + "learning_rate": 2.484112510474251e-06, + "loss": 0.74795681, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40625, + "step": 7310, + "time_per_iteration": 2.400636911392212 + }, + { + "auxiliary_loss_clip": 0.01061952, + "auxiliary_loss_mlp": 0.01049514, + "balance_loss_clip": 1.01810265, + "balance_loss_mlp": 1.01891768, + "epoch": 0.4395611002555238, + "flos": 23179709750400.0, + "grad_norm": 1.8975296324059103, + "language_loss": 0.77321887, + "learning_rate": 2.483734621343429e-06, + "loss": 0.79433352, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4296875, + "step": 7311, + "time_per_iteration": 2.427537441253662 + }, + { + "auxiliary_loss_clip": 0.01062786, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.0146333, + "balance_loss_mlp": 1.01949108, + "epoch": 0.43962122350819177, + "flos": 22126275164160.0, + "grad_norm": 2.260130338078128, + "language_loss": 0.82983363, + "learning_rate": 2.483356713869341e-06, + "loss": 0.85091662, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43164062, + "step": 7312, + "time_per_iteration": 2.3986408710479736 + }, + { + "auxiliary_loss_clip": 0.01060939, + "auxiliary_loss_mlp": 0.01039506, + "balance_loss_clip": 1.0100373, + "balance_loss_mlp": 1.01815271, + "epoch": 0.43968134676085974, + "flos": 17419599066240.0, + "grad_norm": 1.8122897199524053, + "language_loss": 0.87158227, + "learning_rate": 2.482978788066318e-06, + "loss": 0.89258671, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42773438, + "step": 7313, + "time_per_iteration": 2.3987982273101807 + }, + { + "auxiliary_loss_clip": 0.0106187, + "auxiliary_loss_mlp": 0.01044383, + "balance_loss_clip": 1.01436567, + "balance_loss_mlp": 1.01843023, + "epoch": 0.43974147001352776, + "flos": 18951657747840.0, + "grad_norm": 1.9158231644092658, + "language_loss": 0.68759257, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.70865512, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43554688, + "step": 7314, + "time_per_iteration": 2.357686996459961 + }, + { + "auxiliary_loss_clip": 0.01064398, + "auxiliary_loss_mlp": 0.01052042, + "balance_loss_clip": 1.01999843, + "balance_loss_mlp": 1.02096105, + "epoch": 0.4398015932661957, + "flos": 18952495620480.0, + "grad_norm": 1.8207796307461122, + "language_loss": 0.77882272, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79998714, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.43359375, + "step": 7315, + "time_per_iteration": 2.3925325870513916 + }, + { + "auxiliary_loss_clip": 0.01060088, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.01384044, + "balance_loss_mlp": 1.01868296, + "epoch": 0.4398617165188637, + "flos": 24198964248960.0, + "grad_norm": 2.7186508942090275, + "language_loss": 0.76055741, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.78155667, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.4140625, + "step": 7316, + "time_per_iteration": 2.4064395427703857 + }, + { + "auxiliary_loss_clip": 0.01062092, + "auxiliary_loss_mlp": 0.01043289, + "balance_loss_clip": 1.01446414, + "balance_loss_mlp": 1.01988757, + "epoch": 0.43992183977153165, + "flos": 22235588231040.0, + "grad_norm": 2.784521056576409, + "language_loss": 0.66906399, + "learning_rate": 2.481466901851506e-06, + "loss": 0.69011784, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 7317, + "time_per_iteration": 2.4330952167510986 + }, + { + "auxiliary_loss_clip": 0.01064463, + "auxiliary_loss_mlp": 0.01047617, + "balance_loss_clip": 1.01823235, + "balance_loss_mlp": 1.02083564, + "epoch": 0.4399819630241996, + "flos": 18696477553920.0, + "grad_norm": 1.7363703215128456, + "language_loss": 0.80803013, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.82915097, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4375, + "step": 7318, + "time_per_iteration": 2.3593099117279053 + }, + { + "auxiliary_loss_clip": 0.01064284, + "auxiliary_loss_mlp": 0.01047454, + "balance_loss_clip": 1.01796126, + "balance_loss_mlp": 1.01984119, + "epoch": 0.4400420862768676, + "flos": 23878216788480.0, + "grad_norm": 1.7009335364060372, + "language_loss": 0.81300569, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.83412302, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4453125, + "step": 7319, + "time_per_iteration": 2.43990421295166 + }, + { + "auxiliary_loss_clip": 0.01063863, + "auxiliary_loss_mlp": 0.01044079, + "balance_loss_clip": 1.01511168, + "balance_loss_mlp": 1.02011383, + "epoch": 0.44010220952953555, + "flos": 28036372515840.0, + "grad_norm": 1.6103783448450437, + "language_loss": 0.80287182, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82395124, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4375, + "step": 7320, + "time_per_iteration": 2.480468511581421 + }, + { + "auxiliary_loss_clip": 0.01062322, + "auxiliary_loss_mlp": 0.01044179, + "balance_loss_clip": 1.01686788, + "balance_loss_mlp": 1.02007961, + "epoch": 0.4401623327822035, + "flos": 23767856380800.0, + "grad_norm": 1.4526986808522624, + "language_loss": 0.71004057, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.73110551, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.421875, + "step": 7321, + "time_per_iteration": 2.4319217205047607 + }, + { + "auxiliary_loss_clip": 0.01014292, + "auxiliary_loss_mlp": 0.01005455, + "balance_loss_clip": 1.00233173, + "balance_loss_mlp": 1.00562501, + "epoch": 0.4402224560348715, + "flos": 70770793795200.0, + "grad_norm": 0.8794083617603583, + "language_loss": 0.56999892, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.59019637, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.03125, + "router_z_loss_mlp": 0.08691406, + "step": 7322, + "time_per_iteration": 3.1158978939056396 + }, + { + "auxiliary_loss_clip": 0.01061648, + "auxiliary_loss_mlp": 0.01041649, + "balance_loss_clip": 1.01488709, + "balance_loss_mlp": 1.0194962, + "epoch": 0.44028257928753944, + "flos": 22890733493760.0, + "grad_norm": 1.4309498576008288, + "language_loss": 0.77373135, + "learning_rate": 2.479198525097822e-06, + "loss": 0.79476428, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.421875, + "step": 7323, + "time_per_iteration": 2.4308297634124756 + }, + { + "auxiliary_loss_clip": 0.01061381, + "auxiliary_loss_mlp": 0.0105159, + "balance_loss_clip": 1.02009487, + "balance_loss_mlp": 1.01864946, + "epoch": 0.4403427025402074, + "flos": 17894766936960.0, + "grad_norm": 2.171901754457414, + "language_loss": 0.81565857, + "learning_rate": 2.478820398622511e-06, + "loss": 0.83678836, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.42773438, + "step": 7324, + "time_per_iteration": 2.352919816970825 + }, + { + "auxiliary_loss_clip": 0.01013656, + "auxiliary_loss_mlp": 0.01014096, + "balance_loss_clip": 1.01094854, + "balance_loss_mlp": 1.00473452, + "epoch": 0.4404028257928754, + "flos": 69558993815040.0, + "grad_norm": 0.6764434934057291, + "language_loss": 0.54537606, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56565356, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.03149414, + "router_z_loss_mlp": 0.08935547, + "step": 7325, + "time_per_iteration": 3.0427229404449463 + }, + { + "auxiliary_loss_clip": 0.01060683, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.01380444, + "balance_loss_mlp": 1.01963806, + "epoch": 0.44046294904554334, + "flos": 20922609530880.0, + "grad_norm": 1.4879926269902644, + "language_loss": 0.70613664, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.72714567, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41015625, + "step": 7326, + "time_per_iteration": 2.3984360694885254 + }, + { + "auxiliary_loss_clip": 0.01059514, + "auxiliary_loss_mlp": 0.01043568, + "balance_loss_clip": 1.01558912, + "balance_loss_mlp": 1.01823974, + "epoch": 0.44052307229821136, + "flos": 23622338367360.0, + "grad_norm": 1.4620135726747976, + "language_loss": 0.77451193, + "learning_rate": 2.477685910312432e-06, + "loss": 0.79554272, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41210938, + "step": 7327, + "time_per_iteration": 2.4216556549072266 + }, + { + "auxiliary_loss_clip": 0.01059044, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.01297712, + "balance_loss_mlp": 1.01744676, + "epoch": 0.4405831955508793, + "flos": 17596853372160.0, + "grad_norm": 2.2507353585766054, + "language_loss": 0.85296571, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.87396693, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41601562, + "step": 7328, + "time_per_iteration": 2.3294596672058105 + }, + { + "auxiliary_loss_clip": 0.0105934, + "auxiliary_loss_mlp": 0.01044639, + "balance_loss_clip": 1.01760232, + "balance_loss_mlp": 1.01749432, + "epoch": 0.4406433188035473, + "flos": 21462506795520.0, + "grad_norm": 1.8601428665869066, + "language_loss": 0.78443682, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.80547655, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41796875, + "step": 7329, + "time_per_iteration": 2.4241573810577393 + }, + { + "auxiliary_loss_clip": 0.01063714, + "auxiliary_loss_mlp": 0.01042906, + "balance_loss_clip": 1.01241195, + "balance_loss_mlp": 1.01930189, + "epoch": 0.44070344205621526, + "flos": 22672491384960.0, + "grad_norm": 1.4837961079016122, + "language_loss": 0.74657083, + "learning_rate": 2.476551258977278e-06, + "loss": 0.76763701, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4453125, + "step": 7330, + "time_per_iteration": 2.3878095149993896 + }, + { + "auxiliary_loss_clip": 0.01061405, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_clip": 1.01782179, + "balance_loss_mlp": 1.01933599, + "epoch": 0.4407635653088832, + "flos": 23440056825600.0, + "grad_norm": 1.8981351777463733, + "language_loss": 0.75422025, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77530235, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41992188, + "step": 7331, + "time_per_iteration": 2.447476863861084 + }, + { + "auxiliary_loss_clip": 0.0105809, + "auxiliary_loss_mlp": 0.01042159, + "balance_loss_clip": 1.01422882, + "balance_loss_mlp": 1.01781368, + "epoch": 0.4408236885615512, + "flos": 24020243665920.0, + "grad_norm": 1.359323039423722, + "language_loss": 0.77433515, + "learning_rate": 2.475794734375581e-06, + "loss": 0.79533762, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40234375, + "step": 7332, + "time_per_iteration": 2.4144577980041504 + }, + { + "auxiliary_loss_clip": 0.01060155, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.02074742, + "balance_loss_mlp": 1.01889884, + "epoch": 0.44088381181421915, + "flos": 12676019794560.0, + "grad_norm": 1.6650089871565255, + "language_loss": 0.75312299, + "learning_rate": 2.475416445004285e-06, + "loss": 0.77421266, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 7333, + "time_per_iteration": 2.4148573875427246 + }, + { + "auxiliary_loss_clip": 0.01058481, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.01257634, + "balance_loss_mlp": 1.01890635, + "epoch": 0.4409439350668871, + "flos": 24568764036480.0, + "grad_norm": 1.7876222631169818, + "language_loss": 0.81414127, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.83514041, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.39453125, + "step": 7334, + "time_per_iteration": 3.8585569858551025 + }, + { + "auxiliary_loss_clip": 0.01067814, + "auxiliary_loss_mlp": 0.01053851, + "balance_loss_clip": 1.01796842, + "balance_loss_mlp": 1.02036905, + "epoch": 0.4410040583195551, + "flos": 22667638705920.0, + "grad_norm": 2.089981064346398, + "language_loss": 0.77407479, + "learning_rate": 2.47465981219252e-06, + "loss": 0.79529154, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.47460938, + "step": 7335, + "time_per_iteration": 3.870195150375366 + }, + { + "auxiliary_loss_clip": 0.01062123, + "auxiliary_loss_mlp": 0.01052869, + "balance_loss_clip": 1.02188611, + "balance_loss_mlp": 1.0193125, + "epoch": 0.44106418157222305, + "flos": 10851773011200.0, + "grad_norm": 1.9683759228334394, + "language_loss": 0.73479533, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.75594521, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.42773438, + "step": 7336, + "time_per_iteration": 3.7258658409118652 + }, + { + "auxiliary_loss_clip": 0.01063475, + "auxiliary_loss_mlp": 0.01057381, + "balance_loss_clip": 1.02465832, + "balance_loss_mlp": 1.0188024, + "epoch": 0.441124304824891, + "flos": 21725611868160.0, + "grad_norm": 2.486735629668177, + "language_loss": 0.65754402, + "learning_rate": 2.473903107384165e-06, + "loss": 0.67875254, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44726562, + "step": 7337, + "time_per_iteration": 2.3925702571868896 + }, + { + "auxiliary_loss_clip": 0.01015682, + "auxiliary_loss_mlp": 0.01010602, + "balance_loss_clip": 1.00750268, + "balance_loss_mlp": 1.00650883, + "epoch": 0.441184428077559, + "flos": 63216950722560.0, + "grad_norm": 0.7526140761932489, + "language_loss": 0.52748525, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54774809, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.03088379, + "router_z_loss_mlp": 0.09179688, + "step": 7338, + "time_per_iteration": 3.059663772583008 + }, + { + "auxiliary_loss_clip": 0.01064901, + "auxiliary_loss_mlp": 0.01055744, + "balance_loss_clip": 1.02117348, + "balance_loss_mlp": 1.01876521, + "epoch": 0.44124455133022694, + "flos": 21176916940800.0, + "grad_norm": 1.9110692821300688, + "language_loss": 0.72349161, + "learning_rate": 2.473146330693997e-06, + "loss": 0.74469805, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.4609375, + "step": 7339, + "time_per_iteration": 2.3849451541900635 + }, + { + "auxiliary_loss_clip": 0.01060784, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_clip": 1.0206939, + "balance_loss_mlp": 1.02078032, + "epoch": 0.4413046745828949, + "flos": 17456886264960.0, + "grad_norm": 1.4442040090961115, + "language_loss": 0.70734489, + "learning_rate": 2.472767915429105e-06, + "loss": 0.72842336, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40039062, + "step": 7340, + "time_per_iteration": 2.422581195831299 + }, + { + "auxiliary_loss_clip": 0.0101631, + "auxiliary_loss_mlp": 0.01007701, + "balance_loss_clip": 1.00476873, + "balance_loss_mlp": 1.00728893, + "epoch": 0.4413647978355629, + "flos": 61583470807680.0, + "grad_norm": 0.911749609981512, + "language_loss": 0.64028013, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66052026, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.02929688, + "router_z_loss_mlp": 0.09033203, + "step": 7341, + "time_per_iteration": 4.282296657562256 + }, + { + "auxiliary_loss_clip": 0.01062079, + "auxiliary_loss_mlp": 0.01052875, + "balance_loss_clip": 1.02340662, + "balance_loss_mlp": 1.01968455, + "epoch": 0.4414249210882309, + "flos": 27525767748480.0, + "grad_norm": 2.0064868041555384, + "language_loss": 0.75096887, + "learning_rate": 2.47201103113145e-06, + "loss": 0.77211845, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42382812, + "step": 7342, + "time_per_iteration": 2.4320425987243652 + }, + { + "auxiliary_loss_clip": 0.01060122, + "auxiliary_loss_mlp": 0.01050015, + "balance_loss_clip": 1.01927066, + "balance_loss_mlp": 1.01759934, + "epoch": 0.44148504434089886, + "flos": 23512850743680.0, + "grad_norm": 1.6877123454569463, + "language_loss": 0.81236267, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.83346397, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.42578125, + "step": 7343, + "time_per_iteration": 2.4582061767578125 + }, + { + "auxiliary_loss_clip": 0.01059931, + "auxiliary_loss_mlp": 0.01045009, + "balance_loss_clip": 1.01637459, + "balance_loss_mlp": 1.01812673, + "epoch": 0.4415451675935668, + "flos": 21579500361600.0, + "grad_norm": 1.569953266894175, + "language_loss": 0.77560043, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.79664981, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 7344, + "time_per_iteration": 2.406813621520996 + }, + { + "auxiliary_loss_clip": 0.01014775, + "auxiliary_loss_mlp": 0.01007858, + "balance_loss_clip": 1.00486541, + "balance_loss_mlp": 1.00593352, + "epoch": 0.4416052908462348, + "flos": 59003458623360.0, + "grad_norm": 0.8037914184343714, + "language_loss": 0.6386956, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65892196, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.02990723, + "router_z_loss_mlp": 0.08886719, + "step": 7345, + "time_per_iteration": 2.812398910522461 + }, + { + "auxiliary_loss_clip": 0.01062908, + "auxiliary_loss_mlp": 0.01051137, + "balance_loss_clip": 1.02035761, + "balance_loss_mlp": 1.01935863, + "epoch": 0.44166541409890275, + "flos": 26356491671040.0, + "grad_norm": 1.6660603291529221, + "language_loss": 0.86785954, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88899994, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43554688, + "step": 7346, + "time_per_iteration": 2.417098045349121 + }, + { + "auxiliary_loss_clip": 0.01062412, + "auxiliary_loss_mlp": 0.01061264, + "balance_loss_clip": 1.02950644, + "balance_loss_mlp": 1.01974368, + "epoch": 0.4417255373515707, + "flos": 20191667973120.0, + "grad_norm": 1.6509689029097487, + "language_loss": 0.81412232, + "learning_rate": 2.470118507411128e-06, + "loss": 0.8353591, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.42578125, + "step": 7347, + "time_per_iteration": 2.465620517730713 + }, + { + "auxiliary_loss_clip": 0.01062283, + "auxiliary_loss_mlp": 0.01062751, + "balance_loss_clip": 1.02999198, + "balance_loss_mlp": 1.01926613, + "epoch": 0.4417856606042387, + "flos": 17887121349120.0, + "grad_norm": 1.9092763332397331, + "language_loss": 0.84725559, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.86850584, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4296875, + "step": 7348, + "time_per_iteration": 2.485146999359131 + }, + { + "auxiliary_loss_clip": 0.01062859, + "auxiliary_loss_mlp": 0.01051407, + "balance_loss_clip": 1.02211714, + "balance_loss_mlp": 1.01907039, + "epoch": 0.44184578385690665, + "flos": 27962810547840.0, + "grad_norm": 1.7556654308338602, + "language_loss": 0.7311331, + "learning_rate": 2.469361373033938e-06, + "loss": 0.75227571, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4375, + "step": 7349, + "time_per_iteration": 2.525250196456909 + }, + { + "auxiliary_loss_clip": 0.01062412, + "auxiliary_loss_mlp": 0.01053862, + "balance_loss_clip": 1.02086473, + "balance_loss_mlp": 1.01888788, + "epoch": 0.4419059071095746, + "flos": 23366774148480.0, + "grad_norm": 1.7278907622273492, + "language_loss": 0.75466394, + "learning_rate": 2.468982779140819e-06, + "loss": 0.77582669, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.43554688, + "step": 7350, + "time_per_iteration": 2.3771934509277344 + }, + { + "auxiliary_loss_clip": 0.01061467, + "auxiliary_loss_mlp": 0.01059402, + "balance_loss_clip": 1.02847934, + "balance_loss_mlp": 1.01827776, + "epoch": 0.4419660303622426, + "flos": 15011290281600.0, + "grad_norm": 2.734310648942757, + "language_loss": 0.8342129, + "learning_rate": 2.468604167463827e-06, + "loss": 0.85542166, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43164062, + "step": 7351, + "time_per_iteration": 2.428382396697998 + }, + { + "auxiliary_loss_clip": 0.01058465, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.02316618, + "balance_loss_mlp": 1.01837564, + "epoch": 0.44202615361491054, + "flos": 25370649210240.0, + "grad_norm": 1.437596158834641, + "language_loss": 0.7388941, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75996542, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40039062, + "step": 7352, + "time_per_iteration": 2.4401159286499023 + }, + { + "auxiliary_loss_clip": 0.01059659, + "auxiliary_loss_mlp": 0.01050322, + "balance_loss_clip": 1.02173555, + "balance_loss_mlp": 1.01816583, + "epoch": 0.4420862768675785, + "flos": 24679962316800.0, + "grad_norm": 1.6983210640906514, + "language_loss": 0.88690233, + "learning_rate": 2.467846890815649e-06, + "loss": 0.90800214, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 7353, + "time_per_iteration": 2.494453191757202 + }, + { + "auxiliary_loss_clip": 0.01064295, + "auxiliary_loss_mlp": 0.01050626, + "balance_loss_clip": 1.02201629, + "balance_loss_mlp": 1.02082181, + "epoch": 0.44214640012024653, + "flos": 19527655224960.0, + "grad_norm": 2.2693578461490787, + "language_loss": 0.77128625, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.79243541, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43554688, + "step": 7354, + "time_per_iteration": 2.380127191543579 + }, + { + "auxiliary_loss_clip": 0.01058582, + "auxiliary_loss_mlp": 0.01048298, + "balance_loss_clip": 1.0205816, + "balance_loss_mlp": 1.01829302, + "epoch": 0.4422065233729145, + "flos": 47555649014400.0, + "grad_norm": 2.1187763497857475, + "language_loss": 0.65882498, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67989373, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40234375, + "step": 7355, + "time_per_iteration": 2.6712300777435303 + }, + { + "auxiliary_loss_clip": 0.01065068, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.01973593, + "balance_loss_mlp": 1.01985526, + "epoch": 0.44226664662558246, + "flos": 19280050796160.0, + "grad_norm": 1.894133097203628, + "language_loss": 0.79750907, + "learning_rate": 2.466710842823274e-06, + "loss": 0.81866527, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.453125, + "step": 7356, + "time_per_iteration": 2.393824577331543 + }, + { + "auxiliary_loss_clip": 0.01064556, + "auxiliary_loss_mlp": 0.01052989, + "balance_loss_clip": 1.02347243, + "balance_loss_mlp": 1.02060246, + "epoch": 0.4423267698782504, + "flos": 17820855855360.0, + "grad_norm": 1.7472717161802904, + "language_loss": 0.78438509, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.80556059, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.43945312, + "step": 7357, + "time_per_iteration": 2.4396588802337646 + }, + { + "auxiliary_loss_clip": 0.01061957, + "auxiliary_loss_mlp": 0.01040502, + "balance_loss_clip": 1.01239192, + "balance_loss_mlp": 1.02046371, + "epoch": 0.4423868931309184, + "flos": 29203169886720.0, + "grad_norm": 1.5635766792460084, + "language_loss": 0.74247235, + "learning_rate": 2.465953388982481e-06, + "loss": 0.76349694, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 7358, + "time_per_iteration": 2.459625244140625 + }, + { + "auxiliary_loss_clip": 0.01067013, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.01029003, + "balance_loss_mlp": 1.02352333, + "epoch": 0.44244701638358636, + "flos": 29711924352000.0, + "grad_norm": 1.8976403948468035, + "language_loss": 0.76914799, + "learning_rate": 2.465574635551405e-06, + "loss": 0.79020542, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43359375, + "step": 7359, + "time_per_iteration": 2.5101490020751953 + }, + { + "auxiliary_loss_clip": 0.01066191, + "auxiliary_loss_mlp": 0.01044259, + "balance_loss_clip": 1.01526725, + "balance_loss_mlp": 1.02272367, + "epoch": 0.4425071396362543, + "flos": 22928928387840.0, + "grad_norm": 1.723749922386189, + "language_loss": 0.71154416, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.73264867, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43359375, + "step": 7360, + "time_per_iteration": 2.4012649059295654 + }, + { + "auxiliary_loss_clip": 0.01065704, + "auxiliary_loss_mlp": 0.01044257, + "balance_loss_clip": 1.01569426, + "balance_loss_mlp": 1.02263474, + "epoch": 0.4425672628889223, + "flos": 19791318879360.0, + "grad_norm": 3.01416642953967, + "language_loss": 0.70705867, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.7281583, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 7361, + "time_per_iteration": 2.419052839279175 + }, + { + "auxiliary_loss_clip": 0.01065608, + "auxiliary_loss_mlp": 0.01049454, + "balance_loss_clip": 1.01642084, + "balance_loss_mlp": 1.02130497, + "epoch": 0.44262738614159025, + "flos": 13661373496320.0, + "grad_norm": 1.9578112824084006, + "language_loss": 0.83736676, + "learning_rate": 2.464438269387809e-06, + "loss": 0.85851741, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44335938, + "step": 7362, + "time_per_iteration": 2.36337947845459 + }, + { + "auxiliary_loss_clip": 0.01068803, + "auxiliary_loss_mlp": 0.01050443, + "balance_loss_clip": 1.01625323, + "balance_loss_mlp": 1.02229428, + "epoch": 0.4426875093942582, + "flos": 14209335285120.0, + "grad_norm": 1.7067309282926402, + "language_loss": 0.75399995, + "learning_rate": 2.464059445424366e-06, + "loss": 0.77519232, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.46484375, + "step": 7363, + "time_per_iteration": 2.3842103481292725 + }, + { + "auxiliary_loss_clip": 0.01015649, + "auxiliary_loss_mlp": 0.01015515, + "balance_loss_clip": 1.01227236, + "balance_loss_mlp": 1.00637484, + "epoch": 0.4427476326469262, + "flos": 70113763319040.0, + "grad_norm": 0.6836030913050111, + "language_loss": 0.55691677, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57722843, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.09277344, + "step": 7364, + "time_per_iteration": 3.1010994911193848 + }, + { + "auxiliary_loss_clip": 0.01060932, + "auxiliary_loss_mlp": 0.01044304, + "balance_loss_clip": 1.01544309, + "balance_loss_mlp": 1.0191493, + "epoch": 0.44280775589959415, + "flos": 25443966798720.0, + "grad_norm": 1.5942693731321327, + "language_loss": 0.75484025, + "learning_rate": 2.463301744720305e-06, + "loss": 0.77589262, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 7365, + "time_per_iteration": 2.465895175933838 + }, + { + "auxiliary_loss_clip": 0.010624, + "auxiliary_loss_mlp": 0.01048847, + "balance_loss_clip": 1.02171493, + "balance_loss_mlp": 1.01994729, + "epoch": 0.4428678791522621, + "flos": 22856099558400.0, + "grad_norm": 1.6045562687198986, + "language_loss": 0.75788999, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.77900243, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.42382812, + "step": 7366, + "time_per_iteration": 2.3839659690856934 + }, + { + "auxiliary_loss_clip": 0.01064761, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.01478064, + "balance_loss_mlp": 1.02195454, + "epoch": 0.44292800240493013, + "flos": 25811252968320.0, + "grad_norm": 2.0302094076946586, + "language_loss": 0.74680662, + "learning_rate": 2.46254397374245e-06, + "loss": 0.76789612, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 7367, + "time_per_iteration": 2.4520552158355713 + }, + { + "auxiliary_loss_clip": 0.0106513, + "auxiliary_loss_mlp": 0.0104984, + "balance_loss_clip": 1.0191195, + "balance_loss_mlp": 1.02164972, + "epoch": 0.4429881256575981, + "flos": 32415493438080.0, + "grad_norm": 1.564151857456975, + "language_loss": 0.74756664, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76871634, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43359375, + "step": 7368, + "time_per_iteration": 2.4760656356811523 + }, + { + "auxiliary_loss_clip": 0.01059269, + "auxiliary_loss_mlp": 0.01046991, + "balance_loss_clip": 1.02015662, + "balance_loss_mlp": 1.01879716, + "epoch": 0.44304824891026606, + "flos": 22162619756160.0, + "grad_norm": 1.6375422527064407, + "language_loss": 0.80791867, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82898122, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 7369, + "time_per_iteration": 2.4084253311157227 + }, + { + "auxiliary_loss_clip": 0.01059266, + "auxiliary_loss_mlp": 0.01046954, + "balance_loss_clip": 1.02051306, + "balance_loss_mlp": 1.01949835, + "epoch": 0.443108372162934, + "flos": 25337376817920.0, + "grad_norm": 1.8887486311064874, + "language_loss": 0.73691005, + "learning_rate": 2.461407185763737e-06, + "loss": 0.75797224, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 7370, + "time_per_iteration": 2.4032037258148193 + }, + { + "auxiliary_loss_clip": 0.01062227, + "auxiliary_loss_mlp": 0.01050748, + "balance_loss_clip": 1.02018285, + "balance_loss_mlp": 1.01888037, + "epoch": 0.443168495415602, + "flos": 23329836063360.0, + "grad_norm": 2.00278626441293, + "language_loss": 0.72219491, + "learning_rate": 2.461028221425126e-06, + "loss": 0.7433247, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43359375, + "step": 7371, + "time_per_iteration": 2.4172191619873047 + }, + { + "auxiliary_loss_clip": 0.01060177, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.01880443, + "balance_loss_mlp": 1.01870549, + "epoch": 0.44322861866826996, + "flos": 21870431654400.0, + "grad_norm": 2.5219169774899943, + "language_loss": 0.70279497, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.72385633, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 7372, + "time_per_iteration": 2.371110677719116 + }, + { + "auxiliary_loss_clip": 0.01061333, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_clip": 1.02527189, + "balance_loss_mlp": 1.01839924, + "epoch": 0.4432887419209379, + "flos": 20083367335680.0, + "grad_norm": 1.6467661260459605, + "language_loss": 0.8382954, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85945189, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4296875, + "step": 7373, + "time_per_iteration": 3.6988110542297363 + }, + { + "auxiliary_loss_clip": 0.01013895, + "auxiliary_loss_mlp": 0.01006346, + "balance_loss_clip": 1.00265026, + "balance_loss_mlp": 1.00397801, + "epoch": 0.4433488651736059, + "flos": 70032032092800.0, + "grad_norm": 0.7628464334091587, + "language_loss": 0.55240899, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57261139, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.03686523, + "router_z_loss_mlp": 0.09912109, + "step": 7374, + "time_per_iteration": 3.049039363861084 + }, + { + "auxiliary_loss_clip": 0.01059385, + "auxiliary_loss_mlp": 0.01055692, + "balance_loss_clip": 1.02622366, + "balance_loss_mlp": 1.01790833, + "epoch": 0.44340898842627385, + "flos": 16281745079040.0, + "grad_norm": 2.258315707337408, + "language_loss": 0.84714651, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.86829728, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4140625, + "step": 7375, + "time_per_iteration": 5.263845682144165 + }, + { + "auxiliary_loss_clip": 0.01061826, + "auxiliary_loss_mlp": 0.0104829, + "balance_loss_clip": 1.01877403, + "balance_loss_mlp": 1.01886892, + "epoch": 0.4434691116789418, + "flos": 16611220379520.0, + "grad_norm": 1.753460545681197, + "language_loss": 0.84483445, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86593556, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 7376, + "time_per_iteration": 2.349104166030884 + }, + { + "auxiliary_loss_clip": 0.01060311, + "auxiliary_loss_mlp": 0.01048215, + "balance_loss_clip": 1.0195334, + "balance_loss_mlp": 1.01923358, + "epoch": 0.4435292349316098, + "flos": 19062227623680.0, + "grad_norm": 1.5761601048074176, + "language_loss": 0.79088855, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.81197381, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41015625, + "step": 7377, + "time_per_iteration": 2.3644888401031494 + }, + { + "auxiliary_loss_clip": 0.0105814, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.01520491, + "balance_loss_mlp": 1.01868999, + "epoch": 0.44358935818427775, + "flos": 21250269440640.0, + "grad_norm": 2.1335934662876106, + "language_loss": 0.77791131, + "learning_rate": 2.458374982357057e-06, + "loss": 0.7989158, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39453125, + "step": 7378, + "time_per_iteration": 2.400728702545166 + }, + { + "auxiliary_loss_clip": 0.0106021, + "auxiliary_loss_mlp": 0.01050818, + "balance_loss_clip": 1.02111101, + "balance_loss_mlp": 1.01847577, + "epoch": 0.4436494814369457, + "flos": 12494471391360.0, + "grad_norm": 3.9384389089010137, + "language_loss": 0.7177223, + "learning_rate": 2.457995878562982e-06, + "loss": 0.73883259, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41796875, + "step": 7379, + "time_per_iteration": 2.353121519088745 + }, + { + "auxiliary_loss_clip": 0.01062327, + "auxiliary_loss_mlp": 0.01050506, + "balance_loss_clip": 1.02094221, + "balance_loss_mlp": 1.02018976, + "epoch": 0.44370960468961373, + "flos": 23658717870720.0, + "grad_norm": 1.92187761155757, + "language_loss": 0.74544013, + "learning_rate": 2.457616757401656e-06, + "loss": 0.76656842, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 7380, + "time_per_iteration": 3.9661660194396973 + }, + { + "auxiliary_loss_clip": 0.01060903, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_clip": 1.01946115, + "balance_loss_mlp": 1.0191586, + "epoch": 0.4437697279422817, + "flos": 32415458526720.0, + "grad_norm": 1.5432808209276452, + "language_loss": 0.66076767, + "learning_rate": 2.457237618887458e-06, + "loss": 0.68187886, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.41796875, + "step": 7381, + "time_per_iteration": 2.494781255722046 + }, + { + "auxiliary_loss_clip": 0.0106209, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_clip": 1.02159274, + "balance_loss_mlp": 1.02031851, + "epoch": 0.44382985119494966, + "flos": 18111926793600.0, + "grad_norm": 1.9444067520392052, + "language_loss": 0.81389618, + "learning_rate": 2.456858463034763e-06, + "loss": 0.83501518, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 7382, + "time_per_iteration": 2.361161947250366 + }, + { + "auxiliary_loss_clip": 0.01063373, + "auxiliary_loss_mlp": 0.01055597, + "balance_loss_clip": 1.02610445, + "balance_loss_mlp": 1.02054524, + "epoch": 0.44388997444761763, + "flos": 30772829969280.0, + "grad_norm": 1.6252828821653198, + "language_loss": 0.65851176, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67970145, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42773438, + "step": 7383, + "time_per_iteration": 2.472119092941284 + }, + { + "auxiliary_loss_clip": 0.01065371, + "auxiliary_loss_mlp": 0.01048482, + "balance_loss_clip": 1.01586664, + "balance_loss_mlp": 1.02106249, + "epoch": 0.4439500977002856, + "flos": 20338128593280.0, + "grad_norm": 2.1443905949071893, + "language_loss": 0.77766377, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.79880226, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.44335938, + "step": 7384, + "time_per_iteration": 2.3564555644989014 + }, + { + "auxiliary_loss_clip": 0.01062086, + "auxiliary_loss_mlp": 0.01052784, + "balance_loss_clip": 1.02231407, + "balance_loss_mlp": 1.02045798, + "epoch": 0.44401022095295356, + "flos": 20370318733440.0, + "grad_norm": 1.4853305932763385, + "language_loss": 0.82183659, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.84298521, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.41601562, + "step": 7385, + "time_per_iteration": 2.4059641361236572 + }, + { + "auxiliary_loss_clip": 0.01064057, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.01886392, + "balance_loss_mlp": 1.02074432, + "epoch": 0.4440703442056215, + "flos": 20229583576320.0, + "grad_norm": 1.7714162826237845, + "language_loss": 0.83016968, + "learning_rate": 2.455341666526582e-06, + "loss": 0.85131049, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43359375, + "step": 7386, + "time_per_iteration": 2.364975929260254 + }, + { + "auxiliary_loss_clip": 0.01067995, + "auxiliary_loss_mlp": 0.01046131, + "balance_loss_clip": 1.01310968, + "balance_loss_mlp": 1.02298546, + "epoch": 0.4441304674582895, + "flos": 39493121299200.0, + "grad_norm": 1.8437513052332828, + "language_loss": 0.70450169, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.72564292, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.44921875, + "step": 7387, + "time_per_iteration": 2.561840534210205 + }, + { + "auxiliary_loss_clip": 0.01063731, + "auxiliary_loss_mlp": 0.01052763, + "balance_loss_clip": 1.02169752, + "balance_loss_mlp": 1.02063203, + "epoch": 0.44419059071095746, + "flos": 14828799271680.0, + "grad_norm": 1.877917070581985, + "language_loss": 0.72565764, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.7468226, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43164062, + "step": 7388, + "time_per_iteration": 2.360121011734009 + }, + { + "auxiliary_loss_clip": 0.01065576, + "auxiliary_loss_mlp": 0.01042158, + "balance_loss_clip": 1.01164079, + "balance_loss_mlp": 1.02220988, + "epoch": 0.4442507139636254, + "flos": 22636740286080.0, + "grad_norm": 1.6623496588440339, + "language_loss": 0.70542103, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.72649837, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 7389, + "time_per_iteration": 2.418391704559326 + }, + { + "auxiliary_loss_clip": 0.01063165, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.01842284, + "balance_loss_mlp": 1.02063465, + "epoch": 0.4443108372162934, + "flos": 38289176375040.0, + "grad_norm": 1.8125730157072082, + "language_loss": 0.75250775, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77359056, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.42578125, + "step": 7390, + "time_per_iteration": 2.513326406478882 + }, + { + "auxiliary_loss_clip": 0.01062439, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.01139784, + "balance_loss_mlp": 1.02112341, + "epoch": 0.44437096046896135, + "flos": 17748027025920.0, + "grad_norm": 1.8447632521418622, + "language_loss": 0.83333737, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.85438365, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4140625, + "step": 7391, + "time_per_iteration": 2.375920534133911 + }, + { + "auxiliary_loss_clip": 0.01063194, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.01946473, + "balance_loss_mlp": 1.02130628, + "epoch": 0.4444310837216293, + "flos": 13731583973760.0, + "grad_norm": 2.753426804903021, + "language_loss": 0.74865478, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.76976097, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41992188, + "step": 7392, + "time_per_iteration": 2.3652689456939697 + }, + { + "auxiliary_loss_clip": 0.01061207, + "auxiliary_loss_mlp": 0.01049526, + "balance_loss_clip": 1.02173829, + "balance_loss_mlp": 1.01933861, + "epoch": 0.44449120697429734, + "flos": 25009053592320.0, + "grad_norm": 1.5493224362014473, + "language_loss": 0.80716985, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.82827717, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 7393, + "time_per_iteration": 2.4181606769561768 + }, + { + "auxiliary_loss_clip": 0.01064793, + "auxiliary_loss_mlp": 0.01045462, + "balance_loss_clip": 1.01358509, + "balance_loss_mlp": 1.02082825, + "epoch": 0.4445513302269653, + "flos": 32670324518400.0, + "grad_norm": 1.812638860839533, + "language_loss": 0.81585932, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83696187, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.43945312, + "step": 7394, + "time_per_iteration": 2.484042167663574 + }, + { + "auxiliary_loss_clip": 0.01059622, + "auxiliary_loss_mlp": 0.01043584, + "balance_loss_clip": 1.0170598, + "balance_loss_mlp": 1.01826406, + "epoch": 0.44461145347963327, + "flos": 11655019728000.0, + "grad_norm": 2.1098196255938166, + "language_loss": 0.81020403, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.831236, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.4140625, + "step": 7395, + "time_per_iteration": 2.355422258377075 + }, + { + "auxiliary_loss_clip": 0.01061341, + "auxiliary_loss_mlp": 0.01047422, + "balance_loss_clip": 1.01940763, + "balance_loss_mlp": 1.01942027, + "epoch": 0.44467157673230123, + "flos": 20885706357120.0, + "grad_norm": 1.8054886468750466, + "language_loss": 0.69759274, + "learning_rate": 2.451548468607584e-06, + "loss": 0.71868038, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 7396, + "time_per_iteration": 2.4111058712005615 + }, + { + "auxiliary_loss_clip": 0.01060973, + "auxiliary_loss_mlp": 0.01044856, + "balance_loss_clip": 1.01563811, + "balance_loss_mlp": 1.01789892, + "epoch": 0.4447316999849692, + "flos": 18545303900160.0, + "grad_norm": 1.7186724628272425, + "language_loss": 0.81673181, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83779001, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 7397, + "time_per_iteration": 2.3499839305877686 + }, + { + "auxiliary_loss_clip": 0.01060885, + "auxiliary_loss_mlp": 0.01045054, + "balance_loss_clip": 1.01619387, + "balance_loss_mlp": 1.0190537, + "epoch": 0.44479182323763716, + "flos": 23767926203520.0, + "grad_norm": 1.6328011961286737, + "language_loss": 0.68048555, + "learning_rate": 2.450789623090293e-06, + "loss": 0.70154494, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 7398, + "time_per_iteration": 2.4421751499176025 + }, + { + "auxiliary_loss_clip": 0.01059671, + "auxiliary_loss_mlp": 0.01048535, + "balance_loss_clip": 1.02261901, + "balance_loss_mlp": 1.01852977, + "epoch": 0.44485194649030513, + "flos": 16542930026880.0, + "grad_norm": 1.7065551656218512, + "language_loss": 0.71188271, + "learning_rate": 2.450410174683472e-06, + "loss": 0.73296481, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.41210938, + "step": 7399, + "time_per_iteration": 2.377603530883789 + }, + { + "auxiliary_loss_clip": 0.01058962, + "auxiliary_loss_mlp": 0.01044105, + "balance_loss_clip": 1.01735413, + "balance_loss_mlp": 1.01855385, + "epoch": 0.4449120697429731, + "flos": 22599872023680.0, + "grad_norm": 1.7119984809684277, + "language_loss": 0.73393667, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.75496733, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40429688, + "step": 7400, + "time_per_iteration": 2.3969647884368896 + }, + { + "auxiliary_loss_clip": 0.01061037, + "auxiliary_loss_mlp": 0.01048284, + "balance_loss_clip": 1.01974511, + "balance_loss_mlp": 1.01887512, + "epoch": 0.44497219299564106, + "flos": 20004010081920.0, + "grad_norm": 1.5764505486063307, + "language_loss": 0.86159092, + "learning_rate": 2.449651226645422e-06, + "loss": 0.88268411, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.421875, + "step": 7401, + "time_per_iteration": 2.362785816192627 + }, + { + "auxiliary_loss_clip": 0.01057367, + "auxiliary_loss_mlp": 0.01043757, + "balance_loss_clip": 1.01699448, + "balance_loss_mlp": 1.01826108, + "epoch": 0.445032316248309, + "flos": 25593045770880.0, + "grad_norm": 1.4781262823600751, + "language_loss": 0.84788442, + "learning_rate": 2.449271727042973e-06, + "loss": 0.86889565, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 7402, + "time_per_iteration": 2.434708595275879 + }, + { + "auxiliary_loss_clip": 0.01061354, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.01377177, + "balance_loss_mlp": 1.01943016, + "epoch": 0.445092439500977, + "flos": 21249396656640.0, + "grad_norm": 1.7459370908241383, + "language_loss": 0.78125179, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.80229056, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41992188, + "step": 7403, + "time_per_iteration": 2.3605964183807373 + }, + { + "auxiliary_loss_clip": 0.01013687, + "auxiliary_loss_mlp": 0.01007713, + "balance_loss_clip": 1.00486398, + "balance_loss_mlp": 1.0045979, + "epoch": 0.44515256275364495, + "flos": 57762051943680.0, + "grad_norm": 0.7503272162185524, + "language_loss": 0.60141772, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62163174, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.09082031, + "step": 7404, + "time_per_iteration": 3.0382485389709473 + }, + { + "auxiliary_loss_clip": 0.0106436, + "auxiliary_loss_mlp": 0.01050617, + "balance_loss_clip": 1.01702356, + "balance_loss_mlp": 1.01967871, + "epoch": 0.4452126860063129, + "flos": 15595107903360.0, + "grad_norm": 1.5107107827549195, + "language_loss": 0.83027613, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.85142595, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.44726562, + "step": 7405, + "time_per_iteration": 2.370190382003784 + }, + { + "auxiliary_loss_clip": 0.01061335, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.01428485, + "balance_loss_mlp": 1.01855719, + "epoch": 0.4452728092589809, + "flos": 21616298801280.0, + "grad_norm": 1.9100722170479782, + "language_loss": 0.76371241, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.78475106, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.42773438, + "step": 7406, + "time_per_iteration": 2.39978289604187 + }, + { + "auxiliary_loss_clip": 0.01058877, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.01405871, + "balance_loss_mlp": 1.01890898, + "epoch": 0.4453329325116489, + "flos": 29496195861120.0, + "grad_norm": 1.9245763869952761, + "language_loss": 0.67185366, + "learning_rate": 2.447373973772129e-06, + "loss": 0.6928429, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40039062, + "step": 7407, + "time_per_iteration": 2.427783250808716 + }, + { + "auxiliary_loss_clip": 0.01065897, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.01547122, + "balance_loss_mlp": 1.02301908, + "epoch": 0.44539305576431687, + "flos": 21360071266560.0, + "grad_norm": 1.547990100049085, + "language_loss": 0.69185454, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.71294427, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4296875, + "step": 7408, + "time_per_iteration": 2.394376039505005 + }, + { + "auxiliary_loss_clip": 0.01062424, + "auxiliary_loss_mlp": 0.0105325, + "balance_loss_clip": 1.01972818, + "balance_loss_mlp": 1.01946771, + "epoch": 0.44545317901698483, + "flos": 41426017833600.0, + "grad_norm": 1.4146912970739978, + "language_loss": 0.72651309, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74766982, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.4296875, + "step": 7409, + "time_per_iteration": 2.5373780727386475 + }, + { + "auxiliary_loss_clip": 0.01063965, + "auxiliary_loss_mlp": 0.01047219, + "balance_loss_clip": 1.01697576, + "balance_loss_mlp": 1.02045476, + "epoch": 0.4455133022696528, + "flos": 22053900182400.0, + "grad_norm": 1.8702441018025147, + "language_loss": 0.65390164, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67501342, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.43554688, + "step": 7410, + "time_per_iteration": 2.4029409885406494 + }, + { + "auxiliary_loss_clip": 0.01067903, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.0177784, + "balance_loss_mlp": 1.02222455, + "epoch": 0.44557342552232077, + "flos": 23475842835840.0, + "grad_norm": 1.8106774757251416, + "language_loss": 0.75821018, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.77937138, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.45703125, + "step": 7411, + "time_per_iteration": 2.39731502532959 + }, + { + "auxiliary_loss_clip": 0.01060767, + "auxiliary_loss_mlp": 0.01039352, + "balance_loss_clip": 1.012959, + "balance_loss_mlp": 1.02131057, + "epoch": 0.44563354877498873, + "flos": 19133694910080.0, + "grad_norm": 1.7669878540799737, + "language_loss": 0.79891396, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81991518, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 7412, + "time_per_iteration": 2.382634401321411 + }, + { + "auxiliary_loss_clip": 0.01064134, + "auxiliary_loss_mlp": 0.0105076, + "balance_loss_clip": 1.01975346, + "balance_loss_mlp": 1.02069306, + "epoch": 0.4456936720276567, + "flos": 13620699895680.0, + "grad_norm": 2.401567271118526, + "language_loss": 0.815467, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.83661592, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43359375, + "step": 7413, + "time_per_iteration": 3.703620433807373 + }, + { + "auxiliary_loss_clip": 0.010607, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.01167154, + "balance_loss_mlp": 1.01992774, + "epoch": 0.44575379528032466, + "flos": 14713027603200.0, + "grad_norm": 1.974671180344838, + "language_loss": 0.78090119, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.80189574, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 7414, + "time_per_iteration": 3.7568609714508057 + }, + { + "auxiliary_loss_clip": 0.01061112, + "auxiliary_loss_mlp": 0.01042407, + "balance_loss_clip": 1.0142498, + "balance_loss_mlp": 1.01993585, + "epoch": 0.4458139185329926, + "flos": 24169532106240.0, + "grad_norm": 1.4685668184123488, + "language_loss": 0.84332961, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.86436486, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 7415, + "time_per_iteration": 3.776383399963379 + }, + { + "auxiliary_loss_clip": 0.01061792, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_clip": 1.01560044, + "balance_loss_mlp": 1.0196383, + "epoch": 0.4458740417856606, + "flos": 21761153498880.0, + "grad_norm": 1.7445033093781286, + "language_loss": 0.85369706, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.87475026, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.421875, + "step": 7416, + "time_per_iteration": 2.4076192378997803 + }, + { + "auxiliary_loss_clip": 0.01064266, + "auxiliary_loss_mlp": 0.01051588, + "balance_loss_clip": 1.02016461, + "balance_loss_mlp": 1.02020216, + "epoch": 0.44593416503832856, + "flos": 21067743519360.0, + "grad_norm": 1.5892124605191462, + "language_loss": 0.81942344, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.84058201, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44140625, + "step": 7417, + "time_per_iteration": 2.4322540760040283 + }, + { + "auxiliary_loss_clip": 0.01064531, + "auxiliary_loss_mlp": 0.0104738, + "balance_loss_clip": 1.01793551, + "balance_loss_mlp": 1.0216589, + "epoch": 0.4459942882909965, + "flos": 22599418176000.0, + "grad_norm": 2.0086329281685447, + "language_loss": 0.82794309, + "learning_rate": 2.443197426237077e-06, + "loss": 0.8490622, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 7418, + "time_per_iteration": 2.3648276329040527 + }, + { + "auxiliary_loss_clip": 0.01061693, + "auxiliary_loss_mlp": 0.01045824, + "balance_loss_clip": 1.01759553, + "balance_loss_mlp": 1.01971757, + "epoch": 0.4460544115436645, + "flos": 26504278922880.0, + "grad_norm": 1.7334981462282826, + "language_loss": 0.78695917, + "learning_rate": 2.442817638972991e-06, + "loss": 0.80803442, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 7419, + "time_per_iteration": 2.4358415603637695 + }, + { + "auxiliary_loss_clip": 0.01061916, + "auxiliary_loss_mlp": 0.01045842, + "balance_loss_clip": 1.02039087, + "balance_loss_mlp": 1.02051115, + "epoch": 0.4461145347963325, + "flos": 17603032682880.0, + "grad_norm": 1.498351492273621, + "language_loss": 0.73411441, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.75519204, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.4140625, + "step": 7420, + "time_per_iteration": 3.8257832527160645 + }, + { + "auxiliary_loss_clip": 0.01060458, + "auxiliary_loss_mlp": 0.01045493, + "balance_loss_clip": 1.01813459, + "balance_loss_mlp": 1.02025211, + "epoch": 0.44617465804900047, + "flos": 27267061507200.0, + "grad_norm": 1.9074377438836017, + "language_loss": 0.75608492, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77714443, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 7421, + "time_per_iteration": 2.4191856384277344 + }, + { + "auxiliary_loss_clip": 0.01058341, + "auxiliary_loss_mlp": 0.01048275, + "balance_loss_clip": 1.02221572, + "balance_loss_mlp": 1.01877809, + "epoch": 0.44623478130166844, + "flos": 17785418958720.0, + "grad_norm": 1.8564371978519814, + "language_loss": 0.773754, + "learning_rate": 2.44167817648821e-06, + "loss": 0.79482013, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 7422, + "time_per_iteration": 2.384601593017578 + }, + { + "auxiliary_loss_clip": 0.0106133, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_clip": 1.01732504, + "balance_loss_mlp": 1.01936913, + "epoch": 0.4462949045543364, + "flos": 23001896862720.0, + "grad_norm": 1.3601470128279411, + "language_loss": 0.66094363, + "learning_rate": 2.441298322143784e-06, + "loss": 0.68198782, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41796875, + "step": 7423, + "time_per_iteration": 2.3729543685913086 + }, + { + "auxiliary_loss_clip": 0.0105632, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.01888514, + "balance_loss_mlp": 1.01747656, + "epoch": 0.44635502780700437, + "flos": 17819180110080.0, + "grad_norm": 1.481183099833573, + "language_loss": 0.80978894, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.83079529, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38867188, + "step": 7424, + "time_per_iteration": 2.428053617477417 + }, + { + "auxiliary_loss_clip": 0.01057828, + "auxiliary_loss_mlp": 0.01047929, + "balance_loss_clip": 1.02340746, + "balance_loss_mlp": 1.01832557, + "epoch": 0.44641515105967233, + "flos": 26686804844160.0, + "grad_norm": 1.4083217951576967, + "language_loss": 0.81129557, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.83235312, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.39453125, + "step": 7425, + "time_per_iteration": 2.4148483276367188 + }, + { + "auxiliary_loss_clip": 0.01060138, + "auxiliary_loss_mlp": 0.01050037, + "balance_loss_clip": 1.02551568, + "balance_loss_mlp": 1.01915646, + "epoch": 0.4464752743123403, + "flos": 18912415512960.0, + "grad_norm": 5.76676989632695, + "language_loss": 0.78022385, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.80132556, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.41015625, + "step": 7426, + "time_per_iteration": 2.3825652599334717 + }, + { + "auxiliary_loss_clip": 0.01061218, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.02418876, + "balance_loss_mlp": 1.01914215, + "epoch": 0.44653539756500826, + "flos": 29569024690560.0, + "grad_norm": 1.7061085577252628, + "language_loss": 0.65887141, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.67999113, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 7427, + "time_per_iteration": 2.4287450313568115 + }, + { + "auxiliary_loss_clip": 0.0105894, + "auxiliary_loss_mlp": 0.01045897, + "balance_loss_clip": 1.0190506, + "balance_loss_mlp": 1.0188005, + "epoch": 0.44659552081767623, + "flos": 21467952967680.0, + "grad_norm": 1.7038854891567468, + "language_loss": 0.76216054, + "learning_rate": 2.439398799698608e-06, + "loss": 0.78320897, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 7428, + "time_per_iteration": 2.417336940765381 + }, + { + "auxiliary_loss_clip": 0.0105944, + "auxiliary_loss_mlp": 0.0104489, + "balance_loss_clip": 1.01908123, + "balance_loss_mlp": 1.01823556, + "epoch": 0.4466556440703442, + "flos": 17930902060800.0, + "grad_norm": 1.7557531392474424, + "language_loss": 0.79142976, + "learning_rate": 2.439018845165806e-06, + "loss": 0.81247306, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41210938, + "step": 7429, + "time_per_iteration": 2.3588950634002686 + }, + { + "auxiliary_loss_clip": 0.01063797, + "auxiliary_loss_mlp": 0.01045346, + "balance_loss_clip": 1.01729655, + "balance_loss_mlp": 1.02087939, + "epoch": 0.44671576732301216, + "flos": 21106322438400.0, + "grad_norm": 1.7622683032664104, + "language_loss": 0.91483247, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93592393, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4296875, + "step": 7430, + "time_per_iteration": 2.419955015182495 + }, + { + "auxiliary_loss_clip": 0.01065109, + "auxiliary_loss_mlp": 0.01050251, + "balance_loss_clip": 1.02012658, + "balance_loss_mlp": 1.02098382, + "epoch": 0.4467758905756801, + "flos": 23507928241920.0, + "grad_norm": 1.672524802184418, + "language_loss": 0.80359221, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.82474583, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44140625, + "step": 7431, + "time_per_iteration": 2.403712272644043 + }, + { + "auxiliary_loss_clip": 0.01063703, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.0157907, + "balance_loss_mlp": 1.02051532, + "epoch": 0.4468360138283481, + "flos": 18733031614080.0, + "grad_norm": 2.032434057676348, + "language_loss": 0.81158793, + "learning_rate": 2.437878881739204e-06, + "loss": 0.83266687, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43164062, + "step": 7432, + "time_per_iteration": 2.348212957382202 + }, + { + "auxiliary_loss_clip": 0.01062998, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.01442015, + "balance_loss_mlp": 1.02033091, + "epoch": 0.4468961370810161, + "flos": 23476017392640.0, + "grad_norm": 1.8671244700388248, + "language_loss": 0.78401721, + "learning_rate": 2.437498860702301e-06, + "loss": 0.80506194, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.42773438, + "step": 7433, + "time_per_iteration": 2.4316258430480957 + }, + { + "auxiliary_loss_clip": 0.01059148, + "auxiliary_loss_mlp": 0.01041502, + "balance_loss_clip": 1.01798224, + "balance_loss_mlp": 1.02039838, + "epoch": 0.4469562603336841, + "flos": 30073903994880.0, + "grad_norm": 2.188048877768603, + "language_loss": 0.77925396, + "learning_rate": 2.437118823075398e-06, + "loss": 0.80026042, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.38671875, + "step": 7434, + "time_per_iteration": 2.442866325378418 + }, + { + "auxiliary_loss_clip": 0.01064535, + "auxiliary_loss_mlp": 0.01038968, + "balance_loss_clip": 1.01168156, + "balance_loss_mlp": 1.02152491, + "epoch": 0.44701638358635204, + "flos": 22455296616960.0, + "grad_norm": 1.7041198966513709, + "language_loss": 0.65700853, + "learning_rate": 2.436738768872905e-06, + "loss": 0.6780436, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4296875, + "step": 7435, + "time_per_iteration": 2.417102098464966 + }, + { + "auxiliary_loss_clip": 0.0106235, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.0161531, + "balance_loss_mlp": 1.02101243, + "epoch": 0.44707650683902, + "flos": 24056797726080.0, + "grad_norm": 2.06497543312855, + "language_loss": 0.84771824, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.86880058, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4140625, + "step": 7436, + "time_per_iteration": 2.4159233570098877 + }, + { + "auxiliary_loss_clip": 0.01066766, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.01429009, + "balance_loss_mlp": 1.02281952, + "epoch": 0.44713663009168797, + "flos": 23765866433280.0, + "grad_norm": 1.6726615927272321, + "language_loss": 0.80789399, + "learning_rate": 2.435978610798798e-06, + "loss": 0.82898903, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43945312, + "step": 7437, + "time_per_iteration": 2.464812994003296 + }, + { + "auxiliary_loss_clip": 0.01064903, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.01386225, + "balance_loss_mlp": 1.02102959, + "epoch": 0.44719675334435594, + "flos": 24498099711360.0, + "grad_norm": 1.8139162809730864, + "language_loss": 0.73439437, + "learning_rate": 2.435598506956009e-06, + "loss": 0.75546551, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.43945312, + "step": 7438, + "time_per_iteration": 2.403141498565674 + }, + { + "auxiliary_loss_clip": 0.01062635, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.01940143, + "balance_loss_mlp": 1.02087259, + "epoch": 0.4472568765970239, + "flos": 29780668552320.0, + "grad_norm": 1.6677403775960167, + "language_loss": 0.68405747, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.70515317, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 7439, + "time_per_iteration": 2.469871759414673 + }, + { + "auxiliary_loss_clip": 0.01065755, + "auxiliary_loss_mlp": 0.01047284, + "balance_loss_clip": 1.0179944, + "balance_loss_mlp": 1.02197158, + "epoch": 0.44731699984969187, + "flos": 24642011802240.0, + "grad_norm": 1.759812683303084, + "language_loss": 0.75411522, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.77524567, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4375, + "step": 7440, + "time_per_iteration": 2.433894634246826 + }, + { + "auxiliary_loss_clip": 0.0106427, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.01572061, + "balance_loss_mlp": 1.02146316, + "epoch": 0.44737712310235983, + "flos": 29454544742400.0, + "grad_norm": 1.7978331368072171, + "language_loss": 0.75141722, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.77248967, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.42773438, + "step": 7441, + "time_per_iteration": 2.48005747795105 + }, + { + "auxiliary_loss_clip": 0.01063659, + "auxiliary_loss_mlp": 0.0104959, + "balance_loss_clip": 1.02043176, + "balance_loss_mlp": 1.02035904, + "epoch": 0.4474372463550278, + "flos": 24895760630400.0, + "grad_norm": 2.1019543279721553, + "language_loss": 0.76137096, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.78250337, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43359375, + "step": 7442, + "time_per_iteration": 2.3905279636383057 + }, + { + "auxiliary_loss_clip": 0.01065136, + "auxiliary_loss_mlp": 0.01047085, + "balance_loss_clip": 1.01792622, + "balance_loss_mlp": 1.02093756, + "epoch": 0.44749736960769576, + "flos": 33180231058560.0, + "grad_norm": 1.9839847683388172, + "language_loss": 0.75856185, + "learning_rate": 2.433697740261273e-06, + "loss": 0.77968407, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.44140625, + "step": 7443, + "time_per_iteration": 2.5045106410980225 + }, + { + "auxiliary_loss_clip": 0.01062318, + "auxiliary_loss_mlp": 0.01048801, + "balance_loss_clip": 1.02007103, + "balance_loss_mlp": 1.01941752, + "epoch": 0.4475574928603637, + "flos": 21070676073600.0, + "grad_norm": 1.492686661711205, + "language_loss": 0.78180754, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.80291873, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 7444, + "time_per_iteration": 2.4029104709625244 + }, + { + "auxiliary_loss_clip": 0.0105925, + "auxiliary_loss_mlp": 0.01045266, + "balance_loss_clip": 1.01614332, + "balance_loss_mlp": 1.01844728, + "epoch": 0.4476176161130317, + "flos": 21861703814400.0, + "grad_norm": 2.4355008600319787, + "language_loss": 0.85946393, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.88050902, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40820312, + "step": 7445, + "time_per_iteration": 2.390942096710205 + }, + { + "auxiliary_loss_clip": 0.01062238, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.01560259, + "balance_loss_mlp": 1.01954746, + "epoch": 0.4476777393656997, + "flos": 22527566864640.0, + "grad_norm": 2.1418163189396875, + "language_loss": 0.66012031, + "learning_rate": 2.432557082778765e-06, + "loss": 0.68118525, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42578125, + "step": 7446, + "time_per_iteration": 2.379326820373535 + }, + { + "auxiliary_loss_clip": 0.01014453, + "auxiliary_loss_mlp": 0.01009643, + "balance_loss_clip": 1.00712788, + "balance_loss_mlp": 1.00603938, + "epoch": 0.4477378626183677, + "flos": 49014283507200.0, + "grad_norm": 0.7394995978133688, + "language_loss": 0.50406832, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52430928, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.02514648, + "router_z_loss_mlp": 0.08398438, + "step": 7447, + "time_per_iteration": 2.917346239089966 + }, + { + "auxiliary_loss_clip": 0.01012499, + "auxiliary_loss_mlp": 0.01007694, + "balance_loss_clip": 1.00505912, + "balance_loss_mlp": 1.00400996, + "epoch": 0.44779798587103564, + "flos": 56538868993920.0, + "grad_norm": 0.7463159587188575, + "language_loss": 0.59377801, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61397994, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.08496094, + "step": 7448, + "time_per_iteration": 3.0910396575927734 + }, + { + "auxiliary_loss_clip": 0.01060034, + "auxiliary_loss_mlp": 0.01044537, + "balance_loss_clip": 1.01826298, + "balance_loss_mlp": 1.01884913, + "epoch": 0.4478581091237036, + "flos": 46496803167360.0, + "grad_norm": 1.9700735084543406, + "language_loss": 0.59584785, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61689359, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41210938, + "step": 7449, + "time_per_iteration": 2.6133785247802734 + }, + { + "auxiliary_loss_clip": 0.01059414, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02040768, + "balance_loss_mlp": 1.01825356, + "epoch": 0.4479182323763716, + "flos": 20813296464000.0, + "grad_norm": 3.239418436180509, + "language_loss": 0.81973028, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.84079581, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 7450, + "time_per_iteration": 2.387289047241211 + }, + { + "auxiliary_loss_clip": 0.01058619, + "auxiliary_loss_mlp": 0.01045275, + "balance_loss_clip": 1.01900172, + "balance_loss_mlp": 1.01862407, + "epoch": 0.44797835562903954, + "flos": 14245121295360.0, + "grad_norm": 1.9783622904797427, + "language_loss": 0.81283045, + "learning_rate": 2.430655659114697e-06, + "loss": 0.83386934, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 7451, + "time_per_iteration": 2.382765769958496 + }, + { + "auxiliary_loss_clip": 0.01011691, + "auxiliary_loss_mlp": 0.01007989, + "balance_loss_clip": 1.00534296, + "balance_loss_mlp": 1.00326574, + "epoch": 0.4480384788817075, + "flos": 63531414138240.0, + "grad_norm": 0.827160850581408, + "language_loss": 0.62832248, + "learning_rate": 2.430275325332681e-06, + "loss": 0.6485194, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.08398438, + "step": 7452, + "time_per_iteration": 4.529294490814209 + }, + { + "auxiliary_loss_clip": 0.0106111, + "auxiliary_loss_mlp": 0.01050297, + "balance_loss_clip": 1.02063775, + "balance_loss_mlp": 1.01897693, + "epoch": 0.44809860213437547, + "flos": 21651561141120.0, + "grad_norm": 1.831648228212988, + "language_loss": 0.64371121, + "learning_rate": 2.429894975234582e-06, + "loss": 0.6648252, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 7453, + "time_per_iteration": 2.376901149749756 + }, + { + "auxiliary_loss_clip": 0.01013009, + "auxiliary_loss_mlp": 0.01006127, + "balance_loss_clip": 1.00309908, + "balance_loss_mlp": 1.0045588, + "epoch": 0.44815872538704343, + "flos": 69187308814080.0, + "grad_norm": 0.7909282751716382, + "language_loss": 0.57159221, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59178364, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.03027344, + "router_z_loss_mlp": 0.08447266, + "step": 7454, + "time_per_iteration": 5.593378067016602 + }, + { + "auxiliary_loss_clip": 0.01060608, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.02405524, + "balance_loss_mlp": 1.01958728, + "epoch": 0.4482188486397114, + "flos": 12597640058880.0, + "grad_norm": 2.3634121033443685, + "language_loss": 0.76583117, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7869429, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41015625, + "step": 7455, + "time_per_iteration": 2.370837926864624 + }, + { + "auxiliary_loss_clip": 0.01060937, + "auxiliary_loss_mlp": 0.01055281, + "balance_loss_clip": 1.02743351, + "balance_loss_mlp": 1.01954722, + "epoch": 0.44827897189237936, + "flos": 34056760452480.0, + "grad_norm": 1.724433408342534, + "language_loss": 0.77031076, + "learning_rate": 2.428753827188016e-06, + "loss": 0.79147291, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 7456, + "time_per_iteration": 2.489419460296631 + }, + { + "auxiliary_loss_clip": 0.01061787, + "auxiliary_loss_mlp": 0.01049695, + "balance_loss_clip": 1.02356446, + "balance_loss_mlp": 1.02134752, + "epoch": 0.44833909514504733, + "flos": 25146472170240.0, + "grad_norm": 2.0334951594428476, + "language_loss": 0.77779019, + "learning_rate": 2.428373411969818e-06, + "loss": 0.7989049, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 7457, + "time_per_iteration": 2.4270012378692627 + }, + { + "auxiliary_loss_clip": 0.01061817, + "auxiliary_loss_mlp": 0.01043085, + "balance_loss_clip": 1.01416516, + "balance_loss_mlp": 1.02041936, + "epoch": 0.4483992183977153, + "flos": 16179065170560.0, + "grad_norm": 2.126938931187814, + "language_loss": 0.69004887, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.71109784, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 7458, + "time_per_iteration": 2.3531150817871094 + }, + { + "auxiliary_loss_clip": 0.01063861, + "auxiliary_loss_mlp": 0.01052192, + "balance_loss_clip": 1.02333188, + "balance_loss_mlp": 1.01998734, + "epoch": 0.44845934165038326, + "flos": 17745164294400.0, + "grad_norm": 1.516548979560439, + "language_loss": 0.72714198, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74830246, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4375, + "step": 7459, + "time_per_iteration": 2.4357681274414062 + }, + { + "auxiliary_loss_clip": 0.01060758, + "auxiliary_loss_mlp": 0.01051066, + "balance_loss_clip": 1.02383876, + "balance_loss_mlp": 1.01954532, + "epoch": 0.4485194649030513, + "flos": 21834820200960.0, + "grad_norm": 3.2218251676191842, + "language_loss": 0.70643497, + "learning_rate": 2.427232068909154e-06, + "loss": 0.72755313, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41210938, + "step": 7460, + "time_per_iteration": 3.8474650382995605 + }, + { + "auxiliary_loss_clip": 0.01063723, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_clip": 1.02850187, + "balance_loss_mlp": 1.02148438, + "epoch": 0.44857958815571924, + "flos": 20083472069760.0, + "grad_norm": 2.011588832194262, + "language_loss": 0.79714096, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.81833273, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.421875, + "step": 7461, + "time_per_iteration": 2.386831521987915 + }, + { + "auxiliary_loss_clip": 0.01064398, + "auxiliary_loss_mlp": 0.01047517, + "balance_loss_clip": 1.01949072, + "balance_loss_mlp": 1.0213151, + "epoch": 0.4486397114083872, + "flos": 27052275623040.0, + "grad_norm": 3.3465856170604398, + "language_loss": 0.69754052, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.7186597, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.43164062, + "step": 7462, + "time_per_iteration": 2.479278802871704 + }, + { + "auxiliary_loss_clip": 0.0102157, + "auxiliary_loss_mlp": 0.01021977, + "balance_loss_clip": 1.01940238, + "balance_loss_mlp": 1.01300263, + "epoch": 0.4486998346610552, + "flos": 67318164155520.0, + "grad_norm": 1.950634939162188, + "language_loss": 0.54497659, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56541204, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.0859375, + "step": 7463, + "time_per_iteration": 3.0324108600616455 + }, + { + "auxiliary_loss_clip": 0.0106194, + "auxiliary_loss_mlp": 0.01042193, + "balance_loss_clip": 1.01539493, + "balance_loss_mlp": 1.02131641, + "epoch": 0.44875995791372314, + "flos": 27635569574400.0, + "grad_norm": 2.0350491602530942, + "language_loss": 0.77395082, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.79499215, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 7464, + "time_per_iteration": 2.483410120010376 + }, + { + "auxiliary_loss_clip": 0.01063165, + "auxiliary_loss_mlp": 0.01040989, + "balance_loss_clip": 1.01655126, + "balance_loss_mlp": 1.02321362, + "epoch": 0.4488200811663911, + "flos": 13005111070080.0, + "grad_norm": 1.7632963398374, + "language_loss": 0.76112831, + "learning_rate": 2.425329506653441e-06, + "loss": 0.78216988, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.40039062, + "step": 7465, + "time_per_iteration": 2.414313316345215 + }, + { + "auxiliary_loss_clip": 0.01067352, + "auxiliary_loss_mlp": 0.0105169, + "balance_loss_clip": 1.02119672, + "balance_loss_mlp": 1.02281857, + "epoch": 0.44888020441905907, + "flos": 27488759840640.0, + "grad_norm": 1.9759589488394178, + "language_loss": 0.81716955, + "learning_rate": 2.424948945758966e-06, + "loss": 0.83835995, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4453125, + "step": 7466, + "time_per_iteration": 2.475558042526245 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.01640475, + "balance_loss_mlp": 1.02355838, + "epoch": 0.44894032767172704, + "flos": 18258701616000.0, + "grad_norm": 2.257730549369128, + "language_loss": 0.81741452, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.83847308, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.41015625, + "step": 7467, + "time_per_iteration": 2.357679843902588 + }, + { + "auxiliary_loss_clip": 0.0106217, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.01791418, + "balance_loss_mlp": 1.02341485, + "epoch": 0.449000450924395, + "flos": 21578767223040.0, + "grad_norm": 1.6218531905722746, + "language_loss": 0.76381624, + "learning_rate": 2.424187775642129e-06, + "loss": 0.7848528, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38671875, + "step": 7468, + "time_per_iteration": 2.403348207473755 + }, + { + "auxiliary_loss_clip": 0.01064981, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.01716709, + "balance_loss_mlp": 1.02384937, + "epoch": 0.44906057417706297, + "flos": 17966932450560.0, + "grad_norm": 1.6417705449681603, + "language_loss": 0.71900392, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.74006414, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.41210938, + "step": 7469, + "time_per_iteration": 2.381255626678467 + }, + { + "auxiliary_loss_clip": 0.0106224, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.01959944, + "balance_loss_mlp": 1.02101719, + "epoch": 0.44912069742973093, + "flos": 20046324516480.0, + "grad_norm": 1.6446459304072352, + "language_loss": 0.72535467, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74644083, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41210938, + "step": 7470, + "time_per_iteration": 2.4082746505737305 + }, + { + "auxiliary_loss_clip": 0.0106462, + "auxiliary_loss_mlp": 0.01045524, + "balance_loss_clip": 1.01814163, + "balance_loss_mlp": 1.02308202, + "epoch": 0.4491808206823989, + "flos": 21032446268160.0, + "grad_norm": 1.7836943497017024, + "language_loss": 0.78435224, + "learning_rate": 2.423045899863634e-06, + "loss": 0.80545366, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41601562, + "step": 7471, + "time_per_iteration": 2.381223440170288 + }, + { + "auxiliary_loss_clip": 0.01061788, + "auxiliary_loss_mlp": 0.01059628, + "balance_loss_clip": 1.03069615, + "balance_loss_mlp": 1.02044487, + "epoch": 0.44924094393506686, + "flos": 22966006118400.0, + "grad_norm": 1.6957215060082014, + "language_loss": 0.71949792, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.74071217, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 7472, + "time_per_iteration": 2.4694647789001465 + }, + { + "auxiliary_loss_clip": 0.01017179, + "auxiliary_loss_mlp": 0.01007838, + "balance_loss_clip": 1.00497687, + "balance_loss_mlp": 1.00875127, + "epoch": 0.4493010671877349, + "flos": 59230323838080.0, + "grad_norm": 0.7878824221482037, + "language_loss": 0.61807913, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63832927, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.08398438, + "step": 7473, + "time_per_iteration": 2.9801182746887207 + }, + { + "auxiliary_loss_clip": 0.01060488, + "auxiliary_loss_mlp": 0.01047885, + "balance_loss_clip": 1.02210021, + "balance_loss_mlp": 1.01947725, + "epoch": 0.44936119044040285, + "flos": 18003905447040.0, + "grad_norm": 1.9129298958746344, + "language_loss": 0.7905125, + "learning_rate": 2.421903879707657e-06, + "loss": 0.81159621, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 7474, + "time_per_iteration": 2.4021730422973633 + }, + { + "auxiliary_loss_clip": 0.01057973, + "auxiliary_loss_mlp": 0.01058705, + "balance_loss_clip": 1.03121495, + "balance_loss_mlp": 1.0182879, + "epoch": 0.4494213136930708, + "flos": 21250758199680.0, + "grad_norm": 1.6572647697526866, + "language_loss": 0.73197532, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.75314212, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.3984375, + "step": 7475, + "time_per_iteration": 2.3916754722595215 + }, + { + "auxiliary_loss_clip": 0.01059435, + "auxiliary_loss_mlp": 0.01058973, + "balance_loss_clip": 1.03219891, + "balance_loss_mlp": 1.01768494, + "epoch": 0.4494814369457388, + "flos": 27417432199680.0, + "grad_norm": 1.8982123997833262, + "language_loss": 0.78000748, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.80119157, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41796875, + "step": 7476, + "time_per_iteration": 2.45082426071167 + }, + { + "auxiliary_loss_clip": 0.01061794, + "auxiliary_loss_mlp": 0.01068399, + "balance_loss_clip": 1.03804803, + "balance_loss_mlp": 1.01924014, + "epoch": 0.44954156019840674, + "flos": 22853027358720.0, + "grad_norm": 2.3153479449573022, + "language_loss": 0.73268032, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.75398231, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42578125, + "step": 7477, + "time_per_iteration": 2.405015468597412 + }, + { + "auxiliary_loss_clip": 0.01063634, + "auxiliary_loss_mlp": 0.01055901, + "balance_loss_clip": 1.02607441, + "balance_loss_mlp": 1.02017474, + "epoch": 0.4496016834510747, + "flos": 17200623818880.0, + "grad_norm": 1.9902788312258757, + "language_loss": 0.70298278, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.72417808, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43554688, + "step": 7478, + "time_per_iteration": 2.411987781524658 + }, + { + "auxiliary_loss_clip": 0.0105736, + "auxiliary_loss_mlp": 0.010594, + "balance_loss_clip": 1.03255463, + "balance_loss_mlp": 1.01874053, + "epoch": 0.4496618067037427, + "flos": 18915627358080.0, + "grad_norm": 1.8224918742826195, + "language_loss": 0.9009285, + "learning_rate": 2.420000193000779e-06, + "loss": 0.92209613, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 7479, + "time_per_iteration": 2.372098207473755 + }, + { + "auxiliary_loss_clip": 0.01063053, + "auxiliary_loss_mlp": 0.01060557, + "balance_loss_clip": 1.03250718, + "balance_loss_mlp": 1.02134788, + "epoch": 0.44972192995641064, + "flos": 21030630877440.0, + "grad_norm": 1.665484434548596, + "language_loss": 0.76902831, + "learning_rate": 2.419619407822302e-06, + "loss": 0.79026437, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 7480, + "time_per_iteration": 2.429140567779541 + }, + { + "auxiliary_loss_clip": 0.01065452, + "auxiliary_loss_mlp": 0.01057482, + "balance_loss_clip": 1.02763176, + "balance_loss_mlp": 1.02212262, + "epoch": 0.4497820532090786, + "flos": 20776044176640.0, + "grad_norm": 2.1280579417771657, + "language_loss": 0.81380546, + "learning_rate": 2.419238606731815e-06, + "loss": 0.83503479, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43359375, + "step": 7481, + "time_per_iteration": 2.4074056148529053 + }, + { + "auxiliary_loss_clip": 0.01060795, + "auxiliary_loss_mlp": 0.01044928, + "balance_loss_clip": 1.01870203, + "balance_loss_mlp": 1.02140367, + "epoch": 0.44984217646174657, + "flos": 33801196233600.0, + "grad_norm": 1.7390548316514016, + "language_loss": 0.69721901, + "learning_rate": 2.418857789743758e-06, + "loss": 0.71827614, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 7482, + "time_per_iteration": 2.5256571769714355 + }, + { + "auxiliary_loss_clip": 0.01064254, + "auxiliary_loss_mlp": 0.01048692, + "balance_loss_clip": 1.01990283, + "balance_loss_mlp": 1.02246571, + "epoch": 0.44990229971441453, + "flos": 15517600951680.0, + "grad_norm": 4.112418058019232, + "language_loss": 0.85994375, + "learning_rate": 2.418476956872571e-06, + "loss": 0.88107324, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 7483, + "time_per_iteration": 2.354069232940674 + }, + { + "auxiliary_loss_clip": 0.01068223, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_clip": 1.01977134, + "balance_loss_mlp": 1.02437997, + "epoch": 0.4499624229670825, + "flos": 29860619299200.0, + "grad_norm": 1.738103393695953, + "language_loss": 0.81887078, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.84003592, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4375, + "step": 7484, + "time_per_iteration": 2.502774953842163 + }, + { + "auxiliary_loss_clip": 0.01071005, + "auxiliary_loss_mlp": 0.0104998, + "balance_loss_clip": 1.01502812, + "balance_loss_mlp": 1.02381635, + "epoch": 0.45002254621975046, + "flos": 18512729735040.0, + "grad_norm": 2.609101713777786, + "language_loss": 0.76085693, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.78206676, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.47265625, + "step": 7485, + "time_per_iteration": 2.401819944381714 + }, + { + "auxiliary_loss_clip": 0.01025166, + "auxiliary_loss_mlp": 0.01023606, + "balance_loss_clip": 1.0210551, + "balance_loss_mlp": 1.01624537, + "epoch": 0.4500826694724185, + "flos": 70417334390400.0, + "grad_norm": 0.807937363010597, + "language_loss": 0.5890553, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60954297, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.08886719, + "step": 7486, + "time_per_iteration": 3.0681605339050293 + }, + { + "auxiliary_loss_clip": 0.01067933, + "auxiliary_loss_mlp": 0.01047795, + "balance_loss_clip": 1.01856518, + "balance_loss_mlp": 1.02484572, + "epoch": 0.45014279272508645, + "flos": 15777982938240.0, + "grad_norm": 1.9189894107674152, + "language_loss": 0.84294266, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.86409998, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 7487, + "time_per_iteration": 2.3868765830993652 + }, + { + "auxiliary_loss_clip": 0.01069, + "auxiliary_loss_mlp": 0.01043834, + "balance_loss_clip": 1.01582026, + "balance_loss_mlp": 1.02709186, + "epoch": 0.4502029159777544, + "flos": 21798475608960.0, + "grad_norm": 1.5297381873543638, + "language_loss": 0.78511143, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.80623972, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 7488, + "time_per_iteration": 2.4674696922302246 + }, + { + "auxiliary_loss_clip": 0.01072994, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_clip": 1.01295733, + "balance_loss_mlp": 1.02663648, + "epoch": 0.4502630392304224, + "flos": 28766685669120.0, + "grad_norm": 2.1089852196065686, + "language_loss": 0.73376125, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.75494695, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.46289062, + "step": 7489, + "time_per_iteration": 2.448042154312134 + }, + { + "auxiliary_loss_clip": 0.01072346, + "auxiliary_loss_mlp": 0.01054469, + "balance_loss_clip": 1.01939726, + "balance_loss_mlp": 1.02700043, + "epoch": 0.45032316248309034, + "flos": 15843480382080.0, + "grad_norm": 2.2370465351909483, + "language_loss": 0.70779502, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.72906315, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 0.453125, + "step": 7490, + "time_per_iteration": 2.4084415435791016 + }, + { + "auxiliary_loss_clip": 0.01031972, + "auxiliary_loss_mlp": 0.01004894, + "balance_loss_clip": 1.00240278, + "balance_loss_mlp": 1.02197397, + "epoch": 0.4503832857357583, + "flos": 57850311594240.0, + "grad_norm": 0.7854790760003224, + "language_loss": 0.56746721, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58783585, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.10009766, + "step": 7491, + "time_per_iteration": 4.339057922363281 + }, + { + "auxiliary_loss_clip": 0.0106863, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_clip": 1.02546573, + "balance_loss_mlp": 1.0264523, + "epoch": 0.4504434089884263, + "flos": 23876959979520.0, + "grad_norm": 1.742570288445148, + "language_loss": 0.80823934, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.82944769, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.421875, + "step": 7492, + "time_per_iteration": 2.481872081756592 + }, + { + "auxiliary_loss_clip": 0.01069872, + "auxiliary_loss_mlp": 0.01063411, + "balance_loss_clip": 1.03379965, + "balance_loss_mlp": 1.02482629, + "epoch": 0.45050353224109424, + "flos": 17784127238400.0, + "grad_norm": 2.3968384721121043, + "language_loss": 0.9345969, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.95592976, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44921875, + "step": 7493, + "time_per_iteration": 3.7811665534973145 + }, + { + "auxiliary_loss_clip": 0.01027315, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.03116536, + "balance_loss_mlp": 1.01757753, + "epoch": 0.4505636554937622, + "flos": 65060330797440.0, + "grad_norm": 0.8257773623827361, + "language_loss": 0.62939882, + "learning_rate": 2.4142867511336e-06, + "loss": 0.65000689, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.09716797, + "step": 7494, + "time_per_iteration": 4.562364816665649 + }, + { + "auxiliary_loss_clip": 0.01066032, + "auxiliary_loss_mlp": 0.01055043, + "balance_loss_clip": 1.02701688, + "balance_loss_mlp": 1.02344131, + "epoch": 0.45062377874643017, + "flos": 22198999259520.0, + "grad_norm": 1.4160153824057204, + "language_loss": 0.82280254, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84401333, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42578125, + "step": 7495, + "time_per_iteration": 2.431205987930298 + }, + { + "auxiliary_loss_clip": 0.01066117, + "auxiliary_loss_mlp": 0.0105786, + "balance_loss_clip": 1.02739, + "balance_loss_mlp": 1.02221024, + "epoch": 0.45068390199909814, + "flos": 37668769781760.0, + "grad_norm": 1.7182156437282932, + "language_loss": 0.86489022, + "learning_rate": 2.41352469075395e-06, + "loss": 0.88612998, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4375, + "step": 7496, + "time_per_iteration": 2.570542812347412 + }, + { + "auxiliary_loss_clip": 0.01065489, + "auxiliary_loss_mlp": 0.01052528, + "balance_loss_clip": 1.02037668, + "balance_loss_mlp": 1.02118182, + "epoch": 0.4507440252517661, + "flos": 22301609345280.0, + "grad_norm": 2.0406691434005944, + "language_loss": 0.76899946, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.79017961, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44335938, + "step": 7497, + "time_per_iteration": 2.3763365745544434 + }, + { + "auxiliary_loss_clip": 0.010652, + "auxiliary_loss_mlp": 0.01053849, + "balance_loss_clip": 1.02440405, + "balance_loss_mlp": 1.02122402, + "epoch": 0.45080414850443407, + "flos": 13187532257280.0, + "grad_norm": 1.9457869926186593, + "language_loss": 0.75808609, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77927649, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.43945312, + "step": 7498, + "time_per_iteration": 2.449838399887085 + }, + { + "auxiliary_loss_clip": 0.01063421, + "auxiliary_loss_mlp": 0.01050808, + "balance_loss_clip": 1.01863396, + "balance_loss_mlp": 1.02036822, + "epoch": 0.4508642717571021, + "flos": 21943853976960.0, + "grad_norm": 2.542104794700752, + "language_loss": 0.71806896, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.7392112, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4296875, + "step": 7499, + "time_per_iteration": 2.402391195297241 + }, + { + "auxiliary_loss_clip": 0.0106586, + "auxiliary_loss_mlp": 0.01056884, + "balance_loss_clip": 1.02654552, + "balance_loss_mlp": 1.02161193, + "epoch": 0.45092439500977005, + "flos": 23366355212160.0, + "grad_norm": 2.1771482242174267, + "language_loss": 0.78594398, + "learning_rate": 2.412000381939477e-06, + "loss": 0.8071714, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44140625, + "step": 7500, + "time_per_iteration": 3.9439070224761963 + }, + { + "auxiliary_loss_clip": 0.01064382, + "auxiliary_loss_mlp": 0.01044894, + "balance_loss_clip": 1.01341081, + "balance_loss_mlp": 1.02159595, + "epoch": 0.450984518262438, + "flos": 20772029370240.0, + "grad_norm": 1.9239433131708068, + "language_loss": 0.6347084, + "learning_rate": 2.411619265641992e-06, + "loss": 0.65580118, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4296875, + "step": 7501, + "time_per_iteration": 2.3783490657806396 + }, + { + "auxiliary_loss_clip": 0.01066746, + "auxiliary_loss_mlp": 0.01052206, + "balance_loss_clip": 1.02118731, + "balance_loss_mlp": 1.02185822, + "epoch": 0.451044641515106, + "flos": 17706550464000.0, + "grad_norm": 2.459238839527523, + "language_loss": 0.8586114, + "learning_rate": 2.411238133735863e-06, + "loss": 0.87980092, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.44921875, + "step": 7502, + "time_per_iteration": 2.4061007499694824 + }, + { + "auxiliary_loss_clip": 0.01064241, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.01267207, + "balance_loss_mlp": 1.02265012, + "epoch": 0.45110476476777395, + "flos": 20593657900800.0, + "grad_norm": 1.3701952291222161, + "language_loss": 0.80600768, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.82707709, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41601562, + "step": 7503, + "time_per_iteration": 2.395280361175537 + }, + { + "auxiliary_loss_clip": 0.01066021, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_clip": 1.01604843, + "balance_loss_mlp": 1.02451015, + "epoch": 0.4511648880204419, + "flos": 16033128220800.0, + "grad_norm": 2.3016941142101417, + "language_loss": 0.81983912, + "learning_rate": 2.410475823155484e-06, + "loss": 0.8409363, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 7504, + "time_per_iteration": 2.426525592803955 + }, + { + "auxiliary_loss_clip": 0.01066357, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_clip": 1.01317477, + "balance_loss_mlp": 1.02408779, + "epoch": 0.4512250112731099, + "flos": 23977929231360.0, + "grad_norm": 1.6788049799273335, + "language_loss": 0.6470964, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.66817462, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 7505, + "time_per_iteration": 2.3967859745025635 + }, + { + "auxiliary_loss_clip": 0.01027602, + "auxiliary_loss_mlp": 0.01005706, + "balance_loss_clip": 1.00332224, + "balance_loss_mlp": 1.01846719, + "epoch": 0.45128513452577784, + "flos": 71458652868480.0, + "grad_norm": 0.8353640560549044, + "language_loss": 0.58959502, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60992813, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.09130859, + "step": 7506, + "time_per_iteration": 3.1318719387054443 + }, + { + "auxiliary_loss_clip": 0.01069628, + "auxiliary_loss_mlp": 0.01049037, + "balance_loss_clip": 1.01887751, + "balance_loss_mlp": 1.02651286, + "epoch": 0.4513452577784458, + "flos": 22089756015360.0, + "grad_norm": 1.624289331125538, + "language_loss": 0.80226862, + "learning_rate": 2.40933224058142e-06, + "loss": 0.82345533, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4296875, + "step": 7507, + "time_per_iteration": 2.3989713191986084 + }, + { + "auxiliary_loss_clip": 0.01070123, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.01767898, + "balance_loss_mlp": 1.02617681, + "epoch": 0.4514053810311138, + "flos": 24275354037120.0, + "grad_norm": 1.513531296535427, + "language_loss": 0.75116217, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.772367, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.43945312, + "step": 7508, + "time_per_iteration": 2.4558751583099365 + }, + { + "auxiliary_loss_clip": 0.01066236, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_clip": 1.01772428, + "balance_loss_mlp": 1.02508795, + "epoch": 0.45146550428378174, + "flos": 17886039096960.0, + "grad_norm": 2.3790295099153083, + "language_loss": 0.80915803, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.83027494, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41210938, + "step": 7509, + "time_per_iteration": 2.3785064220428467 + }, + { + "auxiliary_loss_clip": 0.01068017, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.01950121, + "balance_loss_mlp": 1.02527213, + "epoch": 0.4515256275364497, + "flos": 24242291112960.0, + "grad_norm": 1.8872486767345809, + "language_loss": 0.75019985, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.77136743, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42773438, + "step": 7510, + "time_per_iteration": 2.487206220626831 + }, + { + "auxiliary_loss_clip": 0.01069483, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.01531506, + "balance_loss_mlp": 1.02554691, + "epoch": 0.45158575078911767, + "flos": 20630002492800.0, + "grad_norm": 1.7782379598352216, + "language_loss": 0.78783864, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.80899131, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43945312, + "step": 7511, + "time_per_iteration": 2.402622699737549 + }, + { + "auxiliary_loss_clip": 0.01066936, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_clip": 1.01261628, + "balance_loss_mlp": 1.02255726, + "epoch": 0.45164587404178563, + "flos": 23326728952320.0, + "grad_norm": 1.6629933873265164, + "language_loss": 0.80061173, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.8216992, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.44335938, + "step": 7512, + "time_per_iteration": 2.440854787826538 + }, + { + "auxiliary_loss_clip": 0.01071546, + "auxiliary_loss_mlp": 0.01053005, + "balance_loss_clip": 1.01941216, + "balance_loss_mlp": 1.02508426, + "epoch": 0.45170599729445365, + "flos": 23804829377280.0, + "grad_norm": 2.0873175716152046, + "language_loss": 0.88352227, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.90476775, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.46484375, + "step": 7513, + "time_per_iteration": 2.4164586067199707 + }, + { + "auxiliary_loss_clip": 0.01061937, + "auxiliary_loss_mlp": 0.01040559, + "balance_loss_clip": 1.01553655, + "balance_loss_mlp": 1.02298594, + "epoch": 0.4517661205471216, + "flos": 23511838314240.0, + "grad_norm": 1.7516360184333193, + "language_loss": 0.68372703, + "learning_rate": 2.406663338649419e-06, + "loss": 0.70475203, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 7514, + "time_per_iteration": 2.448357343673706 + }, + { + "auxiliary_loss_clip": 0.01067267, + "auxiliary_loss_mlp": 0.01049516, + "balance_loss_clip": 1.01671004, + "balance_loss_mlp": 1.02399337, + "epoch": 0.4518262437997896, + "flos": 23512815832320.0, + "grad_norm": 1.8136106769651723, + "language_loss": 0.70960569, + "learning_rate": 2.406282005146318e-06, + "loss": 0.73077357, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.43359375, + "step": 7515, + "time_per_iteration": 2.411487102508545 + }, + { + "auxiliary_loss_clip": 0.01067135, + "auxiliary_loss_mlp": 0.01055349, + "balance_loss_clip": 1.02262652, + "balance_loss_mlp": 1.02185559, + "epoch": 0.45188636705245755, + "flos": 14567369944320.0, + "grad_norm": 2.2380827510852037, + "language_loss": 0.82904404, + "learning_rate": 2.405900656236963e-06, + "loss": 0.8502689, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.453125, + "step": 7516, + "time_per_iteration": 2.379821300506592 + }, + { + "auxiliary_loss_clip": 0.01061956, + "auxiliary_loss_mlp": 0.01051877, + "balance_loss_clip": 1.02303994, + "balance_loss_mlp": 1.02099013, + "epoch": 0.4519464903051255, + "flos": 19900527212160.0, + "grad_norm": 1.776936438771301, + "language_loss": 0.6733954, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.69453371, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41015625, + "step": 7517, + "time_per_iteration": 2.3759145736694336 + }, + { + "auxiliary_loss_clip": 0.01063887, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.01359022, + "balance_loss_mlp": 1.02303934, + "epoch": 0.4520066135577935, + "flos": 18843357110400.0, + "grad_norm": 1.854733095290723, + "language_loss": 0.64080125, + "learning_rate": 2.405137912257333e-06, + "loss": 0.66184187, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40820312, + "step": 7518, + "time_per_iteration": 2.4072892665863037 + }, + { + "auxiliary_loss_clip": 0.01063657, + "auxiliary_loss_mlp": 0.01047478, + "balance_loss_clip": 1.0197736, + "balance_loss_mlp": 1.02139246, + "epoch": 0.45206673681046144, + "flos": 48212609667840.0, + "grad_norm": 1.3158955196885416, + "language_loss": 0.60289198, + "learning_rate": 2.404756517215982e-06, + "loss": 0.62400329, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42382812, + "step": 7519, + "time_per_iteration": 2.6101343631744385 + }, + { + "auxiliary_loss_clip": 0.01065322, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.01878345, + "balance_loss_mlp": 1.0223496, + "epoch": 0.4521268600631294, + "flos": 23841034323840.0, + "grad_norm": 1.5500488275541031, + "language_loss": 0.7340861, + "learning_rate": 2.404375106826223e-06, + "loss": 0.75523472, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4296875, + "step": 7520, + "time_per_iteration": 2.4613234996795654 + }, + { + "auxiliary_loss_clip": 0.01063357, + "auxiliary_loss_mlp": 0.01050845, + "balance_loss_clip": 1.0214839, + "balance_loss_mlp": 1.02024734, + "epoch": 0.4521869833157974, + "flos": 18842623971840.0, + "grad_norm": 2.3132486496994433, + "language_loss": 0.76998031, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.79112238, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43164062, + "step": 7521, + "time_per_iteration": 2.357624053955078 + }, + { + "auxiliary_loss_clip": 0.01065544, + "auxiliary_loss_mlp": 0.01051777, + "balance_loss_clip": 1.02214146, + "balance_loss_mlp": 1.02081573, + "epoch": 0.45224710656846534, + "flos": 19787164427520.0, + "grad_norm": 1.7082416112724692, + "language_loss": 0.68585873, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70703197, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44726562, + "step": 7522, + "time_per_iteration": 2.415496587753296 + }, + { + "auxiliary_loss_clip": 0.0106102, + "auxiliary_loss_mlp": 0.01051075, + "balance_loss_clip": 1.02298903, + "balance_loss_mlp": 1.01891303, + "epoch": 0.4523072298211333, + "flos": 28254893915520.0, + "grad_norm": 1.5460426862177024, + "language_loss": 0.62509149, + "learning_rate": 2.403230783711134e-06, + "loss": 0.64621246, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 7523, + "time_per_iteration": 2.43674898147583 + }, + { + "auxiliary_loss_clip": 0.01065695, + "auxiliary_loss_mlp": 0.01053911, + "balance_loss_clip": 1.02308369, + "balance_loss_mlp": 1.02156031, + "epoch": 0.45236735307380127, + "flos": 11180270793600.0, + "grad_norm": 2.2580041357568055, + "language_loss": 0.8005116, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.82170773, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.44140625, + "step": 7524, + "time_per_iteration": 2.388261556625366 + }, + { + "auxiliary_loss_clip": 0.01063154, + "auxiliary_loss_mlp": 0.01051813, + "balance_loss_clip": 1.02388227, + "balance_loss_mlp": 1.02094293, + "epoch": 0.45242747632646924, + "flos": 22600290960000.0, + "grad_norm": 1.9958813143792098, + "language_loss": 0.64987016, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.67101979, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.421875, + "step": 7525, + "time_per_iteration": 2.40183424949646 + }, + { + "auxiliary_loss_clip": 0.01061798, + "auxiliary_loss_mlp": 0.01049067, + "balance_loss_clip": 1.02179217, + "balance_loss_mlp": 1.02022898, + "epoch": 0.45248759957913726, + "flos": 18255385036800.0, + "grad_norm": 1.5839960717519521, + "language_loss": 0.80485368, + "learning_rate": 2.402086322981083e-06, + "loss": 0.82596231, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 7526, + "time_per_iteration": 2.384517192840576 + }, + { + "auxiliary_loss_clip": 0.01060406, + "auxiliary_loss_mlp": 0.0104888, + "balance_loss_clip": 1.02108049, + "balance_loss_mlp": 1.01951313, + "epoch": 0.4525477228318052, + "flos": 22449152217600.0, + "grad_norm": 1.62904524332469, + "language_loss": 0.81982303, + "learning_rate": 2.40170480555747e-06, + "loss": 0.84091592, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40820312, + "step": 7527, + "time_per_iteration": 2.3906197547912598 + }, + { + "auxiliary_loss_clip": 0.01063523, + "auxiliary_loss_mlp": 0.01043764, + "balance_loss_clip": 1.01589298, + "balance_loss_mlp": 1.02164245, + "epoch": 0.4526078460844732, + "flos": 29643529265280.0, + "grad_norm": 1.9612117783958356, + "language_loss": 0.66406012, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.68513298, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 7528, + "time_per_iteration": 2.5078327655792236 + }, + { + "auxiliary_loss_clip": 0.01060165, + "auxiliary_loss_mlp": 0.01045031, + "balance_loss_clip": 1.01782775, + "balance_loss_mlp": 1.01976204, + "epoch": 0.45266796933714115, + "flos": 23038625479680.0, + "grad_norm": 1.5443909066694728, + "language_loss": 0.76207799, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.78312999, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40429688, + "step": 7529, + "time_per_iteration": 2.4032959938049316 + }, + { + "auxiliary_loss_clip": 0.01062269, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_clip": 1.0177784, + "balance_loss_mlp": 1.02080059, + "epoch": 0.4527280925898091, + "flos": 14427542482560.0, + "grad_norm": 2.429139085087504, + "language_loss": 0.74597615, + "learning_rate": 2.400560161948384e-06, + "loss": 0.76705778, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 7530, + "time_per_iteration": 2.366192102432251 + }, + { + "auxiliary_loss_clip": 0.01063731, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.01528788, + "balance_loss_mlp": 1.02165318, + "epoch": 0.4527882158424771, + "flos": 22924529556480.0, + "grad_norm": 1.8486309003832744, + "language_loss": 0.77156699, + "learning_rate": 2.400178583680834e-06, + "loss": 0.79263824, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 7531, + "time_per_iteration": 3.754228115081787 + }, + { + "auxiliary_loss_clip": 0.01060354, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.01371408, + "balance_loss_mlp": 1.02066374, + "epoch": 0.45284833909514505, + "flos": 25554187560960.0, + "grad_norm": 2.630467720386154, + "language_loss": 0.6859082, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.70692718, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39648438, + "step": 7532, + "time_per_iteration": 2.449387788772583 + }, + { + "auxiliary_loss_clip": 0.01062368, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.01062393, + "balance_loss_mlp": 1.02224588, + "epoch": 0.452908462347813, + "flos": 18149039435520.0, + "grad_norm": 5.213538689237841, + "language_loss": 0.78910911, + "learning_rate": 2.399415381635768e-06, + "loss": 0.8101114, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 7533, + "time_per_iteration": 3.7315876483917236 + }, + { + "auxiliary_loss_clip": 0.01068391, + "auxiliary_loss_mlp": 0.01048682, + "balance_loss_clip": 1.0167582, + "balance_loss_mlp": 1.0218339, + "epoch": 0.452968585600481, + "flos": 19061738864640.0, + "grad_norm": 1.802402658000889, + "language_loss": 0.85077262, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.87194335, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46679688, + "step": 7534, + "time_per_iteration": 2.4087021350860596 + }, + { + "auxiliary_loss_clip": 0.01065911, + "auxiliary_loss_mlp": 0.01041273, + "balance_loss_clip": 1.01290154, + "balance_loss_mlp": 1.0240314, + "epoch": 0.45302870885314894, + "flos": 22050723248640.0, + "grad_norm": 1.7093093689011205, + "language_loss": 0.77556741, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.7966392, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 7535, + "time_per_iteration": 3.817349672317505 + }, + { + "auxiliary_loss_clip": 0.01062766, + "auxiliary_loss_mlp": 0.01040412, + "balance_loss_clip": 1.01454377, + "balance_loss_mlp": 1.02259207, + "epoch": 0.4530888321058169, + "flos": 20375171412480.0, + "grad_norm": 1.5338947837655712, + "language_loss": 0.82336545, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.84439719, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 7536, + "time_per_iteration": 2.3797943592071533 + }, + { + "auxiliary_loss_clip": 0.01063912, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.01387334, + "balance_loss_mlp": 1.02163112, + "epoch": 0.4531489553584849, + "flos": 14829532410240.0, + "grad_norm": 1.6289766735434226, + "language_loss": 0.77005279, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.79110122, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.421875, + "step": 7537, + "time_per_iteration": 2.408872127532959 + }, + { + "auxiliary_loss_clip": 0.01064275, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.01450133, + "balance_loss_mlp": 1.02171504, + "epoch": 0.45320907861115284, + "flos": 21943888888320.0, + "grad_norm": 2.0384824522201086, + "language_loss": 0.76922905, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.79027784, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.42578125, + "step": 7538, + "time_per_iteration": 2.40981125831604 + }, + { + "auxiliary_loss_clip": 0.01017276, + "auxiliary_loss_mlp": 0.01023723, + "balance_loss_clip": 1.02080202, + "balance_loss_mlp": 1.00921988, + "epoch": 0.45326920186382086, + "flos": 66247760782080.0, + "grad_norm": 0.8062914014356619, + "language_loss": 0.62353498, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64394498, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.0291748, + "router_z_loss_mlp": 0.08056641, + "step": 7539, + "time_per_iteration": 4.464643478393555 + }, + { + "auxiliary_loss_clip": 0.01062909, + "auxiliary_loss_mlp": 0.01041335, + "balance_loss_clip": 1.01495385, + "balance_loss_mlp": 1.02255869, + "epoch": 0.4533293251164888, + "flos": 14683351080960.0, + "grad_norm": 1.7206702170265986, + "language_loss": 0.67246509, + "learning_rate": 2.396743698142872e-06, + "loss": 0.69350743, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 7540, + "time_per_iteration": 2.411545753479004 + }, + { + "auxiliary_loss_clip": 0.01066391, + "auxiliary_loss_mlp": 0.01058013, + "balance_loss_clip": 1.02649426, + "balance_loss_mlp": 1.02211607, + "epoch": 0.4533894483691568, + "flos": 22600116403200.0, + "grad_norm": 1.9064700828604393, + "language_loss": 0.86361408, + "learning_rate": 2.396361968778424e-06, + "loss": 0.88485807, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44140625, + "step": 7541, + "time_per_iteration": 2.4239509105682373 + }, + { + "auxiliary_loss_clip": 0.01063443, + "auxiliary_loss_mlp": 0.01052241, + "balance_loss_clip": 1.02587235, + "balance_loss_mlp": 1.02195084, + "epoch": 0.45344957162182475, + "flos": 34750170432000.0, + "grad_norm": 1.695000076332881, + "language_loss": 0.7811783, + "learning_rate": 2.395980224383889e-06, + "loss": 0.8023352, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41601562, + "step": 7542, + "time_per_iteration": 2.534183979034424 + }, + { + "auxiliary_loss_clip": 0.01063376, + "auxiliary_loss_mlp": 0.01049391, + "balance_loss_clip": 1.02222371, + "balance_loss_mlp": 1.02179372, + "epoch": 0.4535096948744927, + "flos": 23549090601600.0, + "grad_norm": 2.028592681332716, + "language_loss": 0.81631964, + "learning_rate": 2.395598464973746e-06, + "loss": 0.83744735, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41601562, + "step": 7543, + "time_per_iteration": 2.431640863418579 + }, + { + "auxiliary_loss_clip": 0.01064203, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03461576, + "balance_loss_mlp": 1.02202845, + "epoch": 0.4535698181271607, + "flos": 25556352065280.0, + "grad_norm": 1.7051442945146436, + "language_loss": 0.77486491, + "learning_rate": 2.395216690562469e-06, + "loss": 0.79611409, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.421875, + "step": 7544, + "time_per_iteration": 2.4908459186553955 + }, + { + "auxiliary_loss_clip": 0.010633, + "auxiliary_loss_mlp": 0.01063235, + "balance_loss_clip": 1.03616285, + "balance_loss_mlp": 1.02143908, + "epoch": 0.45362994137982865, + "flos": 24862942085760.0, + "grad_norm": 1.6798603648020651, + "language_loss": 0.76961583, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.79088116, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41796875, + "step": 7545, + "time_per_iteration": 2.421607255935669 + }, + { + "auxiliary_loss_clip": 0.01062926, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_clip": 1.02634287, + "balance_loss_mlp": 1.02091742, + "epoch": 0.4536900646324966, + "flos": 30805578691200.0, + "grad_norm": 1.6139418750589096, + "language_loss": 0.72845864, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74960148, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.41992188, + "step": 7546, + "time_per_iteration": 2.503974437713623 + }, + { + "auxiliary_loss_clip": 0.01062174, + "auxiliary_loss_mlp": 0.0106323, + "balance_loss_clip": 1.0345248, + "balance_loss_mlp": 1.01973009, + "epoch": 0.4537501878851646, + "flos": 23403188563200.0, + "grad_norm": 1.5735791045972793, + "language_loss": 0.77323651, + "learning_rate": 2.394071277466609e-06, + "loss": 0.79449058, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42578125, + "step": 7547, + "time_per_iteration": 2.4390709400177 + }, + { + "auxiliary_loss_clip": 0.01061625, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.03235483, + "balance_loss_mlp": 1.0198431, + "epoch": 0.45381031113783254, + "flos": 18148341208320.0, + "grad_norm": 2.3845396588392993, + "language_loss": 0.70914245, + "learning_rate": 2.393689443195573e-06, + "loss": 0.73037684, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41796875, + "step": 7548, + "time_per_iteration": 2.3879354000091553 + }, + { + "auxiliary_loss_clip": 0.01059379, + "auxiliary_loss_mlp": 0.01064733, + "balance_loss_clip": 1.03879273, + "balance_loss_mlp": 1.01851368, + "epoch": 0.4538704343905005, + "flos": 25335526515840.0, + "grad_norm": 2.1761699459910853, + "language_loss": 0.73978591, + "learning_rate": 2.393307593995794e-06, + "loss": 0.7610271, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40820312, + "step": 7549, + "time_per_iteration": 2.434638261795044 + }, + { + "auxiliary_loss_clip": 0.01058528, + "auxiliary_loss_mlp": 0.01054712, + "balance_loss_clip": 1.0286293, + "balance_loss_mlp": 1.01839447, + "epoch": 0.4539305576431685, + "flos": 28730166520320.0, + "grad_norm": 1.4053676913984436, + "language_loss": 0.66369891, + "learning_rate": 2.392925729881751e-06, + "loss": 0.68483126, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 7550, + "time_per_iteration": 2.4462249279022217 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_clip": 1.02585125, + "balance_loss_mlp": 1.01892221, + "epoch": 0.45399068089583644, + "flos": 22491292095360.0, + "grad_norm": 1.607829597010775, + "language_loss": 0.69687402, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.71798658, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 7551, + "time_per_iteration": 2.4201266765594482 + }, + { + "auxiliary_loss_clip": 0.01060958, + "auxiliary_loss_mlp": 0.01050634, + "balance_loss_clip": 1.02368093, + "balance_loss_mlp": 1.01808548, + "epoch": 0.45405080414850446, + "flos": 12892655980800.0, + "grad_norm": 1.767975966681464, + "language_loss": 0.80474919, + "learning_rate": 2.392161956968798e-06, + "loss": 0.82586509, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4296875, + "step": 7552, + "time_per_iteration": 2.3344578742980957 + }, + { + "auxiliary_loss_clip": 0.01011773, + "auxiliary_loss_mlp": 0.01005534, + "balance_loss_clip": 1.00312567, + "balance_loss_mlp": 1.00369716, + "epoch": 0.4541109274011724, + "flos": 59764146526080.0, + "grad_norm": 0.8189387899927604, + "language_loss": 0.57869726, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59887034, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.08105469, + "step": 7553, + "time_per_iteration": 2.9818027019500732 + }, + { + "auxiliary_loss_clip": 0.01058432, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.01789212, + "balance_loss_mlp": 1.01884305, + "epoch": 0.4541710506538404, + "flos": 28510493045760.0, + "grad_norm": 1.374811528453148, + "language_loss": 0.77982134, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.80083573, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39648438, + "step": 7554, + "time_per_iteration": 2.4302072525024414 + }, + { + "auxiliary_loss_clip": 0.01065392, + "auxiliary_loss_mlp": 0.01044467, + "balance_loss_clip": 1.01460481, + "balance_loss_mlp": 1.02152681, + "epoch": 0.45423117390650836, + "flos": 17674639614720.0, + "grad_norm": 2.5767865161365298, + "language_loss": 0.78525281, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.80635142, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43945312, + "step": 7555, + "time_per_iteration": 2.3951215744018555 + }, + { + "auxiliary_loss_clip": 0.01061327, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.01185906, + "balance_loss_mlp": 1.02034211, + "epoch": 0.4542912971591763, + "flos": 28071355564800.0, + "grad_norm": 1.4496442320092022, + "language_loss": 0.73375082, + "learning_rate": 2.390634232808903e-06, + "loss": 0.75474042, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 7556, + "time_per_iteration": 2.4454572200775146 + }, + { + "auxiliary_loss_clip": 0.01065257, + "auxiliary_loss_mlp": 0.01042165, + "balance_loss_clip": 1.01076519, + "balance_loss_mlp": 1.02113295, + "epoch": 0.4543514204118443, + "flos": 22670745816960.0, + "grad_norm": 1.8180800008287303, + "language_loss": 0.64687508, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.66794926, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.44140625, + "step": 7557, + "time_per_iteration": 2.412522792816162 + }, + { + "auxiliary_loss_clip": 0.01018487, + "auxiliary_loss_mlp": 0.01004049, + "balance_loss_clip": 1.00129485, + "balance_loss_mlp": 1.00995886, + "epoch": 0.45441154366451225, + "flos": 58213303666560.0, + "grad_norm": 0.6929412655031698, + "language_loss": 0.57646191, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.5966872, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.08496094, + "step": 7558, + "time_per_iteration": 2.9754416942596436 + }, + { + "auxiliary_loss_clip": 0.01065701, + "auxiliary_loss_mlp": 0.0105064, + "balance_loss_clip": 1.01774967, + "balance_loss_mlp": 1.02225137, + "epoch": 0.4544716669171802, + "flos": 16763336640000.0, + "grad_norm": 3.085064424633043, + "language_loss": 0.5941056, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.61526906, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.43554688, + "step": 7559, + "time_per_iteration": 2.36574125289917 + }, + { + "auxiliary_loss_clip": 0.01061778, + "auxiliary_loss_mlp": 0.01047297, + "balance_loss_clip": 1.02003407, + "balance_loss_mlp": 1.02090061, + "epoch": 0.4545317901698482, + "flos": 15924303912960.0, + "grad_norm": 1.8276368582948233, + "language_loss": 0.73037326, + "learning_rate": 2.389106271642792e-06, + "loss": 0.75146401, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40820312, + "step": 7560, + "time_per_iteration": 2.4502646923065186 + }, + { + "auxiliary_loss_clip": 0.010648, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.01622701, + "balance_loss_mlp": 1.02125943, + "epoch": 0.45459191342251615, + "flos": 17638783781760.0, + "grad_norm": 2.8251203747107474, + "language_loss": 0.7071203, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.72824472, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43554688, + "step": 7561, + "time_per_iteration": 2.3899810314178467 + }, + { + "auxiliary_loss_clip": 0.01057716, + "auxiliary_loss_mlp": 0.01037013, + "balance_loss_clip": 1.0113945, + "balance_loss_mlp": 1.01873469, + "epoch": 0.4546520366751841, + "flos": 16175783502720.0, + "grad_norm": 1.990346769810943, + "language_loss": 0.86210561, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.88305283, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38867188, + "step": 7562, + "time_per_iteration": 2.372159004211426 + }, + { + "auxiliary_loss_clip": 0.01059113, + "auxiliary_loss_mlp": 0.01049142, + "balance_loss_clip": 1.02127051, + "balance_loss_mlp": 1.01989353, + "epoch": 0.4547121599278521, + "flos": 19750540544640.0, + "grad_norm": 1.816953152115645, + "language_loss": 0.90390152, + "learning_rate": 2.38796014579055e-06, + "loss": 0.9249841, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.390625, + "step": 7563, + "time_per_iteration": 2.3999404907226562 + }, + { + "auxiliary_loss_clip": 0.0106171, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.01653564, + "balance_loss_mlp": 1.01944625, + "epoch": 0.45477228318052004, + "flos": 19936452867840.0, + "grad_norm": 1.9939108152979832, + "language_loss": 0.73590481, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.75697684, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 7564, + "time_per_iteration": 2.3637027740478516 + }, + { + "auxiliary_loss_clip": 0.01061394, + "auxiliary_loss_mlp": 0.01044982, + "balance_loss_clip": 1.01489329, + "balance_loss_mlp": 1.01829994, + "epoch": 0.454832406433188, + "flos": 21287242437120.0, + "grad_norm": 1.9964359278868038, + "language_loss": 0.69341028, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.71447408, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43164062, + "step": 7565, + "time_per_iteration": 2.4022116661071777 + }, + { + "auxiliary_loss_clip": 0.01060489, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.01299179, + "balance_loss_mlp": 1.01997042, + "epoch": 0.45489252968585603, + "flos": 24497576040960.0, + "grad_norm": 2.187256620095692, + "language_loss": 0.81252712, + "learning_rate": 2.386813887534922e-06, + "loss": 0.83354449, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40625, + "step": 7566, + "time_per_iteration": 2.401824474334717 + }, + { + "auxiliary_loss_clip": 0.01061344, + "auxiliary_loss_mlp": 0.01042364, + "balance_loss_clip": 1.01241922, + "balance_loss_mlp": 1.01971722, + "epoch": 0.454952652938524, + "flos": 17091520220160.0, + "grad_norm": 1.6854513653180536, + "language_loss": 0.75133491, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.77237195, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41601562, + "step": 7567, + "time_per_iteration": 2.3912529945373535 + }, + { + "auxiliary_loss_clip": 0.01063197, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_clip": 1.01832485, + "balance_loss_mlp": 1.01989758, + "epoch": 0.45501277619119196, + "flos": 27629320440960.0, + "grad_norm": 1.4186883110526398, + "language_loss": 0.81505972, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83616507, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43359375, + "step": 7568, + "time_per_iteration": 2.4388492107391357 + }, + { + "auxiliary_loss_clip": 0.01063974, + "auxiliary_loss_mlp": 0.01053732, + "balance_loss_clip": 1.02175987, + "balance_loss_mlp": 1.01916337, + "epoch": 0.4550728994438599, + "flos": 19973635332480.0, + "grad_norm": 1.7751440893209678, + "language_loss": 0.81376028, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.83493733, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44921875, + "step": 7569, + "time_per_iteration": 2.385939359664917 + }, + { + "auxiliary_loss_clip": 0.010636, + "auxiliary_loss_mlp": 0.01052147, + "balance_loss_clip": 1.02242851, + "balance_loss_mlp": 1.02007115, + "epoch": 0.4551330226965279, + "flos": 26065700023680.0, + "grad_norm": 1.3588551377855331, + "language_loss": 0.76063496, + "learning_rate": 2.385285337909412e-06, + "loss": 0.7817924, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43359375, + "step": 7570, + "time_per_iteration": 2.429088592529297 + }, + { + "auxiliary_loss_clip": 0.0105966, + "auxiliary_loss_mlp": 0.01046912, + "balance_loss_clip": 1.01831388, + "balance_loss_mlp": 1.0194478, + "epoch": 0.45519314594919585, + "flos": 32779707408000.0, + "grad_norm": 1.6901056792684868, + "language_loss": 0.75211442, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.77318013, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40234375, + "step": 7571, + "time_per_iteration": 3.8648102283477783 + }, + { + "auxiliary_loss_clip": 0.01057397, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.01273251, + "balance_loss_mlp": 1.01805949, + "epoch": 0.4552532692018638, + "flos": 19171645424640.0, + "grad_norm": 1.482503826940187, + "language_loss": 0.82003731, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.84100252, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 7572, + "time_per_iteration": 3.858579635620117 + }, + { + "auxiliary_loss_clip": 0.01062283, + "auxiliary_loss_mlp": 0.01055682, + "balance_loss_clip": 1.02332854, + "balance_loss_mlp": 1.01813948, + "epoch": 0.4553133924545318, + "flos": 26026073763840.0, + "grad_norm": 1.879501608298425, + "language_loss": 0.74073493, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.76191455, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44140625, + "step": 7573, + "time_per_iteration": 2.3973395824432373 + }, + { + "auxiliary_loss_clip": 0.0106437, + "auxiliary_loss_mlp": 0.01049186, + "balance_loss_clip": 1.01497269, + "balance_loss_mlp": 1.02009666, + "epoch": 0.45537351570719975, + "flos": 30660305057280.0, + "grad_norm": 2.063658661293185, + "language_loss": 0.75367129, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.7748068, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 0.44335938, + "step": 7574, + "time_per_iteration": 3.8338615894317627 + }, + { + "auxiliary_loss_clip": 0.0106248, + "auxiliary_loss_mlp": 0.01047302, + "balance_loss_clip": 1.01832235, + "balance_loss_mlp": 1.01946664, + "epoch": 0.4554336389598677, + "flos": 24352232584320.0, + "grad_norm": 1.7330775576866297, + "language_loss": 0.72254074, + "learning_rate": 2.383374322259915e-06, + "loss": 0.74363852, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4296875, + "step": 7575, + "time_per_iteration": 2.449084997177124 + }, + { + "auxiliary_loss_clip": 0.01060649, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.018695, + "balance_loss_mlp": 1.01874948, + "epoch": 0.4554937622125357, + "flos": 20556894372480.0, + "grad_norm": 1.835831977234608, + "language_loss": 0.74933523, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.77039534, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41796875, + "step": 7576, + "time_per_iteration": 2.358691930770874 + }, + { + "auxiliary_loss_clip": 0.01058994, + "auxiliary_loss_mlp": 0.01048946, + "balance_loss_clip": 1.02087271, + "balance_loss_mlp": 1.01832414, + "epoch": 0.45555388546520365, + "flos": 22819650232320.0, + "grad_norm": 2.134846264786994, + "language_loss": 0.68237299, + "learning_rate": 2.382609814135511e-06, + "loss": 0.70345241, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40625, + "step": 7577, + "time_per_iteration": 2.3962113857269287 + }, + { + "auxiliary_loss_clip": 0.01061387, + "auxiliary_loss_mlp": 0.01054252, + "balance_loss_clip": 1.02183938, + "balance_loss_mlp": 1.01928806, + "epoch": 0.4556140087178716, + "flos": 21724913640960.0, + "grad_norm": 10.014538859408013, + "language_loss": 0.76007283, + "learning_rate": 2.382227538303157e-06, + "loss": 0.78122926, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.421875, + "step": 7578, + "time_per_iteration": 2.381537914276123 + }, + { + "auxiliary_loss_clip": 0.01059374, + "auxiliary_loss_mlp": 0.0104129, + "balance_loss_clip": 1.01344252, + "balance_loss_mlp": 1.01935494, + "epoch": 0.45567413197053963, + "flos": 25993325041920.0, + "grad_norm": 2.011837873609319, + "language_loss": 0.72267008, + "learning_rate": 2.381845247976697e-06, + "loss": 0.74367666, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40039062, + "step": 7579, + "time_per_iteration": 3.8259294033050537 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.01041817, + "balance_loss_clip": 1.01534104, + "balance_loss_mlp": 1.01890194, + "epoch": 0.4557342552232076, + "flos": 21536697168000.0, + "grad_norm": 1.9618503421512454, + "language_loss": 0.79557502, + "learning_rate": 2.381462943170627e-06, + "loss": 0.81658286, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 7580, + "time_per_iteration": 2.3910229206085205 + }, + { + "auxiliary_loss_clip": 0.01060695, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.01554894, + "balance_loss_mlp": 1.01964641, + "epoch": 0.45579437847587556, + "flos": 40000479310080.0, + "grad_norm": 1.5560266412279595, + "language_loss": 0.69535816, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71639907, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 7581, + "time_per_iteration": 2.590573787689209 + }, + { + "auxiliary_loss_clip": 0.01059472, + "auxiliary_loss_mlp": 0.01044283, + "balance_loss_clip": 1.01703203, + "balance_loss_mlp": 1.01878095, + "epoch": 0.4558545017285435, + "flos": 31137183584640.0, + "grad_norm": 2.217065787033831, + "language_loss": 0.73471355, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75575113, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 7582, + "time_per_iteration": 2.4482460021972656 + }, + { + "auxiliary_loss_clip": 0.01063592, + "auxiliary_loss_mlp": 0.01049974, + "balance_loss_clip": 1.01750159, + "balance_loss_mlp": 1.02049029, + "epoch": 0.4559146249812115, + "flos": 21724704172800.0, + "grad_norm": 1.7539031666427538, + "language_loss": 0.73730284, + "learning_rate": 2.380315942019729e-06, + "loss": 0.75843853, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4296875, + "step": 7583, + "time_per_iteration": 2.4571034908294678 + }, + { + "auxiliary_loss_clip": 0.01063981, + "auxiliary_loss_mlp": 0.0105267, + "balance_loss_clip": 1.02167606, + "balance_loss_mlp": 1.01955557, + "epoch": 0.45597474823387946, + "flos": 23804829377280.0, + "grad_norm": 3.235966591266685, + "language_loss": 0.73899925, + "learning_rate": 2.379933579440195e-06, + "loss": 0.76016575, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4453125, + "step": 7584, + "time_per_iteration": 2.4091238975524902 + }, + { + "auxiliary_loss_clip": 0.01062235, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_clip": 1.01447773, + "balance_loss_mlp": 1.02083194, + "epoch": 0.4560348714865474, + "flos": 31904295177600.0, + "grad_norm": 1.5277985422426574, + "language_loss": 0.68786895, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70890927, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 7585, + "time_per_iteration": 2.4985082149505615 + }, + { + "auxiliary_loss_clip": 0.01060292, + "auxiliary_loss_mlp": 0.01042792, + "balance_loss_clip": 1.01414585, + "balance_loss_mlp": 1.01996493, + "epoch": 0.4560949947392154, + "flos": 22047895428480.0, + "grad_norm": 1.7361174709269103, + "language_loss": 0.77246082, + "learning_rate": 2.379168811074267e-06, + "loss": 0.7934916, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40234375, + "step": 7586, + "time_per_iteration": 2.397373914718628 + }, + { + "auxiliary_loss_clip": 0.01058128, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.01408625, + "balance_loss_mlp": 1.01885104, + "epoch": 0.45615511799188335, + "flos": 24570649249920.0, + "grad_norm": 1.6794587565867642, + "language_loss": 0.79138744, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.81235003, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.39257812, + "step": 7587, + "time_per_iteration": 2.4596943855285645 + }, + { + "auxiliary_loss_clip": 0.01063069, + "auxiliary_loss_mlp": 0.01049842, + "balance_loss_clip": 1.02131486, + "balance_loss_mlp": 1.01965117, + "epoch": 0.4562152412445513, + "flos": 18329784877440.0, + "grad_norm": 1.8487835072057834, + "language_loss": 0.71118236, + "learning_rate": 2.378403985195863e-06, + "loss": 0.73231143, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43359375, + "step": 7588, + "time_per_iteration": 2.3714163303375244 + }, + { + "auxiliary_loss_clip": 0.01059172, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.01163566, + "balance_loss_mlp": 1.01910365, + "epoch": 0.4562753644972193, + "flos": 13515680926080.0, + "grad_norm": 1.7631616088138158, + "language_loss": 0.79918408, + "learning_rate": 2.378021550725735e-06, + "loss": 0.82015026, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 7589, + "time_per_iteration": 2.4145472049713135 + }, + { + "auxiliary_loss_clip": 0.010613, + "auxiliary_loss_mlp": 0.01044454, + "balance_loss_clip": 1.01785874, + "balance_loss_mlp": 1.01991022, + "epoch": 0.45633548774988725, + "flos": 29638502029440.0, + "grad_norm": 2.4656556069908757, + "language_loss": 0.63906503, + "learning_rate": 2.377639101920992e-06, + "loss": 0.66012257, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.4140625, + "step": 7590, + "time_per_iteration": 2.444105625152588 + }, + { + "auxiliary_loss_clip": 0.01060569, + "auxiliary_loss_mlp": 0.01045514, + "balance_loss_clip": 1.01853657, + "balance_loss_mlp": 1.01978672, + "epoch": 0.4563956110025552, + "flos": 22232411297280.0, + "grad_norm": 1.7216062935071048, + "language_loss": 0.73730582, + "learning_rate": 2.377256638796135e-06, + "loss": 0.7583667, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 7591, + "time_per_iteration": 2.4021365642547607 + }, + { + "auxiliary_loss_clip": 0.01063799, + "auxiliary_loss_mlp": 0.01044425, + "balance_loss_clip": 1.01724482, + "balance_loss_mlp": 1.02140081, + "epoch": 0.45645573425522323, + "flos": 17091101283840.0, + "grad_norm": 2.3331162102960885, + "language_loss": 0.7872467, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.80832887, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.42382812, + "step": 7592, + "time_per_iteration": 2.342581033706665 + }, + { + "auxiliary_loss_clip": 0.01060247, + "auxiliary_loss_mlp": 0.01043497, + "balance_loss_clip": 1.01615071, + "balance_loss_mlp": 1.01889586, + "epoch": 0.4565158575078912, + "flos": 20331495434880.0, + "grad_norm": 2.027895791899318, + "language_loss": 0.71038616, + "learning_rate": 2.376491669644098e-06, + "loss": 0.73142362, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 7593, + "time_per_iteration": 2.3955235481262207 + }, + { + "auxiliary_loss_clip": 0.01055686, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.01779103, + "balance_loss_mlp": 1.01791883, + "epoch": 0.45657598076055916, + "flos": 23982013860480.0, + "grad_norm": 1.8585867894698758, + "language_loss": 0.84834862, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86932695, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 7594, + "time_per_iteration": 2.3982200622558594 + }, + { + "auxiliary_loss_clip": 0.01011818, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 1.00093424, + "balance_loss_mlp": 1.00377011, + "epoch": 0.45663610401322713, + "flos": 69361211629440.0, + "grad_norm": 0.7811502830930908, + "language_loss": 0.52802372, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54817569, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.08007812, + "step": 7595, + "time_per_iteration": 3.0359461307525635 + }, + { + "auxiliary_loss_clip": 0.01063223, + "auxiliary_loss_mlp": 0.01044139, + "balance_loss_clip": 1.01524234, + "balance_loss_mlp": 1.01977324, + "epoch": 0.4566962272658951, + "flos": 15148464480000.0, + "grad_norm": 2.2313334549397204, + "language_loss": 0.8945027, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.91557634, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43554688, + "step": 7596, + "time_per_iteration": 2.3227362632751465 + }, + { + "auxiliary_loss_clip": 0.01061244, + "auxiliary_loss_mlp": 0.01045915, + "balance_loss_clip": 1.01803219, + "balance_loss_mlp": 1.01888835, + "epoch": 0.45675635051856306, + "flos": 18696477553920.0, + "grad_norm": 1.4996810455330196, + "language_loss": 0.78092468, + "learning_rate": 2.374961560136843e-06, + "loss": 0.80199629, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.42382812, + "step": 7597, + "time_per_iteration": 2.3933229446411133 + }, + { + "auxiliary_loss_clip": 0.01060592, + "auxiliary_loss_mlp": 0.01044486, + "balance_loss_clip": 1.01643634, + "balance_loss_mlp": 1.01873112, + "epoch": 0.456816473771231, + "flos": 19097315406720.0, + "grad_norm": 1.6232572055955337, + "language_loss": 0.79308867, + "learning_rate": 2.374578997177314e-06, + "loss": 0.81413949, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 7598, + "time_per_iteration": 2.3688669204711914 + }, + { + "auxiliary_loss_clip": 0.01056706, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.01106155, + "balance_loss_mlp": 1.01717114, + "epoch": 0.456876597023899, + "flos": 28948792654080.0, + "grad_norm": 2.0769349949547022, + "language_loss": 0.72700226, + "learning_rate": 2.374196420013712e-06, + "loss": 0.74792969, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.39453125, + "step": 7599, + "time_per_iteration": 2.445981025695801 + }, + { + "auxiliary_loss_clip": 0.01056254, + "auxiliary_loss_mlp": 0.01042967, + "balance_loss_clip": 1.01742029, + "balance_loss_mlp": 1.01674163, + "epoch": 0.45693672027656695, + "flos": 23288499146880.0, + "grad_norm": 3.226841599092553, + "language_loss": 0.7097019, + "learning_rate": 2.373813828660544e-06, + "loss": 0.73069412, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39453125, + "step": 7600, + "time_per_iteration": 2.3679611682891846 + }, + { + "auxiliary_loss_clip": 0.01058438, + "auxiliary_loss_mlp": 0.01043846, + "balance_loss_clip": 1.01834762, + "balance_loss_mlp": 1.01847243, + "epoch": 0.4569968435292349, + "flos": 20557173663360.0, + "grad_norm": 1.7740166001843118, + "language_loss": 0.80736661, + "learning_rate": 2.373431223132319e-06, + "loss": 0.82838941, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 7601, + "time_per_iteration": 2.3913936614990234 + }, + { + "auxiliary_loss_clip": 0.01060042, + "auxiliary_loss_mlp": 0.01045829, + "balance_loss_clip": 1.02074718, + "balance_loss_mlp": 1.01895809, + "epoch": 0.4570569667819029, + "flos": 41280988579200.0, + "grad_norm": 1.6827401025545106, + "language_loss": 0.7319445, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.75300324, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.41015625, + "step": 7602, + "time_per_iteration": 2.535433769226074 + }, + { + "auxiliary_loss_clip": 0.01059539, + "auxiliary_loss_mlp": 0.01050032, + "balance_loss_clip": 1.02149343, + "balance_loss_mlp": 1.0189774, + "epoch": 0.45711709003457085, + "flos": 26030367861120.0, + "grad_norm": 1.9845510835016633, + "language_loss": 0.75118232, + "learning_rate": 2.372665969608729e-06, + "loss": 0.77227801, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40625, + "step": 7603, + "time_per_iteration": 2.4382476806640625 + }, + { + "auxiliary_loss_clip": 0.01057818, + "auxiliary_loss_mlp": 0.01053064, + "balance_loss_clip": 1.02603889, + "balance_loss_mlp": 1.01768446, + "epoch": 0.4571772132872388, + "flos": 22157138672640.0, + "grad_norm": 2.0895261253163455, + "language_loss": 0.84190595, + "learning_rate": 2.372283321642383e-06, + "loss": 0.8630147, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40039062, + "step": 7604, + "time_per_iteration": 2.3670825958251953 + }, + { + "auxiliary_loss_clip": 0.01065413, + "auxiliary_loss_mlp": 0.01054225, + "balance_loss_clip": 1.02335012, + "balance_loss_mlp": 1.02102017, + "epoch": 0.45723733653990684, + "flos": 23877728029440.0, + "grad_norm": 2.05827111641087, + "language_loss": 0.87572074, + "learning_rate": 2.371900659559016e-06, + "loss": 0.8969171, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4453125, + "step": 7605, + "time_per_iteration": 2.3866117000579834 + }, + { + "auxiliary_loss_clip": 0.01060637, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.01635289, + "balance_loss_mlp": 1.01864743, + "epoch": 0.4572974597925748, + "flos": 16870904138880.0, + "grad_norm": 1.8144405181312455, + "language_loss": 0.7511003, + "learning_rate": 2.371517983373138e-06, + "loss": 0.7721346, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41992188, + "step": 7606, + "time_per_iteration": 2.3695812225341797 + }, + { + "auxiliary_loss_clip": 0.01061164, + "auxiliary_loss_mlp": 0.01042237, + "balance_loss_clip": 1.01438928, + "balance_loss_mlp": 1.0190742, + "epoch": 0.45735758304524277, + "flos": 13770651651840.0, + "grad_norm": 2.034643272951108, + "language_loss": 0.81941718, + "learning_rate": 2.371135293099262e-06, + "loss": 0.84045118, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41992188, + "step": 7607, + "time_per_iteration": 2.3556575775146484 + }, + { + "auxiliary_loss_clip": 0.01061068, + "auxiliary_loss_mlp": 0.01046162, + "balance_loss_clip": 1.01830292, + "balance_loss_mlp": 1.02012479, + "epoch": 0.45741770629791073, + "flos": 21099828925440.0, + "grad_norm": 1.7336368565476437, + "language_loss": 0.81407315, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83514547, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40820312, + "step": 7608, + "time_per_iteration": 2.398467779159546 + }, + { + "auxiliary_loss_clip": 0.01057934, + "auxiliary_loss_mlp": 0.01039612, + "balance_loss_clip": 1.01376772, + "balance_loss_mlp": 1.01694822, + "epoch": 0.4574778295505787, + "flos": 23111768511360.0, + "grad_norm": 1.6206119650206945, + "language_loss": 0.69531363, + "learning_rate": 2.370369870345559e-06, + "loss": 0.7162891, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 7609, + "time_per_iteration": 2.3712785243988037 + }, + { + "auxiliary_loss_clip": 0.01059942, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_clip": 1.02018452, + "balance_loss_mlp": 1.01918185, + "epoch": 0.45753795280324666, + "flos": 24351778736640.0, + "grad_norm": 1.804244782556104, + "language_loss": 0.82632053, + "learning_rate": 2.369987137894757e-06, + "loss": 0.84738392, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40820312, + "step": 7610, + "time_per_iteration": 2.41630220413208 + }, + { + "auxiliary_loss_clip": 0.01061698, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.0168817, + "balance_loss_mlp": 1.01871932, + "epoch": 0.4575980760559146, + "flos": 16652871498240.0, + "grad_norm": 2.6771548215740855, + "language_loss": 0.83400738, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.85507792, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 7611, + "time_per_iteration": 5.056148290634155 + }, + { + "auxiliary_loss_clip": 0.01061539, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.01798773, + "balance_loss_mlp": 1.01969194, + "epoch": 0.4576581993085826, + "flos": 35910160087680.0, + "grad_norm": 1.7711288939672405, + "language_loss": 0.75233096, + "learning_rate": 2.369221630917819e-06, + "loss": 0.77341306, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 7612, + "time_per_iteration": 2.49027681350708 + }, + { + "auxiliary_loss_clip": 0.01058126, + "auxiliary_loss_mlp": 0.01038345, + "balance_loss_clip": 1.01258361, + "balance_loss_mlp": 1.01804602, + "epoch": 0.45771832256125056, + "flos": 20079492174720.0, + "grad_norm": 1.848481831453945, + "language_loss": 0.86142892, + "learning_rate": 2.368838856420711e-06, + "loss": 0.8823936, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 7613, + "time_per_iteration": 2.3878791332244873 + }, + { + "auxiliary_loss_clip": 0.01062908, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.01422787, + "balance_loss_mlp": 1.02091157, + "epoch": 0.4577784458139185, + "flos": 10743542196480.0, + "grad_norm": 2.169837578846919, + "language_loss": 0.77329063, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.79433972, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41992188, + "step": 7614, + "time_per_iteration": 3.80096697807312 + }, + { + "auxiliary_loss_clip": 0.01061079, + "auxiliary_loss_mlp": 0.01043943, + "balance_loss_clip": 1.01703739, + "balance_loss_mlp": 1.02024269, + "epoch": 0.4578385690665865, + "flos": 21906217664640.0, + "grad_norm": 1.5408649038362092, + "language_loss": 0.7531724, + "learning_rate": 2.368073265481791e-06, + "loss": 0.77422261, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 7615, + "time_per_iteration": 2.3691372871398926 + }, + { + "auxiliary_loss_clip": 0.01011812, + "auxiliary_loss_mlp": 0.0101266, + "balance_loss_clip": 1.01010847, + "balance_loss_mlp": 1.00328565, + "epoch": 0.45789869231925445, + "flos": 64755574606080.0, + "grad_norm": 0.7929262380307466, + "language_loss": 0.57771432, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59795904, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.08496094, + "step": 7616, + "time_per_iteration": 2.9558324813842773 + }, + { + "auxiliary_loss_clip": 0.01059394, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.0164727, + "balance_loss_mlp": 1.0181725, + "epoch": 0.4579588155719224, + "flos": 16143069692160.0, + "grad_norm": 1.5137194189612533, + "language_loss": 0.72087508, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.74190301, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41210938, + "step": 7617, + "time_per_iteration": 2.3521227836608887 + }, + { + "auxiliary_loss_clip": 0.01059546, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.01112926, + "balance_loss_mlp": 1.01940811, + "epoch": 0.45801893882459044, + "flos": 21394530645120.0, + "grad_norm": 2.0287878278241243, + "language_loss": 0.7770983, + "learning_rate": 2.36692477442939e-06, + "loss": 0.79805708, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40234375, + "step": 7618, + "time_per_iteration": 2.3814167976379395 + }, + { + "auxiliary_loss_clip": 0.01062869, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.01591396, + "balance_loss_mlp": 1.02017009, + "epoch": 0.4580790620772584, + "flos": 19535545192320.0, + "grad_norm": 1.8061431895305433, + "language_loss": 0.78766, + "learning_rate": 2.366541916231585e-06, + "loss": 0.80873358, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42578125, + "step": 7619, + "time_per_iteration": 3.767815589904785 + }, + { + "auxiliary_loss_clip": 0.0105877, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_clip": 1.01759958, + "balance_loss_mlp": 1.0191834, + "epoch": 0.45813918532992637, + "flos": 16580147402880.0, + "grad_norm": 1.7793254827838023, + "language_loss": 0.72770739, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74872029, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39453125, + "step": 7620, + "time_per_iteration": 2.370732545852661 + }, + { + "auxiliary_loss_clip": 0.01058109, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.01349545, + "balance_loss_mlp": 1.0183847, + "epoch": 0.45819930858259433, + "flos": 42228671057280.0, + "grad_norm": 1.6928584031994731, + "language_loss": 0.78981167, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.81077659, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3984375, + "step": 7621, + "time_per_iteration": 2.5454959869384766 + }, + { + "auxiliary_loss_clip": 0.01009818, + "auxiliary_loss_mlp": 0.01003715, + "balance_loss_clip": 1.00110412, + "balance_loss_mlp": 1.00182438, + "epoch": 0.4582594318352623, + "flos": 63711705732480.0, + "grad_norm": 0.7850306508084729, + "language_loss": 0.65066141, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67079669, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.08007812, + "step": 7622, + "time_per_iteration": 3.0070178508758545 + }, + { + "auxiliary_loss_clip": 0.01062607, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.01110125, + "balance_loss_mlp": 1.01954579, + "epoch": 0.45831955508793026, + "flos": 26868772183680.0, + "grad_norm": 1.7081261105395746, + "language_loss": 0.80907011, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.8301149, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4296875, + "step": 7623, + "time_per_iteration": 2.4171340465545654 + }, + { + "auxiliary_loss_clip": 0.01060623, + "auxiliary_loss_mlp": 0.0104322, + "balance_loss_clip": 1.0141449, + "balance_loss_mlp": 1.01814699, + "epoch": 0.45837967834059823, + "flos": 18732961791360.0, + "grad_norm": 2.185298001555558, + "language_loss": 0.72116017, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.74219865, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42382812, + "step": 7624, + "time_per_iteration": 2.3985538482666016 + }, + { + "auxiliary_loss_clip": 0.01060244, + "auxiliary_loss_mlp": 0.01044027, + "balance_loss_clip": 1.0153569, + "balance_loss_mlp": 1.01787329, + "epoch": 0.4584398015932662, + "flos": 21177056586240.0, + "grad_norm": 1.789770086953154, + "language_loss": 0.7426964, + "learning_rate": 2.364244475667491e-06, + "loss": 0.76373911, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42382812, + "step": 7625, + "time_per_iteration": 2.379972457885742 + }, + { + "auxiliary_loss_clip": 0.01058787, + "auxiliary_loss_mlp": 0.01047727, + "balance_loss_clip": 1.01980877, + "balance_loss_mlp": 1.01818991, + "epoch": 0.45849992484593416, + "flos": 19789084552320.0, + "grad_norm": 2.16751493550557, + "language_loss": 0.79168308, + "learning_rate": 2.363861520479451e-06, + "loss": 0.81274825, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 7626, + "time_per_iteration": 2.392838716506958 + }, + { + "auxiliary_loss_clip": 0.01061693, + "auxiliary_loss_mlp": 0.01047822, + "balance_loss_clip": 1.01691151, + "balance_loss_mlp": 1.01852036, + "epoch": 0.4585600480986021, + "flos": 18222287201280.0, + "grad_norm": 2.04518428941276, + "language_loss": 0.8645786, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.88567376, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43164062, + "step": 7627, + "time_per_iteration": 2.331693649291992 + }, + { + "auxiliary_loss_clip": 0.01063703, + "auxiliary_loss_mlp": 0.01051944, + "balance_loss_clip": 1.01653886, + "balance_loss_mlp": 1.01875734, + "epoch": 0.4586201713512701, + "flos": 29020958167680.0, + "grad_norm": 1.5458735696990489, + "language_loss": 0.70731962, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.72847605, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.44921875, + "step": 7628, + "time_per_iteration": 2.43790602684021 + }, + { + "auxiliary_loss_clip": 0.0105915, + "auxiliary_loss_mlp": 0.01040991, + "balance_loss_clip": 1.0128814, + "balance_loss_mlp": 1.01757836, + "epoch": 0.45868029460393805, + "flos": 23403467854080.0, + "grad_norm": 1.48292274360582, + "language_loss": 0.78972363, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.81072497, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41601562, + "step": 7629, + "time_per_iteration": 2.4037492275238037 + }, + { + "auxiliary_loss_clip": 0.01064378, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_clip": 1.01663113, + "balance_loss_mlp": 1.01858902, + "epoch": 0.458740417856606, + "flos": 18221030392320.0, + "grad_norm": 2.7075777117511466, + "language_loss": 0.8125459, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.83367252, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.45703125, + "step": 7630, + "time_per_iteration": 2.3722639083862305 + }, + { + "auxiliary_loss_clip": 0.01062811, + "auxiliary_loss_mlp": 0.01051262, + "balance_loss_clip": 1.01907539, + "balance_loss_mlp": 1.01846266, + "epoch": 0.458800541109274, + "flos": 34567330308480.0, + "grad_norm": 1.8666396289257838, + "language_loss": 0.73632479, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.75746548, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4453125, + "step": 7631, + "time_per_iteration": 2.4682650566101074 + }, + { + "auxiliary_loss_clip": 0.0106263, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.02127957, + "balance_loss_mlp": 1.01934266, + "epoch": 0.458860664361942, + "flos": 17711158763520.0, + "grad_norm": 2.1365234652590637, + "language_loss": 0.73511523, + "learning_rate": 2.361563500108531e-06, + "loss": 0.75626802, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.43359375, + "step": 7632, + "time_per_iteration": 2.396500825881958 + }, + { + "auxiliary_loss_clip": 0.01063565, + "auxiliary_loss_mlp": 0.01048166, + "balance_loss_clip": 1.01535964, + "balance_loss_mlp": 1.0180341, + "epoch": 0.45892078761460997, + "flos": 18440913335040.0, + "grad_norm": 2.461462919057188, + "language_loss": 0.71278334, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.73390067, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.45507812, + "step": 7633, + "time_per_iteration": 2.349790334701538 + }, + { + "auxiliary_loss_clip": 0.01062441, + "auxiliary_loss_mlp": 0.01055773, + "balance_loss_clip": 1.02482665, + "balance_loss_mlp": 1.01929379, + "epoch": 0.45898091086727794, + "flos": 22671897891840.0, + "grad_norm": 1.5036837676908137, + "language_loss": 0.82565147, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.84683359, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43164062, + "step": 7634, + "time_per_iteration": 2.4247701168060303 + }, + { + "auxiliary_loss_clip": 0.01064401, + "auxiliary_loss_mlp": 0.0105527, + "balance_loss_clip": 1.01881623, + "balance_loss_mlp": 1.01875639, + "epoch": 0.4590410341199459, + "flos": 21651875343360.0, + "grad_norm": 5.120830366455669, + "language_loss": 0.82799065, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.84918737, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.45703125, + "step": 7635, + "time_per_iteration": 2.3572170734405518 + }, + { + "auxiliary_loss_clip": 0.01061608, + "auxiliary_loss_mlp": 0.01060603, + "balance_loss_clip": 1.03057456, + "balance_loss_mlp": 1.01916265, + "epoch": 0.45910115737261387, + "flos": 36533987994240.0, + "grad_norm": 1.5108601783086366, + "language_loss": 0.66209006, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.68331218, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42578125, + "step": 7636, + "time_per_iteration": 2.523144483566284 + }, + { + "auxiliary_loss_clip": 0.01061144, + "auxiliary_loss_mlp": 0.01042301, + "balance_loss_clip": 1.01414418, + "balance_loss_mlp": 1.01966608, + "epoch": 0.45916128062528183, + "flos": 24418882103040.0, + "grad_norm": 1.4665530940942548, + "language_loss": 0.81398308, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.83501756, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 7637, + "time_per_iteration": 2.409820318222046 + }, + { + "auxiliary_loss_clip": 0.01064537, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.0142405, + "balance_loss_mlp": 1.01931977, + "epoch": 0.4592214038779498, + "flos": 23220837198720.0, + "grad_norm": 1.6578607587424679, + "language_loss": 0.76383936, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.78495145, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.453125, + "step": 7638, + "time_per_iteration": 2.4298181533813477 + }, + { + "auxiliary_loss_clip": 0.01060398, + "auxiliary_loss_mlp": 0.01053038, + "balance_loss_clip": 1.02098298, + "balance_loss_mlp": 1.01837587, + "epoch": 0.45928152713061776, + "flos": 19171121754240.0, + "grad_norm": 1.7229400916486581, + "language_loss": 0.75006276, + "learning_rate": 2.358881852733989e-06, + "loss": 0.77119714, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.41992188, + "step": 7639, + "time_per_iteration": 2.3341550827026367 + }, + { + "auxiliary_loss_clip": 0.01061717, + "auxiliary_loss_mlp": 0.01047964, + "balance_loss_clip": 1.01661181, + "balance_loss_mlp": 1.01809335, + "epoch": 0.4593416503832857, + "flos": 22413715320960.0, + "grad_norm": 1.7656305632229226, + "language_loss": 0.69466132, + "learning_rate": 2.358498705700346e-06, + "loss": 0.71575814, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43554688, + "step": 7640, + "time_per_iteration": 2.4109485149383545 + }, + { + "auxiliary_loss_clip": 0.01063456, + "auxiliary_loss_mlp": 0.01052766, + "balance_loss_clip": 1.02108002, + "balance_loss_mlp": 1.01819742, + "epoch": 0.4594017736359537, + "flos": 18879212943360.0, + "grad_norm": 1.85300086816382, + "language_loss": 0.76823634, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.78939855, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.453125, + "step": 7641, + "time_per_iteration": 2.352515697479248 + }, + { + "auxiliary_loss_clip": 0.01062886, + "auxiliary_loss_mlp": 0.01046741, + "balance_loss_clip": 1.01370871, + "balance_loss_mlp": 1.01848102, + "epoch": 0.45946189688862166, + "flos": 20517617226240.0, + "grad_norm": 2.782012021907221, + "language_loss": 0.76368207, + "learning_rate": 2.357732370864668e-06, + "loss": 0.78477836, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.4453125, + "step": 7642, + "time_per_iteration": 2.4027509689331055 + }, + { + "auxiliary_loss_clip": 0.01011666, + "auxiliary_loss_mlp": 0.01007238, + "balance_loss_clip": 1.00471127, + "balance_loss_mlp": 1.00339127, + "epoch": 0.4595220201412896, + "flos": 61403249036160.0, + "grad_norm": 0.8396385840465036, + "language_loss": 0.58259141, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60278046, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.08300781, + "step": 7643, + "time_per_iteration": 2.7522642612457275 + }, + { + "auxiliary_loss_clip": 0.0106424, + "auxiliary_loss_mlp": 0.01051437, + "balance_loss_clip": 1.01839197, + "balance_loss_mlp": 1.01814687, + "epoch": 0.4595821433939576, + "flos": 23329836063360.0, + "grad_norm": 1.4743489910457728, + "language_loss": 0.93959427, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.96075106, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.4609375, + "step": 7644, + "time_per_iteration": 2.4043054580688477 + }, + { + "auxiliary_loss_clip": 0.01063857, + "auxiliary_loss_mlp": 0.01047967, + "balance_loss_clip": 1.01535177, + "balance_loss_mlp": 1.01848757, + "epoch": 0.4596422666466256, + "flos": 14281500798720.0, + "grad_norm": 2.0799827875031767, + "language_loss": 0.84561551, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.86673379, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.453125, + "step": 7645, + "time_per_iteration": 2.33394455909729 + }, + { + "auxiliary_loss_clip": 0.0100959, + "auxiliary_loss_mlp": 0.01007081, + "balance_loss_clip": 1.0043031, + "balance_loss_mlp": 1.00165439, + "epoch": 0.4597023898992936, + "flos": 65724029343360.0, + "grad_norm": 0.7834073321176499, + "language_loss": 0.59890699, + "learning_rate": 2.356199538526593e-06, + "loss": 0.61907375, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.07910156, + "step": 7646, + "time_per_iteration": 2.944260358810425 + }, + { + "auxiliary_loss_clip": 0.01062275, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.01312137, + "balance_loss_mlp": 1.01878631, + "epoch": 0.45976251315196154, + "flos": 26905849914240.0, + "grad_norm": 1.564301884401858, + "language_loss": 0.73685789, + "learning_rate": 2.355816296637939e-06, + "loss": 0.75791228, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.43554688, + "step": 7647, + "time_per_iteration": 2.4041519165039062 + }, + { + "auxiliary_loss_clip": 0.01063284, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_clip": 1.01775026, + "balance_loss_mlp": 1.01946545, + "epoch": 0.4598226364046295, + "flos": 26616768923520.0, + "grad_norm": 1.6791709403390798, + "language_loss": 0.68000674, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.7011224, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4375, + "step": 7648, + "time_per_iteration": 2.436443567276001 + }, + { + "auxiliary_loss_clip": 0.01062548, + "auxiliary_loss_mlp": 0.01048077, + "balance_loss_clip": 1.01665306, + "balance_loss_mlp": 1.01888943, + "epoch": 0.45988275965729747, + "flos": 24386657051520.0, + "grad_norm": 1.7325451442904258, + "language_loss": 0.79666764, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81777382, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4375, + "step": 7649, + "time_per_iteration": 2.398726463317871 + }, + { + "auxiliary_loss_clip": 0.01062495, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.01744676, + "balance_loss_mlp": 1.0202316, + "epoch": 0.45994288290996543, + "flos": 24534653771520.0, + "grad_norm": 1.9541120109094536, + "language_loss": 0.70800239, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.729096, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.421875, + "step": 7650, + "time_per_iteration": 3.7953343391418457 + }, + { + "auxiliary_loss_clip": 0.01065938, + "auxiliary_loss_mlp": 0.01051906, + "balance_loss_clip": 1.01604831, + "balance_loss_mlp": 1.01963496, + "epoch": 0.4600030061626334, + "flos": 14829357853440.0, + "grad_norm": 1.9806342668653707, + "language_loss": 0.85886681, + "learning_rate": 2.354283194302761e-06, + "loss": 0.88004518, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.46289062, + "step": 7651, + "time_per_iteration": 3.7432727813720703 + }, + { + "auxiliary_loss_clip": 0.01060872, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.01496863, + "balance_loss_mlp": 1.01840699, + "epoch": 0.46006312941530136, + "flos": 18112869400320.0, + "grad_norm": 1.6921433007267233, + "language_loss": 0.76261151, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.78367198, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42382812, + "step": 7652, + "time_per_iteration": 2.3489830493927 + }, + { + "auxiliary_loss_clip": 0.01060988, + "auxiliary_loss_mlp": 0.01045086, + "balance_loss_clip": 1.01593959, + "balance_loss_mlp": 1.01834929, + "epoch": 0.46012325266796933, + "flos": 21975520446720.0, + "grad_norm": 1.8641937943133977, + "language_loss": 0.76947695, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.79053771, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 7653, + "time_per_iteration": 3.834712266921997 + }, + { + "auxiliary_loss_clip": 0.01067405, + "auxiliary_loss_mlp": 0.01048067, + "balance_loss_clip": 1.01473594, + "balance_loss_mlp": 1.02177024, + "epoch": 0.4601833759206373, + "flos": 15267168702720.0, + "grad_norm": 2.0952528042894794, + "language_loss": 0.67590415, + "learning_rate": 2.353133226438741e-06, + "loss": 0.69705886, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.45703125, + "step": 7654, + "time_per_iteration": 2.3489742279052734 + }, + { + "auxiliary_loss_clip": 0.01060338, + "auxiliary_loss_mlp": 0.01047123, + "balance_loss_clip": 1.01839375, + "balance_loss_mlp": 1.01836538, + "epoch": 0.46024349917330526, + "flos": 27087782342400.0, + "grad_norm": 1.6670386250789198, + "language_loss": 0.80507815, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.82615274, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 7655, + "time_per_iteration": 2.419264793395996 + }, + { + "auxiliary_loss_clip": 0.01059178, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.01706934, + "balance_loss_mlp": 1.01776731, + "epoch": 0.4603036224259732, + "flos": 24461755119360.0, + "grad_norm": 1.5823434977786461, + "language_loss": 0.68747461, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.70852977, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4140625, + "step": 7656, + "time_per_iteration": 2.3946292400360107 + }, + { + "auxiliary_loss_clip": 0.01062804, + "auxiliary_loss_mlp": 0.01049971, + "balance_loss_clip": 1.019418, + "balance_loss_mlp": 1.01955247, + "epoch": 0.4603637456786412, + "flos": 28108084181760.0, + "grad_norm": 1.5597807719779306, + "language_loss": 0.81988567, + "learning_rate": 2.351983138057098e-06, + "loss": 0.84101337, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 7657, + "time_per_iteration": 2.4348127841949463 + }, + { + "auxiliary_loss_clip": 0.0106059, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.01164961, + "balance_loss_mlp": 1.01795316, + "epoch": 0.4604238689313092, + "flos": 24347903575680.0, + "grad_norm": 2.0213099870495426, + "language_loss": 0.71669883, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.7377333, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.42578125, + "step": 7658, + "time_per_iteration": 2.386352062225342 + }, + { + "auxiliary_loss_clip": 0.01010643, + "auxiliary_loss_mlp": 0.0100841, + "balance_loss_clip": 1.00603759, + "balance_loss_mlp": 1.00214195, + "epoch": 0.4604839921839772, + "flos": 53603477280000.0, + "grad_norm": 0.9768039702030216, + "language_loss": 0.62233955, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64253008, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.08496094, + "step": 7659, + "time_per_iteration": 4.465997695922852 + }, + { + "auxiliary_loss_clip": 0.01061234, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.02347445, + "balance_loss_mlp": 1.02039206, + "epoch": 0.46054411543664514, + "flos": 31247090144640.0, + "grad_norm": 1.5450528965580168, + "language_loss": 0.69146204, + "learning_rate": 2.350832929550336e-06, + "loss": 0.71260166, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40820312, + "step": 7660, + "time_per_iteration": 2.523466110229492 + }, + { + "auxiliary_loss_clip": 0.01059799, + "auxiliary_loss_mlp": 0.0105093, + "balance_loss_clip": 1.02090085, + "balance_loss_mlp": 1.0177896, + "epoch": 0.4606042386893131, + "flos": 24091850597760.0, + "grad_norm": 1.6188750548987032, + "language_loss": 0.77859831, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79970562, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41992188, + "step": 7661, + "time_per_iteration": 2.404334783554077 + }, + { + "auxiliary_loss_clip": 0.01060999, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.01480389, + "balance_loss_mlp": 1.01966918, + "epoch": 0.46066436194198107, + "flos": 26577247397760.0, + "grad_norm": 1.8889096207113918, + "language_loss": 0.75287342, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77390939, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41210938, + "step": 7662, + "time_per_iteration": 2.4249329566955566 + }, + { + "auxiliary_loss_clip": 0.01064028, + "auxiliary_loss_mlp": 0.01046763, + "balance_loss_clip": 1.01648355, + "balance_loss_mlp": 1.01876152, + "epoch": 0.46072448519464904, + "flos": 17774910639360.0, + "grad_norm": 2.477153087973142, + "language_loss": 0.81325567, + "learning_rate": 2.349682601310998e-06, + "loss": 0.83436358, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.453125, + "step": 7663, + "time_per_iteration": 2.308344602584839 + }, + { + "auxiliary_loss_clip": 0.01058906, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.01721907, + "balance_loss_mlp": 1.01836646, + "epoch": 0.460784608447317, + "flos": 15085201363200.0, + "grad_norm": 1.7934225103637977, + "language_loss": 0.74696875, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.7679913, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 7664, + "time_per_iteration": 2.4094228744506836 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.01050042, + "balance_loss_clip": 1.02078807, + "balance_loss_mlp": 1.01915848, + "epoch": 0.46084473169998497, + "flos": 18587269221120.0, + "grad_norm": 1.47343522674296, + "language_loss": 0.73796451, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.75908232, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 7665, + "time_per_iteration": 2.345404863357544 + }, + { + "auxiliary_loss_clip": 0.01059196, + "auxiliary_loss_mlp": 0.01043869, + "balance_loss_clip": 1.01503253, + "balance_loss_mlp": 1.01765633, + "epoch": 0.46090485495265293, + "flos": 19493928984960.0, + "grad_norm": 1.7989892762261108, + "language_loss": 0.78770357, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80873418, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 7666, + "time_per_iteration": 2.356403350830078 + }, + { + "auxiliary_loss_clip": 0.01058037, + "auxiliary_loss_mlp": 0.01042689, + "balance_loss_clip": 1.01336384, + "balance_loss_mlp": 1.01700056, + "epoch": 0.4609649782053209, + "flos": 33363525029760.0, + "grad_norm": 1.3889926790483555, + "language_loss": 0.7494669, + "learning_rate": 2.348148644753088e-06, + "loss": 0.7704742, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41015625, + "step": 7667, + "time_per_iteration": 2.474224328994751 + }, + { + "auxiliary_loss_clip": 0.01058388, + "auxiliary_loss_mlp": 0.01043133, + "balance_loss_clip": 1.01576209, + "balance_loss_mlp": 1.01702714, + "epoch": 0.46102510145798886, + "flos": 23768030937600.0, + "grad_norm": 1.3712990062022, + "language_loss": 0.77223027, + "learning_rate": 2.347765122572676e-06, + "loss": 0.79324543, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 7668, + "time_per_iteration": 2.3919124603271484 + }, + { + "auxiliary_loss_clip": 0.01057571, + "auxiliary_loss_mlp": 0.01041727, + "balance_loss_clip": 1.01618063, + "balance_loss_mlp": 1.01799047, + "epoch": 0.4610852247106568, + "flos": 23293700939520.0, + "grad_norm": 1.5792744580482982, + "language_loss": 0.78558874, + "learning_rate": 2.347381587204975e-06, + "loss": 0.8065818, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39453125, + "step": 7669, + "time_per_iteration": 2.413553237915039 + }, + { + "auxiliary_loss_clip": 0.01061122, + "auxiliary_loss_mlp": 0.01044074, + "balance_loss_clip": 1.01497483, + "balance_loss_mlp": 1.01951885, + "epoch": 0.4611453479633248, + "flos": 25446270948480.0, + "grad_norm": 1.731573701298637, + "language_loss": 0.84048313, + "learning_rate": 2.34699803866453e-06, + "loss": 0.86153507, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41796875, + "step": 7670, + "time_per_iteration": 2.4130802154541016 + }, + { + "auxiliary_loss_clip": 0.0105846, + "auxiliary_loss_mlp": 0.01044136, + "balance_loss_clip": 1.01761222, + "balance_loss_mlp": 1.01795208, + "epoch": 0.4612054712159928, + "flos": 21138617312640.0, + "grad_norm": 1.5463473670581702, + "language_loss": 0.64363927, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.66466522, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40429688, + "step": 7671, + "time_per_iteration": 2.394991159439087 + }, + { + "auxiliary_loss_clip": 0.01009781, + "auxiliary_loss_mlp": 0.01004605, + "balance_loss_clip": 1.00173247, + "balance_loss_mlp": 1.00176406, + "epoch": 0.4612655944686608, + "flos": 69955851772800.0, + "grad_norm": 0.7019538810597213, + "language_loss": 0.55946207, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57960594, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.02868652, + "router_z_loss_mlp": 0.08007812, + "step": 7672, + "time_per_iteration": 3.112905263900757 + }, + { + "auxiliary_loss_clip": 0.01060729, + "auxiliary_loss_mlp": 0.01046606, + "balance_loss_clip": 1.01682758, + "balance_loss_mlp": 1.01834154, + "epoch": 0.46132571772132874, + "flos": 16836200380800.0, + "grad_norm": 1.87682454826404, + "language_loss": 0.72567713, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.74675047, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42382812, + "step": 7673, + "time_per_iteration": 2.3370959758758545 + }, + { + "auxiliary_loss_clip": 0.01060442, + "auxiliary_loss_mlp": 0.01042072, + "balance_loss_clip": 1.01405752, + "balance_loss_mlp": 1.01972985, + "epoch": 0.4613858409739967, + "flos": 35807480179200.0, + "grad_norm": 1.8743863304483863, + "language_loss": 0.71922696, + "learning_rate": 2.345463713066195e-06, + "loss": 0.74025208, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40625, + "step": 7674, + "time_per_iteration": 2.516575336456299 + }, + { + "auxiliary_loss_clip": 0.01059705, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.01224649, + "balance_loss_mlp": 1.01746237, + "epoch": 0.4614459642266647, + "flos": 35265174030720.0, + "grad_norm": 1.619378595854543, + "language_loss": 0.67201805, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.69303429, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 7675, + "time_per_iteration": 2.5457570552825928 + }, + { + "auxiliary_loss_clip": 0.0100992, + "auxiliary_loss_mlp": 0.01005098, + "balance_loss_clip": 1.00248718, + "balance_loss_mlp": 1.00206184, + "epoch": 0.46150608747933264, + "flos": 66701493077760.0, + "grad_norm": 0.7299214303106868, + "language_loss": 0.58669358, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60684377, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.07861328, + "step": 7676, + "time_per_iteration": 3.0389010906219482 + }, + { + "auxiliary_loss_clip": 0.01010756, + "auxiliary_loss_mlp": 0.01005114, + "balance_loss_clip": 1.00257456, + "balance_loss_mlp": 1.00285244, + "epoch": 0.4615662107320006, + "flos": 55827409841280.0, + "grad_norm": 0.7923994863483977, + "language_loss": 0.6277504, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64790916, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.07910156, + "step": 7677, + "time_per_iteration": 2.9186391830444336 + }, + { + "auxiliary_loss_clip": 0.01059187, + "auxiliary_loss_mlp": 0.01041322, + "balance_loss_clip": 1.01517916, + "balance_loss_mlp": 1.01883626, + "epoch": 0.46162633398466857, + "flos": 15482443345920.0, + "grad_norm": 2.1757796344568723, + "language_loss": 0.77656114, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.79756629, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 7678, + "time_per_iteration": 2.3908486366271973 + }, + { + "auxiliary_loss_clip": 0.01062353, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.01276493, + "balance_loss_mlp": 1.02057958, + "epoch": 0.46168645723733653, + "flos": 20010398860800.0, + "grad_norm": 2.261113559921667, + "language_loss": 0.67602885, + "learning_rate": 2.343545511426974e-06, + "loss": 0.69706815, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 7679, + "time_per_iteration": 2.3506360054016113 + }, + { + "auxiliary_loss_clip": 0.01060606, + "auxiliary_loss_mlp": 0.01049096, + "balance_loss_clip": 1.02356148, + "balance_loss_mlp": 1.01986957, + "epoch": 0.4617465804900045, + "flos": 20297629549440.0, + "grad_norm": 2.370558510213089, + "language_loss": 0.71851933, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.73961639, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40820312, + "step": 7680, + "time_per_iteration": 2.385673761367798 + }, + { + "auxiliary_loss_clip": 0.01065588, + "auxiliary_loss_mlp": 0.01051335, + "balance_loss_clip": 1.02197361, + "balance_loss_mlp": 1.02152634, + "epoch": 0.46180670374267246, + "flos": 22345215500160.0, + "grad_norm": 1.890668783587243, + "language_loss": 0.65564239, + "learning_rate": 2.342778139478487e-06, + "loss": 0.6768117, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44140625, + "step": 7681, + "time_per_iteration": 2.3830442428588867 + }, + { + "auxiliary_loss_clip": 0.01058409, + "auxiliary_loss_mlp": 0.01046292, + "balance_loss_clip": 1.01927876, + "balance_loss_mlp": 1.01890743, + "epoch": 0.46186682699534043, + "flos": 19894836660480.0, + "grad_norm": 1.5360773257238376, + "language_loss": 0.68287838, + "learning_rate": 2.342394433999697e-06, + "loss": 0.70392537, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39453125, + "step": 7682, + "time_per_iteration": 2.4170122146606445 + }, + { + "auxiliary_loss_clip": 0.01061421, + "auxiliary_loss_mlp": 0.01056368, + "balance_loss_clip": 1.02760339, + "balance_loss_mlp": 1.02001143, + "epoch": 0.4619269502480084, + "flos": 31502235427200.0, + "grad_norm": 2.227603967223568, + "language_loss": 0.75753176, + "learning_rate": 2.342010715537275e-06, + "loss": 0.77870965, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4140625, + "step": 7683, + "time_per_iteration": 2.4309606552124023 + }, + { + "auxiliary_loss_clip": 0.01059883, + "auxiliary_loss_mlp": 0.01049756, + "balance_loss_clip": 1.02195692, + "balance_loss_mlp": 1.01962864, + "epoch": 0.46198707350067636, + "flos": 25008320453760.0, + "grad_norm": 1.9371825872341872, + "language_loss": 0.77618849, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.79728496, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40234375, + "step": 7684, + "time_per_iteration": 2.430478096008301 + }, + { + "auxiliary_loss_clip": 0.01064731, + "auxiliary_loss_mlp": 0.01068618, + "balance_loss_clip": 1.03973353, + "balance_loss_mlp": 1.02013838, + "epoch": 0.4620471967533444, + "flos": 18291485249280.0, + "grad_norm": 2.0823064843299774, + "language_loss": 0.80327106, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82460457, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4453125, + "step": 7685, + "time_per_iteration": 2.3590614795684814 + }, + { + "auxiliary_loss_clip": 0.01059987, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_clip": 1.02099061, + "balance_loss_mlp": 1.01940513, + "epoch": 0.46210732000601235, + "flos": 33983687243520.0, + "grad_norm": 2.182972644051508, + "language_loss": 0.67955768, + "learning_rate": 2.340859482393731e-06, + "loss": 0.7006377, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 7686, + "time_per_iteration": 2.5280306339263916 + }, + { + "auxiliary_loss_clip": 0.01061872, + "auxiliary_loss_mlp": 0.01055669, + "balance_loss_clip": 1.02678442, + "balance_loss_mlp": 1.018888, + "epoch": 0.4621674432586803, + "flos": 25008250631040.0, + "grad_norm": 2.106244347642457, + "language_loss": 0.7517575, + "learning_rate": 2.340475712142296e-06, + "loss": 0.77293289, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 7687, + "time_per_iteration": 2.3905930519104004 + }, + { + "auxiliary_loss_clip": 0.01061993, + "auxiliary_loss_mlp": 0.01052546, + "balance_loss_clip": 1.02499628, + "balance_loss_mlp": 1.02004194, + "epoch": 0.4622275665113483, + "flos": 22013052024960.0, + "grad_norm": 2.747064401471875, + "language_loss": 0.76409316, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.7852385, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41992188, + "step": 7688, + "time_per_iteration": 2.400160789489746 + }, + { + "auxiliary_loss_clip": 0.01058016, + "auxiliary_loss_mlp": 0.01054212, + "balance_loss_clip": 1.02740204, + "balance_loss_mlp": 1.01738143, + "epoch": 0.46228768976401624, + "flos": 24057740332800.0, + "grad_norm": 1.6287726889385674, + "language_loss": 0.8009401, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.82206237, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 7689, + "time_per_iteration": 3.6335947513580322 + }, + { + "auxiliary_loss_clip": 0.01063244, + "auxiliary_loss_mlp": 0.01051281, + "balance_loss_clip": 1.02054858, + "balance_loss_mlp": 1.01947033, + "epoch": 0.4623478130166842, + "flos": 26650180961280.0, + "grad_norm": 2.3943845148046434, + "language_loss": 0.57749492, + "learning_rate": 2.339324323980964e-06, + "loss": 0.5986402, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4375, + "step": 7690, + "time_per_iteration": 2.470726251602173 + }, + { + "auxiliary_loss_clip": 0.01059342, + "auxiliary_loss_mlp": 0.01052902, + "balance_loss_clip": 1.0259254, + "balance_loss_mlp": 1.01798975, + "epoch": 0.46240793626935217, + "flos": 20557383131520.0, + "grad_norm": 1.92670327346222, + "language_loss": 0.84115297, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.86227536, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 7691, + "time_per_iteration": 3.7698986530303955 + }, + { + "auxiliary_loss_clip": 0.01060796, + "auxiliary_loss_mlp": 0.01049967, + "balance_loss_clip": 1.02293086, + "balance_loss_mlp": 1.01965714, + "epoch": 0.46246805952202014, + "flos": 22454947503360.0, + "grad_norm": 1.5368733351370816, + "language_loss": 0.76215923, + "learning_rate": 2.338556667513091e-06, + "loss": 0.7832669, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41210938, + "step": 7692, + "time_per_iteration": 3.9141244888305664 + }, + { + "auxiliary_loss_clip": 0.01062271, + "auxiliary_loss_mlp": 0.01049786, + "balance_loss_clip": 1.01993632, + "balance_loss_mlp": 1.01964116, + "epoch": 0.4625281827746881, + "flos": 35039914738560.0, + "grad_norm": 1.5133440548305996, + "language_loss": 0.75203216, + "learning_rate": 2.338172820014723e-06, + "loss": 0.77315271, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42578125, + "step": 7693, + "time_per_iteration": 2.5443947315216064 + }, + { + "auxiliary_loss_clip": 0.01063309, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_clip": 1.01914263, + "balance_loss_mlp": 1.02149534, + "epoch": 0.46258830602735607, + "flos": 21067603873920.0, + "grad_norm": 1.4585410889391353, + "language_loss": 0.86225206, + "learning_rate": 2.337788959692808e-06, + "loss": 0.88335878, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 7694, + "time_per_iteration": 2.3847227096557617 + }, + { + "auxiliary_loss_clip": 0.01061944, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.0186919, + "balance_loss_mlp": 1.02085781, + "epoch": 0.46264842928002403, + "flos": 26176025520000.0, + "grad_norm": 1.9104033688223259, + "language_loss": 0.79886782, + "learning_rate": 2.337405086561902e-06, + "loss": 0.81994528, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 7695, + "time_per_iteration": 2.454000234603882 + }, + { + "auxiliary_loss_clip": 0.01061023, + "auxiliary_loss_mlp": 0.01045295, + "balance_loss_clip": 1.01898599, + "balance_loss_mlp": 1.01997781, + "epoch": 0.462708552532692, + "flos": 16763266817280.0, + "grad_norm": 1.97390896321549, + "language_loss": 0.73658907, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.75765216, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41015625, + "step": 7696, + "time_per_iteration": 2.381009340286255 + }, + { + "auxiliary_loss_clip": 0.01062951, + "auxiliary_loss_mlp": 0.01044103, + "balance_loss_clip": 1.01612425, + "balance_loss_mlp": 1.02176893, + "epoch": 0.46276867578535996, + "flos": 15559531361280.0, + "grad_norm": 2.482232221662205, + "language_loss": 0.70685434, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.72792488, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41210938, + "step": 7697, + "time_per_iteration": 2.3951914310455322 + }, + { + "auxiliary_loss_clip": 0.01062014, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.0136373, + "balance_loss_mlp": 1.02193022, + "epoch": 0.462828799038028, + "flos": 22414413548160.0, + "grad_norm": 1.8223132282354166, + "language_loss": 0.8530032, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.87402141, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 7698, + "time_per_iteration": 3.85278582572937 + }, + { + "auxiliary_loss_clip": 0.01063638, + "auxiliary_loss_mlp": 0.01038975, + "balance_loss_clip": 1.01204515, + "balance_loss_mlp": 1.02225828, + "epoch": 0.46288892229069595, + "flos": 21068511569280.0, + "grad_norm": 2.0397901865622745, + "language_loss": 0.72687161, + "learning_rate": 2.335869466239502e-06, + "loss": 0.74789774, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 7699, + "time_per_iteration": 2.3980605602264404 + }, + { + "auxiliary_loss_clip": 0.01064537, + "auxiliary_loss_mlp": 0.0104751, + "balance_loss_clip": 1.01762414, + "balance_loss_mlp": 1.02153587, + "epoch": 0.4629490455433639, + "flos": 23184562429440.0, + "grad_norm": 1.8110953944847128, + "language_loss": 0.72656739, + "learning_rate": 2.335485529281996e-06, + "loss": 0.74768794, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.4296875, + "step": 7700, + "time_per_iteration": 2.402522563934326 + }, + { + "auxiliary_loss_clip": 0.01061705, + "auxiliary_loss_mlp": 0.01042833, + "balance_loss_clip": 1.01427007, + "balance_loss_mlp": 1.02075362, + "epoch": 0.4630091687960319, + "flos": 18834768915840.0, + "grad_norm": 5.84540738630768, + "language_loss": 0.73579967, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.756845, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 7701, + "time_per_iteration": 2.397334575653076 + }, + { + "auxiliary_loss_clip": 0.01067739, + "auxiliary_loss_mlp": 0.01044157, + "balance_loss_clip": 1.01502204, + "balance_loss_mlp": 1.02386963, + "epoch": 0.46306929204869984, + "flos": 38905568161920.0, + "grad_norm": 3.797362018590303, + "language_loss": 0.65673232, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67785126, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43945312, + "step": 7702, + "time_per_iteration": 2.5191195011138916 + }, + { + "auxiliary_loss_clip": 0.01063107, + "auxiliary_loss_mlp": 0.01039007, + "balance_loss_clip": 1.01195908, + "balance_loss_mlp": 1.02171767, + "epoch": 0.4631294153013678, + "flos": 19643217425280.0, + "grad_norm": 2.020100199358354, + "language_loss": 0.74693066, + "learning_rate": 2.33433364213785e-06, + "loss": 0.76795185, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 7703, + "time_per_iteration": 2.4120192527770996 + }, + { + "auxiliary_loss_clip": 0.01064731, + "auxiliary_loss_mlp": 0.01050493, + "balance_loss_clip": 1.01871192, + "balance_loss_mlp": 1.02170181, + "epoch": 0.4631895385540358, + "flos": 24607098576000.0, + "grad_norm": 1.5794271138002782, + "language_loss": 0.6981045, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.7192567, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4296875, + "step": 7704, + "time_per_iteration": 2.403841972351074 + }, + { + "auxiliary_loss_clip": 0.01063504, + "auxiliary_loss_mlp": 0.01047391, + "balance_loss_clip": 1.01880431, + "balance_loss_mlp": 1.02057266, + "epoch": 0.46324966180670374, + "flos": 26318995004160.0, + "grad_norm": 2.072470200963084, + "language_loss": 0.82362366, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.84473258, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 7705, + "time_per_iteration": 2.4477264881134033 + }, + { + "auxiliary_loss_clip": 0.0106365, + "auxiliary_loss_mlp": 0.0104771, + "balance_loss_clip": 1.01809812, + "balance_loss_mlp": 1.02010608, + "epoch": 0.4633097850593717, + "flos": 19239621575040.0, + "grad_norm": 2.416199490240897, + "language_loss": 0.78609657, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.80721009, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43554688, + "step": 7706, + "time_per_iteration": 2.363382577896118 + }, + { + "auxiliary_loss_clip": 0.01059658, + "auxiliary_loss_mlp": 0.01051072, + "balance_loss_clip": 1.02215242, + "balance_loss_mlp": 1.01965928, + "epoch": 0.46336990831203967, + "flos": 22782083742720.0, + "grad_norm": 1.8433642272977389, + "language_loss": 0.71520853, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.73631585, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40039062, + "step": 7707, + "time_per_iteration": 2.410961627960205 + }, + { + "auxiliary_loss_clip": 0.01063291, + "auxiliary_loss_mlp": 0.01055759, + "balance_loss_clip": 1.02438319, + "balance_loss_mlp": 1.01885915, + "epoch": 0.46343003156470763, + "flos": 38209260539520.0, + "grad_norm": 1.9605414941144852, + "language_loss": 0.62307763, + "learning_rate": 2.332413576865791e-06, + "loss": 0.64426804, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 7708, + "time_per_iteration": 2.4999494552612305 + }, + { + "auxiliary_loss_clip": 0.01062186, + "auxiliary_loss_mlp": 0.01055731, + "balance_loss_clip": 1.02600026, + "balance_loss_mlp": 1.01976109, + "epoch": 0.4634901548173756, + "flos": 31937288279040.0, + "grad_norm": 2.407911473774212, + "language_loss": 0.78146595, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.80264515, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42382812, + "step": 7709, + "time_per_iteration": 2.472043991088867 + }, + { + "auxiliary_loss_clip": 0.01063394, + "auxiliary_loss_mlp": 0.01054934, + "balance_loss_clip": 1.02396369, + "balance_loss_mlp": 1.01973319, + "epoch": 0.46355027807004356, + "flos": 20081551944960.0, + "grad_norm": 1.7252107878954535, + "language_loss": 0.78522819, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.8064115, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.4375, + "step": 7710, + "time_per_iteration": 2.374325752258301 + }, + { + "auxiliary_loss_clip": 0.01064597, + "auxiliary_loss_mlp": 0.01052578, + "balance_loss_clip": 1.0206058, + "balance_loss_mlp": 1.02014267, + "epoch": 0.4636104013227116, + "flos": 24060219039360.0, + "grad_norm": 2.343754833877412, + "language_loss": 0.7461611, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.76733285, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4453125, + "step": 7711, + "time_per_iteration": 2.467454671859741 + }, + { + "auxiliary_loss_clip": 0.01061971, + "auxiliary_loss_mlp": 0.01056806, + "balance_loss_clip": 1.02646708, + "balance_loss_mlp": 1.02045226, + "epoch": 0.46367052457537955, + "flos": 23913514039680.0, + "grad_norm": 1.2824809445630814, + "language_loss": 0.72637415, + "learning_rate": 2.33087729766797e-06, + "loss": 0.74756193, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.41601562, + "step": 7712, + "time_per_iteration": 2.4148452281951904 + }, + { + "auxiliary_loss_clip": 0.01065983, + "auxiliary_loss_mlp": 0.01065809, + "balance_loss_clip": 1.03219175, + "balance_loss_mlp": 1.02013326, + "epoch": 0.4637306478280475, + "flos": 26395314969600.0, + "grad_norm": 1.6689263978641424, + "language_loss": 0.74379748, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.76511538, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.45898438, + "step": 7713, + "time_per_iteration": 2.4980030059814453 + }, + { + "auxiliary_loss_clip": 0.01063818, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_clip": 1.02416134, + "balance_loss_mlp": 1.0190475, + "epoch": 0.4637907710807155, + "flos": 21979639987200.0, + "grad_norm": 1.8692476723845421, + "language_loss": 0.60609603, + "learning_rate": 2.3301090827294e-06, + "loss": 0.62730056, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.44726562, + "step": 7714, + "time_per_iteration": 2.385563611984253 + }, + { + "auxiliary_loss_clip": 0.01059588, + "auxiliary_loss_mlp": 0.01056283, + "balance_loss_clip": 1.02447832, + "balance_loss_mlp": 1.01804137, + "epoch": 0.46385089433338345, + "flos": 12421468005120.0, + "grad_norm": 4.365879384273636, + "language_loss": 0.70870769, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72986639, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.41601562, + "step": 7715, + "time_per_iteration": 2.385610342025757 + }, + { + "auxiliary_loss_clip": 0.01066641, + "auxiliary_loss_mlp": 0.01052573, + "balance_loss_clip": 1.02064919, + "balance_loss_mlp": 1.0205164, + "epoch": 0.4639110175860514, + "flos": 23914596291840.0, + "grad_norm": 2.9956151696584064, + "language_loss": 0.69622141, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.71741354, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.4609375, + "step": 7716, + "time_per_iteration": 2.3896665573120117 + }, + { + "auxiliary_loss_clip": 0.01064866, + "auxiliary_loss_mlp": 0.01051327, + "balance_loss_clip": 1.01933169, + "balance_loss_mlp": 1.019701, + "epoch": 0.4639711408387194, + "flos": 25299251746560.0, + "grad_norm": 1.605371228142037, + "language_loss": 0.82143563, + "learning_rate": 2.328956666474691e-06, + "loss": 0.8425976, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.453125, + "step": 7717, + "time_per_iteration": 2.4502758979797363 + }, + { + "auxiliary_loss_clip": 0.01065031, + "auxiliary_loss_mlp": 0.01046272, + "balance_loss_clip": 1.0146811, + "balance_loss_mlp": 1.02015507, + "epoch": 0.46403126409138734, + "flos": 21210852648960.0, + "grad_norm": 1.7405919977552307, + "language_loss": 0.74390274, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.76501578, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.44921875, + "step": 7718, + "time_per_iteration": 2.378511905670166 + }, + { + "auxiliary_loss_clip": 0.01062829, + "auxiliary_loss_mlp": 0.01054168, + "balance_loss_clip": 1.02357841, + "balance_loss_mlp": 1.01924551, + "epoch": 0.4640913873440553, + "flos": 35844104062080.0, + "grad_norm": 1.7298828723218822, + "language_loss": 0.71640426, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.73757422, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43554688, + "step": 7719, + "time_per_iteration": 2.5230586528778076 + }, + { + "auxiliary_loss_clip": 0.01067119, + "auxiliary_loss_mlp": 0.01051906, + "balance_loss_clip": 1.01878953, + "balance_loss_mlp": 1.02075315, + "epoch": 0.46415151059672327, + "flos": 19165361379840.0, + "grad_norm": 2.5785232113690895, + "language_loss": 0.87531042, + "learning_rate": 2.327804137953357e-06, + "loss": 0.89650071, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.46289062, + "step": 7720, + "time_per_iteration": 2.363158702850342 + }, + { + "auxiliary_loss_clip": 0.01014888, + "auxiliary_loss_mlp": 0.0100487, + "balance_loss_clip": 1.00231934, + "balance_loss_mlp": 1.00667787, + "epoch": 0.46421163384939124, + "flos": 58909401820800.0, + "grad_norm": 0.7286691291723035, + "language_loss": 0.55092412, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57112163, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.08203125, + "step": 7721, + "time_per_iteration": 3.057159185409546 + }, + { + "auxiliary_loss_clip": 0.01063572, + "auxiliary_loss_mlp": 0.0105306, + "balance_loss_clip": 1.02001524, + "balance_loss_mlp": 1.0210247, + "epoch": 0.4642717571020592, + "flos": 20156300899200.0, + "grad_norm": 2.2341439918799986, + "language_loss": 0.8064093, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.82757568, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.42578125, + "step": 7722, + "time_per_iteration": 2.3673644065856934 + }, + { + "auxiliary_loss_clip": 0.01066621, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.0127095, + "balance_loss_mlp": 1.02177405, + "epoch": 0.46433188035472717, + "flos": 25045014159360.0, + "grad_norm": 2.197113484047951, + "language_loss": 0.78682667, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80795974, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.44921875, + "step": 7723, + "time_per_iteration": 2.4375202655792236 + }, + { + "auxiliary_loss_clip": 0.01063149, + "auxiliary_loss_mlp": 0.010496, + "balance_loss_clip": 1.01665092, + "balance_loss_mlp": 1.01938748, + "epoch": 0.4643920036073952, + "flos": 28074357941760.0, + "grad_norm": 1.5947324440446287, + "language_loss": 0.69365025, + "learning_rate": 2.326267259301118e-06, + "loss": 0.71477771, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4375, + "step": 7724, + "time_per_iteration": 2.437103033065796 + }, + { + "auxiliary_loss_clip": 0.01064569, + "auxiliary_loss_mlp": 0.01046423, + "balance_loss_clip": 1.01720488, + "balance_loss_mlp": 1.0215894, + "epoch": 0.46445212686006315, + "flos": 18368363796480.0, + "grad_norm": 2.153775602909375, + "language_loss": 0.69129717, + "learning_rate": 2.325883008671415e-06, + "loss": 0.71240711, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 7725, + "time_per_iteration": 2.404132127761841 + }, + { + "auxiliary_loss_clip": 0.01061175, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.01756334, + "balance_loss_mlp": 1.02018833, + "epoch": 0.4645122501127311, + "flos": 31720302979200.0, + "grad_norm": 1.7071924832533731, + "language_loss": 0.66462874, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.68570125, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 7726, + "time_per_iteration": 2.458108901977539 + }, + { + "auxiliary_loss_clip": 0.01064198, + "auxiliary_loss_mlp": 0.01057885, + "balance_loss_clip": 1.02789223, + "balance_loss_mlp": 1.02132297, + "epoch": 0.4645723733653991, + "flos": 23767681824000.0, + "grad_norm": 1.8202246219584926, + "language_loss": 0.75917971, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.78040057, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4296875, + "step": 7727, + "time_per_iteration": 2.4059643745422363 + }, + { + "auxiliary_loss_clip": 0.01064471, + "auxiliary_loss_mlp": 0.01051123, + "balance_loss_clip": 1.02112961, + "balance_loss_mlp": 1.02076662, + "epoch": 0.46463249661806705, + "flos": 33144130846080.0, + "grad_norm": 1.963373560009747, + "language_loss": 0.80413675, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.82529265, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4375, + "step": 7728, + "time_per_iteration": 2.4513487815856934 + }, + { + "auxiliary_loss_clip": 0.01063946, + "auxiliary_loss_mlp": 0.0105348, + "balance_loss_clip": 1.02067339, + "balance_loss_mlp": 1.02091074, + "epoch": 0.464692619870735, + "flos": 18295046208000.0, + "grad_norm": 1.7256566729337512, + "language_loss": 0.76694024, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78811449, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.4296875, + "step": 7729, + "time_per_iteration": 3.691314458847046 + }, + { + "auxiliary_loss_clip": 0.01063767, + "auxiliary_loss_mlp": 0.01058449, + "balance_loss_clip": 1.02721667, + "balance_loss_mlp": 1.02081048, + "epoch": 0.464752743123403, + "flos": 22636949754240.0, + "grad_norm": 1.6607633307326528, + "language_loss": 0.81447208, + "learning_rate": 2.323961570451588e-06, + "loss": 0.83569425, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4296875, + "step": 7730, + "time_per_iteration": 2.38246488571167 + }, + { + "auxiliary_loss_clip": 0.01060608, + "auxiliary_loss_mlp": 0.01059458, + "balance_loss_clip": 1.02841544, + "balance_loss_mlp": 1.01895368, + "epoch": 0.46481286637607094, + "flos": 20411097068160.0, + "grad_norm": 1.566540440584837, + "language_loss": 0.78260458, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.80380523, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.41796875, + "step": 7731, + "time_per_iteration": 3.768162727355957 + }, + { + "auxiliary_loss_clip": 0.01060589, + "auxiliary_loss_mlp": 0.01045408, + "balance_loss_clip": 1.01870561, + "balance_loss_mlp": 1.01968622, + "epoch": 0.4648729896287389, + "flos": 34274025043200.0, + "grad_norm": 1.688239820417337, + "language_loss": 0.66610718, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68716717, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 7732, + "time_per_iteration": 3.838547945022583 + }, + { + "auxiliary_loss_clip": 0.01063776, + "auxiliary_loss_mlp": 0.01057396, + "balance_loss_clip": 1.02156186, + "balance_loss_mlp": 1.01896298, + "epoch": 0.4649331128814069, + "flos": 21320794120320.0, + "grad_norm": 3.3408094665503043, + "language_loss": 0.73912078, + "learning_rate": 2.32280855998725e-06, + "loss": 0.76033247, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 0.44921875, + "step": 7733, + "time_per_iteration": 2.374772548675537 + }, + { + "auxiliary_loss_clip": 0.01010184, + "auxiliary_loss_mlp": 0.01015728, + "balance_loss_clip": 1.01285481, + "balance_loss_mlp": 1.00250912, + "epoch": 0.46499323613407484, + "flos": 58305754546560.0, + "grad_norm": 1.2600022761213459, + "language_loss": 0.52085185, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54111099, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.02868652, + "router_z_loss_mlp": 0.07666016, + "step": 7734, + "time_per_iteration": 2.9554405212402344 + }, + { + "auxiliary_loss_clip": 0.01061063, + "auxiliary_loss_mlp": 0.0104802, + "balance_loss_clip": 1.0166204, + "balance_loss_mlp": 1.01855171, + "epoch": 0.4650533593867428, + "flos": 10888885653120.0, + "grad_norm": 2.037897062118562, + "language_loss": 0.77063197, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.79172277, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.42578125, + "step": 7735, + "time_per_iteration": 2.331630229949951 + }, + { + "auxiliary_loss_clip": 0.01060078, + "auxiliary_loss_mlp": 0.01055329, + "balance_loss_clip": 1.02519262, + "balance_loss_mlp": 1.0188632, + "epoch": 0.46511348263941077, + "flos": 19973565509760.0, + "grad_norm": 2.303358795571282, + "language_loss": 0.71072978, + "learning_rate": 2.321655439354519e-06, + "loss": 0.73188382, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41210938, + "step": 7736, + "time_per_iteration": 2.404212474822998 + }, + { + "auxiliary_loss_clip": 0.01059184, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.02118635, + "balance_loss_mlp": 1.01852643, + "epoch": 0.46517360589207873, + "flos": 19677502247040.0, + "grad_norm": 1.5584611025866622, + "language_loss": 0.73010939, + "learning_rate": 2.321271041396427e-06, + "loss": 0.75118762, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40820312, + "step": 7737, + "time_per_iteration": 2.3608734607696533 + }, + { + "auxiliary_loss_clip": 0.01063431, + "auxiliary_loss_mlp": 0.01052656, + "balance_loss_clip": 1.02129221, + "balance_loss_mlp": 1.02106786, + "epoch": 0.46523372914474675, + "flos": 16871742011520.0, + "grad_norm": 1.9097089420678, + "language_loss": 0.84505951, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.86622036, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.42382812, + "step": 7738, + "time_per_iteration": 3.7910706996917725 + }, + { + "auxiliary_loss_clip": 0.01010856, + "auxiliary_loss_mlp": 0.01018455, + "balance_loss_clip": 1.01577246, + "balance_loss_mlp": 1.0031625, + "epoch": 0.4652938523974147, + "flos": 53435963243520.0, + "grad_norm": 0.7670022416207088, + "language_loss": 0.57852137, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59881449, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.07714844, + "step": 7739, + "time_per_iteration": 3.0808160305023193 + }, + { + "auxiliary_loss_clip": 0.01060667, + "auxiliary_loss_mlp": 0.01057472, + "balance_loss_clip": 1.02944613, + "balance_loss_mlp": 1.0195415, + "epoch": 0.4653539756500827, + "flos": 15230405174400.0, + "grad_norm": 1.7329680802560934, + "language_loss": 0.86130267, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.88248408, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41015625, + "step": 7740, + "time_per_iteration": 2.343319892883301 + }, + { + "auxiliary_loss_clip": 0.01062243, + "auxiliary_loss_mlp": 0.01050439, + "balance_loss_clip": 1.01954007, + "balance_loss_mlp": 1.02116334, + "epoch": 0.46541409890275065, + "flos": 23731127763840.0, + "grad_norm": 1.6479007503068148, + "language_loss": 0.76378405, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78491092, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.41015625, + "step": 7741, + "time_per_iteration": 2.4303243160247803 + }, + { + "auxiliary_loss_clip": 0.01067115, + "auxiliary_loss_mlp": 0.01050415, + "balance_loss_clip": 1.02035046, + "balance_loss_mlp": 1.02211905, + "epoch": 0.4654742221554186, + "flos": 20846359388160.0, + "grad_norm": 2.1694290513296877, + "language_loss": 0.81759471, + "learning_rate": 2.319348869158064e-06, + "loss": 0.83877003, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45117188, + "step": 7742, + "time_per_iteration": 2.389361619949341 + }, + { + "auxiliary_loss_clip": 0.01065799, + "auxiliary_loss_mlp": 0.01054087, + "balance_loss_clip": 1.02297306, + "balance_loss_mlp": 1.02185822, + "epoch": 0.4655343454080866, + "flos": 20703773928960.0, + "grad_norm": 1.6044049654842119, + "language_loss": 0.73701543, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.75821435, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43945312, + "step": 7743, + "time_per_iteration": 2.4023377895355225 + }, + { + "auxiliary_loss_clip": 0.01063307, + "auxiliary_loss_mlp": 0.01046971, + "balance_loss_clip": 1.01758623, + "balance_loss_mlp": 1.02071619, + "epoch": 0.46559446866075455, + "flos": 18988840212480.0, + "grad_norm": 2.0952100409196093, + "language_loss": 0.72614896, + "learning_rate": 2.318579915392483e-06, + "loss": 0.74725169, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42578125, + "step": 7744, + "time_per_iteration": 2.384129047393799 + }, + { + "auxiliary_loss_clip": 0.01062438, + "auxiliary_loss_mlp": 0.01039248, + "balance_loss_clip": 1.01482213, + "balance_loss_mlp": 1.02252662, + "epoch": 0.4656545919134225, + "flos": 34494920415360.0, + "grad_norm": 1.4858152459723244, + "language_loss": 0.85575444, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87677127, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3984375, + "step": 7745, + "time_per_iteration": 2.51816463470459 + }, + { + "auxiliary_loss_clip": 0.01064294, + "auxiliary_loss_mlp": 0.01043372, + "balance_loss_clip": 1.01447582, + "balance_loss_mlp": 1.02382565, + "epoch": 0.4657147151660905, + "flos": 24309569036160.0, + "grad_norm": 1.5945846523857399, + "language_loss": 0.73737228, + "learning_rate": 2.317810913304574e-06, + "loss": 0.7584489, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40429688, + "step": 7746, + "time_per_iteration": 2.4029455184936523 + }, + { + "auxiliary_loss_clip": 0.01063042, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_clip": 1.01941335, + "balance_loss_mlp": 1.02306592, + "epoch": 0.46577483841875844, + "flos": 58793038525440.0, + "grad_norm": 1.505215186546429, + "language_loss": 0.71050096, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.73159063, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40039062, + "step": 7747, + "time_per_iteration": 2.7494115829467773 + }, + { + "auxiliary_loss_clip": 0.01061836, + "auxiliary_loss_mlp": 0.01048918, + "balance_loss_clip": 1.02170205, + "balance_loss_mlp": 1.02175891, + "epoch": 0.4658349616714264, + "flos": 31320617201280.0, + "grad_norm": 1.58263099299657, + "language_loss": 0.68362588, + "learning_rate": 2.317041863010978e-06, + "loss": 0.70473343, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40039062, + "step": 7748, + "time_per_iteration": 2.4643189907073975 + }, + { + "auxiliary_loss_clip": 0.01065293, + "auxiliary_loss_mlp": 0.01050598, + "balance_loss_clip": 1.01950788, + "balance_loss_mlp": 1.02212608, + "epoch": 0.46589508492409437, + "flos": 14859627868800.0, + "grad_norm": 2.025016069604226, + "language_loss": 0.6630739, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.68423277, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43359375, + "step": 7749, + "time_per_iteration": 2.3929193019866943 + }, + { + "auxiliary_loss_clip": 0.01065433, + "auxiliary_loss_mlp": 0.01048806, + "balance_loss_clip": 1.01975453, + "balance_loss_mlp": 1.02284026, + "epoch": 0.46595520817676234, + "flos": 12895169598720.0, + "grad_norm": 1.9886957123345985, + "language_loss": 0.7543211, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.77546352, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42578125, + "step": 7750, + "time_per_iteration": 2.3537771701812744 + }, + { + "auxiliary_loss_clip": 0.01064314, + "auxiliary_loss_mlp": 0.01041836, + "balance_loss_clip": 1.01531219, + "balance_loss_mlp": 1.02130795, + "epoch": 0.46601533142943036, + "flos": 32852780616960.0, + "grad_norm": 2.020092971078369, + "language_loss": 0.75695062, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.7780121, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.43164062, + "step": 7751, + "time_per_iteration": 2.493079423904419 + }, + { + "auxiliary_loss_clip": 0.01065635, + "auxiliary_loss_mlp": 0.01051298, + "balance_loss_clip": 1.02188873, + "balance_loss_mlp": 1.02236414, + "epoch": 0.4660754546820983, + "flos": 19966687971840.0, + "grad_norm": 1.9821187078305789, + "language_loss": 0.74980283, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.77097207, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43359375, + "step": 7752, + "time_per_iteration": 2.348349094390869 + }, + { + "auxiliary_loss_clip": 0.01063773, + "auxiliary_loss_mlp": 0.0105543, + "balance_loss_clip": 1.02659297, + "balance_loss_mlp": 1.02042866, + "epoch": 0.4661355779347663, + "flos": 26686944489600.0, + "grad_norm": 2.2402111051636253, + "language_loss": 0.71250015, + "learning_rate": 2.315119027142644e-06, + "loss": 0.73369217, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43359375, + "step": 7753, + "time_per_iteration": 2.4525625705718994 + }, + { + "auxiliary_loss_clip": 0.01059783, + "auxiliary_loss_mlp": 0.01044328, + "balance_loss_clip": 1.01849484, + "balance_loss_mlp": 1.02001381, + "epoch": 0.46619570118743425, + "flos": 20958395541120.0, + "grad_norm": 2.9785662837282287, + "language_loss": 0.74098778, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.76202893, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 7754, + "time_per_iteration": 2.3645730018615723 + }, + { + "auxiliary_loss_clip": 0.01065109, + "auxiliary_loss_mlp": 0.01052445, + "balance_loss_clip": 1.02321458, + "balance_loss_mlp": 1.02102852, + "epoch": 0.4662558244401022, + "flos": 24424921768320.0, + "grad_norm": 2.089828090416352, + "language_loss": 0.80062532, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.82180095, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44140625, + "step": 7755, + "time_per_iteration": 2.4428043365478516 + }, + { + "auxiliary_loss_clip": 0.01057903, + "auxiliary_loss_mlp": 0.01051967, + "balance_loss_clip": 1.0245136, + "balance_loss_mlp": 1.0179708, + "epoch": 0.4663159476927702, + "flos": 20594391039360.0, + "grad_norm": 1.5952910574245276, + "language_loss": 0.73218393, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.75328261, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 7756, + "time_per_iteration": 2.3651952743530273 + }, + { + "auxiliary_loss_clip": 0.01059477, + "auxiliary_loss_mlp": 0.0104534, + "balance_loss_clip": 1.0180409, + "balance_loss_mlp": 1.01927757, + "epoch": 0.46637607094543815, + "flos": 25660812453120.0, + "grad_norm": 1.6253400368963045, + "language_loss": 0.78906095, + "learning_rate": 2.313580543272274e-06, + "loss": 0.81010914, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 7757, + "time_per_iteration": 2.4442760944366455 + }, + { + "auxiliary_loss_clip": 0.01057763, + "auxiliary_loss_mlp": 0.01049617, + "balance_loss_clip": 1.02173448, + "balance_loss_mlp": 1.01706851, + "epoch": 0.4664361941981061, + "flos": 24272875330560.0, + "grad_norm": 1.910035993011496, + "language_loss": 0.67820388, + "learning_rate": 2.313195892540705e-06, + "loss": 0.6992777, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 7758, + "time_per_iteration": 2.4202370643615723 + }, + { + "auxiliary_loss_clip": 0.01060399, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.02212429, + "balance_loss_mlp": 1.01894784, + "epoch": 0.4664963174507741, + "flos": 18404882945280.0, + "grad_norm": 1.7201659397462121, + "language_loss": 0.76238203, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.7834872, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.4140625, + "step": 7759, + "time_per_iteration": 2.4078283309936523 + }, + { + "auxiliary_loss_clip": 0.0105965, + "auxiliary_loss_mlp": 0.01051883, + "balance_loss_clip": 1.02490568, + "balance_loss_mlp": 1.0189147, + "epoch": 0.46655644070344204, + "flos": 22454039808000.0, + "grad_norm": 1.5974373699986775, + "language_loss": 0.78132343, + "learning_rate": 2.312426555462893e-06, + "loss": 0.8024388, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 7760, + "time_per_iteration": 2.4319489002227783 + }, + { + "auxiliary_loss_clip": 0.01057208, + "auxiliary_loss_mlp": 0.01046074, + "balance_loss_clip": 1.01990795, + "balance_loss_mlp": 1.01800978, + "epoch": 0.46661656395611, + "flos": 13807554825600.0, + "grad_norm": 1.6902864471619454, + "language_loss": 0.76029134, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.78132421, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 7761, + "time_per_iteration": 2.3493776321411133 + }, + { + "auxiliary_loss_clip": 0.01061971, + "auxiliary_loss_mlp": 0.01053256, + "balance_loss_clip": 1.02195215, + "balance_loss_mlp": 1.01953053, + "epoch": 0.466676687208778, + "flos": 21651107293440.0, + "grad_norm": 1.6775834476930387, + "language_loss": 0.79803693, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81918919, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.42382812, + "step": 7762, + "time_per_iteration": 2.401060104370117 + }, + { + "auxiliary_loss_clip": 0.010115, + "auxiliary_loss_mlp": 0.01013796, + "balance_loss_clip": 1.01104236, + "balance_loss_mlp": 1.00389266, + "epoch": 0.46673681046144594, + "flos": 68530941653760.0, + "grad_norm": 0.8213144378579171, + "language_loss": 0.59910226, + "learning_rate": 2.311272461028297e-06, + "loss": 0.6193552, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.07617188, + "step": 7763, + "time_per_iteration": 3.1485788822174072 + }, + { + "auxiliary_loss_clip": 0.0106285, + "auxiliary_loss_mlp": 0.01045583, + "balance_loss_clip": 1.01503003, + "balance_loss_mlp": 1.01999879, + "epoch": 0.46679693371411396, + "flos": 15813559480320.0, + "grad_norm": 2.2742388297086094, + "language_loss": 0.80113053, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.82221484, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4296875, + "step": 7764, + "time_per_iteration": 2.350856065750122 + }, + { + "auxiliary_loss_clip": 0.01060644, + "auxiliary_loss_mlp": 0.01042896, + "balance_loss_clip": 1.01818383, + "balance_loss_mlp": 1.02149522, + "epoch": 0.4668570569667819, + "flos": 18513602519040.0, + "grad_norm": 1.7937899392023746, + "language_loss": 0.73363829, + "learning_rate": 2.310503005696839e-06, + "loss": 0.75467366, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.39257812, + "step": 7765, + "time_per_iteration": 2.376589775085449 + }, + { + "auxiliary_loss_clip": 0.01062561, + "auxiliary_loss_mlp": 0.01046108, + "balance_loss_clip": 1.0172112, + "balance_loss_mlp": 1.02068698, + "epoch": 0.4669171802194499, + "flos": 19205685866880.0, + "grad_norm": 3.288266368161227, + "language_loss": 0.79914534, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.82023203, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 7766, + "time_per_iteration": 2.352276563644409 + }, + { + "auxiliary_loss_clip": 0.01061471, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.0189743, + "balance_loss_mlp": 1.02104187, + "epoch": 0.46697730347211786, + "flos": 12275321587200.0, + "grad_norm": 2.9695605022363205, + "language_loss": 0.6661284, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.68720949, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40429688, + "step": 7767, + "time_per_iteration": 2.3489773273468018 + }, + { + "auxiliary_loss_clip": 0.01065729, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_clip": 1.01625514, + "balance_loss_mlp": 1.02327025, + "epoch": 0.4670374267247858, + "flos": 23585609750400.0, + "grad_norm": 2.046740949835028, + "language_loss": 0.75515103, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.77624559, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.42578125, + "step": 7768, + "time_per_iteration": 2.4242918491363525 + }, + { + "auxiliary_loss_clip": 0.01064529, + "auxiliary_loss_mlp": 0.010399, + "balance_loss_clip": 1.01491404, + "balance_loss_mlp": 1.02302027, + "epoch": 0.4670975499774538, + "flos": 15990359938560.0, + "grad_norm": 1.7072447625060694, + "language_loss": 0.71459734, + "learning_rate": 2.308963953858982e-06, + "loss": 0.7356416, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.4140625, + "step": 7769, + "time_per_iteration": 3.642825126647949 + }, + { + "auxiliary_loss_clip": 0.01062159, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.01300299, + "balance_loss_mlp": 1.02084136, + "epoch": 0.46715767323012175, + "flos": 15376691237760.0, + "grad_norm": 3.2356923136944884, + "language_loss": 0.83394587, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.85495496, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.4140625, + "step": 7770, + "time_per_iteration": 3.704324960708618 + }, + { + "auxiliary_loss_clip": 0.01016936, + "auxiliary_loss_mlp": 0.01006257, + "balance_loss_clip": 1.00364602, + "balance_loss_mlp": 1.00895143, + "epoch": 0.4672177964827897, + "flos": 60249124488960.0, + "grad_norm": 0.8004469076539501, + "language_loss": 0.55693418, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57716614, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.08007812, + "step": 7771, + "time_per_iteration": 4.359293460845947 + }, + { + "auxiliary_loss_clip": 0.01062585, + "auxiliary_loss_mlp": 0.01048482, + "balance_loss_clip": 1.02408004, + "balance_loss_mlp": 1.02246726, + "epoch": 0.4672779197354577, + "flos": 27634906258560.0, + "grad_norm": 2.2655550019433814, + "language_loss": 0.67073905, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.69184971, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.40234375, + "step": 7772, + "time_per_iteration": 2.453249931335449 + }, + { + "auxiliary_loss_clip": 0.01061236, + "auxiliary_loss_mlp": 0.01050693, + "balance_loss_clip": 1.02486038, + "balance_loss_mlp": 1.02130628, + "epoch": 0.46733804298812565, + "flos": 31392922360320.0, + "grad_norm": 2.4313814338856523, + "language_loss": 0.64693952, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.66805887, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 7773, + "time_per_iteration": 2.4556703567504883 + }, + { + "auxiliary_loss_clip": 0.0106096, + "auxiliary_loss_mlp": 0.01051724, + "balance_loss_clip": 1.02412701, + "balance_loss_mlp": 1.01996052, + "epoch": 0.4673981662407936, + "flos": 19499584625280.0, + "grad_norm": 1.7950671575375483, + "language_loss": 0.81768447, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.83881134, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 7774, + "time_per_iteration": 2.395684242248535 + }, + { + "auxiliary_loss_clip": 0.010629, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.0236938, + "balance_loss_mlp": 1.02128112, + "epoch": 0.4674582894934616, + "flos": 20520794160000.0, + "grad_norm": 1.7850005108627631, + "language_loss": 0.79257977, + "learning_rate": 2.306655024915726e-06, + "loss": 0.81372023, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41601562, + "step": 7775, + "time_per_iteration": 2.3757708072662354 + }, + { + "auxiliary_loss_clip": 0.01060608, + "auxiliary_loss_mlp": 0.01054369, + "balance_loss_clip": 1.02488828, + "balance_loss_mlp": 1.02059889, + "epoch": 0.46751841274612954, + "flos": 22089860749440.0, + "grad_norm": 1.793971568873115, + "language_loss": 0.70931977, + "learning_rate": 2.306270162640694e-06, + "loss": 0.73046958, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40039062, + "step": 7776, + "time_per_iteration": 2.4042041301727295 + }, + { + "auxiliary_loss_clip": 0.0106286, + "auxiliary_loss_mlp": 0.01052545, + "balance_loss_clip": 1.02695072, + "balance_loss_mlp": 1.02126336, + "epoch": 0.46757853599879756, + "flos": 26978853300480.0, + "grad_norm": 1.389042603820999, + "language_loss": 0.74478555, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.76593965, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.41601562, + "step": 7777, + "time_per_iteration": 2.415757656097412 + }, + { + "auxiliary_loss_clip": 0.01060533, + "auxiliary_loss_mlp": 0.01055913, + "balance_loss_clip": 1.03000855, + "balance_loss_mlp": 1.0186249, + "epoch": 0.4676386592514655, + "flos": 24132908223360.0, + "grad_norm": 1.8890180696293926, + "language_loss": 0.71421552, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.73537993, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.41796875, + "step": 7778, + "time_per_iteration": 3.861370325088501 + }, + { + "auxiliary_loss_clip": 0.01060938, + "auxiliary_loss_mlp": 0.01045741, + "balance_loss_clip": 1.01723814, + "balance_loss_mlp": 1.01942217, + "epoch": 0.4676987825041335, + "flos": 25482545717760.0, + "grad_norm": 1.6841242781431371, + "language_loss": 0.74807322, + "learning_rate": 2.305115506191206e-06, + "loss": 0.76914001, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 7779, + "time_per_iteration": 2.423671007156372 + }, + { + "auxiliary_loss_clip": 0.01059824, + "auxiliary_loss_mlp": 0.01053717, + "balance_loss_clip": 1.02561975, + "balance_loss_mlp": 1.01956666, + "epoch": 0.46775890575680146, + "flos": 21944203090560.0, + "grad_norm": 1.4726677163064938, + "language_loss": 0.73534739, + "learning_rate": 2.304730597548562e-06, + "loss": 0.75648272, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 7780, + "time_per_iteration": 2.415987014770508 + }, + { + "auxiliary_loss_clip": 0.01061991, + "auxiliary_loss_mlp": 0.01047539, + "balance_loss_clip": 1.01923823, + "balance_loss_mlp": 1.01835346, + "epoch": 0.4678190290094694, + "flos": 25227225878400.0, + "grad_norm": 2.1084711443683033, + "language_loss": 0.74959117, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.77068651, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4375, + "step": 7781, + "time_per_iteration": 2.3927628993988037 + }, + { + "auxiliary_loss_clip": 0.01059705, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.01684546, + "balance_loss_mlp": 1.01778555, + "epoch": 0.4678791522621374, + "flos": 32267042870400.0, + "grad_norm": 1.9137160181325696, + "language_loss": 0.63836265, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65941852, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41796875, + "step": 7782, + "time_per_iteration": 2.4727392196655273 + }, + { + "auxiliary_loss_clip": 0.0106424, + "auxiliary_loss_mlp": 0.01047014, + "balance_loss_clip": 1.01812983, + "balance_loss_mlp": 1.01988888, + "epoch": 0.46793927551480535, + "flos": 27045432996480.0, + "grad_norm": 2.0700181493198593, + "language_loss": 0.64627683, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.66738939, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.44335938, + "step": 7783, + "time_per_iteration": 2.4182987213134766 + }, + { + "auxiliary_loss_clip": 0.01064562, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.02090693, + "balance_loss_mlp": 1.02011967, + "epoch": 0.4679993987674733, + "flos": 17456432417280.0, + "grad_norm": 2.474503611536118, + "language_loss": 0.69663328, + "learning_rate": 2.303190847569801e-06, + "loss": 0.71778804, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.44335938, + "step": 7784, + "time_per_iteration": 2.3916144371032715 + }, + { + "auxiliary_loss_clip": 0.01061346, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.01763642, + "balance_loss_mlp": 1.02065396, + "epoch": 0.4680595220201413, + "flos": 17164174492800.0, + "grad_norm": 2.7222036535210345, + "language_loss": 0.85957915, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.88064241, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 7785, + "time_per_iteration": 2.363743305206299 + }, + { + "auxiliary_loss_clip": 0.01064041, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_clip": 1.02103329, + "balance_loss_mlp": 1.02117395, + "epoch": 0.46811964527280925, + "flos": 11326801236480.0, + "grad_norm": 1.9248623707658652, + "language_loss": 0.78930551, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.81045175, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 7786, + "time_per_iteration": 2.4428436756134033 + }, + { + "auxiliary_loss_clip": 0.01058889, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.01221967, + "balance_loss_mlp": 1.01956677, + "epoch": 0.4681797685254772, + "flos": 24277693098240.0, + "grad_norm": 1.7604738604904435, + "language_loss": 0.75292897, + "learning_rate": 2.302035914315856e-06, + "loss": 0.77388787, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39453125, + "step": 7787, + "time_per_iteration": 2.3883590698242188 + }, + { + "auxiliary_loss_clip": 0.0106369, + "auxiliary_loss_mlp": 0.01048068, + "balance_loss_clip": 1.02037585, + "balance_loss_mlp": 1.0221467, + "epoch": 0.4682398917781452, + "flos": 31649010249600.0, + "grad_norm": 1.67846293910167, + "language_loss": 0.6696822, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.69079977, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41601562, + "step": 7788, + "time_per_iteration": 2.4894304275512695 + }, + { + "auxiliary_loss_clip": 0.01062401, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.01548874, + "balance_loss_mlp": 1.02156472, + "epoch": 0.46830001503081314, + "flos": 28109515547520.0, + "grad_norm": 2.7033969479384563, + "language_loss": 0.65038407, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.67141545, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40820312, + "step": 7789, + "time_per_iteration": 2.4446873664855957 + }, + { + "auxiliary_loss_clip": 0.01019191, + "auxiliary_loss_mlp": 0.01012611, + "balance_loss_clip": 1.0099889, + "balance_loss_mlp": 1.01134038, + "epoch": 0.4683601382834811, + "flos": 57878661484800.0, + "grad_norm": 0.7014177006080612, + "language_loss": 0.6190331, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63935113, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.078125, + "step": 7790, + "time_per_iteration": 3.112109661102295 + }, + { + "auxiliary_loss_clip": 0.01063111, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.01455307, + "balance_loss_mlp": 1.02358985, + "epoch": 0.46842026153614913, + "flos": 21870850590720.0, + "grad_norm": 1.6788595373636737, + "language_loss": 0.80050588, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.8215338, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39453125, + "step": 7791, + "time_per_iteration": 2.3955464363098145 + }, + { + "auxiliary_loss_clip": 0.01063238, + "auxiliary_loss_mlp": 0.01046635, + "balance_loss_clip": 1.01889455, + "balance_loss_mlp": 1.02193964, + "epoch": 0.4684803847888171, + "flos": 24899635791360.0, + "grad_norm": 1.5662915910041637, + "language_loss": 0.75373697, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77483571, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41210938, + "step": 7792, + "time_per_iteration": 2.4700374603271484 + }, + { + "auxiliary_loss_clip": 0.01060798, + "auxiliary_loss_mlp": 0.01038489, + "balance_loss_clip": 1.01338339, + "balance_loss_mlp": 1.02095056, + "epoch": 0.46854050804148506, + "flos": 26250425360640.0, + "grad_norm": 2.2396379676281657, + "language_loss": 0.68921649, + "learning_rate": 2.299725738964898e-06, + "loss": 0.71020937, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3984375, + "step": 7793, + "time_per_iteration": 2.4273033142089844 + }, + { + "auxiliary_loss_clip": 0.01062898, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.01478648, + "balance_loss_mlp": 1.02228665, + "epoch": 0.468600631294153, + "flos": 21578732311680.0, + "grad_norm": 1.687992561788393, + "language_loss": 0.75001907, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.77104974, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40625, + "step": 7794, + "time_per_iteration": 2.418278455734253 + }, + { + "auxiliary_loss_clip": 0.01064155, + "auxiliary_loss_mlp": 0.01042154, + "balance_loss_clip": 1.01483154, + "balance_loss_mlp": 1.02222824, + "epoch": 0.468660754546821, + "flos": 25884430911360.0, + "grad_norm": 2.2692688428806975, + "language_loss": 0.64738476, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.66844779, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41796875, + "step": 7795, + "time_per_iteration": 2.420742988586426 + }, + { + "auxiliary_loss_clip": 0.01061066, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.01582646, + "balance_loss_mlp": 1.02060652, + "epoch": 0.46872087779948896, + "flos": 35473710781440.0, + "grad_norm": 1.6186563108088952, + "language_loss": 0.69756842, + "learning_rate": 2.298570497656304e-06, + "loss": 0.71860445, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 7796, + "time_per_iteration": 2.5142431259155273 + }, + { + "auxiliary_loss_clip": 0.01063076, + "auxiliary_loss_mlp": 0.01043243, + "balance_loss_clip": 1.01808953, + "balance_loss_mlp": 1.02168667, + "epoch": 0.4687810010521569, + "flos": 26395210235520.0, + "grad_norm": 1.6388877722670203, + "language_loss": 0.71434015, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.73540336, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.4140625, + "step": 7797, + "time_per_iteration": 2.4260106086730957 + }, + { + "auxiliary_loss_clip": 0.01063726, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.02040744, + "balance_loss_mlp": 1.02202451, + "epoch": 0.4688411243048249, + "flos": 19971785030400.0, + "grad_norm": 1.954974415853462, + "language_loss": 0.68431717, + "learning_rate": 2.297800280150454e-06, + "loss": 0.70544755, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 7798, + "time_per_iteration": 2.4706454277038574 + }, + { + "auxiliary_loss_clip": 0.01018268, + "auxiliary_loss_mlp": 0.01002047, + "balance_loss_clip": 0.99954391, + "balance_loss_mlp": 1.01013756, + "epoch": 0.46890124755749285, + "flos": 63973728552960.0, + "grad_norm": 0.9294963703961346, + "language_loss": 0.64680713, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66701031, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.08105469, + "step": 7799, + "time_per_iteration": 3.1854240894317627 + }, + { + "auxiliary_loss_clip": 0.01061554, + "auxiliary_loss_mlp": 0.01043608, + "balance_loss_clip": 1.01590335, + "balance_loss_mlp": 1.02108741, + "epoch": 0.4689613708101608, + "flos": 23767856380800.0, + "grad_norm": 1.3337267406610476, + "language_loss": 0.73268652, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.75373811, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40429688, + "step": 7800, + "time_per_iteration": 2.4434688091278076 + }, + { + "auxiliary_loss_clip": 0.01058901, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.01920414, + "balance_loss_mlp": 1.0198276, + "epoch": 0.4690214940628288, + "flos": 24787599638400.0, + "grad_norm": 3.1813351888853427, + "language_loss": 0.73504543, + "learning_rate": 2.296644869233568e-06, + "loss": 0.7560755, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.390625, + "step": 7801, + "time_per_iteration": 2.417325496673584 + }, + { + "auxiliary_loss_clip": 0.01065723, + "auxiliary_loss_mlp": 0.01052232, + "balance_loss_clip": 1.02077246, + "balance_loss_mlp": 1.02125835, + "epoch": 0.46908161731549675, + "flos": 18076350251520.0, + "grad_norm": 1.8351219924079774, + "language_loss": 0.65107512, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.67225468, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 7802, + "time_per_iteration": 2.3957178592681885 + }, + { + "auxiliary_loss_clip": 0.01061129, + "auxiliary_loss_mlp": 0.01052377, + "balance_loss_clip": 1.02505374, + "balance_loss_mlp": 1.0185926, + "epoch": 0.4691417405681647, + "flos": 25702149369600.0, + "grad_norm": 2.125524417458925, + "language_loss": 0.74776351, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.76889861, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42578125, + "step": 7803, + "time_per_iteration": 2.4061081409454346 + }, + { + "auxiliary_loss_clip": 0.01058735, + "auxiliary_loss_mlp": 0.01057211, + "balance_loss_clip": 1.03152144, + "balance_loss_mlp": 1.0185411, + "epoch": 0.46920186382083273, + "flos": 17456083303680.0, + "grad_norm": 2.6816017453627414, + "language_loss": 0.79472256, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.81588203, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 7804, + "time_per_iteration": 2.378934383392334 + }, + { + "auxiliary_loss_clip": 0.01059265, + "auxiliary_loss_mlp": 0.01059238, + "balance_loss_clip": 1.03160524, + "balance_loss_mlp": 1.01773214, + "epoch": 0.4692619870735007, + "flos": 20338407884160.0, + "grad_norm": 1.9509879593078754, + "language_loss": 0.78403449, + "learning_rate": 2.295104163929305e-06, + "loss": 0.80521953, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41601562, + "step": 7805, + "time_per_iteration": 2.363140106201172 + }, + { + "auxiliary_loss_clip": 0.01064621, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.02397394, + "balance_loss_mlp": 1.02036357, + "epoch": 0.46932211032616866, + "flos": 29495288165760.0, + "grad_norm": 1.61192931744742, + "language_loss": 0.83707368, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85825253, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.44140625, + "step": 7806, + "time_per_iteration": 2.4540414810180664 + }, + { + "auxiliary_loss_clip": 0.01061937, + "auxiliary_loss_mlp": 0.01060603, + "balance_loss_clip": 1.03167117, + "balance_loss_mlp": 1.01981521, + "epoch": 0.4693822335788366, + "flos": 36209749397760.0, + "grad_norm": 3.237683308904276, + "language_loss": 0.78777957, + "learning_rate": 2.294333744076472e-06, + "loss": 0.8090049, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 7807, + "time_per_iteration": 2.4886929988861084 + }, + { + "auxiliary_loss_clip": 0.01061042, + "auxiliary_loss_mlp": 0.01048526, + "balance_loss_clip": 1.01966572, + "balance_loss_mlp": 1.01963115, + "epoch": 0.4694423568315046, + "flos": 20337954036480.0, + "grad_norm": 2.630599248059758, + "language_loss": 0.51941347, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.54050916, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 7808, + "time_per_iteration": 3.5892951488494873 + }, + { + "auxiliary_loss_clip": 0.01013969, + "auxiliary_loss_mlp": 0.01011676, + "balance_loss_clip": 1.00925624, + "balance_loss_mlp": 1.00530887, + "epoch": 0.46950248008417256, + "flos": 64323489219840.0, + "grad_norm": 0.7946352889237837, + "language_loss": 0.57825267, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59850907, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.08691406, + "step": 7809, + "time_per_iteration": 2.869509220123291 + }, + { + "auxiliary_loss_clip": 0.01064775, + "auxiliary_loss_mlp": 0.01057394, + "balance_loss_clip": 1.02699566, + "balance_loss_mlp": 1.02146876, + "epoch": 0.4695626033368405, + "flos": 19199331999360.0, + "grad_norm": 1.9682407570279368, + "language_loss": 0.73312759, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.75434929, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 7810, + "time_per_iteration": 5.289606094360352 + }, + { + "auxiliary_loss_clip": 0.01063802, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_clip": 1.01970887, + "balance_loss_mlp": 1.02234745, + "epoch": 0.4696227265895085, + "flos": 23001338280960.0, + "grad_norm": 2.4616524056753937, + "language_loss": 0.82589269, + "learning_rate": 2.29279277055369e-06, + "loss": 0.84701741, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 7811, + "time_per_iteration": 2.4006187915802 + }, + { + "auxiliary_loss_clip": 0.01064951, + "auxiliary_loss_mlp": 0.01049723, + "balance_loss_clip": 1.02125561, + "balance_loss_mlp": 1.02149642, + "epoch": 0.46968284984217645, + "flos": 21869803249920.0, + "grad_norm": 1.9474656082962947, + "language_loss": 0.81178689, + "learning_rate": 2.292407499379644e-06, + "loss": 0.83293366, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.43554688, + "step": 7812, + "time_per_iteration": 2.45174503326416 + }, + { + "auxiliary_loss_clip": 0.01063853, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.0153594, + "balance_loss_mlp": 1.02306271, + "epoch": 0.4697429730948444, + "flos": 19973949534720.0, + "grad_norm": 1.9192964158845418, + "language_loss": 0.75676513, + "learning_rate": 2.292022217117477e-06, + "loss": 0.77781177, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40820312, + "step": 7813, + "time_per_iteration": 2.3654377460479736 + }, + { + "auxiliary_loss_clip": 0.01064599, + "auxiliary_loss_mlp": 0.01041836, + "balance_loss_clip": 1.01361895, + "balance_loss_mlp": 1.02255487, + "epoch": 0.4698030963475124, + "flos": 15155376929280.0, + "grad_norm": 2.4554212054812736, + "language_loss": 0.85337442, + "learning_rate": 2.291636923781798e-06, + "loss": 0.87443876, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 7814, + "time_per_iteration": 2.387277364730835 + }, + { + "auxiliary_loss_clip": 0.01064851, + "auxiliary_loss_mlp": 0.01046453, + "balance_loss_clip": 1.01942801, + "balance_loss_mlp": 1.02315056, + "epoch": 0.46986321960018035, + "flos": 15150489338880.0, + "grad_norm": 2.3330959518573184, + "language_loss": 0.82538867, + "learning_rate": 2.291251619387217e-06, + "loss": 0.84650171, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41796875, + "step": 7815, + "time_per_iteration": 2.3585915565490723 + }, + { + "auxiliary_loss_clip": 0.01067831, + "auxiliary_loss_mlp": 0.01046913, + "balance_loss_clip": 1.01594281, + "balance_loss_mlp": 1.02403307, + "epoch": 0.4699233428528483, + "flos": 23107893350400.0, + "grad_norm": 1.886207091183442, + "language_loss": 0.79008448, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.81123191, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.4375, + "step": 7816, + "time_per_iteration": 2.441071033477783 + }, + { + "auxiliary_loss_clip": 0.01020418, + "auxiliary_loss_mlp": 0.01009001, + "balance_loss_clip": 1.00639009, + "balance_loss_mlp": 1.01148772, + "epoch": 0.46998346610551633, + "flos": 68101998289920.0, + "grad_norm": 0.8515240677606969, + "language_loss": 0.5915395, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61183369, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.08935547, + "step": 7817, + "time_per_iteration": 3.0076546669006348 + }, + { + "auxiliary_loss_clip": 0.01065779, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.01421833, + "balance_loss_mlp": 1.02519393, + "epoch": 0.4700435893581843, + "flos": 24128439569280.0, + "grad_norm": 1.7806709751254075, + "language_loss": 0.80183744, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.82290661, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 7818, + "time_per_iteration": 3.90934419631958 + }, + { + "auxiliary_loss_clip": 0.01064948, + "auxiliary_loss_mlp": 0.01044695, + "balance_loss_clip": 1.01589417, + "balance_loss_mlp": 1.022156, + "epoch": 0.47010371261085226, + "flos": 20149667740800.0, + "grad_norm": 1.8004634963754325, + "language_loss": 0.85007942, + "learning_rate": 2.289710291512104e-06, + "loss": 0.87117589, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42773438, + "step": 7819, + "time_per_iteration": 2.390690565109253 + }, + { + "auxiliary_loss_clip": 0.01067135, + "auxiliary_loss_mlp": 0.01052719, + "balance_loss_clip": 1.02087855, + "balance_loss_mlp": 1.0218389, + "epoch": 0.47016383586352023, + "flos": 15121301575680.0, + "grad_norm": 7.837933408911312, + "language_loss": 0.7843461, + "learning_rate": 2.289324932042186e-06, + "loss": 0.80554473, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.453125, + "step": 7820, + "time_per_iteration": 2.373380184173584 + }, + { + "auxiliary_loss_clip": 0.01064066, + "auxiliary_loss_mlp": 0.01046896, + "balance_loss_clip": 1.01929903, + "balance_loss_mlp": 1.02341878, + "epoch": 0.4702239591161882, + "flos": 13552130252160.0, + "grad_norm": 2.130901903397475, + "language_loss": 0.7666679, + "learning_rate": 2.288939561601039e-06, + "loss": 0.78777748, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40625, + "step": 7821, + "time_per_iteration": 2.390869140625 + }, + { + "auxiliary_loss_clip": 0.01063819, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.0178833, + "balance_loss_mlp": 1.0225215, + "epoch": 0.47028408236885616, + "flos": 24275458771200.0, + "grad_norm": 2.67522461115942, + "language_loss": 0.90054327, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.92162675, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.4140625, + "step": 7822, + "time_per_iteration": 2.4590673446655273 + }, + { + "auxiliary_loss_clip": 0.01063074, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.02428699, + "balance_loss_mlp": 1.02150655, + "epoch": 0.4703442056215241, + "flos": 22855820267520.0, + "grad_norm": 1.4774753809896473, + "language_loss": 0.80569232, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.82684559, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41601562, + "step": 7823, + "time_per_iteration": 2.41172456741333 + }, + { + "auxiliary_loss_clip": 0.01016544, + "auxiliary_loss_mlp": 0.01016124, + "balance_loss_clip": 1.01346612, + "balance_loss_mlp": 1.0076133, + "epoch": 0.4704043288741921, + "flos": 69236535697920.0, + "grad_norm": 0.7068002695197454, + "language_loss": 0.56792927, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58825588, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.08886719, + "step": 7824, + "time_per_iteration": 3.0900912284851074 + }, + { + "auxiliary_loss_clip": 0.01063529, + "auxiliary_loss_mlp": 0.01055299, + "balance_loss_clip": 1.02549708, + "balance_loss_mlp": 1.02036715, + "epoch": 0.47046445212686006, + "flos": 18040110393600.0, + "grad_norm": 1.6573664019394294, + "language_loss": 0.81818438, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83937263, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43164062, + "step": 7825, + "time_per_iteration": 2.358977794647217 + }, + { + "auxiliary_loss_clip": 0.01062556, + "auxiliary_loss_mlp": 0.01051842, + "balance_loss_clip": 1.02320826, + "balance_loss_mlp": 1.02103484, + "epoch": 0.470524575379528, + "flos": 23950312479360.0, + "grad_norm": 1.9895817876258772, + "language_loss": 0.68394744, + "learning_rate": 2.287012545338324e-06, + "loss": 0.70509142, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4140625, + "step": 7826, + "time_per_iteration": 2.4384548664093018 + }, + { + "auxiliary_loss_clip": 0.01063773, + "auxiliary_loss_mlp": 0.01054307, + "balance_loss_clip": 1.02498198, + "balance_loss_mlp": 1.02051139, + "epoch": 0.470584698632196, + "flos": 18112590109440.0, + "grad_norm": 2.3312876414048564, + "language_loss": 0.84191203, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86309278, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43164062, + "step": 7827, + "time_per_iteration": 2.361374855041504 + }, + { + "auxiliary_loss_clip": 0.01012544, + "auxiliary_loss_mlp": 0.01016209, + "balance_loss_clip": 1.01372898, + "balance_loss_mlp": 1.00412416, + "epoch": 0.47064482188486395, + "flos": 57249143026560.0, + "grad_norm": 0.8092041398675978, + "language_loss": 0.55681646, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57710397, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.08398438, + "step": 7828, + "time_per_iteration": 3.0136284828186035 + }, + { + "auxiliary_loss_clip": 0.01059896, + "auxiliary_loss_mlp": 0.01047645, + "balance_loss_clip": 1.02113247, + "balance_loss_mlp": 1.01968753, + "epoch": 0.4707049451375319, + "flos": 17894103621120.0, + "grad_norm": 2.579353567452235, + "language_loss": 0.82184583, + "learning_rate": 2.285856204861245e-06, + "loss": 0.84292126, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 7829, + "time_per_iteration": 2.364840030670166 + }, + { + "auxiliary_loss_clip": 0.01060694, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.01429033, + "balance_loss_mlp": 1.02002835, + "epoch": 0.47076506839019994, + "flos": 25231380330240.0, + "grad_norm": 1.267241125002107, + "language_loss": 0.76309586, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78410882, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 7830, + "time_per_iteration": 2.462272882461548 + }, + { + "auxiliary_loss_clip": 0.0106076, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.01388013, + "balance_loss_mlp": 1.01971281, + "epoch": 0.4708251916428679, + "flos": 13478847575040.0, + "grad_norm": 1.8626475272001433, + "language_loss": 0.79476857, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81581509, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41015625, + "step": 7831, + "time_per_iteration": 2.359921932220459 + }, + { + "auxiliary_loss_clip": 0.01065733, + "auxiliary_loss_mlp": 0.01048935, + "balance_loss_clip": 1.01836991, + "balance_loss_mlp": 1.02104664, + "epoch": 0.47088531489553587, + "flos": 30146697912960.0, + "grad_norm": 1.8965456425185359, + "language_loss": 0.76548743, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.78663415, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.44726562, + "step": 7832, + "time_per_iteration": 2.4623947143554688 + }, + { + "auxiliary_loss_clip": 0.01061189, + "auxiliary_loss_mlp": 0.01044454, + "balance_loss_clip": 1.01641607, + "balance_loss_mlp": 1.02046347, + "epoch": 0.47094543814820383, + "flos": 21797218800000.0, + "grad_norm": 1.4806490513808277, + "language_loss": 0.75598192, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.7770384, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40625, + "step": 7833, + "time_per_iteration": 2.389000415802002 + }, + { + "auxiliary_loss_clip": 0.01061954, + "auxiliary_loss_mlp": 0.0104794, + "balance_loss_clip": 1.01888847, + "balance_loss_mlp": 1.0203079, + "epoch": 0.4710055614008718, + "flos": 23001896862720.0, + "grad_norm": 1.7392210252883136, + "language_loss": 0.7693541, + "learning_rate": 2.283928754133762e-06, + "loss": 0.79045296, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41601562, + "step": 7834, + "time_per_iteration": 2.429323196411133 + }, + { + "auxiliary_loss_clip": 0.0106265, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.01861322, + "balance_loss_mlp": 1.02129984, + "epoch": 0.47106568465353976, + "flos": 42739694760960.0, + "grad_norm": 1.4780861198969002, + "language_loss": 0.67282194, + "learning_rate": 2.283543231629972e-06, + "loss": 0.69391704, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 7835, + "time_per_iteration": 2.5691614151000977 + }, + { + "auxiliary_loss_clip": 0.01018496, + "auxiliary_loss_mlp": 0.01003799, + "balance_loss_clip": 1.00112867, + "balance_loss_mlp": 1.01022196, + "epoch": 0.4711258079062077, + "flos": 68551157197440.0, + "grad_norm": 0.8710584516514523, + "language_loss": 0.62270784, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64293081, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.08300781, + "step": 7836, + "time_per_iteration": 3.005786418914795 + }, + { + "auxiliary_loss_clip": 0.01068135, + "auxiliary_loss_mlp": 0.01046407, + "balance_loss_clip": 1.01555538, + "balance_loss_mlp": 1.02232933, + "epoch": 0.4711859311588757, + "flos": 25445433075840.0, + "grad_norm": 1.695939790642757, + "language_loss": 0.70698118, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.72812659, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.45703125, + "step": 7837, + "time_per_iteration": 2.4421114921569824 + }, + { + "auxiliary_loss_clip": 0.01063229, + "auxiliary_loss_mlp": 0.01043786, + "balance_loss_clip": 1.01446104, + "balance_loss_mlp": 1.02144003, + "epoch": 0.47124605441154366, + "flos": 21980792062080.0, + "grad_norm": 1.7510343002849857, + "language_loss": 0.68025219, + "learning_rate": 2.282386599665153e-06, + "loss": 0.70132232, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41796875, + "step": 7838, + "time_per_iteration": 2.4117214679718018 + }, + { + "auxiliary_loss_clip": 0.01064812, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.02073324, + "epoch": 0.4713061776642116, + "flos": 25411462456320.0, + "grad_norm": 2.3565392412622175, + "language_loss": 0.78492486, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.80604887, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.44140625, + "step": 7839, + "time_per_iteration": 2.450242042541504 + }, + { + "auxiliary_loss_clip": 0.01059758, + "auxiliary_loss_mlp": 0.01042888, + "balance_loss_clip": 1.015172, + "balance_loss_mlp": 1.01973522, + "epoch": 0.4713663009168796, + "flos": 26541042451200.0, + "grad_norm": 1.8485421048320883, + "language_loss": 0.73770714, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75873363, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40039062, + "step": 7840, + "time_per_iteration": 2.4319353103637695 + }, + { + "auxiliary_loss_clip": 0.01061336, + "auxiliary_loss_mlp": 0.01046601, + "balance_loss_clip": 1.01893258, + "balance_loss_mlp": 1.02029705, + "epoch": 0.47142642416954755, + "flos": 23622443101440.0, + "grad_norm": 1.7460929192326227, + "language_loss": 0.76705557, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.78813493, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 7841, + "time_per_iteration": 2.394404649734497 + }, + { + "auxiliary_loss_clip": 0.0106301, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.01758242, + "balance_loss_mlp": 1.02188301, + "epoch": 0.4714865474222155, + "flos": 22309045464960.0, + "grad_norm": 1.5163726485814648, + "language_loss": 0.71930885, + "learning_rate": 2.280844273866501e-06, + "loss": 0.74037862, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41210938, + "step": 7842, + "time_per_iteration": 2.449740171432495 + }, + { + "auxiliary_loss_clip": 0.01063271, + "auxiliary_loss_mlp": 0.0104566, + "balance_loss_clip": 1.01693022, + "balance_loss_mlp": 1.02149248, + "epoch": 0.4715466706748835, + "flos": 17821449348480.0, + "grad_norm": 2.049990938121652, + "language_loss": 0.79677904, + "learning_rate": 2.280458665756177e-06, + "loss": 0.81786835, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 7843, + "time_per_iteration": 2.399085283279419 + }, + { + "auxiliary_loss_clip": 0.01060428, + "auxiliary_loss_mlp": 0.01044611, + "balance_loss_clip": 1.01869535, + "balance_loss_mlp": 1.01936042, + "epoch": 0.4716067939275515, + "flos": 23658403668480.0, + "grad_norm": 1.7096829830547409, + "language_loss": 0.750718, + "learning_rate": 2.280073047010832e-06, + "loss": 0.77176845, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 7844, + "time_per_iteration": 2.4345993995666504 + }, + { + "auxiliary_loss_clip": 0.01059994, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_clip": 1.01727462, + "balance_loss_mlp": 1.01925492, + "epoch": 0.47166691718021947, + "flos": 17929226315520.0, + "grad_norm": 1.4865601655496228, + "language_loss": 0.79794782, + "learning_rate": 2.279687417645088e-06, + "loss": 0.8190164, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40820312, + "step": 7845, + "time_per_iteration": 2.363325595855713 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.01045455, + "balance_loss_clip": 1.02029014, + "balance_loss_mlp": 1.01992524, + "epoch": 0.47172704043288743, + "flos": 26613382521600.0, + "grad_norm": 1.366678485527142, + "language_loss": 0.74106073, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.76210493, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.390625, + "step": 7846, + "time_per_iteration": 2.440913200378418 + }, + { + "auxiliary_loss_clip": 0.0105703, + "auxiliary_loss_mlp": 0.01041553, + "balance_loss_clip": 1.01536238, + "balance_loss_mlp": 1.0180552, + "epoch": 0.4717871636855554, + "flos": 27921613276800.0, + "grad_norm": 1.3936492314109212, + "language_loss": 0.75225455, + "learning_rate": 2.2789161271109e-06, + "loss": 0.77324033, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 7847, + "time_per_iteration": 3.8726038932800293 + }, + { + "auxiliary_loss_clip": 0.01062518, + "auxiliary_loss_mlp": 0.01046895, + "balance_loss_clip": 1.01827312, + "balance_loss_mlp": 1.02069759, + "epoch": 0.47184728693822336, + "flos": 14501348830080.0, + "grad_norm": 1.7724984047949506, + "language_loss": 0.81881046, + "learning_rate": 2.278530465971703e-06, + "loss": 0.83990461, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 7848, + "time_per_iteration": 2.3496346473693848 + }, + { + "auxiliary_loss_clip": 0.01062889, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_clip": 1.01646459, + "balance_loss_mlp": 1.02146685, + "epoch": 0.47190741019089133, + "flos": 17855629436160.0, + "grad_norm": 1.7339397575819548, + "language_loss": 0.7132417, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.7342943, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.4140625, + "step": 7849, + "time_per_iteration": 3.850473403930664 + }, + { + "auxiliary_loss_clip": 0.01063384, + "auxiliary_loss_mlp": 0.01054175, + "balance_loss_clip": 1.02538633, + "balance_loss_mlp": 1.01968169, + "epoch": 0.4719675334435593, + "flos": 17894487646080.0, + "grad_norm": 2.243314898675664, + "language_loss": 0.7086339, + "learning_rate": 2.277759112022224e-06, + "loss": 0.72980952, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4375, + "step": 7850, + "time_per_iteration": 3.811210870742798 + }, + { + "auxiliary_loss_clip": 0.01060346, + "auxiliary_loss_mlp": 0.01044394, + "balance_loss_clip": 1.01695251, + "balance_loss_mlp": 1.01882422, + "epoch": 0.47202765669622726, + "flos": 20703320081280.0, + "grad_norm": 1.8576121757386315, + "language_loss": 0.76421678, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.78526413, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 7851, + "time_per_iteration": 2.364367723464966 + }, + { + "auxiliary_loss_clip": 0.01061517, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.0267179, + "balance_loss_mlp": 1.01981974, + "epoch": 0.4720877799488952, + "flos": 16359391676160.0, + "grad_norm": 1.6796844507535476, + "language_loss": 0.77641487, + "learning_rate": 2.276987715942132e-06, + "loss": 0.79757118, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41796875, + "step": 7852, + "time_per_iteration": 2.3586106300354004 + }, + { + "auxiliary_loss_clip": 0.01060999, + "auxiliary_loss_mlp": 0.01040659, + "balance_loss_clip": 1.0132643, + "balance_loss_mlp": 1.02007806, + "epoch": 0.4721479032015632, + "flos": 20667115134720.0, + "grad_norm": 1.6320181709078498, + "language_loss": 0.70009482, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.72111142, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 7853, + "time_per_iteration": 2.369363307952881 + }, + { + "auxiliary_loss_clip": 0.01012204, + "auxiliary_loss_mlp": 0.01006129, + "balance_loss_clip": 1.00344646, + "balance_loss_mlp": 1.00376964, + "epoch": 0.47220802645423116, + "flos": 67746616894080.0, + "grad_norm": 0.6941354192692705, + "language_loss": 0.50237358, + "learning_rate": 2.276216277848432e-06, + "loss": 0.5225569, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.08447266, + "step": 7854, + "time_per_iteration": 3.141892671585083 + }, + { + "auxiliary_loss_clip": 0.01061296, + "auxiliary_loss_mlp": 0.0104694, + "balance_loss_clip": 1.01750755, + "balance_loss_mlp": 1.01903963, + "epoch": 0.4722681497068991, + "flos": 20920445026560.0, + "grad_norm": 1.7653264717101098, + "language_loss": 0.64525592, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66633826, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.421875, + "step": 7855, + "time_per_iteration": 2.372880458831787 + }, + { + "auxiliary_loss_clip": 0.0106022, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.02101421, + "balance_loss_mlp": 1.01897001, + "epoch": 0.4723282729595671, + "flos": 28291832000640.0, + "grad_norm": 2.3226950435079474, + "language_loss": 0.76613164, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.78722906, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41210938, + "step": 7856, + "time_per_iteration": 2.4617667198181152 + }, + { + "auxiliary_loss_clip": 0.01057217, + "auxiliary_loss_mlp": 0.01044197, + "balance_loss_clip": 1.01775646, + "balance_loss_mlp": 1.01784384, + "epoch": 0.4723883962122351, + "flos": 27123847643520.0, + "grad_norm": 1.9377869842617634, + "language_loss": 0.76283681, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.78385091, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 7857, + "time_per_iteration": 2.421924352645874 + }, + { + "auxiliary_loss_clip": 0.01057634, + "auxiliary_loss_mlp": 0.01048505, + "balance_loss_clip": 1.02343571, + "balance_loss_mlp": 1.01788068, + "epoch": 0.47244851946490307, + "flos": 31535996578560.0, + "grad_norm": 1.71778160277913, + "language_loss": 0.65685982, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.67792118, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3984375, + "step": 7858, + "time_per_iteration": 3.9005205631256104 + }, + { + "auxiliary_loss_clip": 0.01057568, + "auxiliary_loss_mlp": 0.01047294, + "balance_loss_clip": 1.02061534, + "balance_loss_mlp": 1.0182873, + "epoch": 0.47250864271757104, + "flos": 20885496888960.0, + "grad_norm": 1.4708174141107118, + "language_loss": 0.71407509, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.73512369, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39257812, + "step": 7859, + "time_per_iteration": 2.4208033084869385 + }, + { + "auxiliary_loss_clip": 0.01063894, + "auxiliary_loss_mlp": 0.01048864, + "balance_loss_clip": 1.02064705, + "balance_loss_mlp": 1.02017546, + "epoch": 0.472568765970239, + "flos": 20521038539520.0, + "grad_norm": 1.7918033355708163, + "language_loss": 0.63885432, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.65998197, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4375, + "step": 7860, + "time_per_iteration": 2.395272731781006 + }, + { + "auxiliary_loss_clip": 0.01060218, + "auxiliary_loss_mlp": 0.01045435, + "balance_loss_clip": 1.01893497, + "balance_loss_mlp": 1.01941049, + "epoch": 0.47262888922290697, + "flos": 35803849397760.0, + "grad_norm": 1.9952109809236571, + "language_loss": 0.72740829, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.74846476, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40820312, + "step": 7861, + "time_per_iteration": 2.510735511779785 + }, + { + "auxiliary_loss_clip": 0.01060432, + "auxiliary_loss_mlp": 0.0104829, + "balance_loss_clip": 1.02015686, + "balance_loss_mlp": 1.01893723, + "epoch": 0.47268901247557493, + "flos": 20666696198400.0, + "grad_norm": 1.8882236758463984, + "language_loss": 0.86304462, + "learning_rate": 2.273130107677896e-06, + "loss": 0.88413185, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 7862, + "time_per_iteration": 2.4680874347686768 + }, + { + "auxiliary_loss_clip": 0.01061675, + "auxiliary_loss_mlp": 0.0105114, + "balance_loss_clip": 1.02401996, + "balance_loss_mlp": 1.01935112, + "epoch": 0.4727491357282429, + "flos": 19572273809280.0, + "grad_norm": 1.8700632174242922, + "language_loss": 0.85424006, + "learning_rate": 2.272744289645927e-06, + "loss": 0.87536824, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.421875, + "step": 7863, + "time_per_iteration": 2.3868300914764404 + }, + { + "auxiliary_loss_clip": 0.01061941, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.0219053, + "balance_loss_mlp": 1.01945078, + "epoch": 0.47280925898091086, + "flos": 18216422092800.0, + "grad_norm": 1.8371830289639786, + "language_loss": 0.66718853, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68829751, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.42578125, + "step": 7864, + "time_per_iteration": 2.4481844902038574 + }, + { + "auxiliary_loss_clip": 0.01062084, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.01349831, + "balance_loss_mlp": 1.02106917, + "epoch": 0.4728693822335788, + "flos": 17820855855360.0, + "grad_norm": 1.7859299693093245, + "language_loss": 0.66958207, + "learning_rate": 2.271972622569147e-06, + "loss": 0.6906091, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41015625, + "step": 7865, + "time_per_iteration": 2.373434543609619 + }, + { + "auxiliary_loss_clip": 0.0105977, + "auxiliary_loss_mlp": 0.0104681, + "balance_loss_clip": 1.01829469, + "balance_loss_mlp": 1.0187093, + "epoch": 0.4729295054862468, + "flos": 20594007014400.0, + "grad_norm": 1.818020094920685, + "language_loss": 0.74843353, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.7694993, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 7866, + "time_per_iteration": 2.401461124420166 + }, + { + "auxiliary_loss_clip": 0.01062694, + "auxiliary_loss_mlp": 0.01036372, + "balance_loss_clip": 1.00923979, + "balance_loss_mlp": 1.02046347, + "epoch": 0.47298962873891476, + "flos": 23366948705280.0, + "grad_norm": 8.746038079266748, + "language_loss": 0.84280336, + "learning_rate": 2.271200914239451e-06, + "loss": 0.86379397, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.421875, + "step": 7867, + "time_per_iteration": 2.397719144821167 + }, + { + "auxiliary_loss_clip": 0.01060193, + "auxiliary_loss_mlp": 0.01042467, + "balance_loss_clip": 1.01507294, + "balance_loss_mlp": 1.01995039, + "epoch": 0.4730497519915827, + "flos": 22051212007680.0, + "grad_norm": 1.554149708412771, + "language_loss": 0.80036402, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.82139051, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 7868, + "time_per_iteration": 2.4186081886291504 + }, + { + "auxiliary_loss_clip": 0.01063098, + "auxiliary_loss_mlp": 0.01050726, + "balance_loss_clip": 1.02012515, + "balance_loss_mlp": 1.02063918, + "epoch": 0.4731098752442507, + "flos": 21068651214720.0, + "grad_norm": 2.3728147371392696, + "language_loss": 0.7690652, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.79020345, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42578125, + "step": 7869, + "time_per_iteration": 2.3728842735290527 + }, + { + "auxiliary_loss_clip": 0.01062723, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_clip": 1.01432729, + "balance_loss_mlp": 1.02079797, + "epoch": 0.4731699984969187, + "flos": 22527671598720.0, + "grad_norm": 1.4538496652214903, + "language_loss": 0.74254495, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76361537, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41992188, + "step": 7870, + "time_per_iteration": 2.4109201431274414 + }, + { + "auxiliary_loss_clip": 0.01064783, + "auxiliary_loss_mlp": 0.01048158, + "balance_loss_clip": 1.01649642, + "balance_loss_mlp": 1.02087641, + "epoch": 0.4732301217495867, + "flos": 24897017439360.0, + "grad_norm": 1.9039865651274053, + "language_loss": 0.82800937, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.8491388, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43945312, + "step": 7871, + "time_per_iteration": 2.418464422225952 + }, + { + "auxiliary_loss_clip": 0.01062583, + "auxiliary_loss_mlp": 0.0104667, + "balance_loss_clip": 1.01791668, + "balance_loss_mlp": 1.02084756, + "epoch": 0.47329024500225464, + "flos": 22783305640320.0, + "grad_norm": 1.5811866917624622, + "language_loss": 0.77068722, + "learning_rate": 2.269271463701879e-06, + "loss": 0.79177982, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 7872, + "time_per_iteration": 2.4526121616363525 + }, + { + "auxiliary_loss_clip": 0.01061298, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_clip": 1.01992309, + "balance_loss_mlp": 1.01938295, + "epoch": 0.4733503682549226, + "flos": 38694238502400.0, + "grad_norm": 1.8189947377360534, + "language_loss": 0.68725634, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70834804, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41992188, + "step": 7873, + "time_per_iteration": 2.5501456260681152 + }, + { + "auxiliary_loss_clip": 0.01060465, + "auxiliary_loss_mlp": 0.01042354, + "balance_loss_clip": 1.01532972, + "balance_loss_mlp": 1.01996303, + "epoch": 0.47341049150759057, + "flos": 22965726827520.0, + "grad_norm": 1.5322898657719985, + "language_loss": 0.73691487, + "learning_rate": 2.26849961190881e-06, + "loss": 0.75794309, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 7874, + "time_per_iteration": 2.4464874267578125 + }, + { + "auxiliary_loss_clip": 0.0106359, + "auxiliary_loss_mlp": 0.01047603, + "balance_loss_clip": 1.01981497, + "balance_loss_mlp": 1.02120256, + "epoch": 0.47347061476025853, + "flos": 14537588688000.0, + "grad_norm": 2.169001746031164, + "language_loss": 0.66784632, + "learning_rate": 2.26811367073266e-06, + "loss": 0.68895829, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42382812, + "step": 7875, + "time_per_iteration": 2.346041679382324 + }, + { + "auxiliary_loss_clip": 0.01063412, + "auxiliary_loss_mlp": 0.01054282, + "balance_loss_clip": 1.02524304, + "balance_loss_mlp": 1.02167737, + "epoch": 0.4735307380129265, + "flos": 30261945911040.0, + "grad_norm": 2.223903411234512, + "language_loss": 0.81933606, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.84051299, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41796875, + "step": 7876, + "time_per_iteration": 2.4821507930755615 + }, + { + "auxiliary_loss_clip": 0.01060497, + "auxiliary_loss_mlp": 0.01044842, + "balance_loss_clip": 1.01779306, + "balance_loss_mlp": 1.01910758, + "epoch": 0.47359086126559446, + "flos": 19390027178880.0, + "grad_norm": 2.9075930633100318, + "language_loss": 0.80027872, + "learning_rate": 2.267341757894304e-06, + "loss": 0.8213321, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 7877, + "time_per_iteration": 2.36898136138916 + }, + { + "auxiliary_loss_clip": 0.01062352, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.02027822, + "balance_loss_mlp": 1.02139211, + "epoch": 0.47365098451826243, + "flos": 21938477627520.0, + "grad_norm": 1.9175271349607586, + "language_loss": 0.71843958, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.7395215, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.41015625, + "step": 7878, + "time_per_iteration": 2.450464963912964 + }, + { + "auxiliary_loss_clip": 0.01058946, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.0171479, + "balance_loss_mlp": 1.01970458, + "epoch": 0.4737111077709304, + "flos": 25843966778880.0, + "grad_norm": 2.1203891881962313, + "language_loss": 0.76207554, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.78308511, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39257812, + "step": 7879, + "time_per_iteration": 2.452617645263672 + }, + { + "auxiliary_loss_clip": 0.01011554, + "auxiliary_loss_mlp": 0.01008566, + "balance_loss_clip": 1.00599074, + "balance_loss_mlp": 1.00308561, + "epoch": 0.47377123102359836, + "flos": 67757790395520.0, + "grad_norm": 0.7332812914740752, + "language_loss": 0.61345857, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63365978, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.08496094, + "step": 7880, + "time_per_iteration": 3.0479912757873535 + }, + { + "auxiliary_loss_clip": 0.01059456, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_clip": 1.01511526, + "balance_loss_mlp": 1.01948261, + "epoch": 0.4738313542762663, + "flos": 24314840651520.0, + "grad_norm": 1.4991735449154941, + "language_loss": 0.6928196, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.71385437, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40039062, + "step": 7881, + "time_per_iteration": 2.4042608737945557 + }, + { + "auxiliary_loss_clip": 0.01059299, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.01318336, + "balance_loss_mlp": 1.01862609, + "epoch": 0.4738914775289343, + "flos": 20704262688000.0, + "grad_norm": 1.5602559070946251, + "language_loss": 0.77969015, + "learning_rate": 2.265411798646092e-06, + "loss": 0.80067289, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40625, + "step": 7882, + "time_per_iteration": 2.433347463607788 + }, + { + "auxiliary_loss_clip": 0.01059974, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_clip": 1.02436447, + "balance_loss_mlp": 1.01862907, + "epoch": 0.4739516007816023, + "flos": 25445188696320.0, + "grad_norm": 2.715900533342876, + "language_loss": 0.77013183, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.79124624, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 7883, + "time_per_iteration": 2.412372589111328 + }, + { + "auxiliary_loss_clip": 0.01058661, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02129316, + "balance_loss_mlp": 1.01879668, + "epoch": 0.4740117240342703, + "flos": 19973321130240.0, + "grad_norm": 1.865736679086033, + "language_loss": 0.73615783, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.75721252, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 7884, + "time_per_iteration": 2.402825355529785 + }, + { + "auxiliary_loss_clip": 0.01062584, + "auxiliary_loss_mlp": 0.01042892, + "balance_loss_clip": 1.01357865, + "balance_loss_mlp": 1.01930642, + "epoch": 0.47407184728693824, + "flos": 15660465701760.0, + "grad_norm": 2.118256616332098, + "language_loss": 0.83172506, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.85277987, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43359375, + "step": 7885, + "time_per_iteration": 2.3320119380950928 + }, + { + "auxiliary_loss_clip": 0.01060269, + "auxiliary_loss_mlp": 0.01043466, + "balance_loss_clip": 1.01689434, + "balance_loss_mlp": 1.01893687, + "epoch": 0.4741319705396062, + "flos": 18587792891520.0, + "grad_norm": 2.09479738543879, + "language_loss": 0.74958456, + "learning_rate": 2.263867649999751e-06, + "loss": 0.7706219, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.4140625, + "step": 7886, + "time_per_iteration": 2.428664207458496 + }, + { + "auxiliary_loss_clip": 0.01063474, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_clip": 1.01746738, + "balance_loss_mlp": 1.01863647, + "epoch": 0.47419209379227417, + "flos": 13260256352640.0, + "grad_norm": 1.8485934958996502, + "language_loss": 0.75198752, + "learning_rate": 2.263481587786849e-06, + "loss": 0.77314597, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.44921875, + "step": 7887, + "time_per_iteration": 3.785902976989746 + }, + { + "auxiliary_loss_clip": 0.01057515, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.01485169, + "balance_loss_mlp": 1.01777506, + "epoch": 0.47425221704494214, + "flos": 20043112671360.0, + "grad_norm": 1.7798335676387431, + "language_loss": 0.78066963, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.80164284, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39648438, + "step": 7888, + "time_per_iteration": 3.7688424587249756 + }, + { + "auxiliary_loss_clip": 0.01059762, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.01165116, + "balance_loss_mlp": 1.01803899, + "epoch": 0.4743123402976101, + "flos": 27270657377280.0, + "grad_norm": 1.5581905272536558, + "language_loss": 0.73835409, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.75932896, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.41796875, + "step": 7889, + "time_per_iteration": 2.433394193649292 + }, + { + "auxiliary_loss_clip": 0.01021805, + "auxiliary_loss_mlp": 0.01003993, + "balance_loss_clip": 1.00101292, + "balance_loss_mlp": 1.01238465, + "epoch": 0.47437246355027807, + "flos": 55391170003200.0, + "grad_norm": 0.7141560876123028, + "language_loss": 0.56168926, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58194721, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.02978516, + "router_z_loss_mlp": 0.09423828, + "step": 7890, + "time_per_iteration": 4.476652145385742 + }, + { + "auxiliary_loss_clip": 0.01061518, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.01433587, + "balance_loss_mlp": 1.01929581, + "epoch": 0.47443258680294603, + "flos": 23877344004480.0, + "grad_norm": 2.0731866835392316, + "language_loss": 0.66745013, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.68850315, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.421875, + "step": 7891, + "time_per_iteration": 2.4517343044281006 + }, + { + "auxiliary_loss_clip": 0.01065677, + "auxiliary_loss_mlp": 0.01050057, + "balance_loss_clip": 1.01696491, + "balance_loss_mlp": 1.0200789, + "epoch": 0.474492710055614, + "flos": 21976777255680.0, + "grad_norm": 2.229793800607644, + "language_loss": 0.72340447, + "learning_rate": 2.26155112714642e-06, + "loss": 0.74456179, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45507812, + "step": 7892, + "time_per_iteration": 2.3907554149627686 + }, + { + "auxiliary_loss_clip": 0.01019446, + "auxiliary_loss_mlp": 0.01006315, + "balance_loss_clip": 1.0036447, + "balance_loss_mlp": 1.01039839, + "epoch": 0.47455283330828196, + "flos": 62553845669760.0, + "grad_norm": 0.7989489478679904, + "language_loss": 0.58658564, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60684323, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.09033203, + "step": 7893, + "time_per_iteration": 3.130575656890869 + }, + { + "auxiliary_loss_clip": 0.01060155, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.01876974, + "balance_loss_mlp": 1.01935196, + "epoch": 0.47461295656094993, + "flos": 12092830577280.0, + "grad_norm": 1.6434069367865434, + "language_loss": 0.78929567, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.81034899, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40820312, + "step": 7894, + "time_per_iteration": 2.358513355255127 + }, + { + "auxiliary_loss_clip": 0.01060217, + "auxiliary_loss_mlp": 0.01046637, + "balance_loss_clip": 1.02084053, + "balance_loss_mlp": 1.01891112, + "epoch": 0.4746730798136179, + "flos": 20883576764160.0, + "grad_norm": 1.6535621950616135, + "language_loss": 0.75888336, + "learning_rate": 2.260392731628497e-06, + "loss": 0.77995193, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.4140625, + "step": 7895, + "time_per_iteration": 2.36547589302063 + }, + { + "auxiliary_loss_clip": 0.01060439, + "auxiliary_loss_mlp": 0.01047422, + "balance_loss_clip": 1.0182631, + "balance_loss_mlp": 1.01948082, + "epoch": 0.4747332030662859, + "flos": 19973774977920.0, + "grad_norm": 2.5183694286012726, + "language_loss": 0.83292663, + "learning_rate": 2.260006580021429e-06, + "loss": 0.85400522, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41015625, + "step": 7896, + "time_per_iteration": 2.392159938812256 + }, + { + "auxiliary_loss_clip": 0.01060242, + "auxiliary_loss_mlp": 0.01042907, + "balance_loss_clip": 1.01310468, + "balance_loss_mlp": 1.01865304, + "epoch": 0.4747933263189539, + "flos": 16033267866240.0, + "grad_norm": 2.8830506358675265, + "language_loss": 0.77156091, + "learning_rate": 2.259620418554886e-06, + "loss": 0.79259241, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41601562, + "step": 7897, + "time_per_iteration": 3.7698278427124023 + }, + { + "auxiliary_loss_clip": 0.01064238, + "auxiliary_loss_mlp": 0.01052344, + "balance_loss_clip": 1.02261293, + "balance_loss_mlp": 1.01994908, + "epoch": 0.47485344957162184, + "flos": 13954224913920.0, + "grad_norm": 2.012769271147644, + "language_loss": 0.65536821, + "learning_rate": 2.25923424724351e-06, + "loss": 0.67653406, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44335938, + "step": 7898, + "time_per_iteration": 2.3681626319885254 + }, + { + "auxiliary_loss_clip": 0.01060222, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.01888096, + "balance_loss_mlp": 1.01893878, + "epoch": 0.4749135728242898, + "flos": 20448035153280.0, + "grad_norm": 2.5914080369277577, + "language_loss": 0.7112968, + "learning_rate": 2.258848066101946e-06, + "loss": 0.73238397, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4140625, + "step": 7899, + "time_per_iteration": 2.371290922164917 + }, + { + "auxiliary_loss_clip": 0.01061835, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.01523662, + "balance_loss_mlp": 1.0202986, + "epoch": 0.4749736960769578, + "flos": 28948687920000.0, + "grad_norm": 1.8220344496806027, + "language_loss": 0.69975889, + "learning_rate": 2.258461875144837e-06, + "loss": 0.72081405, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4140625, + "step": 7900, + "time_per_iteration": 2.4827263355255127 + }, + { + "auxiliary_loss_clip": 0.0105956, + "auxiliary_loss_mlp": 0.01045767, + "balance_loss_clip": 1.01931417, + "balance_loss_mlp": 1.01913297, + "epoch": 0.47503381932962574, + "flos": 31937497747200.0, + "grad_norm": 1.8101877782906082, + "language_loss": 0.72177327, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.74282652, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40429688, + "step": 7901, + "time_per_iteration": 2.5073821544647217 + }, + { + "auxiliary_loss_clip": 0.0106134, + "auxiliary_loss_mlp": 0.01053827, + "balance_loss_clip": 1.027637, + "balance_loss_mlp": 1.02050233, + "epoch": 0.4750939425822937, + "flos": 22126170430080.0, + "grad_norm": 1.6538023132123763, + "language_loss": 0.74812269, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.76927435, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40820312, + "step": 7902, + "time_per_iteration": 2.5126986503601074 + }, + { + "auxiliary_loss_clip": 0.01058128, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.01541197, + "balance_loss_mlp": 1.01909876, + "epoch": 0.47515406583496167, + "flos": 20849047562880.0, + "grad_norm": 1.8579596080030094, + "language_loss": 0.69843209, + "learning_rate": 2.257303243526688e-06, + "loss": 0.71941108, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.390625, + "step": 7903, + "time_per_iteration": 2.3729562759399414 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.0179106, + "balance_loss_mlp": 1.01932693, + "epoch": 0.47521418908762963, + "flos": 17523989631360.0, + "grad_norm": 2.3182482308590218, + "language_loss": 0.73449039, + "learning_rate": 2.256917013453848e-06, + "loss": 0.75550532, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.40234375, + "step": 7904, + "time_per_iteration": 2.4128899574279785 + }, + { + "auxiliary_loss_clip": 0.01059321, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_clip": 1.01630092, + "balance_loss_mlp": 1.01980805, + "epoch": 0.4752743123402976, + "flos": 20558360649600.0, + "grad_norm": 1.7670281386018132, + "language_loss": 0.87266958, + "learning_rate": 2.25653077363869e-06, + "loss": 0.89367509, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.39453125, + "step": 7905, + "time_per_iteration": 2.370866537094116 + }, + { + "auxiliary_loss_clip": 0.01056679, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_clip": 1.01985335, + "balance_loss_mlp": 1.01829696, + "epoch": 0.47533443559296557, + "flos": 26359389313920.0, + "grad_norm": 1.5983772291181422, + "language_loss": 0.83119142, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.85220158, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3828125, + "step": 7906, + "time_per_iteration": 2.5027198791503906 + }, + { + "auxiliary_loss_clip": 0.010162, + "auxiliary_loss_mlp": 0.01003095, + "balance_loss_clip": 1.00018597, + "balance_loss_mlp": 1.00767934, + "epoch": 0.47539455884563353, + "flos": 65946251347200.0, + "grad_norm": 0.6692666830014874, + "language_loss": 0.59060419, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61079717, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.08496094, + "step": 7907, + "time_per_iteration": 3.123603343963623 + }, + { + "auxiliary_loss_clip": 0.01059609, + "auxiliary_loss_mlp": 0.01043456, + "balance_loss_clip": 1.01793361, + "balance_loss_mlp": 1.0195725, + "epoch": 0.4754546820983015, + "flos": 17237177879040.0, + "grad_norm": 1.7792303860108796, + "language_loss": 0.81759417, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83862484, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 7908, + "time_per_iteration": 2.3709094524383545 + }, + { + "auxiliary_loss_clip": 0.01059773, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.02105343, + "balance_loss_mlp": 1.01939416, + "epoch": 0.47551480535096946, + "flos": 19824940385280.0, + "grad_norm": 1.8243332316400749, + "language_loss": 0.75678802, + "learning_rate": 2.254985717247797e-06, + "loss": 0.77784413, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.40429688, + "step": 7909, + "time_per_iteration": 2.383418560028076 + }, + { + "auxiliary_loss_clip": 0.01058313, + "auxiliary_loss_mlp": 0.01043443, + "balance_loss_clip": 1.01833773, + "balance_loss_mlp": 1.01934505, + "epoch": 0.4755749286036375, + "flos": 22162864135680.0, + "grad_norm": 1.6005404906931817, + "language_loss": 0.7588768, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77989435, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 7910, + "time_per_iteration": 2.412743091583252 + }, + { + "auxiliary_loss_clip": 0.01056196, + "auxiliary_loss_mlp": 0.0104116, + "balance_loss_clip": 1.01738977, + "balance_loss_mlp": 1.01784372, + "epoch": 0.47563505185630545, + "flos": 21647336866560.0, + "grad_norm": 1.9028034395074134, + "language_loss": 0.79583287, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81680644, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 7911, + "time_per_iteration": 2.3711490631103516 + }, + { + "auxiliary_loss_clip": 0.01060949, + "auxiliary_loss_mlp": 0.01047678, + "balance_loss_clip": 1.02130878, + "balance_loss_mlp": 1.01942194, + "epoch": 0.4756951751089734, + "flos": 20627803077120.0, + "grad_norm": 1.841054313976103, + "language_loss": 0.77002585, + "learning_rate": 2.253826823377983e-06, + "loss": 0.79111207, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.4140625, + "step": 7912, + "time_per_iteration": 2.4338953495025635 + }, + { + "auxiliary_loss_clip": 0.01056899, + "auxiliary_loss_mlp": 0.01046844, + "balance_loss_clip": 1.02229881, + "balance_loss_mlp": 1.01768708, + "epoch": 0.4757552983616414, + "flos": 25847597560320.0, + "grad_norm": 1.6608778744668011, + "language_loss": 0.75815189, + "learning_rate": 2.253440506151569e-06, + "loss": 0.77918935, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.390625, + "step": 7913, + "time_per_iteration": 2.4204211235046387 + }, + { + "auxiliary_loss_clip": 0.01058687, + "auxiliary_loss_mlp": 0.01041167, + "balance_loss_clip": 1.01749194, + "balance_loss_mlp": 1.01978958, + "epoch": 0.47581542161430934, + "flos": 18222042821760.0, + "grad_norm": 2.0097608947127363, + "language_loss": 0.73485464, + "learning_rate": 2.253054179314666e-06, + "loss": 0.75585318, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38867188, + "step": 7914, + "time_per_iteration": 2.368048667907715 + }, + { + "auxiliary_loss_clip": 0.01060226, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.01539183, + "balance_loss_mlp": 1.02011776, + "epoch": 0.4758755448669773, + "flos": 21578697400320.0, + "grad_norm": 1.9439937539422598, + "language_loss": 0.65901661, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.68003988, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40039062, + "step": 7915, + "time_per_iteration": 2.3776769638061523 + }, + { + "auxiliary_loss_clip": 0.01056588, + "auxiliary_loss_mlp": 0.0104079, + "balance_loss_clip": 1.01758039, + "balance_loss_mlp": 1.01879764, + "epoch": 0.47593566811964527, + "flos": 15230265528960.0, + "grad_norm": 1.6698076935776078, + "language_loss": 0.78398323, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.80495703, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37890625, + "step": 7916, + "time_per_iteration": 2.3720967769622803 + }, + { + "auxiliary_loss_clip": 0.01056862, + "auxiliary_loss_mlp": 0.01041527, + "balance_loss_clip": 1.01873469, + "balance_loss_mlp": 1.01851726, + "epoch": 0.47599579137231324, + "flos": 21542178251520.0, + "grad_norm": 2.025042098630158, + "language_loss": 0.64896262, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66994649, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3828125, + "step": 7917, + "time_per_iteration": 2.3721086978912354 + }, + { + "auxiliary_loss_clip": 0.01013841, + "auxiliary_loss_mlp": 0.01016819, + "balance_loss_clip": 1.01364779, + "balance_loss_mlp": 1.00521624, + "epoch": 0.4760559146249812, + "flos": 64551471598080.0, + "grad_norm": 0.8499760382628104, + "language_loss": 0.65765858, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67796516, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.03173828, + "router_z_loss_mlp": 0.0859375, + "step": 7918, + "time_per_iteration": 3.0589041709899902 + }, + { + "auxiliary_loss_clip": 0.01061458, + "auxiliary_loss_mlp": 0.01039208, + "balance_loss_clip": 1.01441288, + "balance_loss_mlp": 1.02099872, + "epoch": 0.47611603787764917, + "flos": 22232865144960.0, + "grad_norm": 1.642339122181158, + "language_loss": 0.69426382, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.71527052, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.40429688, + "step": 7919, + "time_per_iteration": 2.390829563140869 + }, + { + "auxiliary_loss_clip": 0.01060425, + "auxiliary_loss_mlp": 0.01039525, + "balance_loss_clip": 1.01638699, + "balance_loss_mlp": 1.01960862, + "epoch": 0.47617616113031713, + "flos": 22779011543040.0, + "grad_norm": 1.6768083449851492, + "language_loss": 0.7566812, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.77768064, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.40820312, + "step": 7920, + "time_per_iteration": 2.488020896911621 + }, + { + "auxiliary_loss_clip": 0.01063272, + "auxiliary_loss_mlp": 0.01041352, + "balance_loss_clip": 1.01399398, + "balance_loss_mlp": 1.02147782, + "epoch": 0.4762362843829851, + "flos": 24132663843840.0, + "grad_norm": 1.8783873190070004, + "language_loss": 0.7823692, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.80341542, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41796875, + "step": 7921, + "time_per_iteration": 2.416640520095825 + }, + { + "auxiliary_loss_clip": 0.01061202, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.01355922, + "balance_loss_mlp": 1.02138436, + "epoch": 0.47629640763565306, + "flos": 22451072342400.0, + "grad_norm": 1.492012331170785, + "language_loss": 0.79174507, + "learning_rate": 2.249963220399845e-06, + "loss": 0.81275827, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 7922, + "time_per_iteration": 2.4559710025787354 + }, + { + "auxiliary_loss_clip": 0.0106218, + "auxiliary_loss_mlp": 0.01044443, + "balance_loss_clip": 1.01698923, + "balance_loss_mlp": 1.02121449, + "epoch": 0.4763565308883211, + "flos": 11180619907200.0, + "grad_norm": 1.6958723170596361, + "language_loss": 0.73853838, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.75960457, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41015625, + "step": 7923, + "time_per_iteration": 2.347822666168213 + }, + { + "auxiliary_loss_clip": 0.01060632, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.01663613, + "balance_loss_mlp": 1.02072465, + "epoch": 0.47641665414098905, + "flos": 22381071333120.0, + "grad_norm": 1.7167714609210076, + "language_loss": 0.83854806, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.85955721, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3984375, + "step": 7924, + "time_per_iteration": 2.4255478382110596 + }, + { + "auxiliary_loss_clip": 0.01064913, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02028751, + "balance_loss_mlp": 1.02201819, + "epoch": 0.476476777393657, + "flos": 25044979248000.0, + "grad_norm": 1.707890724411671, + "language_loss": 0.8188948, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.84004128, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 7925, + "time_per_iteration": 2.5074949264526367 + }, + { + "auxiliary_loss_clip": 0.01059375, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.01948082, + "balance_loss_mlp": 1.02027202, + "epoch": 0.476536900646325, + "flos": 27268737252480.0, + "grad_norm": 1.652447227243215, + "language_loss": 0.73637593, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.75741488, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 7926, + "time_per_iteration": 2.510043144226074 + }, + { + "auxiliary_loss_clip": 0.01062578, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_clip": 1.01474714, + "balance_loss_mlp": 1.02122426, + "epoch": 0.47659702389899294, + "flos": 25300229264640.0, + "grad_norm": 1.8511155592438404, + "language_loss": 0.7035659, + "learning_rate": 2.248031062546432e-06, + "loss": 0.72460544, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.4140625, + "step": 7927, + "time_per_iteration": 3.806177854537964 + }, + { + "auxiliary_loss_clip": 0.01057988, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.01701641, + "balance_loss_mlp": 1.0196979, + "epoch": 0.4766571471516609, + "flos": 25991719119360.0, + "grad_norm": 1.5316927208180362, + "language_loss": 0.68599045, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70696765, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3828125, + "step": 7928, + "time_per_iteration": 3.814880609512329 + }, + { + "auxiliary_loss_clip": 0.01061353, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.02041864, + "balance_loss_mlp": 1.02090621, + "epoch": 0.4767172704043289, + "flos": 16031347741440.0, + "grad_norm": 1.9185391139380377, + "language_loss": 0.79401451, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81510448, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 7929, + "time_per_iteration": 2.3753602504730225 + }, + { + "auxiliary_loss_clip": 0.0105974, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.01952219, + "balance_loss_mlp": 1.02062917, + "epoch": 0.47677739365699684, + "flos": 39233891387520.0, + "grad_norm": 1.8064559585972237, + "language_loss": 0.67381382, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.69485027, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.390625, + "step": 7930, + "time_per_iteration": 3.8612468242645264 + }, + { + "auxiliary_loss_clip": 0.01058361, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02310836, + "balance_loss_mlp": 1.0190506, + "epoch": 0.4768375169096648, + "flos": 24716621111040.0, + "grad_norm": 1.8157186513081014, + "language_loss": 0.80433702, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82538295, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.39453125, + "step": 7931, + "time_per_iteration": 2.427858591079712 + }, + { + "auxiliary_loss_clip": 0.01060011, + "auxiliary_loss_mlp": 0.01045125, + "balance_loss_clip": 1.01849413, + "balance_loss_mlp": 1.02050138, + "epoch": 0.47689764016233277, + "flos": 22527566864640.0, + "grad_norm": 1.807340716065254, + "language_loss": 0.77676904, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.79782039, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 7932, + "time_per_iteration": 2.3967626094818115 + }, + { + "auxiliary_loss_clip": 0.01057494, + "auxiliary_loss_mlp": 0.01043437, + "balance_loss_clip": 1.01806927, + "balance_loss_mlp": 1.01998055, + "epoch": 0.47695776341500074, + "flos": 15119765475840.0, + "grad_norm": 1.7504829617371402, + "language_loss": 0.81167185, + "learning_rate": 2.245712162906593e-06, + "loss": 0.83268118, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 7933, + "time_per_iteration": 2.5742087364196777 + }, + { + "auxiliary_loss_clip": 0.01064573, + "auxiliary_loss_mlp": 0.01049864, + "balance_loss_clip": 1.01951337, + "balance_loss_mlp": 1.02044344, + "epoch": 0.4770178866676687, + "flos": 14678184199680.0, + "grad_norm": 1.915105015780136, + "language_loss": 0.75262022, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.77376461, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44140625, + "step": 7934, + "time_per_iteration": 2.339354991912842 + }, + { + "auxiliary_loss_clip": 0.01062312, + "auxiliary_loss_mlp": 0.01044906, + "balance_loss_clip": 1.01744008, + "balance_loss_mlp": 1.02021837, + "epoch": 0.47707800992033667, + "flos": 22564470038400.0, + "grad_norm": 1.7894954610540696, + "language_loss": 0.81110859, + "learning_rate": 2.244939121664211e-06, + "loss": 0.83218074, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.421875, + "step": 7935, + "time_per_iteration": 2.4033567905426025 + }, + { + "auxiliary_loss_clip": 0.01064807, + "auxiliary_loss_mlp": 0.01050696, + "balance_loss_clip": 1.02156138, + "balance_loss_mlp": 1.0201534, + "epoch": 0.4771381331730047, + "flos": 30916951528320.0, + "grad_norm": 1.70908421706898, + "language_loss": 0.72219789, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.74335289, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.44726562, + "step": 7936, + "time_per_iteration": 3.949907064437866 + }, + { + "auxiliary_loss_clip": 0.01061948, + "auxiliary_loss_mlp": 0.01044688, + "balance_loss_clip": 1.01668537, + "balance_loss_mlp": 1.01948583, + "epoch": 0.47719825642567265, + "flos": 25737725911680.0, + "grad_norm": 2.0316751095921783, + "language_loss": 0.68567377, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.70674014, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42578125, + "step": 7937, + "time_per_iteration": 2.4147164821624756 + }, + { + "auxiliary_loss_clip": 0.01012546, + "auxiliary_loss_mlp": 0.01008079, + "balance_loss_clip": 1.005445, + "balance_loss_mlp": 1.00438857, + "epoch": 0.4772583796783406, + "flos": 66351592765440.0, + "grad_norm": 0.7146660475141763, + "language_loss": 0.56536514, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58557135, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.08154297, + "step": 7938, + "time_per_iteration": 3.1767678260803223 + }, + { + "auxiliary_loss_clip": 0.01063052, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.01201534, + "balance_loss_mlp": 1.02129292, + "epoch": 0.4773185029310086, + "flos": 22050094844160.0, + "grad_norm": 1.7128713193066494, + "language_loss": 0.90203929, + "learning_rate": 2.243392927839317e-06, + "loss": 0.92307556, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 7939, + "time_per_iteration": 2.3885273933410645 + }, + { + "auxiliary_loss_clip": 0.010623, + "auxiliary_loss_mlp": 0.01046321, + "balance_loss_clip": 1.0201546, + "balance_loss_mlp": 1.02073681, + "epoch": 0.47737862618367655, + "flos": 16726852402560.0, + "grad_norm": 1.9393402592246656, + "language_loss": 0.78655005, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.80763626, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41601562, + "step": 7940, + "time_per_iteration": 2.367142915725708 + }, + { + "auxiliary_loss_clip": 0.0105999, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.0178194, + "balance_loss_mlp": 1.02121401, + "epoch": 0.4774387494363445, + "flos": 19608443844480.0, + "grad_norm": 1.6866227812753702, + "language_loss": 0.85956085, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.8805865, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38671875, + "step": 7941, + "time_per_iteration": 2.390547513961792 + }, + { + "auxiliary_loss_clip": 0.01064488, + "auxiliary_loss_mlp": 0.01045942, + "balance_loss_clip": 1.01711738, + "balance_loss_mlp": 1.02161062, + "epoch": 0.4774988726890125, + "flos": 16653046055040.0, + "grad_norm": 1.77094756236938, + "language_loss": 0.77148718, + "learning_rate": 2.24223318550976e-06, + "loss": 0.79259145, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42773438, + "step": 7942, + "time_per_iteration": 2.3902204036712646 + }, + { + "auxiliary_loss_clip": 0.0106327, + "auxiliary_loss_mlp": 0.01050735, + "balance_loss_clip": 1.02350736, + "balance_loss_mlp": 1.02199805, + "epoch": 0.47755899594168044, + "flos": 20484519390720.0, + "grad_norm": 1.754314422679348, + "language_loss": 0.64877015, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66991019, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41210938, + "step": 7943, + "time_per_iteration": 2.3747384548187256 + }, + { + "auxiliary_loss_clip": 0.01064407, + "auxiliary_loss_mlp": 0.01047097, + "balance_loss_clip": 1.01762831, + "balance_loss_mlp": 1.02122486, + "epoch": 0.4776191191943484, + "flos": 21651735697920.0, + "grad_norm": 1.6469644641716175, + "language_loss": 0.75208992, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.77320504, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.43164062, + "step": 7944, + "time_per_iteration": 2.4242780208587646 + }, + { + "auxiliary_loss_clip": 0.0106384, + "auxiliary_loss_mlp": 0.01048116, + "balance_loss_clip": 1.01938629, + "balance_loss_mlp": 1.02157235, + "epoch": 0.4776792424470164, + "flos": 18769236560640.0, + "grad_norm": 1.9983627527367607, + "language_loss": 0.69708908, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.71820867, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 7945, + "time_per_iteration": 2.3689916133880615 + }, + { + "auxiliary_loss_clip": 0.01061631, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.01478791, + "balance_loss_mlp": 1.01950097, + "epoch": 0.47773936569968434, + "flos": 29714542704000.0, + "grad_norm": 2.0508922019754694, + "language_loss": 0.7703656, + "learning_rate": 2.240686733875009e-06, + "loss": 0.79140007, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.421875, + "step": 7946, + "time_per_iteration": 2.4657633304595947 + }, + { + "auxiliary_loss_clip": 0.01066322, + "auxiliary_loss_mlp": 0.01055389, + "balance_loss_clip": 1.02357221, + "balance_loss_mlp": 1.02274418, + "epoch": 0.4777994889523523, + "flos": 24790357635840.0, + "grad_norm": 1.7950525779694382, + "language_loss": 0.80673069, + "learning_rate": 2.240300098112506e-06, + "loss": 0.82794774, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.43554688, + "step": 7947, + "time_per_iteration": 2.417748212814331 + }, + { + "auxiliary_loss_clip": 0.01060142, + "auxiliary_loss_mlp": 0.01043863, + "balance_loss_clip": 1.01470399, + "balance_loss_mlp": 1.01977658, + "epoch": 0.47785961220502027, + "flos": 17857200447360.0, + "grad_norm": 2.0710250440307076, + "language_loss": 0.74775869, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.76879871, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40429688, + "step": 7948, + "time_per_iteration": 2.3824448585510254 + }, + { + "auxiliary_loss_clip": 0.0106357, + "auxiliary_loss_mlp": 0.01040549, + "balance_loss_clip": 1.01258254, + "balance_loss_mlp": 1.0217768, + "epoch": 0.4779197354576883, + "flos": 20265509232000.0, + "grad_norm": 1.5179100543911805, + "language_loss": 0.79379153, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.81483269, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 7949, + "time_per_iteration": 2.3861618041992188 + }, + { + "auxiliary_loss_clip": 0.01060648, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_clip": 1.01593721, + "balance_loss_mlp": 1.0208683, + "epoch": 0.47797985871035625, + "flos": 17055629475840.0, + "grad_norm": 2.272126439049425, + "language_loss": 0.7621702, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.78319341, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 7950, + "time_per_iteration": 2.3618481159210205 + }, + { + "auxiliary_loss_clip": 0.01061488, + "auxiliary_loss_mlp": 0.01047126, + "balance_loss_clip": 1.01868272, + "balance_loss_mlp": 1.02065134, + "epoch": 0.4780399819630242, + "flos": 31357066527360.0, + "grad_norm": 1.5513406557281755, + "language_loss": 0.75801873, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.77910483, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 7951, + "time_per_iteration": 2.4895410537719727 + }, + { + "auxiliary_loss_clip": 0.01063456, + "auxiliary_loss_mlp": 0.01044256, + "balance_loss_clip": 1.01595592, + "balance_loss_mlp": 1.02039564, + "epoch": 0.4781001052156922, + "flos": 24898448805120.0, + "grad_norm": 2.0924915635689807, + "language_loss": 0.82129991, + "learning_rate": 2.238366782910174e-06, + "loss": 0.84237707, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.43164062, + "step": 7952, + "time_per_iteration": 2.400447368621826 + }, + { + "auxiliary_loss_clip": 0.01062824, + "auxiliary_loss_mlp": 0.01043737, + "balance_loss_clip": 1.01495957, + "balance_loss_mlp": 1.02025104, + "epoch": 0.47816022846836015, + "flos": 18696721933440.0, + "grad_norm": 2.039262991219003, + "language_loss": 0.80140108, + "learning_rate": 2.23798009269438e-06, + "loss": 0.82246673, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42578125, + "step": 7953, + "time_per_iteration": 2.380124092102051 + }, + { + "auxiliary_loss_clip": 0.01063391, + "auxiliary_loss_mlp": 0.01043345, + "balance_loss_clip": 1.0142343, + "balance_loss_mlp": 1.02036643, + "epoch": 0.4782203517210281, + "flos": 11976954174720.0, + "grad_norm": 2.0630334579386953, + "language_loss": 0.85164297, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.87271035, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4296875, + "step": 7954, + "time_per_iteration": 2.3169827461242676 + }, + { + "auxiliary_loss_clip": 0.01062229, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.01386893, + "balance_loss_mlp": 1.02058673, + "epoch": 0.4782804749736961, + "flos": 20812458591360.0, + "grad_norm": 1.3724927982050883, + "language_loss": 0.71145171, + "learning_rate": 2.237206685204768e-06, + "loss": 0.73251832, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.41601562, + "step": 7955, + "time_per_iteration": 2.398909568786621 + }, + { + "auxiliary_loss_clip": 0.01061837, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.01393282, + "balance_loss_mlp": 1.01999807, + "epoch": 0.47834059822636404, + "flos": 23839218933120.0, + "grad_norm": 1.5277184697611863, + "language_loss": 0.83092427, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.85196871, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 7956, + "time_per_iteration": 2.391289234161377 + }, + { + "auxiliary_loss_clip": 0.01060208, + "auxiliary_loss_mlp": 0.01044911, + "balance_loss_clip": 1.01739788, + "balance_loss_mlp": 1.02017093, + "epoch": 0.478400721479032, + "flos": 22632795302400.0, + "grad_norm": 1.7793498693584444, + "language_loss": 0.85705292, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87810415, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 7957, + "time_per_iteration": 2.3942055702209473 + }, + { + "auxiliary_loss_clip": 0.0106039, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.01447558, + "balance_loss_mlp": 1.01967001, + "epoch": 0.4784608447317, + "flos": 19353926966400.0, + "grad_norm": 1.5830615652275102, + "language_loss": 0.80698413, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.82798851, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40625, + "step": 7958, + "time_per_iteration": 2.381096839904785 + }, + { + "auxiliary_loss_clip": 0.01059264, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.01637328, + "balance_loss_mlp": 1.01863146, + "epoch": 0.47852096798436794, + "flos": 24020069109120.0, + "grad_norm": 2.2716173378074718, + "language_loss": 0.84527659, + "learning_rate": 2.235659762404047e-06, + "loss": 0.86631286, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 7959, + "time_per_iteration": 2.4463672637939453 + }, + { + "auxiliary_loss_clip": 0.01059807, + "auxiliary_loss_mlp": 0.0104386, + "balance_loss_clip": 1.01715708, + "balance_loss_mlp": 1.02043223, + "epoch": 0.4785810912370359, + "flos": 25665246195840.0, + "grad_norm": 2.1401209338887774, + "language_loss": 0.74313933, + "learning_rate": 2.235273009326599e-06, + "loss": 0.76417601, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 7960, + "time_per_iteration": 2.4222304821014404 + }, + { + "auxiliary_loss_clip": 0.01059299, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.01470411, + "balance_loss_mlp": 1.01977968, + "epoch": 0.47864121448970387, + "flos": 21431119616640.0, + "grad_norm": 1.617371804236592, + "language_loss": 0.7816987, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.80269945, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 7961, + "time_per_iteration": 2.4220728874206543 + }, + { + "auxiliary_loss_clip": 0.01059034, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_clip": 1.01412368, + "balance_loss_mlp": 1.01900935, + "epoch": 0.47870133774237184, + "flos": 16142964958080.0, + "grad_norm": 1.6092551590337072, + "language_loss": 0.79346073, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.81446517, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 7962, + "time_per_iteration": 2.3809974193573 + }, + { + "auxiliary_loss_clip": 0.01061262, + "auxiliary_loss_mlp": 0.0104029, + "balance_loss_clip": 1.01215696, + "balance_loss_mlp": 1.02027321, + "epoch": 0.47876146099503986, + "flos": 26905570623360.0, + "grad_norm": 1.6836193942477087, + "language_loss": 0.66574401, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.68675953, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41015625, + "step": 7963, + "time_per_iteration": 2.5148062705993652 + }, + { + "auxiliary_loss_clip": 0.01058818, + "auxiliary_loss_mlp": 0.01041619, + "balance_loss_clip": 1.01507103, + "balance_loss_mlp": 1.01917386, + "epoch": 0.4788215842477078, + "flos": 45330354910080.0, + "grad_norm": 1.7615332186158594, + "language_loss": 0.79694748, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.81795186, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 7964, + "time_per_iteration": 2.6183600425720215 + }, + { + "auxiliary_loss_clip": 0.01061554, + "auxiliary_loss_mlp": 0.01045717, + "balance_loss_clip": 1.01527071, + "balance_loss_mlp": 1.01885378, + "epoch": 0.4788817075003758, + "flos": 22236076990080.0, + "grad_norm": 1.7904226823603722, + "language_loss": 0.77887094, + "learning_rate": 2.233339110409044e-06, + "loss": 0.79994363, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42773438, + "step": 7965, + "time_per_iteration": 2.4420006275177 + }, + { + "auxiliary_loss_clip": 0.01059836, + "auxiliary_loss_mlp": 0.01044794, + "balance_loss_clip": 1.01667213, + "balance_loss_mlp": 1.01885939, + "epoch": 0.47894183075304375, + "flos": 16470275754240.0, + "grad_norm": 1.635084174245906, + "language_loss": 0.76080096, + "learning_rate": 2.232952304022137e-06, + "loss": 0.78184724, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41015625, + "step": 7966, + "time_per_iteration": 2.384932041168213 + }, + { + "auxiliary_loss_clip": 0.01061666, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_clip": 1.01482487, + "balance_loss_mlp": 1.0199405, + "epoch": 0.4790019540057117, + "flos": 24281463525120.0, + "grad_norm": 1.6441350949909974, + "language_loss": 0.74409735, + "learning_rate": 2.232565488801655e-06, + "loss": 0.76515245, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 7967, + "time_per_iteration": 3.8738393783569336 + }, + { + "auxiliary_loss_clip": 0.0105576, + "auxiliary_loss_mlp": 0.01042088, + "balance_loss_clip": 1.01611257, + "balance_loss_mlp": 1.0180409, + "epoch": 0.4790620772583797, + "flos": 25665281107200.0, + "grad_norm": 1.9835282571599955, + "language_loss": 0.8067832, + "learning_rate": 2.232178664762267e-06, + "loss": 0.82776171, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37695312, + "step": 7968, + "time_per_iteration": 3.81965708732605 + }, + { + "auxiliary_loss_clip": 0.01012443, + "auxiliary_loss_mlp": 0.01011891, + "balance_loss_clip": 1.00924408, + "balance_loss_mlp": 1.00415611, + "epoch": 0.47912220051104765, + "flos": 69424228500480.0, + "grad_norm": 0.7766479473014086, + "language_loss": 0.62345088, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64369422, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.08300781, + "step": 7969, + "time_per_iteration": 3.183363676071167 + }, + { + "auxiliary_loss_clip": 0.01058515, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_clip": 1.01900673, + "balance_loss_mlp": 1.01913476, + "epoch": 0.4791823237637156, + "flos": 24167821449600.0, + "grad_norm": 1.3514346964038169, + "language_loss": 0.78355289, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.80458474, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39453125, + "step": 7970, + "time_per_iteration": 3.8085100650787354 + }, + { + "auxiliary_loss_clip": 0.01060362, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.0174222, + "balance_loss_mlp": 1.0193373, + "epoch": 0.4792424470163836, + "flos": 24750382262400.0, + "grad_norm": 1.529345933331152, + "language_loss": 0.7093336, + "learning_rate": 2.231018139877349e-06, + "loss": 0.73039961, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41015625, + "step": 7971, + "time_per_iteration": 2.4606971740722656 + }, + { + "auxiliary_loss_clip": 0.0105912, + "auxiliary_loss_mlp": 0.01043842, + "balance_loss_clip": 1.0153749, + "balance_loss_mlp": 1.01838326, + "epoch": 0.47930257026905154, + "flos": 23256797765760.0, + "grad_norm": 1.3157229223490616, + "language_loss": 0.80565399, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82668364, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 7972, + "time_per_iteration": 2.4166030883789062 + }, + { + "auxiliary_loss_clip": 0.0106075, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.0149076, + "balance_loss_mlp": 1.01902723, + "epoch": 0.4793626935217195, + "flos": 14063223778560.0, + "grad_norm": 2.169576561754285, + "language_loss": 0.7101447, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.73118258, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 7973, + "time_per_iteration": 2.4202895164489746 + }, + { + "auxiliary_loss_clip": 0.01058399, + "auxiliary_loss_mlp": 0.01040641, + "balance_loss_clip": 1.0155952, + "balance_loss_mlp": 1.02019405, + "epoch": 0.4794228167743875, + "flos": 21797777381760.0, + "grad_norm": 1.7654522865993942, + "language_loss": 0.80228341, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.82327378, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 7974, + "time_per_iteration": 2.4075348377227783 + }, + { + "auxiliary_loss_clip": 0.01013917, + "auxiliary_loss_mlp": 0.01006328, + "balance_loss_clip": 1.00346696, + "balance_loss_mlp": 1.00598717, + "epoch": 0.47948294002705544, + "flos": 66965436023040.0, + "grad_norm": 0.7531833735395026, + "language_loss": 0.54119623, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56139874, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.07910156, + "step": 7975, + "time_per_iteration": 3.1026434898376465 + }, + { + "auxiliary_loss_clip": 0.01064634, + "auxiliary_loss_mlp": 0.01053456, + "balance_loss_clip": 1.02312875, + "balance_loss_mlp": 1.01979208, + "epoch": 0.47954306327972346, + "flos": 12421642561920.0, + "grad_norm": 2.325431168621703, + "language_loss": 0.90496719, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92614806, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.44921875, + "step": 7976, + "time_per_iteration": 3.7761037349700928 + }, + { + "auxiliary_loss_clip": 0.01063131, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_clip": 1.01403058, + "balance_loss_mlp": 1.02013135, + "epoch": 0.4796031865323914, + "flos": 18361172056320.0, + "grad_norm": 2.3733002177993865, + "language_loss": 0.75489306, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.7759819, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4296875, + "step": 7977, + "time_per_iteration": 2.4004733562469482 + }, + { + "auxiliary_loss_clip": 0.01058709, + "auxiliary_loss_mlp": 0.01040914, + "balance_loss_clip": 1.01433015, + "balance_loss_mlp": 1.01919246, + "epoch": 0.4796633097850594, + "flos": 21834017239680.0, + "grad_norm": 1.614953109361554, + "language_loss": 0.79110324, + "learning_rate": 2.228309942555734e-06, + "loss": 0.81209946, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 7978, + "time_per_iteration": 2.3940184116363525 + }, + { + "auxiliary_loss_clip": 0.01060351, + "auxiliary_loss_mlp": 0.01048637, + "balance_loss_clip": 1.02089667, + "balance_loss_mlp": 1.01930285, + "epoch": 0.47972343303772735, + "flos": 23436321310080.0, + "grad_norm": 1.618265869151182, + "language_loss": 0.90060294, + "learning_rate": 2.22792302247656e-06, + "loss": 0.92169279, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 7979, + "time_per_iteration": 2.432506799697876 + }, + { + "auxiliary_loss_clip": 0.01062635, + "auxiliary_loss_mlp": 0.01044551, + "balance_loss_clip": 1.0158689, + "balance_loss_mlp": 1.01988387, + "epoch": 0.4797835562903953, + "flos": 24898623361920.0, + "grad_norm": 1.504373453106211, + "language_loss": 0.77306986, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79414171, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.42773438, + "step": 7980, + "time_per_iteration": 2.4225895404815674 + }, + { + "auxiliary_loss_clip": 0.01064015, + "auxiliary_loss_mlp": 0.01041311, + "balance_loss_clip": 1.0094465, + "balance_loss_mlp": 1.02082705, + "epoch": 0.4798436795430633, + "flos": 35041555572480.0, + "grad_norm": 1.6206162487552431, + "language_loss": 0.73755252, + "learning_rate": 2.227149156404295e-06, + "loss": 0.75860572, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.43164062, + "step": 7981, + "time_per_iteration": 2.55383038520813 + }, + { + "auxiliary_loss_clip": 0.01060655, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.01227665, + "balance_loss_mlp": 1.02098441, + "epoch": 0.47990380279573125, + "flos": 20589293980800.0, + "grad_norm": 1.841161528644146, + "language_loss": 0.70978743, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.73078215, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 7982, + "time_per_iteration": 2.3786492347717285 + }, + { + "auxiliary_loss_clip": 0.01056963, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.01770175, + "balance_loss_mlp": 1.01982927, + "epoch": 0.4799639260483992, + "flos": 26358202327680.0, + "grad_norm": 1.6737597323687665, + "language_loss": 0.7253623, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.74635255, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 7983, + "time_per_iteration": 2.4813530445098877 + }, + { + "auxiliary_loss_clip": 0.0101457, + "auxiliary_loss_mlp": 0.01018907, + "balance_loss_clip": 1.01626039, + "balance_loss_mlp": 1.00590038, + "epoch": 0.4800240493010672, + "flos": 70975629941760.0, + "grad_norm": 0.8086045979361574, + "language_loss": 0.59554565, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61588043, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.08691406, + "step": 7984, + "time_per_iteration": 2.981422185897827 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01049052, + "balance_loss_clip": 1.02139497, + "balance_loss_mlp": 1.01925886, + "epoch": 0.48008417255373514, + "flos": 17085864579840.0, + "grad_norm": 1.747363970224287, + "language_loss": 0.68021798, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.70130926, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40820312, + "step": 7985, + "time_per_iteration": 2.439094066619873 + }, + { + "auxiliary_loss_clip": 0.01062362, + "auxiliary_loss_mlp": 0.01046268, + "balance_loss_clip": 1.01699018, + "balance_loss_mlp": 1.01990592, + "epoch": 0.4801442958064031, + "flos": 15412547070720.0, + "grad_norm": 1.7999700993751266, + "language_loss": 0.72314841, + "learning_rate": 2.225214340743835e-06, + "loss": 0.74423468, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42382812, + "step": 7986, + "time_per_iteration": 2.357844114303589 + }, + { + "auxiliary_loss_clip": 0.01063716, + "auxiliary_loss_mlp": 0.01057923, + "balance_loss_clip": 1.02901459, + "balance_loss_mlp": 1.01976895, + "epoch": 0.4802044190590711, + "flos": 11472947654400.0, + "grad_norm": 2.406035625379, + "language_loss": 0.8104918, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.83170819, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43945312, + "step": 7987, + "time_per_iteration": 2.4091708660125732 + }, + { + "auxiliary_loss_clip": 0.01060186, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.02292442, + "balance_loss_mlp": 1.01883626, + "epoch": 0.48026454231173904, + "flos": 20950191371520.0, + "grad_norm": 2.0826859362368895, + "language_loss": 0.7618891, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.78300428, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 7988, + "time_per_iteration": 2.3661227226257324 + }, + { + "auxiliary_loss_clip": 0.01063115, + "auxiliary_loss_mlp": 0.01043499, + "balance_loss_clip": 1.01290989, + "balance_loss_mlp": 1.02055001, + "epoch": 0.48032466556440706, + "flos": 20447092546560.0, + "grad_norm": 2.0915148262583685, + "language_loss": 0.80472958, + "learning_rate": 2.224053348748365e-06, + "loss": 0.82579565, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42578125, + "step": 7989, + "time_per_iteration": 2.4025328159332275 + }, + { + "auxiliary_loss_clip": 0.01063302, + "auxiliary_loss_mlp": 0.01054335, + "balance_loss_clip": 1.02218437, + "balance_loss_mlp": 1.01922274, + "epoch": 0.480384788817075, + "flos": 37119376627200.0, + "grad_norm": 1.694231167244884, + "language_loss": 0.74831814, + "learning_rate": 2.223666334404724e-06, + "loss": 0.76949453, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.44140625, + "step": 7990, + "time_per_iteration": 2.5351386070251465 + }, + { + "auxiliary_loss_clip": 0.0101136, + "auxiliary_loss_mlp": 0.01004103, + "balance_loss_clip": 1.00143242, + "balance_loss_mlp": 1.00335741, + "epoch": 0.480444912069743, + "flos": 69549323368320.0, + "grad_norm": 0.7715540747618471, + "language_loss": 0.59119737, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61135197, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.08007812, + "step": 7991, + "time_per_iteration": 3.1314985752105713 + }, + { + "auxiliary_loss_clip": 0.01059167, + "auxiliary_loss_mlp": 0.0104813, + "balance_loss_clip": 1.01827955, + "balance_loss_mlp": 1.0181191, + "epoch": 0.48050503532241096, + "flos": 29821027950720.0, + "grad_norm": 2.0284037342332204, + "language_loss": 0.68455309, + "learning_rate": 2.222892280287768e-06, + "loss": 0.70562601, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41015625, + "step": 7992, + "time_per_iteration": 2.50402569770813 + }, + { + "auxiliary_loss_clip": 0.01060077, + "auxiliary_loss_mlp": 0.0104621, + "balance_loss_clip": 1.01821947, + "balance_loss_mlp": 1.01792324, + "epoch": 0.4805651585750789, + "flos": 23947484659200.0, + "grad_norm": 1.8645538036855487, + "language_loss": 0.77351332, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.79457617, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.421875, + "step": 7993, + "time_per_iteration": 2.4181883335113525 + }, + { + "auxiliary_loss_clip": 0.01057612, + "auxiliary_loss_mlp": 0.01045691, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.01833129, + "epoch": 0.4806252818277469, + "flos": 25664268677760.0, + "grad_norm": 1.7133155469880301, + "language_loss": 0.80282724, + "learning_rate": 2.222118192362422e-06, + "loss": 0.82386023, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39257812, + "step": 7994, + "time_per_iteration": 2.414649486541748 + }, + { + "auxiliary_loss_clip": 0.01061678, + "auxiliary_loss_mlp": 0.01043001, + "balance_loss_clip": 1.01524913, + "balance_loss_mlp": 1.02055192, + "epoch": 0.48068540508041485, + "flos": 13151152753920.0, + "grad_norm": 1.933677776376886, + "language_loss": 0.80957055, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.83061731, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41210938, + "step": 7995, + "time_per_iteration": 2.414883852005005 + }, + { + "auxiliary_loss_clip": 0.01061661, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.01339269, + "balance_loss_mlp": 1.02064788, + "epoch": 0.4807455283330828, + "flos": 21175729954560.0, + "grad_norm": 1.423762021082029, + "language_loss": 0.84352142, + "learning_rate": 2.2213440707461e-06, + "loss": 0.86454409, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 7996, + "time_per_iteration": 2.4238340854644775 + }, + { + "auxiliary_loss_clip": 0.01061499, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_clip": 1.01524556, + "balance_loss_mlp": 1.02088451, + "epoch": 0.4808056515857508, + "flos": 12275181941760.0, + "grad_norm": 1.629648839991111, + "language_loss": 0.81723809, + "learning_rate": 2.220956997340516e-06, + "loss": 0.83827114, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40625, + "step": 7997, + "time_per_iteration": 2.4251325130462646 + }, + { + "auxiliary_loss_clip": 0.01060761, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.016119, + "balance_loss_mlp": 1.01979065, + "epoch": 0.48086577483841875, + "flos": 24824921748480.0, + "grad_norm": 1.6476983165119004, + "language_loss": 0.73857147, + "learning_rate": 2.220569915556221e-06, + "loss": 0.75960362, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41015625, + "step": 7998, + "time_per_iteration": 2.4322750568389893 + }, + { + "auxiliary_loss_clip": 0.0106139, + "auxiliary_loss_mlp": 0.01040056, + "balance_loss_clip": 1.01313841, + "balance_loss_mlp": 1.02010572, + "epoch": 0.4809258980910867, + "flos": 24464129091840.0, + "grad_norm": 1.8193181126772409, + "language_loss": 0.72179097, + "learning_rate": 2.220182825407892e-06, + "loss": 0.74280536, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 7999, + "time_per_iteration": 2.4700052738189697 + }, + { + "auxiliary_loss_clip": 0.0106265, + "auxiliary_loss_mlp": 0.01047317, + "balance_loss_clip": 1.01924372, + "balance_loss_mlp": 1.02112222, + "epoch": 0.4809860213437547, + "flos": 21214867455360.0, + "grad_norm": 1.434568128841026, + "language_loss": 0.72275746, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.74385709, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41601562, + "step": 8000, + "time_per_iteration": 2.399415969848633 + }, + { + "auxiliary_loss_clip": 0.01064335, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.0117476, + "balance_loss_mlp": 1.0225805, + "epoch": 0.48104614459642264, + "flos": 37630609799040.0, + "grad_norm": 1.4747237513619706, + "language_loss": 0.75711292, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77815831, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 8001, + "time_per_iteration": 2.5292582511901855 + }, + { + "auxiliary_loss_clip": 0.01061589, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.0153296, + "balance_loss_mlp": 1.01994359, + "epoch": 0.48110626784909066, + "flos": 18405127324800.0, + "grad_norm": 1.7024790864301895, + "language_loss": 0.82216793, + "learning_rate": 2.219021504925493e-06, + "loss": 0.8432169, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 8002, + "time_per_iteration": 2.3702197074890137 + }, + { + "auxiliary_loss_clip": 0.01062448, + "auxiliary_loss_mlp": 0.01043527, + "balance_loss_clip": 1.01285434, + "balance_loss_mlp": 1.02093077, + "epoch": 0.48116639110175863, + "flos": 28438537000320.0, + "grad_norm": 1.7963281434255978, + "language_loss": 0.73075515, + "learning_rate": 2.218634381467819e-06, + "loss": 0.75181484, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.4140625, + "step": 8003, + "time_per_iteration": 2.453856945037842 + }, + { + "auxiliary_loss_clip": 0.01059124, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.01969695, + "balance_loss_mlp": 1.02090263, + "epoch": 0.4812265143544266, + "flos": 21724180502400.0, + "grad_norm": 1.552743945959638, + "language_loss": 0.83509409, + "learning_rate": 2.218247249719507e-06, + "loss": 0.85612798, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3828125, + "step": 8004, + "time_per_iteration": 2.386887788772583 + }, + { + "auxiliary_loss_clip": 0.0106818, + "auxiliary_loss_mlp": 0.0105335, + "balance_loss_clip": 1.02031744, + "balance_loss_mlp": 1.0224719, + "epoch": 0.48128663760709456, + "flos": 13223841937920.0, + "grad_norm": 1.9072155334844356, + "language_loss": 0.80002147, + "learning_rate": 2.217860109695239e-06, + "loss": 0.82123679, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.45703125, + "step": 8005, + "time_per_iteration": 2.374980926513672 + }, + { + "auxiliary_loss_clip": 0.0106099, + "auxiliary_loss_mlp": 0.01049052, + "balance_loss_clip": 1.02103758, + "balance_loss_mlp": 1.0193224, + "epoch": 0.4813467608597625, + "flos": 24242291112960.0, + "grad_norm": 1.872877695950415, + "language_loss": 0.72127068, + "learning_rate": 2.217472961409692e-06, + "loss": 0.74237108, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41796875, + "step": 8006, + "time_per_iteration": 3.8522467613220215 + }, + { + "auxiliary_loss_clip": 0.01061459, + "auxiliary_loss_mlp": 0.01046194, + "balance_loss_clip": 1.01933587, + "balance_loss_mlp": 1.01970983, + "epoch": 0.4814068841124305, + "flos": 27479473418880.0, + "grad_norm": 2.0719733901746764, + "language_loss": 0.71288252, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.73395902, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41796875, + "step": 8007, + "time_per_iteration": 3.8084208965301514 + }, + { + "auxiliary_loss_clip": 0.01061291, + "auxiliary_loss_mlp": 0.0103897, + "balance_loss_clip": 1.01275611, + "balance_loss_mlp": 1.01967597, + "epoch": 0.48146700736509845, + "flos": 19571889784320.0, + "grad_norm": 1.7628924614841701, + "language_loss": 0.72925931, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.7502619, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41601562, + "step": 8008, + "time_per_iteration": 2.4252769947052 + }, + { + "auxiliary_loss_clip": 0.01063055, + "auxiliary_loss_mlp": 0.01049505, + "balance_loss_clip": 1.01936936, + "balance_loss_mlp": 1.02068663, + "epoch": 0.4815271306177664, + "flos": 20626825559040.0, + "grad_norm": 1.7728533209613722, + "language_loss": 0.62160355, + "learning_rate": 2.216311467132199e-06, + "loss": 0.64272916, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42382812, + "step": 8009, + "time_per_iteration": 3.8353755474090576 + }, + { + "auxiliary_loss_clip": 0.01016746, + "auxiliary_loss_mlp": 0.01004499, + "balance_loss_clip": 1.00157797, + "balance_loss_mlp": 1.00861692, + "epoch": 0.4815872538704344, + "flos": 67687894120320.0, + "grad_norm": 0.8830933847516163, + "language_loss": 0.61492455, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63513708, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.0291748, + "router_z_loss_mlp": 0.08105469, + "step": 8010, + "time_per_iteration": 3.0942647457122803 + }, + { + "auxiliary_loss_clip": 0.01060641, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_clip": 1.02256203, + "balance_loss_mlp": 1.01913714, + "epoch": 0.48164737712310235, + "flos": 22819650232320.0, + "grad_norm": 1.8451646024837405, + "language_loss": 0.74504673, + "learning_rate": 2.215537096576639e-06, + "loss": 0.76617318, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4140625, + "step": 8011, + "time_per_iteration": 2.4487714767456055 + }, + { + "auxiliary_loss_clip": 0.0105713, + "auxiliary_loss_mlp": 0.01041816, + "balance_loss_clip": 1.01848674, + "balance_loss_mlp": 1.01791811, + "epoch": 0.4817075003757703, + "flos": 23732698775040.0, + "grad_norm": 1.7095045016521477, + "language_loss": 0.80160868, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.82259816, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.390625, + "step": 8012, + "time_per_iteration": 2.4836971759796143 + }, + { + "auxiliary_loss_clip": 0.0105939, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.02272701, + "balance_loss_mlp": 1.0186826, + "epoch": 0.4817676236284383, + "flos": 28181681061120.0, + "grad_norm": 1.9165683230687593, + "language_loss": 0.75377458, + "learning_rate": 2.214762693328326e-06, + "loss": 0.774876, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 8013, + "time_per_iteration": 2.4618914127349854 + }, + { + "auxiliary_loss_clip": 0.01057585, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.01931596, + "balance_loss_mlp": 1.01822019, + "epoch": 0.48182774688110624, + "flos": 17090821992960.0, + "grad_norm": 2.0389655413353562, + "language_loss": 0.92383814, + "learning_rate": 2.214375479481094e-06, + "loss": 0.94486058, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39453125, + "step": 8014, + "time_per_iteration": 2.343515396118164 + }, + { + "auxiliary_loss_clip": 0.01062754, + "auxiliary_loss_mlp": 0.01049924, + "balance_loss_clip": 1.01953804, + "balance_loss_mlp": 1.01866388, + "epoch": 0.4818878701337742, + "flos": 12567055841280.0, + "grad_norm": 2.071723259852537, + "language_loss": 0.76103985, + "learning_rate": 2.213988257504722e-06, + "loss": 0.7821666, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.44140625, + "step": 8015, + "time_per_iteration": 2.369561195373535 + }, + { + "auxiliary_loss_clip": 0.01062535, + "auxiliary_loss_mlp": 0.01045238, + "balance_loss_clip": 1.01669931, + "balance_loss_mlp": 1.01953793, + "epoch": 0.48194799338644223, + "flos": 24607342955520.0, + "grad_norm": 2.0135890836369392, + "language_loss": 0.8156938, + "learning_rate": 2.213601027413894e-06, + "loss": 0.83677155, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 8016, + "time_per_iteration": 3.8353145122528076 + }, + { + "auxiliary_loss_clip": 0.01059663, + "auxiliary_loss_mlp": 0.01040311, + "balance_loss_clip": 1.01540852, + "balance_loss_mlp": 1.02077579, + "epoch": 0.4820081166391102, + "flos": 21104157934080.0, + "grad_norm": 1.7765442418953632, + "language_loss": 0.78748012, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.80847991, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38867188, + "step": 8017, + "time_per_iteration": 2.405628204345703 + }, + { + "auxiliary_loss_clip": 0.01058742, + "auxiliary_loss_mlp": 0.01038338, + "balance_loss_clip": 1.01322079, + "balance_loss_mlp": 1.01991296, + "epoch": 0.48206823989177816, + "flos": 25263430824960.0, + "grad_norm": 1.8421142344840846, + "language_loss": 0.81302094, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.83399177, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38867188, + "step": 8018, + "time_per_iteration": 2.3974337577819824 + }, + { + "auxiliary_loss_clip": 0.01059309, + "auxiliary_loss_mlp": 0.0104333, + "balance_loss_clip": 1.01781893, + "balance_loss_mlp": 1.01903677, + "epoch": 0.4821283631444461, + "flos": 24643897015680.0, + "grad_norm": 2.931411638672019, + "language_loss": 0.77539766, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.79642409, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 8019, + "time_per_iteration": 2.4267358779907227 + }, + { + "auxiliary_loss_clip": 0.01060731, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_clip": 1.01743543, + "balance_loss_mlp": 1.01903856, + "epoch": 0.4821884863971141, + "flos": 23950940883840.0, + "grad_norm": 2.0656954819571034, + "language_loss": 0.80130744, + "learning_rate": 2.212052026199701e-06, + "loss": 0.82235068, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41796875, + "step": 8020, + "time_per_iteration": 2.416184425354004 + }, + { + "auxiliary_loss_clip": 0.0105909, + "auxiliary_loss_mlp": 0.01043928, + "balance_loss_clip": 1.01988363, + "balance_loss_mlp": 1.02049625, + "epoch": 0.48224860964978206, + "flos": 17159845484160.0, + "grad_norm": 1.7813388599644817, + "language_loss": 0.71371877, + "learning_rate": 2.211664755756855e-06, + "loss": 0.73474896, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38671875, + "step": 8021, + "time_per_iteration": 2.415107488632202 + }, + { + "auxiliary_loss_clip": 0.01065447, + "auxiliary_loss_mlp": 0.01047897, + "balance_loss_clip": 1.01773679, + "balance_loss_mlp": 1.02267277, + "epoch": 0.48230873290245, + "flos": 23074725692160.0, + "grad_norm": 1.9226023024343395, + "language_loss": 0.63644552, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.657579, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4296875, + "step": 8022, + "time_per_iteration": 2.396003246307373 + }, + { + "auxiliary_loss_clip": 0.01058628, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.01320922, + "balance_loss_mlp": 1.01982522, + "epoch": 0.482368856155118, + "flos": 19352530512000.0, + "grad_norm": 2.2151863152926263, + "language_loss": 0.67269373, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.69364673, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.38867188, + "step": 8023, + "time_per_iteration": 2.3957715034484863 + }, + { + "auxiliary_loss_clip": 0.01060887, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.01115143, + "balance_loss_mlp": 1.02069521, + "epoch": 0.48242897940778595, + "flos": 20078095720320.0, + "grad_norm": 2.445648796893174, + "language_loss": 0.77758944, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.7985602, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.40234375, + "step": 8024, + "time_per_iteration": 2.392923355102539 + }, + { + "auxiliary_loss_clip": 0.01061727, + "auxiliary_loss_mlp": 0.01043353, + "balance_loss_clip": 1.01620865, + "balance_loss_mlp": 1.0214386, + "epoch": 0.4824891026604539, + "flos": 23402874360960.0, + "grad_norm": 1.5716824555792122, + "language_loss": 0.76492053, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.78597128, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 8025, + "time_per_iteration": 2.464078903198242 + }, + { + "auxiliary_loss_clip": 0.01061643, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.01424885, + "balance_loss_mlp": 1.02174592, + "epoch": 0.4825492259131219, + "flos": 20367840026880.0, + "grad_norm": 1.8161741688680966, + "language_loss": 0.72677803, + "learning_rate": 2.209728283441112e-06, + "loss": 0.74780035, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 8026, + "time_per_iteration": 2.366476535797119 + }, + { + "auxiliary_loss_clip": 0.01063072, + "auxiliary_loss_mlp": 0.01048757, + "balance_loss_clip": 1.01881123, + "balance_loss_mlp": 1.02096319, + "epoch": 0.48260934916578985, + "flos": 14318159592960.0, + "grad_norm": 1.931193755784965, + "language_loss": 0.76832134, + "learning_rate": 2.209340965060465e-06, + "loss": 0.78943962, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.421875, + "step": 8027, + "time_per_iteration": 2.385864019393921 + }, + { + "auxiliary_loss_clip": 0.01064211, + "auxiliary_loss_mlp": 0.01044218, + "balance_loss_clip": 1.01926792, + "balance_loss_mlp": 1.02253222, + "epoch": 0.4826694724184578, + "flos": 22120235498880.0, + "grad_norm": 3.8439574714607807, + "language_loss": 0.68394703, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.70503134, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.41796875, + "step": 8028, + "time_per_iteration": 2.4028079509735107 + }, + { + "auxiliary_loss_clip": 0.01060827, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_clip": 1.0173763, + "balance_loss_mlp": 1.02118289, + "epoch": 0.48272959567112583, + "flos": 16180217245440.0, + "grad_norm": 1.575623083158319, + "language_loss": 0.73615682, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75720388, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 8029, + "time_per_iteration": 2.3875839710235596 + }, + { + "auxiliary_loss_clip": 0.01063854, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.0091089, + "balance_loss_mlp": 1.02205467, + "epoch": 0.4827897189237938, + "flos": 23179465370880.0, + "grad_norm": 1.8068717393750544, + "language_loss": 0.85839581, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87940294, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 8030, + "time_per_iteration": 2.3997063636779785 + }, + { + "auxiliary_loss_clip": 0.0106108, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.01349521, + "balance_loss_mlp": 1.02089107, + "epoch": 0.48284984217646176, + "flos": 21651561141120.0, + "grad_norm": 1.9310018928010162, + "language_loss": 0.74761051, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.7686066, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.40234375, + "step": 8031, + "time_per_iteration": 2.423278570175171 + }, + { + "auxiliary_loss_clip": 0.01064863, + "auxiliary_loss_mlp": 0.01047083, + "balance_loss_clip": 1.01711345, + "balance_loss_mlp": 1.02136087, + "epoch": 0.48290996542912973, + "flos": 31466100303360.0, + "grad_norm": 3.563608809081684, + "language_loss": 0.7276665, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.74878597, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43554688, + "step": 8032, + "time_per_iteration": 2.471867322921753 + }, + { + "auxiliary_loss_clip": 0.01061929, + "auxiliary_loss_mlp": 0.01044998, + "balance_loss_clip": 1.017663, + "balance_loss_mlp": 1.02126431, + "epoch": 0.4829700886817977, + "flos": 24460812512640.0, + "grad_norm": 1.3440796598123754, + "language_loss": 0.75173253, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.77280182, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 8033, + "time_per_iteration": 2.4587459564208984 + }, + { + "auxiliary_loss_clip": 0.01064154, + "auxiliary_loss_mlp": 0.01049724, + "balance_loss_clip": 1.02155495, + "balance_loss_mlp": 1.02120948, + "epoch": 0.48303021193446566, + "flos": 25700997294720.0, + "grad_norm": 1.482503072336305, + "language_loss": 0.84330475, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.8644436, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4296875, + "step": 8034, + "time_per_iteration": 2.418597936630249 + }, + { + "auxiliary_loss_clip": 0.01059149, + "auxiliary_loss_mlp": 0.0104317, + "balance_loss_clip": 1.0190897, + "balance_loss_mlp": 1.02017319, + "epoch": 0.4830903351871336, + "flos": 20084170296960.0, + "grad_norm": 1.7357396468416317, + "language_loss": 0.80176806, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.82279128, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.390625, + "step": 8035, + "time_per_iteration": 2.4087443351745605 + }, + { + "auxiliary_loss_clip": 0.01061801, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.01881099, + "balance_loss_mlp": 1.02064776, + "epoch": 0.4831504584398016, + "flos": 39450806864640.0, + "grad_norm": 2.1747378045962793, + "language_loss": 0.71372616, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.7348181, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41210938, + "step": 8036, + "time_per_iteration": 2.523103713989258 + }, + { + "auxiliary_loss_clip": 0.01059952, + "auxiliary_loss_mlp": 0.01044207, + "balance_loss_clip": 1.0180527, + "balance_loss_mlp": 1.01902413, + "epoch": 0.48321058169246955, + "flos": 20005685827200.0, + "grad_norm": 1.9362753290644112, + "language_loss": 0.74236047, + "learning_rate": 2.205467347074847e-06, + "loss": 0.7634021, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40820312, + "step": 8037, + "time_per_iteration": 2.3917489051818848 + }, + { + "auxiliary_loss_clip": 0.01063101, + "auxiliary_loss_mlp": 0.01050369, + "balance_loss_clip": 1.02199769, + "balance_loss_mlp": 1.02010298, + "epoch": 0.4832707049451375, + "flos": 20740397811840.0, + "grad_norm": 2.2643846088141637, + "language_loss": 0.71441704, + "learning_rate": 2.205079942181525e-06, + "loss": 0.73555171, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4296875, + "step": 8038, + "time_per_iteration": 2.3755311965942383 + }, + { + "auxiliary_loss_clip": 0.01058801, + "auxiliary_loss_mlp": 0.01047557, + "balance_loss_clip": 1.01943612, + "balance_loss_mlp": 1.01869202, + "epoch": 0.4833308281978055, + "flos": 33144200668800.0, + "grad_norm": 1.8644357691772886, + "language_loss": 0.80313265, + "learning_rate": 2.20469252951155e-06, + "loss": 0.82419622, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40039062, + "step": 8039, + "time_per_iteration": 2.512798309326172 + }, + { + "auxiliary_loss_clip": 0.01060613, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.01259017, + "balance_loss_mlp": 1.01947165, + "epoch": 0.48339095145047345, + "flos": 19098223102080.0, + "grad_norm": 1.5275951104537393, + "language_loss": 0.78619558, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.80719656, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41210938, + "step": 8040, + "time_per_iteration": 2.365368604660034 + }, + { + "auxiliary_loss_clip": 0.01060257, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.01907325, + "balance_loss_mlp": 1.01944625, + "epoch": 0.4834510747031414, + "flos": 34458017241600.0, + "grad_norm": 1.4497888540116088, + "language_loss": 0.76651305, + "learning_rate": 2.203917680900409e-06, + "loss": 0.78758407, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40820312, + "step": 8041, + "time_per_iteration": 2.5252156257629395 + }, + { + "auxiliary_loss_clip": 0.01060722, + "auxiliary_loss_mlp": 0.01039638, + "balance_loss_clip": 1.01191008, + "balance_loss_mlp": 1.01999557, + "epoch": 0.48351119795580944, + "flos": 27379621330560.0, + "grad_norm": 3.339787986100837, + "language_loss": 0.68387473, + "learning_rate": 2.203530244988624e-06, + "loss": 0.70487833, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 8042, + "time_per_iteration": 2.428579330444336 + }, + { + "auxiliary_loss_clip": 0.01012975, + "auxiliary_loss_mlp": 0.01006221, + "balance_loss_clip": 1.00355053, + "balance_loss_mlp": 1.00462937, + "epoch": 0.4835713212084774, + "flos": 67140770204160.0, + "grad_norm": 0.6966085158274868, + "language_loss": 0.58610052, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60629243, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.08349609, + "step": 8043, + "time_per_iteration": 3.09352707862854 + }, + { + "auxiliary_loss_clip": 0.01061253, + "auxiliary_loss_mlp": 0.01046991, + "balance_loss_clip": 1.01804709, + "balance_loss_mlp": 1.0189321, + "epoch": 0.48363144446114537, + "flos": 17966513514240.0, + "grad_norm": 2.0704928003168184, + "language_loss": 0.73590451, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.75698698, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 8044, + "time_per_iteration": 2.3639345169067383 + }, + { + "auxiliary_loss_clip": 0.01057398, + "auxiliary_loss_mlp": 0.01046247, + "balance_loss_clip": 1.0176487, + "balance_loss_mlp": 1.01789069, + "epoch": 0.48369156771381333, + "flos": 20592505825920.0, + "grad_norm": 1.7359384405607368, + "language_loss": 0.76873255, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78976893, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39453125, + "step": 8045, + "time_per_iteration": 2.4688022136688232 + }, + { + "auxiliary_loss_clip": 0.01060465, + "auxiliary_loss_mlp": 0.01050887, + "balance_loss_clip": 1.02013111, + "balance_loss_mlp": 1.01780367, + "epoch": 0.4837516909664813, + "flos": 22673957662080.0, + "grad_norm": 1.6054229880531283, + "language_loss": 0.70305145, + "learning_rate": 2.201980424309533e-06, + "loss": 0.72416496, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.42578125, + "step": 8046, + "time_per_iteration": 3.753631353378296 + }, + { + "auxiliary_loss_clip": 0.01059684, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.02224159, + "balance_loss_mlp": 1.0192275, + "epoch": 0.48381181421914926, + "flos": 25517493855360.0, + "grad_norm": 1.925554935005815, + "language_loss": 0.83608121, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.85718644, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40429688, + "step": 8047, + "time_per_iteration": 3.8124001026153564 + }, + { + "auxiliary_loss_clip": 0.01057859, + "auxiliary_loss_mlp": 0.01041969, + "balance_loss_clip": 1.01468158, + "balance_loss_mlp": 1.01742637, + "epoch": 0.4838719374718172, + "flos": 24206330545920.0, + "grad_norm": 1.7251425102606115, + "language_loss": 0.81340367, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.83440197, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 8048, + "time_per_iteration": 3.7799034118652344 + }, + { + "auxiliary_loss_clip": 0.0106176, + "auxiliary_loss_mlp": 0.010474, + "balance_loss_clip": 1.0178839, + "balance_loss_mlp": 1.01931429, + "epoch": 0.4839320607244852, + "flos": 26723358904320.0, + "grad_norm": 1.8285384499786943, + "language_loss": 0.8286863, + "learning_rate": 2.200817978328054e-06, + "loss": 0.84977794, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42578125, + "step": 8049, + "time_per_iteration": 2.4199976921081543 + }, + { + "auxiliary_loss_clip": 0.0105799, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.01603889, + "balance_loss_mlp": 1.01972604, + "epoch": 0.48399218397715316, + "flos": 20447860596480.0, + "grad_norm": 1.833263095569868, + "language_loss": 0.74039996, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.76137102, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3828125, + "step": 8050, + "time_per_iteration": 2.393047571182251 + }, + { + "auxiliary_loss_clip": 0.01012864, + "auxiliary_loss_mlp": 0.010129, + "balance_loss_clip": 1.01006293, + "balance_loss_mlp": 1.00475752, + "epoch": 0.4840523072298211, + "flos": 67177394087040.0, + "grad_norm": 0.7103506286266804, + "language_loss": 0.56460959, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58486724, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.02832031, + "router_z_loss_mlp": 0.08105469, + "step": 8051, + "time_per_iteration": 3.056006669998169 + }, + { + "auxiliary_loss_clip": 0.01062354, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_clip": 1.01256704, + "balance_loss_mlp": 1.02063739, + "epoch": 0.4841124304824891, + "flos": 22410608209920.0, + "grad_norm": 2.315457239549307, + "language_loss": 0.76556981, + "learning_rate": 2.199655463811236e-06, + "loss": 0.78661323, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41796875, + "step": 8052, + "time_per_iteration": 2.4224650859832764 + }, + { + "auxiliary_loss_clip": 0.01060694, + "auxiliary_loss_mlp": 0.01043382, + "balance_loss_clip": 1.01554632, + "balance_loss_mlp": 1.01906514, + "epoch": 0.48417255373515705, + "flos": 13843131367680.0, + "grad_norm": 2.1340151369139053, + "language_loss": 0.67835009, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.69939089, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41601562, + "step": 8053, + "time_per_iteration": 2.3431830406188965 + }, + { + "auxiliary_loss_clip": 0.01059392, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.01585054, + "balance_loss_mlp": 1.01886106, + "epoch": 0.484232676987825, + "flos": 31648346933760.0, + "grad_norm": 2.1391783866765084, + "language_loss": 0.72056293, + "learning_rate": 2.198880416254091e-06, + "loss": 0.74159819, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40625, + "step": 8054, + "time_per_iteration": 2.47769832611084 + }, + { + "auxiliary_loss_clip": 0.01059199, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.01236737, + "balance_loss_mlp": 1.0183965, + "epoch": 0.48429280024049304, + "flos": 24094294392960.0, + "grad_norm": 1.7760997181835392, + "language_loss": 0.70525956, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.72625983, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 8055, + "time_per_iteration": 3.847093343734741 + }, + { + "auxiliary_loss_clip": 0.01063039, + "auxiliary_loss_mlp": 0.0104482, + "balance_loss_clip": 1.01625717, + "balance_loss_mlp": 1.02061236, + "epoch": 0.484352923493161, + "flos": 17529121601280.0, + "grad_norm": 1.9450355446291452, + "language_loss": 0.64475727, + "learning_rate": 2.198105338530685e-06, + "loss": 0.6658358, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42382812, + "step": 8056, + "time_per_iteration": 2.3498189449310303 + }, + { + "auxiliary_loss_clip": 0.01061323, + "auxiliary_loss_mlp": 0.01048363, + "balance_loss_clip": 1.01901388, + "balance_loss_mlp": 1.01967168, + "epoch": 0.48441304674582897, + "flos": 29165638308480.0, + "grad_norm": 1.9800532263813526, + "language_loss": 0.68398285, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.70507973, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41601562, + "step": 8057, + "time_per_iteration": 2.4741296768188477 + }, + { + "auxiliary_loss_clip": 0.01059538, + "auxiliary_loss_mlp": 0.01040569, + "balance_loss_clip": 1.01409256, + "balance_loss_mlp": 1.01940477, + "epoch": 0.48447316999849693, + "flos": 15885829728000.0, + "grad_norm": 1.7893056373759102, + "language_loss": 0.82688767, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.84788871, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 8058, + "time_per_iteration": 2.388723611831665 + }, + { + "auxiliary_loss_clip": 0.01062737, + "auxiliary_loss_mlp": 0.01048815, + "balance_loss_clip": 1.02052665, + "balance_loss_mlp": 1.02086866, + "epoch": 0.4845332932511649, + "flos": 24380477740800.0, + "grad_norm": 2.1184651071538894, + "language_loss": 0.81194675, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.83306223, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 8059, + "time_per_iteration": 2.426980495452881 + }, + { + "auxiliary_loss_clip": 0.01065121, + "auxiliary_loss_mlp": 0.01045581, + "balance_loss_clip": 1.01340675, + "balance_loss_mlp": 1.02215052, + "epoch": 0.48459341650383286, + "flos": 37115152352640.0, + "grad_norm": 2.576206693789027, + "language_loss": 0.68010509, + "learning_rate": 2.196555093055352e-06, + "loss": 0.70121217, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4296875, + "step": 8060, + "time_per_iteration": 2.4971303939819336 + }, + { + "auxiliary_loss_clip": 0.01064193, + "auxiliary_loss_mlp": 0.01044886, + "balance_loss_clip": 1.01726556, + "balance_loss_mlp": 1.02245545, + "epoch": 0.48465353975650083, + "flos": 22965657004800.0, + "grad_norm": 1.7006119168574179, + "language_loss": 0.68245995, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.70355076, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 8061, + "time_per_iteration": 2.4359328746795654 + }, + { + "auxiliary_loss_clip": 0.01064843, + "auxiliary_loss_mlp": 0.01052082, + "balance_loss_clip": 1.02249384, + "balance_loss_mlp": 1.02270436, + "epoch": 0.4847136630091688, + "flos": 17706864666240.0, + "grad_norm": 2.5435696283075497, + "language_loss": 0.83263326, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.8538025, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 8062, + "time_per_iteration": 2.3625669479370117 + }, + { + "auxiliary_loss_clip": 0.01063008, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.01842654, + "balance_loss_mlp": 1.02211249, + "epoch": 0.48477378626183676, + "flos": 22017171565440.0, + "grad_norm": 1.5345641087261925, + "language_loss": 0.75190526, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.77298844, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 8063, + "time_per_iteration": 2.4452805519104004 + }, + { + "auxiliary_loss_clip": 0.01063247, + "auxiliary_loss_mlp": 0.01042106, + "balance_loss_clip": 1.01460457, + "balance_loss_mlp": 1.02146769, + "epoch": 0.4848339095145047, + "flos": 27961763207040.0, + "grad_norm": 3.5037733619661626, + "language_loss": 0.80110419, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.82215774, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 8064, + "time_per_iteration": 2.4386181831359863 + }, + { + "auxiliary_loss_clip": 0.0106256, + "auxiliary_loss_mlp": 0.01045424, + "balance_loss_clip": 1.01941299, + "balance_loss_mlp": 1.02346563, + "epoch": 0.4848940327671727, + "flos": 21687696264960.0, + "grad_norm": 1.8195053785379132, + "language_loss": 0.80000329, + "learning_rate": 2.194617118620173e-06, + "loss": 0.82108313, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 8065, + "time_per_iteration": 2.4277796745300293 + }, + { + "auxiliary_loss_clip": 0.01057869, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.01719797, + "balance_loss_mlp": 1.01993489, + "epoch": 0.48495415601984065, + "flos": 20630526163200.0, + "grad_norm": 1.9725729833189676, + "language_loss": 0.76840496, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78940737, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 8066, + "time_per_iteration": 2.384957790374756 + }, + { + "auxiliary_loss_clip": 0.01061273, + "auxiliary_loss_mlp": 0.01049005, + "balance_loss_clip": 1.02373254, + "balance_loss_mlp": 1.02234769, + "epoch": 0.4850142792725086, + "flos": 25627016390400.0, + "grad_norm": 1.7089037807898715, + "language_loss": 0.73058748, + "learning_rate": 2.193841877083912e-06, + "loss": 0.75169027, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 8067, + "time_per_iteration": 2.465919017791748 + }, + { + "auxiliary_loss_clip": 0.01063691, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.01580095, + "balance_loss_mlp": 1.02271903, + "epoch": 0.4850744025251766, + "flos": 13771105499520.0, + "grad_norm": 2.18132537363447, + "language_loss": 0.8131094, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.83416158, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 8068, + "time_per_iteration": 2.3778936862945557 + }, + { + "auxiliary_loss_clip": 0.01057858, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.01476908, + "balance_loss_mlp": 1.01921439, + "epoch": 0.4851345257778446, + "flos": 20260447084800.0, + "grad_norm": 1.6384515057075235, + "language_loss": 0.85551858, + "learning_rate": 2.193066606145638e-06, + "loss": 0.87648481, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38671875, + "step": 8069, + "time_per_iteration": 2.3884708881378174 + }, + { + "auxiliary_loss_clip": 0.01059839, + "auxiliary_loss_mlp": 0.01042606, + "balance_loss_clip": 1.01661849, + "balance_loss_mlp": 1.0208931, + "epoch": 0.48519464903051257, + "flos": 27088445658240.0, + "grad_norm": 1.7052585531909965, + "language_loss": 0.78706878, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80809325, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38867188, + "step": 8070, + "time_per_iteration": 2.4462618827819824 + }, + { + "auxiliary_loss_clip": 0.01059335, + "auxiliary_loss_mlp": 0.01047904, + "balance_loss_clip": 1.02003312, + "balance_loss_mlp": 1.01967001, + "epoch": 0.48525477228318054, + "flos": 17126328712320.0, + "grad_norm": 2.9424992332236273, + "language_loss": 0.79881787, + "learning_rate": 2.192291305922943e-06, + "loss": 0.81989026, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.39648438, + "step": 8071, + "time_per_iteration": 2.3527936935424805 + }, + { + "auxiliary_loss_clip": 0.01059927, + "auxiliary_loss_mlp": 0.01047527, + "balance_loss_clip": 1.02059793, + "balance_loss_mlp": 1.01952791, + "epoch": 0.4853148955358485, + "flos": 28179167443200.0, + "grad_norm": 1.8395574634045238, + "language_loss": 0.73164386, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.75271839, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40429688, + "step": 8072, + "time_per_iteration": 2.4669339656829834 + }, + { + "auxiliary_loss_clip": 0.01061407, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.02721095, + "balance_loss_mlp": 1.01981699, + "epoch": 0.48537501878851647, + "flos": 17492358072960.0, + "grad_norm": 2.4880003475827874, + "language_loss": 0.89084065, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.91201174, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41601562, + "step": 8073, + "time_per_iteration": 2.3430569171905518 + }, + { + "auxiliary_loss_clip": 0.01056152, + "auxiliary_loss_mlp": 0.01045087, + "balance_loss_clip": 1.02032733, + "balance_loss_mlp": 1.01853752, + "epoch": 0.48543514204118443, + "flos": 28583601166080.0, + "grad_norm": 1.9392747162390804, + "language_loss": 0.61998683, + "learning_rate": 2.19112830093786e-06, + "loss": 0.64099926, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 8074, + "time_per_iteration": 2.457200527191162 + }, + { + "auxiliary_loss_clip": 0.0105867, + "auxiliary_loss_mlp": 0.01052167, + "balance_loss_clip": 1.02486801, + "balance_loss_mlp": 1.01763248, + "epoch": 0.4854952652938524, + "flos": 20958919211520.0, + "grad_norm": 1.7517166035769813, + "language_loss": 0.74097854, + "learning_rate": 2.19074061809469e-06, + "loss": 0.76208687, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 8075, + "time_per_iteration": 2.3599302768707275 + }, + { + "auxiliary_loss_clip": 0.01056391, + "auxiliary_loss_mlp": 0.01044059, + "balance_loss_clip": 1.01829815, + "balance_loss_mlp": 1.01852012, + "epoch": 0.48555538854652036, + "flos": 66527243015040.0, + "grad_norm": 1.4979667256479785, + "language_loss": 0.82537305, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84637755, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 8076, + "time_per_iteration": 2.7755625247955322 + }, + { + "auxiliary_loss_clip": 0.01057296, + "auxiliary_loss_mlp": 0.0104243, + "balance_loss_clip": 1.01628745, + "balance_loss_mlp": 1.01882005, + "epoch": 0.4856155117991883, + "flos": 15924059533440.0, + "grad_norm": 4.501197650386905, + "language_loss": 0.88062614, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.90162343, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 8077, + "time_per_iteration": 2.3368592262268066 + }, + { + "auxiliary_loss_clip": 0.01011103, + "auxiliary_loss_mlp": 0.01007815, + "balance_loss_clip": 1.00493002, + "balance_loss_mlp": 1.00290966, + "epoch": 0.4856756350518563, + "flos": 71044129762560.0, + "grad_norm": 0.9067521289282683, + "language_loss": 0.5864867, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60667586, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.02880859, + "router_z_loss_mlp": 0.08203125, + "step": 8078, + "time_per_iteration": 2.975743055343628 + }, + { + "auxiliary_loss_clip": 0.01061316, + "auxiliary_loss_mlp": 0.01050173, + "balance_loss_clip": 1.02143192, + "balance_loss_mlp": 1.01963353, + "epoch": 0.48573575830452426, + "flos": 29824379441280.0, + "grad_norm": 1.6266849230439848, + "language_loss": 0.73697048, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.75808537, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 8079, + "time_per_iteration": 2.4719154834747314 + }, + { + "auxiliary_loss_clip": 0.01058907, + "auxiliary_loss_mlp": 0.01048035, + "balance_loss_clip": 1.02036643, + "balance_loss_mlp": 1.01896513, + "epoch": 0.4857958815571922, + "flos": 17638539402240.0, + "grad_norm": 2.0479324251504396, + "language_loss": 0.80472726, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.82579672, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 8080, + "time_per_iteration": 2.383028984069824 + }, + { + "auxiliary_loss_clip": 0.01059277, + "auxiliary_loss_mlp": 0.01046492, + "balance_loss_clip": 1.01899004, + "balance_loss_mlp": 1.01845491, + "epoch": 0.4858560048098602, + "flos": 21104437224960.0, + "grad_norm": 1.9529380928397921, + "language_loss": 0.84759653, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86865419, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40820312, + "step": 8081, + "time_per_iteration": 2.398174285888672 + }, + { + "auxiliary_loss_clip": 0.01058631, + "auxiliary_loss_mlp": 0.0105019, + "balance_loss_clip": 1.02304602, + "balance_loss_mlp": 1.01811934, + "epoch": 0.4859161280625282, + "flos": 22089756015360.0, + "grad_norm": 1.4376535649861486, + "language_loss": 0.84286141, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.8639496, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 8082, + "time_per_iteration": 2.4224324226379395 + }, + { + "auxiliary_loss_clip": 0.01058369, + "auxiliary_loss_mlp": 0.01042978, + "balance_loss_clip": 1.01825428, + "balance_loss_mlp": 1.01951754, + "epoch": 0.4859762513151962, + "flos": 17492497718400.0, + "grad_norm": 1.9993672834047864, + "language_loss": 0.88750064, + "learning_rate": 2.187638896199746e-06, + "loss": 0.90851414, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38867188, + "step": 8083, + "time_per_iteration": 2.359990358352661 + }, + { + "auxiliary_loss_clip": 0.01057636, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.01951551, + "balance_loss_mlp": 1.01936674, + "epoch": 0.48603637456786414, + "flos": 18003277042560.0, + "grad_norm": 1.9081140289359209, + "language_loss": 0.82845592, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.84947711, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 8084, + "time_per_iteration": 2.384178400039673 + }, + { + "auxiliary_loss_clip": 0.01061854, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_clip": 1.01774073, + "balance_loss_mlp": 1.02001762, + "epoch": 0.4860964978205321, + "flos": 22490942981760.0, + "grad_norm": 2.301997060448946, + "language_loss": 0.70044768, + "learning_rate": 2.186863394279098e-06, + "loss": 0.72152495, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 8085, + "time_per_iteration": 3.6870505809783936 + }, + { + "auxiliary_loss_clip": 0.01061918, + "auxiliary_loss_mlp": 0.01043019, + "balance_loss_clip": 1.01558888, + "balance_loss_mlp": 1.02136874, + "epoch": 0.48615662107320007, + "flos": 23371277713920.0, + "grad_norm": 1.534410328821934, + "language_loss": 0.78643644, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.80748576, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 8086, + "time_per_iteration": 3.8431646823883057 + }, + { + "auxiliary_loss_clip": 0.01060772, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.01373518, + "balance_loss_mlp": 1.02054751, + "epoch": 0.48621674432586803, + "flos": 34417518197760.0, + "grad_norm": 2.4310641183148367, + "language_loss": 0.71677953, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.73780447, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 8087, + "time_per_iteration": 2.478919267654419 + }, + { + "auxiliary_loss_clip": 0.01062563, + "auxiliary_loss_mlp": 0.01050302, + "balance_loss_clip": 1.02179885, + "balance_loss_mlp": 1.019526, + "epoch": 0.486276867578536, + "flos": 33106215242880.0, + "grad_norm": 1.8777814888201116, + "language_loss": 0.7463752, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.76750386, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 8088, + "time_per_iteration": 3.9433329105377197 + }, + { + "auxiliary_loss_clip": 0.01059021, + "auxiliary_loss_mlp": 0.01050942, + "balance_loss_clip": 1.02494216, + "balance_loss_mlp": 1.01965523, + "epoch": 0.48633699083120396, + "flos": 21469628712960.0, + "grad_norm": 1.4672822386875704, + "language_loss": 0.7678957, + "learning_rate": 2.185312305524892e-06, + "loss": 0.78899533, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 8089, + "time_per_iteration": 2.419008255004883 + }, + { + "auxiliary_loss_clip": 0.01062613, + "auxiliary_loss_mlp": 0.01043621, + "balance_loss_clip": 1.01482022, + "balance_loss_mlp": 1.02160764, + "epoch": 0.48639711408387193, + "flos": 20083297512960.0, + "grad_norm": 1.5465988301845353, + "language_loss": 0.85029054, + "learning_rate": 2.184924515731926e-06, + "loss": 0.87135291, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41015625, + "step": 8090, + "time_per_iteration": 2.410463809967041 + }, + { + "auxiliary_loss_clip": 0.01059474, + "auxiliary_loss_mlp": 0.01036974, + "balance_loss_clip": 1.01222634, + "balance_loss_mlp": 1.02122462, + "epoch": 0.4864572373365399, + "flos": 20777789744640.0, + "grad_norm": 1.510495849341002, + "language_loss": 0.77306741, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.79403192, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 8091, + "time_per_iteration": 2.396533966064453 + }, + { + "auxiliary_loss_clip": 0.01060904, + "auxiliary_loss_mlp": 0.01041816, + "balance_loss_clip": 1.01448107, + "balance_loss_mlp": 1.02073884, + "epoch": 0.48651736058920786, + "flos": 26024328195840.0, + "grad_norm": 1.4587298215944733, + "language_loss": 0.8071444, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82817167, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 8092, + "time_per_iteration": 2.4713478088378906 + }, + { + "auxiliary_loss_clip": 0.010632, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.01429582, + "balance_loss_mlp": 1.02187884, + "epoch": 0.4865774838418758, + "flos": 20484554302080.0, + "grad_norm": 1.8965175223986472, + "language_loss": 0.72490811, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.74595988, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 8093, + "time_per_iteration": 2.4030678272247314 + }, + { + "auxiliary_loss_clip": 0.01060147, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.01742268, + "balance_loss_mlp": 1.02070558, + "epoch": 0.4866376070945438, + "flos": 23546646806400.0, + "grad_norm": 1.7805349735882785, + "language_loss": 0.68812656, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.70916533, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 8094, + "time_per_iteration": 3.8735239505767822 + }, + { + "auxiliary_loss_clip": 0.01062914, + "auxiliary_loss_mlp": 0.01055519, + "balance_loss_clip": 1.0271945, + "balance_loss_mlp": 1.02074194, + "epoch": 0.4866977303472118, + "flos": 16689669937920.0, + "grad_norm": 2.424768384797598, + "language_loss": 0.68485612, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.70604044, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 8095, + "time_per_iteration": 2.355020046234131 + }, + { + "auxiliary_loss_clip": 0.01060978, + "auxiliary_loss_mlp": 0.01048791, + "balance_loss_clip": 1.0209918, + "balance_loss_mlp": 1.01989233, + "epoch": 0.4867578535998798, + "flos": 17895011316480.0, + "grad_norm": 2.0170220308569995, + "language_loss": 0.80598712, + "learning_rate": 2.182597630229345e-06, + "loss": 0.8270849, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41210938, + "step": 8096, + "time_per_iteration": 2.440167188644409 + }, + { + "auxiliary_loss_clip": 0.01057541, + "auxiliary_loss_mlp": 0.01050655, + "balance_loss_clip": 1.02318954, + "balance_loss_mlp": 1.01867473, + "epoch": 0.48681797685254774, + "flos": 22636705374720.0, + "grad_norm": 1.8329422382741873, + "language_loss": 0.69659472, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.7176767, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.38867188, + "step": 8097, + "time_per_iteration": 2.386955499649048 + }, + { + "auxiliary_loss_clip": 0.01057859, + "auxiliary_loss_mlp": 0.010478, + "balance_loss_clip": 1.02042913, + "balance_loss_mlp": 1.01834512, + "epoch": 0.4868781001052157, + "flos": 20885043041280.0, + "grad_norm": 1.4748607858451788, + "language_loss": 0.72770947, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.74876606, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 8098, + "time_per_iteration": 2.401756763458252 + }, + { + "auxiliary_loss_clip": 0.01063607, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_clip": 1.01738083, + "balance_loss_mlp": 1.02042031, + "epoch": 0.48693822335788367, + "flos": 41973316306560.0, + "grad_norm": 1.5982524191001612, + "language_loss": 0.67768139, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.69878256, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43164062, + "step": 8099, + "time_per_iteration": 2.54843807220459 + }, + { + "auxiliary_loss_clip": 0.01056702, + "auxiliary_loss_mlp": 0.01047173, + "balance_loss_clip": 1.02203155, + "balance_loss_mlp": 1.01751184, + "epoch": 0.48699834661055164, + "flos": 24242151467520.0, + "grad_norm": 1.6529325220241322, + "language_loss": 0.684488, + "learning_rate": 2.181046234549138e-06, + "loss": 0.70552677, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.390625, + "step": 8100, + "time_per_iteration": 2.420623779296875 + }, + { + "auxiliary_loss_clip": 0.01054873, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.01787019, + "balance_loss_mlp": 1.01639938, + "epoch": 0.4870584698632196, + "flos": 25922625805440.0, + "grad_norm": 1.5637623936031626, + "language_loss": 0.7737264, + "learning_rate": 2.180658368429088e-06, + "loss": 0.7947017, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 8101, + "time_per_iteration": 2.4278759956359863 + }, + { + "auxiliary_loss_clip": 0.01011069, + "auxiliary_loss_mlp": 0.01011928, + "balance_loss_clip": 1.00928152, + "balance_loss_mlp": 1.00325632, + "epoch": 0.48711859311588757, + "flos": 70208588171520.0, + "grad_norm": 0.6987598935858327, + "language_loss": 0.52428693, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54451692, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.078125, + "step": 8102, + "time_per_iteration": 3.173898696899414 + }, + { + "auxiliary_loss_clip": 0.01058586, + "auxiliary_loss_mlp": 0.01045068, + "balance_loss_clip": 1.01871049, + "balance_loss_mlp": 1.0190202, + "epoch": 0.48717871636855553, + "flos": 12342320219520.0, + "grad_norm": 1.8631496123715359, + "language_loss": 0.75103474, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.77207136, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 8103, + "time_per_iteration": 2.3232340812683105 + }, + { + "auxiliary_loss_clip": 0.01061523, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.02205729, + "balance_loss_mlp": 1.02022684, + "epoch": 0.4872388396212235, + "flos": 23476017392640.0, + "grad_norm": 1.7650212617703493, + "language_loss": 0.64478081, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.66590106, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41210938, + "step": 8104, + "time_per_iteration": 2.45658016204834 + }, + { + "auxiliary_loss_clip": 0.0105861, + "auxiliary_loss_mlp": 0.01042263, + "balance_loss_clip": 1.01600075, + "balance_loss_mlp": 1.01911306, + "epoch": 0.48729896287389146, + "flos": 31426334398080.0, + "grad_norm": 1.721731285338277, + "language_loss": 0.69923645, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.72024524, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 8105, + "time_per_iteration": 2.4437413215637207 + }, + { + "auxiliary_loss_clip": 0.01057165, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.01168013, + "balance_loss_mlp": 1.01817691, + "epoch": 0.4873590861265594, + "flos": 19057060742400.0, + "grad_norm": 1.7281528761776228, + "language_loss": 0.74703848, + "learning_rate": 2.178718935364259e-06, + "loss": 0.76798046, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 8106, + "time_per_iteration": 2.399144172668457 + }, + { + "auxiliary_loss_clip": 0.01062503, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.01553941, + "balance_loss_mlp": 1.02061248, + "epoch": 0.4874192093792274, + "flos": 24347275171200.0, + "grad_norm": 1.8912377794399549, + "language_loss": 0.78096199, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.80204821, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.41796875, + "step": 8107, + "time_per_iteration": 2.401186466217041 + }, + { + "auxiliary_loss_clip": 0.01057599, + "auxiliary_loss_mlp": 0.0104132, + "balance_loss_clip": 1.01565385, + "balance_loss_mlp": 1.0197196, + "epoch": 0.4874793326318954, + "flos": 23111489220480.0, + "grad_norm": 1.8239428648996252, + "language_loss": 0.76115012, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.7821393, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 8108, + "time_per_iteration": 2.430654525756836 + }, + { + "auxiliary_loss_clip": 0.01059204, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.01523042, + "balance_loss_mlp": 1.02066612, + "epoch": 0.4875394558845634, + "flos": 19025149893120.0, + "grad_norm": 1.6714873124160845, + "language_loss": 0.74079216, + "learning_rate": 2.177555194083212e-06, + "loss": 0.76178163, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 8109, + "time_per_iteration": 2.375513792037964 + }, + { + "auxiliary_loss_clip": 0.01059649, + "auxiliary_loss_mlp": 0.01045333, + "balance_loss_clip": 1.01790297, + "balance_loss_mlp": 1.02019727, + "epoch": 0.48759957913723134, + "flos": 21432550982400.0, + "grad_norm": 1.8339476738649023, + "language_loss": 0.7937634, + "learning_rate": 2.177167266837428e-06, + "loss": 0.81481314, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 8110, + "time_per_iteration": 2.425811767578125 + }, + { + "auxiliary_loss_clip": 0.01060352, + "auxiliary_loss_mlp": 0.01044469, + "balance_loss_clip": 1.01802826, + "balance_loss_mlp": 1.01995802, + "epoch": 0.4876597023898993, + "flos": 17747712823680.0, + "grad_norm": 1.9448574640947216, + "language_loss": 0.7354126, + "learning_rate": 2.176779332873444e-06, + "loss": 0.75646079, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 8111, + "time_per_iteration": 2.353019952774048 + }, + { + "auxiliary_loss_clip": 0.010621, + "auxiliary_loss_mlp": 0.01045663, + "balance_loss_clip": 1.01683807, + "balance_loss_mlp": 1.02169824, + "epoch": 0.4877198256425673, + "flos": 17018691390720.0, + "grad_norm": 1.7307651338923498, + "language_loss": 0.77338839, + "learning_rate": 2.17639139220597e-06, + "loss": 0.79446602, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40429688, + "step": 8112, + "time_per_iteration": 2.401118040084839 + }, + { + "auxiliary_loss_clip": 0.01064581, + "auxiliary_loss_mlp": 0.01044541, + "balance_loss_clip": 1.01570439, + "balance_loss_mlp": 1.02138877, + "epoch": 0.48777994889523524, + "flos": 22382956546560.0, + "grad_norm": 1.603476162715847, + "language_loss": 0.76233637, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.7834276, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43164062, + "step": 8113, + "time_per_iteration": 2.397122621536255 + }, + { + "auxiliary_loss_clip": 0.010184, + "auxiliary_loss_mlp": 0.01006601, + "balance_loss_clip": 1.00377584, + "balance_loss_mlp": 1.01040721, + "epoch": 0.4878400721479032, + "flos": 61238527908480.0, + "grad_norm": 0.7856095335632013, + "language_loss": 0.48953632, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50978637, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.08007812, + "step": 8114, + "time_per_iteration": 2.978111743927002 + }, + { + "auxiliary_loss_clip": 0.01063286, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_clip": 1.01738262, + "balance_loss_mlp": 1.02175331, + "epoch": 0.48790019540057117, + "flos": 24535421821440.0, + "grad_norm": 1.4085129442140978, + "language_loss": 0.77793092, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.79902345, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4140625, + "step": 8115, + "time_per_iteration": 2.422938346862793 + }, + { + "auxiliary_loss_clip": 0.01063013, + "auxiliary_loss_mlp": 0.01045988, + "balance_loss_clip": 1.01679325, + "balance_loss_mlp": 1.02075887, + "epoch": 0.48796031865323913, + "flos": 21832900076160.0, + "grad_norm": 2.072144954818526, + "language_loss": 0.73311788, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.75420785, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.421875, + "step": 8116, + "time_per_iteration": 2.427475929260254 + }, + { + "auxiliary_loss_clip": 0.0105896, + "auxiliary_loss_mlp": 0.01045113, + "balance_loss_clip": 1.0173372, + "balance_loss_mlp": 1.0195266, + "epoch": 0.4880204419059071, + "flos": 18587897625600.0, + "grad_norm": 1.670177532949957, + "language_loss": 0.63914466, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.66018534, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39453125, + "step": 8117, + "time_per_iteration": 2.3777389526367188 + }, + { + "auxiliary_loss_clip": 0.01058629, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_clip": 1.01562738, + "balance_loss_mlp": 1.01883805, + "epoch": 0.48808056515857506, + "flos": 19171156665600.0, + "grad_norm": 1.6803930142318335, + "language_loss": 0.80434316, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.82534605, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8118, + "time_per_iteration": 2.421194076538086 + }, + { + "auxiliary_loss_clip": 0.01062077, + "auxiliary_loss_mlp": 0.01048497, + "balance_loss_clip": 1.02049494, + "balance_loss_mlp": 1.02069759, + "epoch": 0.48814068841124303, + "flos": 20119467548160.0, + "grad_norm": 2.6399460646830035, + "language_loss": 0.64873171, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.66983747, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 8119, + "time_per_iteration": 2.3728177547454834 + }, + { + "auxiliary_loss_clip": 0.01058614, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.01191497, + "balance_loss_mlp": 1.01933479, + "epoch": 0.488200811663911, + "flos": 22964504929920.0, + "grad_norm": 2.166139340929017, + "language_loss": 0.72830266, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74926734, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39257812, + "step": 8120, + "time_per_iteration": 2.4339663982391357 + }, + { + "auxiliary_loss_clip": 0.01060997, + "auxiliary_loss_mlp": 0.01045335, + "balance_loss_clip": 1.01693928, + "balance_loss_mlp": 1.01879168, + "epoch": 0.48826093491657896, + "flos": 33909322314240.0, + "grad_norm": 1.6001621993353154, + "language_loss": 0.64905918, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.6701225, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 8121, + "time_per_iteration": 2.4859707355499268 + }, + { + "auxiliary_loss_clip": 0.0106228, + "auxiliary_loss_mlp": 0.01043163, + "balance_loss_clip": 1.01469553, + "balance_loss_mlp": 1.01985073, + "epoch": 0.488321058169247, + "flos": 23069349342720.0, + "grad_norm": 2.056472661197915, + "language_loss": 0.84618223, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.86723655, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42382812, + "step": 8122, + "time_per_iteration": 2.396568775177002 + }, + { + "auxiliary_loss_clip": 0.01060277, + "auxiliary_loss_mlp": 0.01041841, + "balance_loss_clip": 1.01392221, + "balance_loss_mlp": 1.01854479, + "epoch": 0.48838118142191494, + "flos": 19316709590400.0, + "grad_norm": 1.7044265461870343, + "language_loss": 0.86421931, + "learning_rate": 2.172123606640866e-06, + "loss": 0.88524055, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 8123, + "time_per_iteration": 2.358893632888794 + }, + { + "auxiliary_loss_clip": 0.01061028, + "auxiliary_loss_mlp": 0.0104393, + "balance_loss_clip": 1.01597571, + "balance_loss_mlp": 1.01903796, + "epoch": 0.4884413046745829, + "flos": 25409507420160.0, + "grad_norm": 1.4460434426878785, + "language_loss": 0.86532879, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.88637829, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41992188, + "step": 8124, + "time_per_iteration": 2.4393153190612793 + }, + { + "auxiliary_loss_clip": 0.0106151, + "auxiliary_loss_mlp": 0.01040951, + "balance_loss_clip": 1.01441503, + "balance_loss_mlp": 1.01960158, + "epoch": 0.4885014279272509, + "flos": 20990620592640.0, + "grad_norm": 2.091484521835699, + "language_loss": 0.8139267, + "learning_rate": 2.171347560204948e-06, + "loss": 0.83495134, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41992188, + "step": 8125, + "time_per_iteration": 3.8356614112854004 + }, + { + "auxiliary_loss_clip": 0.01060151, + "auxiliary_loss_mlp": 0.01043126, + "balance_loss_clip": 1.01478946, + "balance_loss_mlp": 1.01894784, + "epoch": 0.48856155117991884, + "flos": 13770756385920.0, + "grad_norm": 2.2162273453194694, + "language_loss": 0.73798937, + "learning_rate": 2.170959527233356e-06, + "loss": 0.75902212, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41210938, + "step": 8126, + "time_per_iteration": 3.752007246017456 + }, + { + "auxiliary_loss_clip": 0.01059996, + "auxiliary_loss_mlp": 0.01044361, + "balance_loss_clip": 1.01725304, + "balance_loss_mlp": 1.01875424, + "epoch": 0.4886216744325868, + "flos": 32086402162560.0, + "grad_norm": 1.6515808268554515, + "language_loss": 0.69848514, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.71952868, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41210938, + "step": 8127, + "time_per_iteration": 2.469999313354492 + }, + { + "auxiliary_loss_clip": 0.01060062, + "auxiliary_loss_mlp": 0.01039233, + "balance_loss_clip": 1.01084948, + "balance_loss_mlp": 1.01787996, + "epoch": 0.48868179768525477, + "flos": 19609037337600.0, + "grad_norm": 2.3586792623036077, + "language_loss": 0.77954853, + "learning_rate": 2.170183441856481e-06, + "loss": 0.80054152, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 8128, + "time_per_iteration": 3.859971523284912 + }, + { + "auxiliary_loss_clip": 0.01059228, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.01478064, + "balance_loss_mlp": 1.01954341, + "epoch": 0.48874192093792274, + "flos": 21285880894080.0, + "grad_norm": 2.103170232123045, + "language_loss": 0.76854444, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78955531, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 8129, + "time_per_iteration": 2.3888838291168213 + }, + { + "auxiliary_loss_clip": 0.01060037, + "auxiliary_loss_mlp": 0.01043241, + "balance_loss_clip": 1.01378417, + "balance_loss_mlp": 1.01907015, + "epoch": 0.4888020441905907, + "flos": 14172571756800.0, + "grad_norm": 2.1013502591445543, + "language_loss": 0.66263652, + "learning_rate": 2.169407330666114e-06, + "loss": 0.68366933, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41015625, + "step": 8130, + "time_per_iteration": 2.345329999923706 + }, + { + "auxiliary_loss_clip": 0.01057346, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.01649261, + "balance_loss_mlp": 1.0177108, + "epoch": 0.48886216744325867, + "flos": 24096738188160.0, + "grad_norm": 1.7445132482629475, + "language_loss": 0.73151457, + "learning_rate": 2.169019265427658e-06, + "loss": 0.75252819, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.39648438, + "step": 8131, + "time_per_iteration": 2.413877248764038 + }, + { + "auxiliary_loss_clip": 0.01060871, + "auxiliary_loss_mlp": 0.01050328, + "balance_loss_clip": 1.02174115, + "balance_loss_mlp": 1.01956081, + "epoch": 0.48892229069592663, + "flos": 38430016266240.0, + "grad_norm": 1.4926496599573764, + "language_loss": 0.702613, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.72372496, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41210938, + "step": 8132, + "time_per_iteration": 2.508917808532715 + }, + { + "auxiliary_loss_clip": 0.01058378, + "auxiliary_loss_mlp": 0.01039136, + "balance_loss_clip": 1.01260018, + "balance_loss_mlp": 1.01850069, + "epoch": 0.4889824139485946, + "flos": 23842151487360.0, + "grad_norm": 1.4565060458573194, + "language_loss": 0.70995712, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.73093224, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 8133, + "time_per_iteration": 2.4350242614746094 + }, + { + "auxiliary_loss_clip": 0.01057206, + "auxiliary_loss_mlp": 0.01041502, + "balance_loss_clip": 1.01608658, + "balance_loss_mlp": 1.01826143, + "epoch": 0.48904253720126256, + "flos": 24424677388800.0, + "grad_norm": 1.6225443384002662, + "language_loss": 0.71776098, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.73874807, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 8134, + "time_per_iteration": 3.956407308578491 + }, + { + "auxiliary_loss_clip": 0.01062676, + "auxiliary_loss_mlp": 0.01044791, + "balance_loss_clip": 1.01567972, + "balance_loss_mlp": 1.02005744, + "epoch": 0.4891026604539306, + "flos": 24169532106240.0, + "grad_norm": 1.943324556443965, + "language_loss": 0.81701458, + "learning_rate": 2.167466940528718e-06, + "loss": 0.83808923, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42578125, + "step": 8135, + "time_per_iteration": 2.4218056201934814 + }, + { + "auxiliary_loss_clip": 0.01058166, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.01522803, + "balance_loss_mlp": 1.01934624, + "epoch": 0.48916278370659855, + "flos": 21469873092480.0, + "grad_norm": 1.8301609115127278, + "language_loss": 0.75882578, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.77979732, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.38867188, + "step": 8136, + "time_per_iteration": 2.401322364807129 + }, + { + "auxiliary_loss_clip": 0.01057548, + "auxiliary_loss_mlp": 0.01040358, + "balance_loss_clip": 1.01452565, + "balance_loss_mlp": 1.01862967, + "epoch": 0.4892229069592665, + "flos": 22308661440000.0, + "grad_norm": 2.0507477659623485, + "language_loss": 0.73856843, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75954747, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38867188, + "step": 8137, + "time_per_iteration": 2.3714816570281982 + }, + { + "auxiliary_loss_clip": 0.01059938, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.01327133, + "balance_loss_mlp": 1.02017057, + "epoch": 0.4892830302119345, + "flos": 12786031088640.0, + "grad_norm": 2.0077215783291686, + "language_loss": 0.77492034, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.79589933, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3984375, + "step": 8138, + "time_per_iteration": 2.38264536857605 + }, + { + "auxiliary_loss_clip": 0.01059546, + "auxiliary_loss_mlp": 0.01044583, + "balance_loss_clip": 1.01917982, + "balance_loss_mlp": 1.01975203, + "epoch": 0.48934315346460244, + "flos": 20812842616320.0, + "grad_norm": 1.5701867538500145, + "language_loss": 0.75384945, + "learning_rate": 2.165914514023972e-06, + "loss": 0.77489078, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 8139, + "time_per_iteration": 2.3868191242218018 + }, + { + "auxiliary_loss_clip": 0.01058944, + "auxiliary_loss_mlp": 0.01043908, + "balance_loss_clip": 1.01807523, + "balance_loss_mlp": 1.01868844, + "epoch": 0.4894032767172704, + "flos": 19754520439680.0, + "grad_norm": 1.697186804427206, + "language_loss": 0.63513237, + "learning_rate": 2.165526391632255e-06, + "loss": 0.65616089, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 8140, + "time_per_iteration": 2.391818046569824 + }, + { + "auxiliary_loss_clip": 0.01060612, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_clip": 1.01529145, + "balance_loss_mlp": 1.01955295, + "epoch": 0.4894633999699384, + "flos": 17818097857920.0, + "grad_norm": 1.6024904548050614, + "language_loss": 0.83456266, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.85561198, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41015625, + "step": 8141, + "time_per_iteration": 2.375199794769287 + }, + { + "auxiliary_loss_clip": 0.01059696, + "auxiliary_loss_mlp": 0.01042069, + "balance_loss_clip": 1.01593792, + "balance_loss_mlp": 1.01891637, + "epoch": 0.48952352322260634, + "flos": 25521962509440.0, + "grad_norm": 1.5298406019735393, + "language_loss": 0.7323736, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.75339127, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40820312, + "step": 8142, + "time_per_iteration": 2.416191577911377 + }, + { + "auxiliary_loss_clip": 0.0105703, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.01183546, + "balance_loss_mlp": 1.01805663, + "epoch": 0.4895836464752743, + "flos": 29054335294080.0, + "grad_norm": 1.8107850355168307, + "language_loss": 0.6865229, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.70746773, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38867188, + "step": 8143, + "time_per_iteration": 2.451817274093628 + }, + { + "auxiliary_loss_clip": 0.01056952, + "auxiliary_loss_mlp": 0.01037709, + "balance_loss_clip": 1.01356912, + "balance_loss_mlp": 1.01808393, + "epoch": 0.48964376972794227, + "flos": 33545562192000.0, + "grad_norm": 1.4563195439832757, + "language_loss": 0.76001084, + "learning_rate": 2.163973839444793e-06, + "loss": 0.78095746, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38867188, + "step": 8144, + "time_per_iteration": 2.4901671409606934 + }, + { + "auxiliary_loss_clip": 0.01058542, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.01518679, + "balance_loss_mlp": 1.01942992, + "epoch": 0.48970389298061023, + "flos": 22052957575680.0, + "grad_norm": 1.997204317839989, + "language_loss": 0.77532852, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.79631376, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 8145, + "time_per_iteration": 2.420436382293701 + }, + { + "auxiliary_loss_clip": 0.01059815, + "auxiliary_loss_mlp": 0.01040108, + "balance_loss_clip": 1.01425135, + "balance_loss_mlp": 1.01974666, + "epoch": 0.4897640162332782, + "flos": 20083262601600.0, + "grad_norm": 2.658579771504605, + "language_loss": 0.81556249, + "learning_rate": 2.163197525984761e-06, + "loss": 0.83656168, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 8146, + "time_per_iteration": 2.361771821975708 + }, + { + "auxiliary_loss_clip": 0.01056258, + "auxiliary_loss_mlp": 0.01039236, + "balance_loss_clip": 1.01544154, + "balance_loss_mlp": 1.01829481, + "epoch": 0.48982413948594616, + "flos": 23805073756800.0, + "grad_norm": 1.5335395060797647, + "language_loss": 0.75156212, + "learning_rate": 2.162809359964687e-06, + "loss": 0.77251703, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37890625, + "step": 8147, + "time_per_iteration": 2.4015231132507324 + }, + { + "auxiliary_loss_clip": 0.01057512, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.0115248, + "balance_loss_mlp": 1.01877642, + "epoch": 0.4898842627386142, + "flos": 17638679047680.0, + "grad_norm": 2.122172553736516, + "language_loss": 0.84629315, + "learning_rate": 2.162421187770864e-06, + "loss": 0.86723077, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 8148, + "time_per_iteration": 2.3507580757141113 + }, + { + "auxiliary_loss_clip": 0.0105364, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.01344705, + "balance_loss_mlp": 1.016891, + "epoch": 0.48994438599128215, + "flos": 16616980753920.0, + "grad_norm": 2.052022882087746, + "language_loss": 0.7555837, + "learning_rate": 2.162033009418015e-06, + "loss": 0.77647418, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3671875, + "step": 8149, + "time_per_iteration": 2.3609838485717773 + }, + { + "auxiliary_loss_clip": 0.01061359, + "auxiliary_loss_mlp": 0.01042738, + "balance_loss_clip": 1.01715565, + "balance_loss_mlp": 1.01986074, + "epoch": 0.4900045092439501, + "flos": 26613626901120.0, + "grad_norm": 2.7346469601259393, + "language_loss": 0.76905018, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.79009116, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.4140625, + "step": 8150, + "time_per_iteration": 2.4113337993621826 + }, + { + "auxiliary_loss_clip": 0.01058695, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.01811862, + "balance_loss_mlp": 1.01863587, + "epoch": 0.4900646324966181, + "flos": 19901085793920.0, + "grad_norm": 1.8817951707232807, + "language_loss": 0.7348972, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75592327, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 8151, + "time_per_iteration": 2.3761820793151855 + }, + { + "auxiliary_loss_clip": 0.01009405, + "auxiliary_loss_mlp": 0.01002731, + "balance_loss_clip": 1.00035918, + "balance_loss_mlp": 1.00150132, + "epoch": 0.49012475574928605, + "flos": 59186682771840.0, + "grad_norm": 0.8253647462615042, + "language_loss": 0.54429889, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56442022, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07910156, + "step": 8152, + "time_per_iteration": 2.9942619800567627 + }, + { + "auxiliary_loss_clip": 0.01057742, + "auxiliary_loss_mlp": 0.01040264, + "balance_loss_clip": 1.01554012, + "balance_loss_mlp": 1.01732433, + "epoch": 0.490184879001954, + "flos": 45258049751040.0, + "grad_norm": 1.6768208377986304, + "language_loss": 0.62249172, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.64347172, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.40429688, + "step": 8153, + "time_per_iteration": 2.5772602558135986 + }, + { + "auxiliary_loss_clip": 0.01057699, + "auxiliary_loss_mlp": 0.01038388, + "balance_loss_clip": 1.01468897, + "balance_loss_mlp": 1.01795018, + "epoch": 0.490245002254622, + "flos": 28000865796480.0, + "grad_norm": 1.6370589119291723, + "language_loss": 0.77561718, + "learning_rate": 2.160092025783549e-06, + "loss": 0.79657805, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3984375, + "step": 8154, + "time_per_iteration": 2.4156250953674316 + }, + { + "auxiliary_loss_clip": 0.01009132, + "auxiliary_loss_mlp": 0.01002908, + "balance_loss_clip": 1.00066733, + "balance_loss_mlp": 1.00142813, + "epoch": 0.49030512550728994, + "flos": 58947910917120.0, + "grad_norm": 0.9727950300438993, + "language_loss": 0.66964078, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.68976116, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.07714844, + "step": 8155, + "time_per_iteration": 3.1068997383117676 + }, + { + "auxiliary_loss_clip": 0.01058089, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.01710272, + "balance_loss_mlp": 1.01878119, + "epoch": 0.4903652487599579, + "flos": 19790830120320.0, + "grad_norm": 2.940843269192393, + "language_loss": 0.77508807, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.79607999, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.39257812, + "step": 8156, + "time_per_iteration": 2.3568928241729736 + }, + { + "auxiliary_loss_clip": 0.0105592, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.01494098, + "balance_loss_mlp": 1.01700771, + "epoch": 0.49042537201262587, + "flos": 21761013853440.0, + "grad_norm": 2.066935992227891, + "language_loss": 0.84754622, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.86849958, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38867188, + "step": 8157, + "time_per_iteration": 2.394874095916748 + }, + { + "auxiliary_loss_clip": 0.01055969, + "auxiliary_loss_mlp": 0.01042962, + "balance_loss_clip": 1.01816607, + "balance_loss_mlp": 1.01750171, + "epoch": 0.49048549526529384, + "flos": 18952041772800.0, + "grad_norm": 2.364658509754103, + "language_loss": 0.80293334, + "learning_rate": 2.158539129514956e-06, + "loss": 0.82392263, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 8158, + "time_per_iteration": 2.354426383972168 + }, + { + "auxiliary_loss_clip": 0.01060396, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.01986194, + "balance_loss_mlp": 1.01937056, + "epoch": 0.4905456185179618, + "flos": 26905186598400.0, + "grad_norm": 2.054086097220237, + "language_loss": 0.70778286, + "learning_rate": 2.158150890381454e-06, + "loss": 0.72884345, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 8159, + "time_per_iteration": 2.439462900161743 + }, + { + "auxiliary_loss_clip": 0.01055996, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.01636696, + "balance_loss_mlp": 1.01696575, + "epoch": 0.49060574177062977, + "flos": 20411306536320.0, + "grad_norm": 1.9521441464888611, + "language_loss": 0.73831618, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75928795, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 8160, + "time_per_iteration": 2.356128215789795 + }, + { + "auxiliary_loss_clip": 0.01059439, + "auxiliary_loss_mlp": 0.01046612, + "balance_loss_clip": 1.01824069, + "balance_loss_mlp": 1.0183388, + "epoch": 0.4906658650232978, + "flos": 17492742097920.0, + "grad_norm": 1.8289368882976844, + "language_loss": 0.72796524, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.7490257, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41015625, + "step": 8161, + "time_per_iteration": 2.3589413166046143 + }, + { + "auxiliary_loss_clip": 0.01056837, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.02096963, + "balance_loss_mlp": 1.01815271, + "epoch": 0.49072598827596575, + "flos": 26613242876160.0, + "grad_norm": 1.600070159444585, + "language_loss": 0.70210409, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.7231276, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 8162, + "time_per_iteration": 2.3958933353424072 + }, + { + "auxiliary_loss_clip": 0.010605, + "auxiliary_loss_mlp": 0.01041619, + "balance_loss_clip": 1.01318741, + "balance_loss_mlp": 1.01808453, + "epoch": 0.4907861115286337, + "flos": 20411550915840.0, + "grad_norm": 1.8377627708938684, + "language_loss": 0.65298927, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.67401046, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42382812, + "step": 8163, + "time_per_iteration": 2.3561549186706543 + }, + { + "auxiliary_loss_clip": 0.01055658, + "auxiliary_loss_mlp": 0.01038754, + "balance_loss_clip": 1.01466203, + "balance_loss_mlp": 1.01761353, + "epoch": 0.4908462347813017, + "flos": 14063398335360.0, + "grad_norm": 2.2513844148426827, + "language_loss": 0.78545928, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.8064034, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38085938, + "step": 8164, + "time_per_iteration": 2.3424456119537354 + }, + { + "auxiliary_loss_clip": 0.01059656, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.01293588, + "balance_loss_mlp": 1.01826119, + "epoch": 0.49090635803396965, + "flos": 18734078954880.0, + "grad_norm": 1.5587581116206426, + "language_loss": 0.78153801, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.80253935, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.4140625, + "step": 8165, + "time_per_iteration": 5.008649110794067 + }, + { + "auxiliary_loss_clip": 0.01057022, + "auxiliary_loss_mlp": 0.01042096, + "balance_loss_clip": 1.01709807, + "balance_loss_mlp": 1.01797605, + "epoch": 0.4909664812866376, + "flos": 20557452954240.0, + "grad_norm": 1.707293990604302, + "language_loss": 0.78386128, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.80485243, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 8166, + "time_per_iteration": 2.4067025184631348 + }, + { + "auxiliary_loss_clip": 0.01009786, + "auxiliary_loss_mlp": 0.01004439, + "balance_loss_clip": 1.00228119, + "balance_loss_mlp": 1.00187361, + "epoch": 0.4910266045393056, + "flos": 54680686502400.0, + "grad_norm": 0.7930940472735298, + "language_loss": 0.54189771, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56203997, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.07910156, + "step": 8167, + "time_per_iteration": 4.395890712738037 + }, + { + "auxiliary_loss_clip": 0.01056674, + "auxiliary_loss_mlp": 0.01037215, + "balance_loss_clip": 1.01228786, + "balance_loss_mlp": 1.01771259, + "epoch": 0.49108672779197354, + "flos": 16245714689280.0, + "grad_norm": 1.866298655890613, + "language_loss": 0.8730545, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.89399338, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.390625, + "step": 8168, + "time_per_iteration": 2.333305597305298 + }, + { + "auxiliary_loss_clip": 0.01056552, + "auxiliary_loss_mlp": 0.01042329, + "balance_loss_clip": 1.0177598, + "balance_loss_mlp": 1.01893497, + "epoch": 0.4911468510446415, + "flos": 19824486537600.0, + "grad_norm": 1.685539208713929, + "language_loss": 0.74043083, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.76141965, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 8169, + "time_per_iteration": 2.377804756164551 + }, + { + "auxiliary_loss_clip": 0.01056392, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.00966084, + "balance_loss_mlp": 1.01764989, + "epoch": 0.4912069742973095, + "flos": 21211690521600.0, + "grad_norm": 1.5335941734494924, + "language_loss": 0.79503214, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.81592232, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.38671875, + "step": 8170, + "time_per_iteration": 2.3817551136016846 + }, + { + "auxiliary_loss_clip": 0.01057949, + "auxiliary_loss_mlp": 0.01040259, + "balance_loss_clip": 1.01491547, + "balance_loss_mlp": 1.01820052, + "epoch": 0.49126709754997744, + "flos": 19536103774080.0, + "grad_norm": 2.0576197823686733, + "language_loss": 0.77556258, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.79654467, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39648438, + "step": 8171, + "time_per_iteration": 2.391784429550171 + }, + { + "auxiliary_loss_clip": 0.01059434, + "auxiliary_loss_mlp": 0.01041316, + "balance_loss_clip": 1.01585281, + "balance_loss_mlp": 1.01899576, + "epoch": 0.4913272208026454, + "flos": 12238872261120.0, + "grad_norm": 1.7586130310535992, + "language_loss": 0.82804209, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.84904957, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40429688, + "step": 8172, + "time_per_iteration": 2.3469550609588623 + }, + { + "auxiliary_loss_clip": 0.01011973, + "auxiliary_loss_mlp": 0.01008965, + "balance_loss_clip": 1.00677204, + "balance_loss_mlp": 1.0038892, + "epoch": 0.49138734405531337, + "flos": 65462739661440.0, + "grad_norm": 0.6922227780624317, + "language_loss": 0.53407866, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55428803, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.08105469, + "step": 8173, + "time_per_iteration": 3.0295400619506836 + }, + { + "auxiliary_loss_clip": 0.01060324, + "auxiliary_loss_mlp": 0.01044419, + "balance_loss_clip": 1.01677406, + "balance_loss_mlp": 1.01982617, + "epoch": 0.4914474673079814, + "flos": 18438155337600.0, + "grad_norm": 1.9509399630197848, + "language_loss": 0.64335835, + "learning_rate": 2.152326591972107e-06, + "loss": 0.66440582, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 8174, + "time_per_iteration": 3.874443769454956 + }, + { + "auxiliary_loss_clip": 0.01058399, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.01603317, + "balance_loss_mlp": 1.01862979, + "epoch": 0.49150759056064935, + "flos": 21684100394880.0, + "grad_norm": 1.70332980914012, + "language_loss": 0.70608956, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.7270903, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8175, + "time_per_iteration": 2.3995161056518555 + }, + { + "auxiliary_loss_clip": 0.01058477, + "auxiliary_loss_mlp": 0.01040358, + "balance_loss_clip": 1.01506162, + "balance_loss_mlp": 1.01926398, + "epoch": 0.4915677138133173, + "flos": 22381350624000.0, + "grad_norm": 1.5661662011380222, + "language_loss": 0.75732219, + "learning_rate": 2.151549919570068e-06, + "loss": 0.77831054, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 8176, + "time_per_iteration": 2.396594524383545 + }, + { + "auxiliary_loss_clip": 0.01059683, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.01195168, + "balance_loss_mlp": 1.01991248, + "epoch": 0.4916278370659853, + "flos": 18401985302400.0, + "grad_norm": 1.9207388945249415, + "language_loss": 0.71162635, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.73260748, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 8177, + "time_per_iteration": 2.3862104415893555 + }, + { + "auxiliary_loss_clip": 0.01013131, + "auxiliary_loss_mlp": 0.01003184, + "balance_loss_clip": 1.00085938, + "balance_loss_mlp": 1.00515008, + "epoch": 0.49168796031865325, + "flos": 66605620884480.0, + "grad_norm": 0.6881099443012091, + "language_loss": 0.46261072, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48277387, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.07958984, + "step": 8178, + "time_per_iteration": 2.994487762451172 + }, + { + "auxiliary_loss_clip": 0.01062561, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_clip": 1.0166415, + "balance_loss_mlp": 1.02044821, + "epoch": 0.4917480835713212, + "flos": 20958290807040.0, + "grad_norm": 2.032064382773035, + "language_loss": 0.66825706, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68932664, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41992188, + "step": 8179, + "time_per_iteration": 2.4026243686676025 + }, + { + "auxiliary_loss_clip": 0.01061591, + "auxiliary_loss_mlp": 0.01042112, + "balance_loss_clip": 1.01551652, + "balance_loss_mlp": 1.01939702, + "epoch": 0.4918082068239892, + "flos": 15772152741120.0, + "grad_norm": 2.067148499491524, + "language_loss": 0.71206617, + "learning_rate": 2.149996505922343e-06, + "loss": 0.73310316, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 8180, + "time_per_iteration": 2.3385813236236572 + }, + { + "auxiliary_loss_clip": 0.01057598, + "auxiliary_loss_mlp": 0.01048606, + "balance_loss_clip": 1.02296424, + "balance_loss_mlp": 1.01857948, + "epoch": 0.49186833007665715, + "flos": 24603747085440.0, + "grad_norm": 1.6716814887533704, + "language_loss": 0.85415155, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.87521362, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.390625, + "step": 8181, + "time_per_iteration": 2.4223759174346924 + }, + { + "auxiliary_loss_clip": 0.01054489, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.01853585, + "balance_loss_mlp": 1.01671863, + "epoch": 0.4919284533293251, + "flos": 22089476724480.0, + "grad_norm": 1.908224590801519, + "language_loss": 0.74953169, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.77048808, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37890625, + "step": 8182, + "time_per_iteration": 2.3814311027526855 + }, + { + "auxiliary_loss_clip": 0.01058169, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.01803279, + "balance_loss_mlp": 1.0189625, + "epoch": 0.4919885765819931, + "flos": 23366913793920.0, + "grad_norm": 1.7460629112421506, + "language_loss": 0.73675412, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.75777221, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39257812, + "step": 8183, + "time_per_iteration": 2.4056015014648438 + }, + { + "auxiliary_loss_clip": 0.01059989, + "auxiliary_loss_mlp": 0.01044878, + "balance_loss_clip": 1.01630306, + "balance_loss_mlp": 1.01805937, + "epoch": 0.49204869983466104, + "flos": 21359442862080.0, + "grad_norm": 2.3384715469122948, + "language_loss": 0.78907377, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.81012237, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41992188, + "step": 8184, + "time_per_iteration": 2.3771708011627197 + }, + { + "auxiliary_loss_clip": 0.01058861, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.02183437, + "balance_loss_mlp": 1.01941299, + "epoch": 0.492108823087329, + "flos": 21141619689600.0, + "grad_norm": 2.419587360844459, + "language_loss": 0.72046697, + "learning_rate": 2.148054610995789e-06, + "loss": 0.74152064, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.39453125, + "step": 8185, + "time_per_iteration": 2.431516647338867 + }, + { + "auxiliary_loss_clip": 0.01058876, + "auxiliary_loss_mlp": 0.01051892, + "balance_loss_clip": 1.02588093, + "balance_loss_mlp": 1.01873326, + "epoch": 0.49216894633999697, + "flos": 25115503927680.0, + "grad_norm": 1.616783326946073, + "language_loss": 0.75547659, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77658427, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40234375, + "step": 8186, + "time_per_iteration": 2.407060146331787 + }, + { + "auxiliary_loss_clip": 0.01058643, + "auxiliary_loss_mlp": 0.01044112, + "balance_loss_clip": 1.01848185, + "balance_loss_mlp": 1.01809597, + "epoch": 0.49222906959266494, + "flos": 22636845020160.0, + "grad_norm": 2.244494213405238, + "language_loss": 0.69259828, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.71362585, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40429688, + "step": 8187, + "time_per_iteration": 2.4409854412078857 + }, + { + "auxiliary_loss_clip": 0.01057681, + "auxiliary_loss_mlp": 0.01051286, + "balance_loss_clip": 1.02542973, + "balance_loss_mlp": 1.01781988, + "epoch": 0.49228919284533296, + "flos": 20409560968320.0, + "grad_norm": 1.4235746634950386, + "language_loss": 0.67824864, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69933832, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8188, + "time_per_iteration": 2.3628766536712646 + }, + { + "auxiliary_loss_clip": 0.0105937, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_clip": 1.01963496, + "balance_loss_mlp": 1.019894, + "epoch": 0.4923493160980009, + "flos": 27121229291520.0, + "grad_norm": 1.6463921577382392, + "language_loss": 0.75319231, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.7742331, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39453125, + "step": 8189, + "time_per_iteration": 2.4471213817596436 + }, + { + "auxiliary_loss_clip": 0.01055319, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.01618814, + "balance_loss_mlp": 1.0178678, + "epoch": 0.4924094393506689, + "flos": 35735209931520.0, + "grad_norm": 1.8768953848888983, + "language_loss": 0.65482336, + "learning_rate": 2.146112575713104e-06, + "loss": 0.67576849, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.375, + "step": 8190, + "time_per_iteration": 2.510164260864258 + }, + { + "auxiliary_loss_clip": 0.01059002, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.01700461, + "balance_loss_mlp": 1.0202961, + "epoch": 0.49246956260333685, + "flos": 20411446181760.0, + "grad_norm": 1.9522367787871127, + "language_loss": 0.72348535, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.74447846, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38671875, + "step": 8191, + "time_per_iteration": 2.38382625579834 + }, + { + "auxiliary_loss_clip": 0.01058811, + "auxiliary_loss_mlp": 0.01042657, + "balance_loss_clip": 1.01788497, + "balance_loss_mlp": 1.01925564, + "epoch": 0.4925296858560048, + "flos": 38975569171200.0, + "grad_norm": 1.6071144186086403, + "language_loss": 0.72665834, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.74767303, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.39453125, + "step": 8192, + "time_per_iteration": 2.5199601650238037 + }, + { + "auxiliary_loss_clip": 0.01014084, + "auxiliary_loss_mlp": 0.01019631, + "balance_loss_clip": 1.01707995, + "balance_loss_mlp": 1.00607872, + "epoch": 0.4925898091086728, + "flos": 64275342721920.0, + "grad_norm": 0.7420362473476545, + "language_loss": 0.52282012, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54315722, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.08007812, + "step": 8193, + "time_per_iteration": 3.100430488586426 + }, + { + "auxiliary_loss_clip": 0.01058477, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.01847744, + "balance_loss_mlp": 1.01972675, + "epoch": 0.49264993236134075, + "flos": 23035343811840.0, + "grad_norm": 1.587204482645328, + "language_loss": 0.78063792, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.80166173, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 8194, + "time_per_iteration": 2.4033992290496826 + }, + { + "auxiliary_loss_clip": 0.01057854, + "auxiliary_loss_mlp": 0.01035911, + "balance_loss_clip": 1.01098454, + "balance_loss_mlp": 1.01891828, + "epoch": 0.4927100556140087, + "flos": 24717040047360.0, + "grad_norm": 2.095925504159037, + "language_loss": 0.71014535, + "learning_rate": 2.144170401915341e-06, + "loss": 0.73108298, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38867188, + "step": 8195, + "time_per_iteration": 2.4212963581085205 + }, + { + "auxiliary_loss_clip": 0.01061196, + "auxiliary_loss_mlp": 0.01039957, + "balance_loss_clip": 1.01575696, + "balance_loss_mlp": 1.02058458, + "epoch": 0.4927701788666767, + "flos": 23504646574080.0, + "grad_norm": 2.053252842670129, + "language_loss": 0.82147312, + "learning_rate": 2.143781950696001e-06, + "loss": 0.84248459, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.40625, + "step": 8196, + "time_per_iteration": 2.402113676071167 + }, + { + "auxiliary_loss_clip": 0.01059531, + "auxiliary_loss_mlp": 0.01039336, + "balance_loss_clip": 1.01334882, + "balance_loss_mlp": 1.01924455, + "epoch": 0.49283030211934464, + "flos": 22927811224320.0, + "grad_norm": 2.0380283180650056, + "language_loss": 0.71790075, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.7388894, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40234375, + "step": 8197, + "time_per_iteration": 2.4418551921844482 + }, + { + "auxiliary_loss_clip": 0.01056619, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.0131768, + "balance_loss_mlp": 1.01907778, + "epoch": 0.4928904253720126, + "flos": 16872091125120.0, + "grad_norm": 1.803349432958303, + "language_loss": 0.86203557, + "learning_rate": 2.143005031915374e-06, + "loss": 0.88295263, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.375, + "step": 8198, + "time_per_iteration": 2.3887691497802734 + }, + { + "auxiliary_loss_clip": 0.01062613, + "auxiliary_loss_mlp": 0.01046215, + "balance_loss_clip": 1.02104998, + "balance_loss_mlp": 1.02215624, + "epoch": 0.4929505486246806, + "flos": 14865667534080.0, + "grad_norm": 2.0101488196083723, + "language_loss": 0.77146447, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.79255277, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.40429688, + "step": 8199, + "time_per_iteration": 2.410046339035034 + }, + { + "auxiliary_loss_clip": 0.0106521, + "auxiliary_loss_mlp": 0.01050747, + "balance_loss_clip": 1.0217793, + "balance_loss_mlp": 1.02253222, + "epoch": 0.49301067187734854, + "flos": 23841208880640.0, + "grad_norm": 1.734005350232476, + "language_loss": 0.60565978, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62681937, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42773438, + "step": 8200, + "time_per_iteration": 2.4281396865844727 + }, + { + "auxiliary_loss_clip": 0.01057032, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.02030206, + "balance_loss_mlp": 1.02029479, + "epoch": 0.49307079513001656, + "flos": 22490209843200.0, + "grad_norm": 1.3854640134829341, + "language_loss": 0.80026418, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.82126689, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3671875, + "step": 8201, + "time_per_iteration": 2.474414348602295 + }, + { + "auxiliary_loss_clip": 0.01064648, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_clip": 1.01577067, + "balance_loss_mlp": 1.02217472, + "epoch": 0.4931309183826845, + "flos": 15923675508480.0, + "grad_norm": 1.9912349292768199, + "language_loss": 0.68719321, + "learning_rate": 2.141451129398785e-06, + "loss": 0.7082743, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42578125, + "step": 8202, + "time_per_iteration": 2.3576419353485107 + }, + { + "auxiliary_loss_clip": 0.0105939, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.0154022, + "balance_loss_mlp": 1.02025831, + "epoch": 0.4931910416353525, + "flos": 27307804930560.0, + "grad_norm": 1.9342093569552468, + "language_loss": 0.76569545, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.78668821, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.390625, + "step": 8203, + "time_per_iteration": 2.494159698486328 + }, + { + "auxiliary_loss_clip": 0.0105823, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.02187729, + "balance_loss_mlp": 1.01929832, + "epoch": 0.49325116488802045, + "flos": 20805301762560.0, + "grad_norm": 2.0748903592297796, + "language_loss": 0.81646097, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.83750737, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38867188, + "step": 8204, + "time_per_iteration": 3.678524971008301 + }, + { + "auxiliary_loss_clip": 0.01058667, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.02204037, + "balance_loss_mlp": 1.02051842, + "epoch": 0.4933112881406884, + "flos": 19864915758720.0, + "grad_norm": 1.9999615758946698, + "language_loss": 0.66783094, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68887484, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.38085938, + "step": 8205, + "time_per_iteration": 3.7977397441864014 + }, + { + "auxiliary_loss_clip": 0.01062808, + "auxiliary_loss_mlp": 0.01049974, + "balance_loss_clip": 1.02015948, + "balance_loss_mlp": 1.02042866, + "epoch": 0.4933714113933564, + "flos": 21827104790400.0, + "grad_norm": 1.8410449372653892, + "language_loss": 0.67192423, + "learning_rate": 2.139897141060744e-06, + "loss": 0.69305205, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.42382812, + "step": 8206, + "time_per_iteration": 3.7696752548217773 + }, + { + "auxiliary_loss_clip": 0.01058987, + "auxiliary_loss_mlp": 0.01051431, + "balance_loss_clip": 1.02668333, + "balance_loss_mlp": 1.01902962, + "epoch": 0.49343153464602435, + "flos": 27888934377600.0, + "grad_norm": 1.7751824703527865, + "language_loss": 0.77282161, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.79392576, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3984375, + "step": 8207, + "time_per_iteration": 2.4647908210754395 + }, + { + "auxiliary_loss_clip": 0.01058454, + "auxiliary_loss_mlp": 0.01047279, + "balance_loss_clip": 1.02015889, + "balance_loss_mlp": 1.01943207, + "epoch": 0.4934916578986923, + "flos": 24679927405440.0, + "grad_norm": 2.068294801362506, + "language_loss": 0.61973751, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.64079487, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.390625, + "step": 8208, + "time_per_iteration": 2.4150383472442627 + }, + { + "auxiliary_loss_clip": 0.01060296, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_clip": 1.01521969, + "balance_loss_mlp": 1.02012634, + "epoch": 0.4935517811513603, + "flos": 23403991524480.0, + "grad_norm": 2.0389138260299857, + "language_loss": 0.80702686, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.82805717, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40234375, + "step": 8209, + "time_per_iteration": 2.3893864154815674 + }, + { + "auxiliary_loss_clip": 0.01057415, + "auxiliary_loss_mlp": 0.01042129, + "balance_loss_clip": 1.01710665, + "balance_loss_mlp": 1.01856017, + "epoch": 0.49361190440402825, + "flos": 21943435040640.0, + "grad_norm": 2.5814654532514822, + "language_loss": 0.80068839, + "learning_rate": 2.138343067844089e-06, + "loss": 0.82168382, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 8210, + "time_per_iteration": 2.4062931537628174 + }, + { + "auxiliary_loss_clip": 0.01059778, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_clip": 1.02545047, + "balance_loss_mlp": 1.01803637, + "epoch": 0.4936720276566962, + "flos": 25114596232320.0, + "grad_norm": 2.3068374399036147, + "language_loss": 0.82420647, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.84534812, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 8211, + "time_per_iteration": 2.419793128967285 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01047263, + "balance_loss_clip": 1.02112007, + "balance_loss_mlp": 1.01956129, + "epoch": 0.4937321509093642, + "flos": 26357748480000.0, + "grad_norm": 2.359071535339109, + "language_loss": 0.92891288, + "learning_rate": 2.137565999700933e-06, + "loss": 0.94998014, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8212, + "time_per_iteration": 2.422281503677368 + }, + { + "auxiliary_loss_clip": 0.0105867, + "auxiliary_loss_mlp": 0.01045832, + "balance_loss_clip": 1.01979685, + "balance_loss_mlp": 1.01768172, + "epoch": 0.49379227416203214, + "flos": 22960420300800.0, + "grad_norm": 3.214978919155108, + "language_loss": 0.6608308, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.68187582, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41015625, + "step": 8213, + "time_per_iteration": 3.9596500396728516 + }, + { + "auxiliary_loss_clip": 0.01057959, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_clip": 1.01541507, + "balance_loss_mlp": 1.01793206, + "epoch": 0.49385239741470016, + "flos": 32487728774400.0, + "grad_norm": 2.8797496189176037, + "language_loss": 0.77202147, + "learning_rate": 2.136788910691711e-06, + "loss": 0.79301405, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40039062, + "step": 8214, + "time_per_iteration": 2.502980947494507 + }, + { + "auxiliary_loss_clip": 0.01059831, + "auxiliary_loss_mlp": 0.01047782, + "balance_loss_clip": 1.0199945, + "balance_loss_mlp": 1.01952517, + "epoch": 0.4939125206673681, + "flos": 22491745943040.0, + "grad_norm": 1.7245932757777973, + "language_loss": 0.85474902, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.87582517, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40234375, + "step": 8215, + "time_per_iteration": 2.4322280883789062 + }, + { + "auxiliary_loss_clip": 0.01055726, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.01541233, + "balance_loss_mlp": 1.0175302, + "epoch": 0.4939726439200361, + "flos": 31174994453760.0, + "grad_norm": 1.7278753821708002, + "language_loss": 0.84423363, + "learning_rate": 2.136011800934292e-06, + "loss": 0.8651849, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3828125, + "step": 8216, + "time_per_iteration": 2.4457359313964844 + }, + { + "auxiliary_loss_clip": 0.01057039, + "auxiliary_loss_mlp": 0.01049341, + "balance_loss_clip": 1.02247143, + "balance_loss_mlp": 1.01816869, + "epoch": 0.49403276717270406, + "flos": 22673119789440.0, + "grad_norm": 1.4016205944531097, + "language_loss": 0.74982679, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.77089059, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38867188, + "step": 8217, + "time_per_iteration": 2.4102461338043213 + }, + { + "auxiliary_loss_clip": 0.01056106, + "auxiliary_loss_mlp": 0.01042741, + "balance_loss_clip": 1.01429737, + "balance_loss_mlp": 1.0179292, + "epoch": 0.494092890425372, + "flos": 20740013786880.0, + "grad_norm": 2.874638033780201, + "language_loss": 0.79207373, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.81306219, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.3828125, + "step": 8218, + "time_per_iteration": 2.3693618774414062 + }, + { + "auxiliary_loss_clip": 0.01055265, + "auxiliary_loss_mlp": 0.01040624, + "balance_loss_clip": 1.0154705, + "balance_loss_mlp": 1.01732385, + "epoch": 0.49415301367804, + "flos": 18368049594240.0, + "grad_norm": 2.1609581600777026, + "language_loss": 0.77207458, + "learning_rate": 2.134846097653142e-06, + "loss": 0.79303348, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 8219, + "time_per_iteration": 2.3816721439361572 + }, + { + "auxiliary_loss_clip": 0.01058578, + "auxiliary_loss_mlp": 0.01044757, + "balance_loss_clip": 1.01870966, + "balance_loss_mlp": 1.01919842, + "epoch": 0.49421313693070795, + "flos": 17529645271680.0, + "grad_norm": 1.7449219582666309, + "language_loss": 0.63312006, + "learning_rate": 2.134457519646357e-06, + "loss": 0.65415347, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 8220, + "time_per_iteration": 2.345715045928955 + }, + { + "auxiliary_loss_clip": 0.01057716, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.01442099, + "balance_loss_mlp": 1.0182333, + "epoch": 0.4942732601833759, + "flos": 20811166871040.0, + "grad_norm": 2.6347165693063754, + "language_loss": 0.73311687, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.75409198, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39453125, + "step": 8221, + "time_per_iteration": 2.379610776901245 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_clip": 1.0167861, + "balance_loss_mlp": 1.02154303, + "epoch": 0.4943333834360439, + "flos": 15048053809920.0, + "grad_norm": 1.683034357917035, + "language_loss": 0.80705798, + "learning_rate": 2.133680348351595e-06, + "loss": 0.82809675, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.37890625, + "step": 8222, + "time_per_iteration": 2.383652925491333 + }, + { + "auxiliary_loss_clip": 0.01058995, + "auxiliary_loss_mlp": 0.01043173, + "balance_loss_clip": 1.01596963, + "balance_loss_mlp": 1.01968575, + "epoch": 0.49439350668871185, + "flos": 16069507724160.0, + "grad_norm": 2.7672905962714838, + "language_loss": 0.73846442, + "learning_rate": 2.133291755093088e-06, + "loss": 0.75948608, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39453125, + "step": 8223, + "time_per_iteration": 2.3419268131256104 + }, + { + "auxiliary_loss_clip": 0.01059319, + "auxiliary_loss_mlp": 0.01048618, + "balance_loss_clip": 1.02006686, + "balance_loss_mlp": 1.01865244, + "epoch": 0.4944536299413798, + "flos": 20879212844160.0, + "grad_norm": 1.83431303896175, + "language_loss": 0.76387358, + "learning_rate": 2.132903156780144e-06, + "loss": 0.784953, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40625, + "step": 8224, + "time_per_iteration": 2.402845621109009 + }, + { + "auxiliary_loss_clip": 0.01061231, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.01034784, + "balance_loss_mlp": 1.02050757, + "epoch": 0.4945137531940478, + "flos": 26607866526720.0, + "grad_norm": 7.233911542803125, + "language_loss": 0.65572631, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.67672509, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40625, + "step": 8225, + "time_per_iteration": 2.4194650650024414 + }, + { + "auxiliary_loss_clip": 0.01058097, + "auxiliary_loss_mlp": 0.01045157, + "balance_loss_clip": 1.01839447, + "balance_loss_mlp": 1.01845479, + "epoch": 0.49457387644671574, + "flos": 23987006184960.0, + "grad_norm": 2.0878148974518727, + "language_loss": 0.78442425, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.80545682, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39648438, + "step": 8226, + "time_per_iteration": 2.406301259994507 + }, + { + "auxiliary_loss_clip": 0.01059774, + "auxiliary_loss_mlp": 0.0104594, + "balance_loss_clip": 1.01663876, + "balance_loss_mlp": 1.0185287, + "epoch": 0.49463399969938376, + "flos": 26975466898560.0, + "grad_norm": 2.0779689600336533, + "language_loss": 0.7191276, + "learning_rate": 2.131737331662051e-06, + "loss": 0.74018472, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41210938, + "step": 8227, + "time_per_iteration": 2.409641981124878 + }, + { + "auxiliary_loss_clip": 0.01060873, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.01186097, + "balance_loss_mlp": 1.01938117, + "epoch": 0.49469412295205173, + "flos": 29680188059520.0, + "grad_norm": 1.6757593977383254, + "language_loss": 0.72395384, + "learning_rate": 2.131348713278718e-06, + "loss": 0.74496585, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 8228, + "time_per_iteration": 2.4470648765563965 + }, + { + "auxiliary_loss_clip": 0.01057676, + "auxiliary_loss_mlp": 0.0103787, + "balance_loss_clip": 1.01232314, + "balance_loss_mlp": 1.0190866, + "epoch": 0.4947542462047197, + "flos": 24130708807680.0, + "grad_norm": 2.1803024353828553, + "language_loss": 0.84825695, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.86921239, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 8229, + "time_per_iteration": 2.393721580505371 + }, + { + "auxiliary_loss_clip": 0.01058424, + "auxiliary_loss_mlp": 0.01039336, + "balance_loss_clip": 1.01271617, + "balance_loss_mlp": 1.01841962, + "epoch": 0.49481436945738766, + "flos": 20044090189440.0, + "grad_norm": 1.7613105186129552, + "language_loss": 0.76071262, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.78169024, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40039062, + "step": 8230, + "time_per_iteration": 2.372110366821289 + }, + { + "auxiliary_loss_clip": 0.01057751, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.01432025, + "balance_loss_mlp": 1.01919556, + "epoch": 0.4948744927100556, + "flos": 15668634960000.0, + "grad_norm": 2.3042528525182586, + "language_loss": 0.80709779, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.82807767, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 8231, + "time_per_iteration": 2.343997001647949 + }, + { + "auxiliary_loss_clip": 0.01012371, + "auxiliary_loss_mlp": 0.01004054, + "balance_loss_clip": 1.0013237, + "balance_loss_mlp": 1.0041256, + "epoch": 0.4949346159627236, + "flos": 68868481478400.0, + "grad_norm": 0.7592329598511163, + "language_loss": 0.60239762, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62256187, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.02734375, + "router_z_loss_mlp": 0.08251953, + "step": 8232, + "time_per_iteration": 3.1526379585266113 + }, + { + "auxiliary_loss_clip": 0.01061074, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.01737702, + "balance_loss_mlp": 1.01851606, + "epoch": 0.49499473921539155, + "flos": 24789135738240.0, + "grad_norm": 1.6230797994646526, + "language_loss": 0.70830625, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.7293812, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 8233, + "time_per_iteration": 2.403683662414551 + }, + { + "auxiliary_loss_clip": 0.01057225, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.00985718, + "balance_loss_mlp": 1.01788843, + "epoch": 0.4950548624680595, + "flos": 32706529464960.0, + "grad_norm": 1.9456900200172516, + "language_loss": 0.68128598, + "learning_rate": 2.129016898898633e-06, + "loss": 0.7022298, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 8234, + "time_per_iteration": 2.4702811241149902 + }, + { + "auxiliary_loss_clip": 0.01010441, + "auxiliary_loss_mlp": 0.01003804, + "balance_loss_clip": 1.00137222, + "balance_loss_mlp": 1.00242031, + "epoch": 0.4951149857207275, + "flos": 50079099196800.0, + "grad_norm": 0.8018207526563954, + "language_loss": 0.58040982, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60055226, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.08007812, + "step": 8235, + "time_per_iteration": 2.961571216583252 + }, + { + "auxiliary_loss_clip": 0.01059789, + "auxiliary_loss_mlp": 0.01041601, + "balance_loss_clip": 1.01511312, + "balance_loss_mlp": 1.01846182, + "epoch": 0.49517510897339545, + "flos": 22235692965120.0, + "grad_norm": 1.6477729791355038, + "language_loss": 0.78756118, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.80857503, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.4140625, + "step": 8236, + "time_per_iteration": 2.472034215927124 + }, + { + "auxiliary_loss_clip": 0.01057163, + "auxiliary_loss_mlp": 0.010419, + "balance_loss_clip": 1.01684213, + "balance_loss_mlp": 1.01876819, + "epoch": 0.4952352322260634, + "flos": 25372953360000.0, + "grad_norm": 4.534221281418266, + "language_loss": 0.73392206, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.75491273, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 8237, + "time_per_iteration": 2.4135754108428955 + }, + { + "auxiliary_loss_clip": 0.01055241, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.01708758, + "balance_loss_mlp": 1.01698208, + "epoch": 0.4952953554787314, + "flos": 24607552423680.0, + "grad_norm": 1.7641861101684626, + "language_loss": 0.77369076, + "learning_rate": 2.127462257935406e-06, + "loss": 0.79465616, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3828125, + "step": 8238, + "time_per_iteration": 2.4316391944885254 + }, + { + "auxiliary_loss_clip": 0.0105726, + "auxiliary_loss_mlp": 0.01039629, + "balance_loss_clip": 1.01472616, + "balance_loss_mlp": 1.01778364, + "epoch": 0.49535547873139935, + "flos": 17310320910720.0, + "grad_norm": 2.298793456518699, + "language_loss": 0.75660408, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.77757305, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.39453125, + "step": 8239, + "time_per_iteration": 2.3445820808410645 + }, + { + "auxiliary_loss_clip": 0.01060305, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_clip": 1.01810253, + "balance_loss_mlp": 1.0190109, + "epoch": 0.4954156019840673, + "flos": 20739280648320.0, + "grad_norm": 2.2227528083099655, + "language_loss": 0.81026626, + "learning_rate": 2.126684908394552e-06, + "loss": 0.83132827, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 8240, + "time_per_iteration": 2.3913705348968506 + }, + { + "auxiliary_loss_clip": 0.01056457, + "auxiliary_loss_mlp": 0.01048919, + "balance_loss_clip": 1.02470744, + "balance_loss_mlp": 1.01866508, + "epoch": 0.49547572523673533, + "flos": 12819931885440.0, + "grad_norm": 2.14093489475366, + "language_loss": 0.87083483, + "learning_rate": 2.126296226410898e-06, + "loss": 0.89188862, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 8241, + "time_per_iteration": 2.3323981761932373 + }, + { + "auxiliary_loss_clip": 0.01056568, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.01828289, + "balance_loss_mlp": 1.01829243, + "epoch": 0.4955358484894033, + "flos": 15596120332800.0, + "grad_norm": 1.8160669957564215, + "language_loss": 0.78426272, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.8052516, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 8242, + "time_per_iteration": 2.3722546100616455 + }, + { + "auxiliary_loss_clip": 0.01056333, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.01762605, + "balance_loss_mlp": 1.018013, + "epoch": 0.49559597174207126, + "flos": 26463291120000.0, + "grad_norm": 1.5853045444858458, + "language_loss": 0.68591177, + "learning_rate": 2.125518848090833e-06, + "loss": 0.70690072, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 8243, + "time_per_iteration": 3.659604787826538 + }, + { + "auxiliary_loss_clip": 0.01056146, + "auxiliary_loss_mlp": 0.01039028, + "balance_loss_clip": 1.01479268, + "balance_loss_mlp": 1.01813197, + "epoch": 0.4956560949947392, + "flos": 23147135585280.0, + "grad_norm": 2.120463247138215, + "language_loss": 0.6966446, + "learning_rate": 2.125130151783901e-06, + "loss": 0.71759635, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 8244, + "time_per_iteration": 2.382580280303955 + }, + { + "auxiliary_loss_clip": 0.01057607, + "auxiliary_loss_mlp": 0.01040231, + "balance_loss_clip": 1.01574492, + "balance_loss_mlp": 1.01856995, + "epoch": 0.4957162182474072, + "flos": 20772518129280.0, + "grad_norm": 1.764747407804806, + "language_loss": 0.77220893, + "learning_rate": 2.12474145073202e-06, + "loss": 0.79318726, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.390625, + "step": 8245, + "time_per_iteration": 3.8329248428344727 + }, + { + "auxiliary_loss_clip": 0.01056298, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.01903617, + "epoch": 0.49577634150007516, + "flos": 18733206170880.0, + "grad_norm": 1.8820548254730949, + "language_loss": 0.83288509, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.85386395, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37304688, + "step": 8246, + "time_per_iteration": 3.8300249576568604 + }, + { + "auxiliary_loss_clip": 0.01059875, + "auxiliary_loss_mlp": 0.01048698, + "balance_loss_clip": 1.02203107, + "balance_loss_mlp": 1.01938438, + "epoch": 0.4958364647527431, + "flos": 25553070397440.0, + "grad_norm": 1.5588775827074368, + "language_loss": 0.85856116, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.8796469, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40429688, + "step": 8247, + "time_per_iteration": 2.4158501625061035 + }, + { + "auxiliary_loss_clip": 0.01057955, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.01450384, + "balance_loss_mlp": 1.01824427, + "epoch": 0.4958965880054111, + "flos": 24424188629760.0, + "grad_norm": 1.925415690534848, + "language_loss": 0.84879696, + "learning_rate": 2.123575319254087e-06, + "loss": 0.86977255, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3984375, + "step": 8248, + "time_per_iteration": 2.4266536235809326 + }, + { + "auxiliary_loss_clip": 0.01058265, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.01501215, + "balance_loss_mlp": 1.01810575, + "epoch": 0.49595671125807905, + "flos": 25082266446720.0, + "grad_norm": 1.9074328057804188, + "language_loss": 0.75068861, + "learning_rate": 2.123186599369812e-06, + "loss": 0.77168673, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40039062, + "step": 8249, + "time_per_iteration": 2.3924858570098877 + }, + { + "auxiliary_loss_clip": 0.01059689, + "auxiliary_loss_mlp": 0.0104862, + "balance_loss_clip": 1.02339578, + "balance_loss_mlp": 1.01899743, + "epoch": 0.496016834510747, + "flos": 16434943591680.0, + "grad_norm": 1.7506248804016296, + "language_loss": 0.76848024, + "learning_rate": 2.122797874814289e-06, + "loss": 0.7895633, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40820312, + "step": 8250, + "time_per_iteration": 2.346207857131958 + }, + { + "auxiliary_loss_clip": 0.0105748, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.01554108, + "balance_loss_mlp": 1.01818657, + "epoch": 0.496076957763415, + "flos": 23436879891840.0, + "grad_norm": 1.719138315727723, + "language_loss": 0.71238661, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.7333678, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39453125, + "step": 8251, + "time_per_iteration": 2.4118151664733887 + }, + { + "auxiliary_loss_clip": 0.01057241, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.01574683, + "balance_loss_mlp": 1.0191679, + "epoch": 0.49613708101608295, + "flos": 16908575362560.0, + "grad_norm": 1.8160084496475115, + "language_loss": 0.80856979, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82954013, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38085938, + "step": 8252, + "time_per_iteration": 3.8295669555664062 + }, + { + "auxiliary_loss_clip": 0.01057917, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.01193666, + "balance_loss_mlp": 1.01865101, + "epoch": 0.4961972042687509, + "flos": 16617155310720.0, + "grad_norm": 1.6782129764589253, + "language_loss": 0.82181168, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.84275758, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.39257812, + "step": 8253, + "time_per_iteration": 2.3403515815734863 + }, + { + "auxiliary_loss_clip": 0.01058111, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.01226223, + "balance_loss_mlp": 1.01938605, + "epoch": 0.49625732752141893, + "flos": 28955286167040.0, + "grad_norm": 1.4187591809414455, + "language_loss": 0.68238151, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.70332605, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38671875, + "step": 8254, + "time_per_iteration": 2.507251024246216 + }, + { + "auxiliary_loss_clip": 0.01057738, + "auxiliary_loss_mlp": 0.01039341, + "balance_loss_clip": 1.01354384, + "balance_loss_mlp": 1.01821995, + "epoch": 0.4963174507740869, + "flos": 23111244840960.0, + "grad_norm": 5.980877644029546, + "language_loss": 0.75330126, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.77427208, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 8255, + "time_per_iteration": 2.377192497253418 + }, + { + "auxiliary_loss_clip": 0.01056213, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.0124259, + "balance_loss_mlp": 1.01862264, + "epoch": 0.49637757402675486, + "flos": 13917007537920.0, + "grad_norm": 1.6599195084239557, + "language_loss": 0.82801437, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.84894049, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.375, + "step": 8256, + "time_per_iteration": 2.3584859371185303 + }, + { + "auxiliary_loss_clip": 0.01055936, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.01380849, + "balance_loss_mlp": 1.01834297, + "epoch": 0.49643769727942283, + "flos": 22307928301440.0, + "grad_norm": 1.39209644965911, + "language_loss": 0.81800514, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83893442, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 8257, + "time_per_iteration": 2.3938443660736084 + }, + { + "auxiliary_loss_clip": 0.01059225, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_clip": 1.01474309, + "balance_loss_mlp": 1.01797485, + "epoch": 0.4964978205320908, + "flos": 19499235511680.0, + "grad_norm": 1.7076872086477162, + "language_loss": 0.67265123, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.69367331, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 8258, + "time_per_iteration": 2.3799993991851807 + }, + { + "auxiliary_loss_clip": 0.01055057, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.01349139, + "balance_loss_mlp": 1.01776302, + "epoch": 0.49655794378475876, + "flos": 23435518348800.0, + "grad_norm": 1.7292266309515187, + "language_loss": 0.7824496, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.80336273, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.37304688, + "step": 8259, + "time_per_iteration": 2.40377140045166 + }, + { + "auxiliary_loss_clip": 0.01055037, + "auxiliary_loss_mlp": 0.01043587, + "balance_loss_clip": 1.01814818, + "balance_loss_mlp": 1.01723886, + "epoch": 0.4966180670374267, + "flos": 26829983796480.0, + "grad_norm": 1.49680573693927, + "language_loss": 0.79356432, + "learning_rate": 2.1189103755834e-06, + "loss": 0.81455064, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 8260, + "time_per_iteration": 2.430732488632202 + }, + { + "auxiliary_loss_clip": 0.01056898, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.011724, + "balance_loss_mlp": 1.01765943, + "epoch": 0.4966781902900947, + "flos": 22008478636800.0, + "grad_norm": 2.976926334183822, + "language_loss": 0.772017, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.79295695, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 8261, + "time_per_iteration": 2.350372314453125 + }, + { + "auxiliary_loss_clip": 0.01055006, + "auxiliary_loss_mlp": 0.01039435, + "balance_loss_clip": 1.0144366, + "balance_loss_mlp": 1.01736712, + "epoch": 0.49673831354276266, + "flos": 26212160643840.0, + "grad_norm": 2.0532473177015107, + "language_loss": 0.9041543, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.92509872, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 8262, + "time_per_iteration": 2.4180643558502197 + }, + { + "auxiliary_loss_clip": 0.01055473, + "auxiliary_loss_mlp": 0.01037725, + "balance_loss_clip": 1.01456237, + "balance_loss_mlp": 1.01785517, + "epoch": 0.4967984367954306, + "flos": 23181245850240.0, + "grad_norm": 1.4658211795261253, + "language_loss": 0.74655414, + "learning_rate": 2.11774403721606e-06, + "loss": 0.7674861, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.375, + "step": 8263, + "time_per_iteration": 2.390300750732422 + }, + { + "auxiliary_loss_clip": 0.01059138, + "auxiliary_loss_mlp": 0.010407, + "balance_loss_clip": 1.01445055, + "balance_loss_mlp": 1.01882911, + "epoch": 0.4968585600480986, + "flos": 19280434821120.0, + "grad_norm": 2.6430504878197625, + "language_loss": 0.71569145, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.73668987, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 8264, + "time_per_iteration": 2.34977650642395 + }, + { + "auxiliary_loss_clip": 0.01058082, + "auxiliary_loss_mlp": 0.01042899, + "balance_loss_clip": 1.01586223, + "balance_loss_mlp": 1.01724446, + "epoch": 0.49691868330076655, + "flos": 22527601776000.0, + "grad_norm": 1.3203638450665252, + "language_loss": 0.65867084, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67968059, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 8265, + "time_per_iteration": 2.40763521194458 + }, + { + "auxiliary_loss_clip": 0.01011844, + "auxiliary_loss_mlp": 0.01010098, + "balance_loss_clip": 1.00772548, + "balance_loss_mlp": 1.00407755, + "epoch": 0.4969788065534345, + "flos": 66573500567040.0, + "grad_norm": 0.8778604960501811, + "language_loss": 0.53602797, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55624735, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07763672, + "step": 8266, + "time_per_iteration": 3.0847082138061523 + }, + { + "auxiliary_loss_clip": 0.01054434, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.01787806, + "balance_loss_mlp": 1.01648557, + "epoch": 0.49703892980610254, + "flos": 24058403648640.0, + "grad_norm": 1.5487698452913634, + "language_loss": 0.80149132, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.8224566, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 8267, + "time_per_iteration": 2.3897550106048584 + }, + { + "auxiliary_loss_clip": 0.01057648, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.01413858, + "balance_loss_mlp": 1.01827085, + "epoch": 0.4970990530587705, + "flos": 29125069441920.0, + "grad_norm": 3.2226164881860795, + "language_loss": 0.76289195, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.78386891, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 8268, + "time_per_iteration": 2.443751573562622 + }, + { + "auxiliary_loss_clip": 0.0105687, + "auxiliary_loss_mlp": 0.01050046, + "balance_loss_clip": 1.02221107, + "balance_loss_mlp": 1.01789629, + "epoch": 0.49715917631143847, + "flos": 46024393294080.0, + "grad_norm": 2.359235837795071, + "language_loss": 0.68424904, + "learning_rate": 2.115411240328073e-06, + "loss": 0.70531821, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.390625, + "step": 8269, + "time_per_iteration": 2.5788111686706543 + }, + { + "auxiliary_loss_clip": 0.01055177, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.01614332, + "balance_loss_mlp": 1.01810014, + "epoch": 0.49721929956410643, + "flos": 20190306430080.0, + "grad_norm": 1.5999172189156035, + "language_loss": 0.86620581, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.88718486, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37109375, + "step": 8270, + "time_per_iteration": 2.3895368576049805 + }, + { + "auxiliary_loss_clip": 0.01057732, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.01428199, + "balance_loss_mlp": 1.01794052, + "epoch": 0.4972794228167744, + "flos": 21652468836480.0, + "grad_norm": 1.77996847744912, + "language_loss": 0.71331424, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73427618, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3984375, + "step": 8271, + "time_per_iteration": 2.3924336433410645 + }, + { + "auxiliary_loss_clip": 0.01058817, + "auxiliary_loss_mlp": 0.01049094, + "balance_loss_clip": 1.02270114, + "balance_loss_mlp": 1.01929998, + "epoch": 0.49733954606944236, + "flos": 24278600793600.0, + "grad_norm": 1.3922667737380563, + "language_loss": 0.79138064, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.81245983, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 8272, + "time_per_iteration": 2.4235055446624756 + }, + { + "auxiliary_loss_clip": 0.01060565, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02389014, + "balance_loss_mlp": 1.02015841, + "epoch": 0.4973996693221103, + "flos": 37851051323520.0, + "grad_norm": 2.6155117073783316, + "language_loss": 0.67831993, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.69941944, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40429688, + "step": 8273, + "time_per_iteration": 2.5213491916656494 + }, + { + "auxiliary_loss_clip": 0.01056263, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02191091, + "balance_loss_mlp": 1.01767182, + "epoch": 0.4974597925747783, + "flos": 21360350557440.0, + "grad_norm": 2.405273395234122, + "language_loss": 0.78847778, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80951434, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 8274, + "time_per_iteration": 2.3747353553771973 + }, + { + "auxiliary_loss_clip": 0.01060881, + "auxiliary_loss_mlp": 0.01043925, + "balance_loss_clip": 1.01669741, + "balance_loss_mlp": 1.01912725, + "epoch": 0.49751991582744626, + "flos": 30736799579520.0, + "grad_norm": 1.676841084617851, + "language_loss": 0.77064574, + "learning_rate": 2.113078285889493e-06, + "loss": 0.79169381, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41796875, + "step": 8275, + "time_per_iteration": 2.4686007499694824 + }, + { + "auxiliary_loss_clip": 0.01061168, + "auxiliary_loss_mlp": 0.01050159, + "balance_loss_clip": 1.02059484, + "balance_loss_mlp": 1.01823354, + "epoch": 0.4975800390801142, + "flos": 14099673104640.0, + "grad_norm": 2.0105341458290553, + "language_loss": 0.85548866, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.87660193, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 8276, + "time_per_iteration": 2.345021963119507 + }, + { + "auxiliary_loss_clip": 0.01056519, + "auxiliary_loss_mlp": 0.010399, + "balance_loss_clip": 1.01662993, + "balance_loss_mlp": 1.01854205, + "epoch": 0.4976401623327822, + "flos": 24206121077760.0, + "grad_norm": 1.3618038905344974, + "language_loss": 0.70892239, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72988653, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 8277, + "time_per_iteration": 2.406276226043701 + }, + { + "auxiliary_loss_clip": 0.01057578, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.01089907, + "balance_loss_mlp": 1.01853156, + "epoch": 0.49770028558545015, + "flos": 21135859315200.0, + "grad_norm": 1.8975617125947426, + "language_loss": 0.832546, + "learning_rate": 2.111911750583964e-06, + "loss": 0.85347533, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.390625, + "step": 8278, + "time_per_iteration": 2.3692667484283447 + }, + { + "auxiliary_loss_clip": 0.01059245, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.01287568, + "balance_loss_mlp": 1.01870084, + "epoch": 0.4977604088381181, + "flos": 16762987526400.0, + "grad_norm": 2.0557425726269143, + "language_loss": 0.6866653, + "learning_rate": 2.111522896975052e-06, + "loss": 0.70765078, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40625, + "step": 8279, + "time_per_iteration": 2.3636441230773926 + }, + { + "auxiliary_loss_clip": 0.01059165, + "auxiliary_loss_mlp": 0.01045068, + "balance_loss_clip": 1.01567125, + "balance_loss_mlp": 1.01899564, + "epoch": 0.49782053209078614, + "flos": 15702675402240.0, + "grad_norm": 2.0791328991282074, + "language_loss": 0.71145296, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.73249531, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40234375, + "step": 8280, + "time_per_iteration": 2.3368418216705322 + }, + { + "auxiliary_loss_clip": 0.01057111, + "auxiliary_loss_mlp": 0.01042968, + "balance_loss_clip": 1.01709986, + "balance_loss_mlp": 1.01731038, + "epoch": 0.4978806553434541, + "flos": 24752546766720.0, + "grad_norm": 1.636481987756457, + "language_loss": 0.65314722, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67414796, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8281, + "time_per_iteration": 2.422330856323242 + }, + { + "auxiliary_loss_clip": 0.01061754, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.0150671, + "balance_loss_mlp": 1.01937366, + "epoch": 0.49794077859612207, + "flos": 13114877984640.0, + "grad_norm": 2.0180811690045632, + "language_loss": 0.74816364, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.76923561, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42382812, + "step": 8282, + "time_per_iteration": 2.340963125228882 + }, + { + "auxiliary_loss_clip": 0.01055711, + "auxiliary_loss_mlp": 0.01040721, + "balance_loss_clip": 1.01587796, + "balance_loss_mlp": 1.01791096, + "epoch": 0.49800090184879003, + "flos": 27523952357760.0, + "grad_norm": 1.8078166414828842, + "language_loss": 0.74251735, + "learning_rate": 2.109967440397263e-06, + "loss": 0.76348174, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 8283, + "time_per_iteration": 3.7654354572296143 + }, + { + "auxiliary_loss_clip": 0.01057328, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.01332927, + "balance_loss_mlp": 1.01874554, + "epoch": 0.498061025101458, + "flos": 19791458524800.0, + "grad_norm": 1.5508979820456046, + "language_loss": 0.8000958, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.8210665, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38671875, + "step": 8284, + "time_per_iteration": 3.8061962127685547 + }, + { + "auxiliary_loss_clip": 0.01062162, + "auxiliary_loss_mlp": 0.010444, + "balance_loss_clip": 1.01491976, + "balance_loss_mlp": 1.01960957, + "epoch": 0.49812114835412596, + "flos": 29892739616640.0, + "grad_norm": 1.6808502334980133, + "language_loss": 0.74209595, + "learning_rate": 2.109189687029526e-06, + "loss": 0.76316166, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42578125, + "step": 8285, + "time_per_iteration": 2.429410934448242 + }, + { + "auxiliary_loss_clip": 0.01059392, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.01347899, + "balance_loss_mlp": 1.01956034, + "epoch": 0.49818127160679393, + "flos": 23145983510400.0, + "grad_norm": 1.8468629873369473, + "language_loss": 0.75310767, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.7740978, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8286, + "time_per_iteration": 3.7117197513580322 + }, + { + "auxiliary_loss_clip": 0.01060767, + "auxiliary_loss_mlp": 0.0104992, + "balance_loss_clip": 1.02030826, + "balance_loss_mlp": 1.01998591, + "epoch": 0.4982413948594619, + "flos": 21651735697920.0, + "grad_norm": 2.2733321888744515, + "language_loss": 0.8598187, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.8809256, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40820312, + "step": 8287, + "time_per_iteration": 2.407097101211548 + }, + { + "auxiliary_loss_clip": 0.0105733, + "auxiliary_loss_mlp": 0.01038679, + "balance_loss_clip": 1.01054549, + "balance_loss_mlp": 1.01719713, + "epoch": 0.49830151811212986, + "flos": 32485669004160.0, + "grad_norm": 1.7787187692651987, + "language_loss": 0.73426986, + "learning_rate": 2.108023025961159e-06, + "loss": 0.75522995, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40039062, + "step": 8288, + "time_per_iteration": 2.4545786380767822 + }, + { + "auxiliary_loss_clip": 0.01061814, + "auxiliary_loss_mlp": 0.01043221, + "balance_loss_clip": 1.01486135, + "balance_loss_mlp": 1.0194757, + "epoch": 0.4983616413647978, + "flos": 18141603315840.0, + "grad_norm": 2.6615338640999258, + "language_loss": 0.82219923, + "learning_rate": 2.10763413072622e-06, + "loss": 0.84324962, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.42382812, + "step": 8289, + "time_per_iteration": 2.338757276535034 + }, + { + "auxiliary_loss_clip": 0.01057142, + "auxiliary_loss_mlp": 0.01042593, + "balance_loss_clip": 1.01715326, + "balance_loss_mlp": 1.01791239, + "epoch": 0.4984217646174658, + "flos": 19717826734080.0, + "grad_norm": 2.1296818544376226, + "language_loss": 0.74670351, + "learning_rate": 2.107245231409784e-06, + "loss": 0.76770091, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 8290, + "time_per_iteration": 2.38388991355896 + }, + { + "auxiliary_loss_clip": 0.01060101, + "auxiliary_loss_mlp": 0.01047502, + "balance_loss_clip": 1.01920128, + "balance_loss_mlp": 1.01945734, + "epoch": 0.49848188787013376, + "flos": 24935386890240.0, + "grad_norm": 1.5512956769544037, + "language_loss": 0.85108519, + "learning_rate": 2.106856328026598e-06, + "loss": 0.87216115, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40625, + "step": 8291, + "time_per_iteration": 2.402104377746582 + }, + { + "auxiliary_loss_clip": 0.0106208, + "auxiliary_loss_mlp": 0.01045066, + "balance_loss_clip": 1.01464415, + "balance_loss_mlp": 1.02038193, + "epoch": 0.4985420111228017, + "flos": 22381350624000.0, + "grad_norm": 1.6096658856160297, + "language_loss": 0.68583584, + "learning_rate": 2.106467420591409e-06, + "loss": 0.70690733, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.41796875, + "step": 8292, + "time_per_iteration": 3.84275221824646 + }, + { + "auxiliary_loss_clip": 0.01057617, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.01366138, + "balance_loss_mlp": 1.01820469, + "epoch": 0.4986021343754697, + "flos": 16215549408000.0, + "grad_norm": 1.628685254795149, + "language_loss": 0.68466246, + "learning_rate": 2.106078509118965e-06, + "loss": 0.70565069, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.39453125, + "step": 8293, + "time_per_iteration": 2.3629579544067383 + }, + { + "auxiliary_loss_clip": 0.01059958, + "auxiliary_loss_mlp": 0.01040977, + "balance_loss_clip": 1.01522756, + "balance_loss_mlp": 1.0196631, + "epoch": 0.4986622576281377, + "flos": 23402490336000.0, + "grad_norm": 1.9401691016118043, + "language_loss": 0.83782065, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.85882998, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 8294, + "time_per_iteration": 2.3775217533111572 + }, + { + "auxiliary_loss_clip": 0.01059656, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.01475835, + "balance_loss_mlp": 1.01902437, + "epoch": 0.49872238088080567, + "flos": 19973530598400.0, + "grad_norm": 2.0177060711633743, + "language_loss": 0.7408365, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.76184422, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40625, + "step": 8295, + "time_per_iteration": 2.4274377822875977 + }, + { + "auxiliary_loss_clip": 0.01059483, + "auxiliary_loss_mlp": 0.01042315, + "balance_loss_clip": 1.0163269, + "balance_loss_mlp": 1.01978278, + "epoch": 0.49878250413347364, + "flos": 22891920480000.0, + "grad_norm": 2.0068468908620396, + "language_loss": 0.69029319, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.7113111, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 8296, + "time_per_iteration": 2.369084596633911 + }, + { + "auxiliary_loss_clip": 0.01060554, + "auxiliary_loss_mlp": 0.01047776, + "balance_loss_clip": 1.01928484, + "balance_loss_mlp": 1.01910615, + "epoch": 0.4988426273861416, + "flos": 32597076752640.0, + "grad_norm": 2.5595771277196526, + "language_loss": 0.6590457, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.68012893, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 8297, + "time_per_iteration": 2.510134220123291 + }, + { + "auxiliary_loss_clip": 0.01058063, + "auxiliary_loss_mlp": 0.01042746, + "balance_loss_clip": 1.01878452, + "balance_loss_mlp": 1.01926196, + "epoch": 0.49890275063880957, + "flos": 20922539708160.0, + "grad_norm": 1.7035805446901031, + "language_loss": 0.708915, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.72992313, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.38867188, + "step": 8298, + "time_per_iteration": 2.379854440689087 + }, + { + "auxiliary_loss_clip": 0.01055244, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.01554489, + "balance_loss_mlp": 1.01769328, + "epoch": 0.49896287389147753, + "flos": 18623474167680.0, + "grad_norm": 1.8161767758811025, + "language_loss": 0.85744816, + "learning_rate": 2.103744956327814e-06, + "loss": 0.87840784, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 8299, + "time_per_iteration": 2.386828660964966 + }, + { + "auxiliary_loss_clip": 0.0106013, + "auxiliary_loss_mlp": 0.01048981, + "balance_loss_clip": 1.02147913, + "balance_loss_mlp": 1.01874757, + "epoch": 0.4990229971441455, + "flos": 24825410507520.0, + "grad_norm": 2.046608705948758, + "language_loss": 0.70276558, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.72385669, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.4140625, + "step": 8300, + "time_per_iteration": 2.4137816429138184 + }, + { + "auxiliary_loss_clip": 0.01013102, + "auxiliary_loss_mlp": 0.01003211, + "balance_loss_clip": 1.00083876, + "balance_loss_mlp": 1.0052166, + "epoch": 0.49908312039681346, + "flos": 71381006271360.0, + "grad_norm": 0.7630893134157045, + "language_loss": 0.51182795, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53199112, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07910156, + "step": 8301, + "time_per_iteration": 3.1143600940704346 + }, + { + "auxiliary_loss_clip": 0.01056158, + "auxiliary_loss_mlp": 0.01042166, + "balance_loss_clip": 1.01733494, + "balance_loss_mlp": 1.01793706, + "epoch": 0.4991432436494814, + "flos": 19827628560000.0, + "grad_norm": 1.7354825675275027, + "language_loss": 0.85556626, + "learning_rate": 2.102578126623879e-06, + "loss": 0.87654948, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 8302, + "time_per_iteration": 2.367746591567993 + }, + { + "auxiliary_loss_clip": 0.01057333, + "auxiliary_loss_mlp": 0.01039426, + "balance_loss_clip": 1.01447558, + "balance_loss_mlp": 1.0193429, + "epoch": 0.4992033669021494, + "flos": 15121022284800.0, + "grad_norm": 1.8064005309037934, + "language_loss": 0.70976365, + "learning_rate": 2.102189175590024e-06, + "loss": 0.73073119, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 8303, + "time_per_iteration": 2.372868537902832 + }, + { + "auxiliary_loss_clip": 0.01057974, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_clip": 1.017717, + "balance_loss_mlp": 1.01764441, + "epoch": 0.49926349015481736, + "flos": 31206730746240.0, + "grad_norm": 1.6156969889188246, + "language_loss": 0.73176634, + "learning_rate": 2.101800220681144e-06, + "loss": 0.75279284, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 8304, + "time_per_iteration": 2.466909646987915 + }, + { + "auxiliary_loss_clip": 0.01058342, + "auxiliary_loss_mlp": 0.01047008, + "balance_loss_clip": 1.02276146, + "balance_loss_mlp": 1.01928937, + "epoch": 0.4993236134074853, + "flos": 24899042298240.0, + "grad_norm": 1.9084815674525235, + "language_loss": 0.8269977, + "learning_rate": 2.10141126191199e-06, + "loss": 0.84805119, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.390625, + "step": 8305, + "time_per_iteration": 2.426361083984375 + }, + { + "auxiliary_loss_clip": 0.01012891, + "auxiliary_loss_mlp": 0.01006323, + "balance_loss_clip": 1.00386715, + "balance_loss_mlp": 1.00482821, + "epoch": 0.4993837366601533, + "flos": 70417508947200.0, + "grad_norm": 0.7143745862809179, + "language_loss": 0.56988788, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.59008002, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.08056641, + "step": 8306, + "time_per_iteration": 3.1281819343566895 + }, + { + "auxiliary_loss_clip": 0.01058708, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_clip": 1.01868999, + "balance_loss_mlp": 1.01986003, + "epoch": 0.4994438599128213, + "flos": 15960299391360.0, + "grad_norm": 1.8696654965247725, + "language_loss": 0.84119439, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.86224151, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 8307, + "time_per_iteration": 2.4603028297424316 + }, + { + "auxiliary_loss_clip": 0.01057134, + "auxiliary_loss_mlp": 0.01043364, + "balance_loss_clip": 1.01811576, + "balance_loss_mlp": 1.01838112, + "epoch": 0.4995039831654893, + "flos": 27927059448960.0, + "grad_norm": 1.6743250367160234, + "language_loss": 0.6223284, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.64333338, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38867188, + "step": 8308, + "time_per_iteration": 2.4142115116119385 + }, + { + "auxiliary_loss_clip": 0.01055367, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_clip": 1.01707518, + "balance_loss_mlp": 1.0168364, + "epoch": 0.49956410641815724, + "flos": 24203712193920.0, + "grad_norm": 1.503919186996274, + "language_loss": 0.75408769, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.77505749, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38671875, + "step": 8309, + "time_per_iteration": 2.410120725631714 + }, + { + "auxiliary_loss_clip": 0.01056565, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.01678014, + "balance_loss_mlp": 1.01794457, + "epoch": 0.4996242296708252, + "flos": 16179204816000.0, + "grad_norm": 2.2571429493541, + "language_loss": 0.80262285, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82359964, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.38671875, + "step": 8310, + "time_per_iteration": 2.342151403427124 + }, + { + "auxiliary_loss_clip": 0.01058953, + "auxiliary_loss_mlp": 0.01041744, + "balance_loss_clip": 1.015136, + "balance_loss_mlp": 1.01867652, + "epoch": 0.49968435292349317, + "flos": 16872579884160.0, + "grad_norm": 1.6727870335259727, + "language_loss": 0.71905828, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.74006522, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 8311, + "time_per_iteration": 2.372880220413208 + }, + { + "auxiliary_loss_clip": 0.01058838, + "auxiliary_loss_mlp": 0.01045261, + "balance_loss_clip": 1.01945174, + "balance_loss_mlp": 1.01904786, + "epoch": 0.49974447617616113, + "flos": 14938636008960.0, + "grad_norm": 1.8744749645010346, + "language_loss": 0.78390902, + "learning_rate": 2.098688443679187e-06, + "loss": 0.80495006, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8312, + "time_per_iteration": 2.3408703804016113 + }, + { + "auxiliary_loss_clip": 0.0105807, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.0182786, + "balance_loss_mlp": 1.01885509, + "epoch": 0.4998045994288291, + "flos": 26650320606720.0, + "grad_norm": 1.8132530784581995, + "language_loss": 0.8597635, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.88078064, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 8313, + "time_per_iteration": 2.4247639179229736 + }, + { + "auxiliary_loss_clip": 0.01058558, + "auxiliary_loss_mlp": 0.01037616, + "balance_loss_clip": 1.01185489, + "balance_loss_mlp": 1.01843429, + "epoch": 0.49986472268149706, + "flos": 20952879546240.0, + "grad_norm": 2.215714604499613, + "language_loss": 0.81617808, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83713984, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 8314, + "time_per_iteration": 2.3642492294311523 + }, + { + "auxiliary_loss_clip": 0.01061656, + "auxiliary_loss_mlp": 0.01050949, + "balance_loss_clip": 1.02100408, + "balance_loss_mlp": 1.0206244, + "epoch": 0.49992484593416503, + "flos": 22782781969920.0, + "grad_norm": 1.7104089272185572, + "language_loss": 0.80318224, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.82430828, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41015625, + "step": 8315, + "time_per_iteration": 2.4030568599700928 + }, + { + "auxiliary_loss_clip": 0.01058223, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.01566935, + "balance_loss_mlp": 1.0191282, + "epoch": 0.499984969186833, + "flos": 46785325576320.0, + "grad_norm": 1.6494547227148133, + "language_loss": 0.75483966, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.7758224, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.390625, + "step": 8316, + "time_per_iteration": 2.592008590698242 + }, + { + "auxiliary_loss_clip": 0.01054861, + "auxiliary_loss_mlp": 0.01038754, + "balance_loss_clip": 1.01572323, + "balance_loss_mlp": 1.01730025, + "epoch": 0.500045092439501, + "flos": 25555793483520.0, + "grad_norm": 5.8520332049472765, + "language_loss": 0.82199979, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.84293598, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.375, + "step": 8317, + "time_per_iteration": 2.4913976192474365 + }, + { + "auxiliary_loss_clip": 0.01056348, + "auxiliary_loss_mlp": 0.01047007, + "balance_loss_clip": 1.01777697, + "balance_loss_mlp": 1.01683021, + "epoch": 0.5001052156921689, + "flos": 20703704106240.0, + "grad_norm": 1.564512702709844, + "language_loss": 0.84882379, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.86985737, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.39453125, + "step": 8318, + "time_per_iteration": 2.3603978157043457 + }, + { + "auxiliary_loss_clip": 0.01057952, + "auxiliary_loss_mlp": 0.01037506, + "balance_loss_clip": 1.01306772, + "balance_loss_mlp": 1.01829314, + "epoch": 0.500165338944837, + "flos": 21250059972480.0, + "grad_norm": 1.825312822917236, + "language_loss": 0.83369553, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.85465014, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.39648438, + "step": 8319, + "time_per_iteration": 2.3962485790252686 + }, + { + "auxiliary_loss_clip": 0.01059142, + "auxiliary_loss_mlp": 0.0104072, + "balance_loss_clip": 1.01530492, + "balance_loss_mlp": 1.01930583, + "epoch": 0.5002254621975049, + "flos": 27853183278720.0, + "grad_norm": 1.5535049232676816, + "language_loss": 0.72548461, + "learning_rate": 2.095576427171635e-06, + "loss": 0.74648321, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 8320, + "time_per_iteration": 2.4695091247558594 + }, + { + "auxiliary_loss_clip": 0.0106417, + "auxiliary_loss_mlp": 0.01054026, + "balance_loss_clip": 1.02174401, + "balance_loss_mlp": 1.01908886, + "epoch": 0.5002855854501729, + "flos": 15551257368960.0, + "grad_norm": 3.0040529036486987, + "language_loss": 0.79514688, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.81632888, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.45117188, + "step": 8321, + "time_per_iteration": 2.345817804336548 + }, + { + "auxiliary_loss_clip": 0.01056974, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.01901293, + "balance_loss_mlp": 1.0175606, + "epoch": 0.5003457087028408, + "flos": 16106480720640.0, + "grad_norm": 1.7143199501543434, + "language_loss": 0.83997178, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.8609876, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39453125, + "step": 8322, + "time_per_iteration": 2.3666889667510986 + }, + { + "auxiliary_loss_clip": 0.01059361, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_clip": 1.01639509, + "balance_loss_mlp": 1.01817095, + "epoch": 0.5004058319555088, + "flos": 22709429470080.0, + "grad_norm": 3.0828031672055465, + "language_loss": 0.75366122, + "learning_rate": 2.094409360775228e-06, + "loss": 0.77469981, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41015625, + "step": 8323, + "time_per_iteration": 5.1190526485443115 + }, + { + "auxiliary_loss_clip": 0.01059062, + "auxiliary_loss_mlp": 0.01044876, + "balance_loss_clip": 1.01694536, + "balance_loss_mlp": 1.01861858, + "epoch": 0.5004659552081767, + "flos": 30116637365760.0, + "grad_norm": 1.516911688379006, + "language_loss": 0.70131838, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.72235775, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40429688, + "step": 8324, + "time_per_iteration": 2.4382009506225586 + }, + { + "auxiliary_loss_clip": 0.01058243, + "auxiliary_loss_mlp": 0.01047747, + "balance_loss_clip": 1.01968527, + "balance_loss_mlp": 1.01796758, + "epoch": 0.5005260784608447, + "flos": 18623718547200.0, + "grad_norm": 1.965891776856318, + "language_loss": 0.73139483, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.7524547, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 8325, + "time_per_iteration": 3.7004287242889404 + }, + { + "auxiliary_loss_clip": 0.01059873, + "auxiliary_loss_mlp": 0.01049675, + "balance_loss_clip": 1.02031422, + "balance_loss_mlp": 1.018713, + "epoch": 0.5005862017135126, + "flos": 24858927279360.0, + "grad_norm": 1.5915228718943166, + "language_loss": 0.74018496, + "learning_rate": 2.093242262158709e-06, + "loss": 0.76128042, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41015625, + "step": 8326, + "time_per_iteration": 2.4063503742218018 + }, + { + "auxiliary_loss_clip": 0.01056159, + "auxiliary_loss_mlp": 0.01040757, + "balance_loss_clip": 1.01566303, + "balance_loss_mlp": 1.01721287, + "epoch": 0.5006463249661807, + "flos": 18733380727680.0, + "grad_norm": 1.5118904681271286, + "language_loss": 0.79180264, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.81277174, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 8327, + "time_per_iteration": 2.3408899307250977 + }, + { + "auxiliary_loss_clip": 0.01061449, + "auxiliary_loss_mlp": 0.01050027, + "balance_loss_clip": 1.02238226, + "balance_loss_mlp": 1.02046132, + "epoch": 0.5007064482188487, + "flos": 13041316016640.0, + "grad_norm": 2.039895958901919, + "language_loss": 0.89281046, + "learning_rate": 2.092464178710997e-06, + "loss": 0.91392523, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 8328, + "time_per_iteration": 2.363429307937622 + }, + { + "auxiliary_loss_clip": 0.0106232, + "auxiliary_loss_mlp": 0.01044152, + "balance_loss_clip": 1.01659083, + "balance_loss_mlp": 1.01952207, + "epoch": 0.5007665714715166, + "flos": 21287591550720.0, + "grad_norm": 2.0488432667312155, + "language_loss": 0.75689113, + "learning_rate": 2.092075131720388e-06, + "loss": 0.77795577, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.42773438, + "step": 8329, + "time_per_iteration": 2.3692123889923096 + }, + { + "auxiliary_loss_clip": 0.01057404, + "auxiliary_loss_mlp": 0.01044558, + "balance_loss_clip": 1.01828396, + "balance_loss_mlp": 1.01790237, + "epoch": 0.5008266947241846, + "flos": 29753226357120.0, + "grad_norm": 2.0891404674055742, + "language_loss": 0.80400091, + "learning_rate": 2.091686081238281e-06, + "loss": 0.82502049, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 8330, + "time_per_iteration": 2.446190595626831 + }, + { + "auxiliary_loss_clip": 0.01009756, + "auxiliary_loss_mlp": 0.01002166, + "balance_loss_clip": 0.99990082, + "balance_loss_mlp": 1.00204563, + "epoch": 0.5008868179768525, + "flos": 63555050995200.0, + "grad_norm": 0.7457314419483813, + "language_loss": 0.56204665, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.5821659, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.07714844, + "step": 8331, + "time_per_iteration": 2.8145554065704346 + }, + { + "auxiliary_loss_clip": 0.01056974, + "auxiliary_loss_mlp": 0.01045296, + "balance_loss_clip": 1.01742458, + "balance_loss_mlp": 1.01809692, + "epoch": 0.5009469412295205, + "flos": 27374559183360.0, + "grad_norm": 1.89039169134968, + "language_loss": 0.66315216, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.6841749, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38867188, + "step": 8332, + "time_per_iteration": 3.8353333473205566 + }, + { + "auxiliary_loss_clip": 0.01056755, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.01163137, + "balance_loss_mlp": 1.01810098, + "epoch": 0.5010070644821885, + "flos": 27377666294400.0, + "grad_norm": 1.5861220654784587, + "language_loss": 0.75925732, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.78020763, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38671875, + "step": 8333, + "time_per_iteration": 2.407594919204712 + }, + { + "auxiliary_loss_clip": 0.01059019, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.01381743, + "balance_loss_mlp": 1.01839817, + "epoch": 0.5010671877348565, + "flos": 20661843519360.0, + "grad_norm": 1.836211192379402, + "language_loss": 0.81145251, + "learning_rate": 2.090129844689929e-06, + "loss": 0.83245468, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 8334, + "time_per_iteration": 2.37439227104187 + }, + { + "auxiliary_loss_clip": 0.01009249, + "auxiliary_loss_mlp": 0.01003346, + "balance_loss_clip": 1.00092566, + "balance_loss_mlp": 1.00144279, + "epoch": 0.5011273109875244, + "flos": 59125374691200.0, + "grad_norm": 0.8938750769283911, + "language_loss": 0.62799299, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64811891, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.078125, + "step": 8335, + "time_per_iteration": 2.988260269165039 + }, + { + "auxiliary_loss_clip": 0.01055495, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.01039982, + "balance_loss_mlp": 1.01721871, + "epoch": 0.5011874342401924, + "flos": 25335212313600.0, + "grad_norm": 1.660277987684863, + "language_loss": 0.80427837, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.82518232, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3828125, + "step": 8336, + "time_per_iteration": 2.4334375858306885 + }, + { + "auxiliary_loss_clip": 0.01058454, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.01427758, + "balance_loss_mlp": 1.01811206, + "epoch": 0.5012475574928603, + "flos": 20228920260480.0, + "grad_norm": 1.5756370082941724, + "language_loss": 0.81785685, + "learning_rate": 2.088962631340836e-06, + "loss": 0.83886671, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40234375, + "step": 8337, + "time_per_iteration": 2.3608627319335938 + }, + { + "auxiliary_loss_clip": 0.01059565, + "auxiliary_loss_mlp": 0.0104473, + "balance_loss_clip": 1.01775336, + "balance_loss_mlp": 1.01765037, + "epoch": 0.5013076807455283, + "flos": 22709045445120.0, + "grad_norm": 2.004025651779985, + "language_loss": 0.81050605, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.83154899, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41796875, + "step": 8338, + "time_per_iteration": 2.367231607437134 + }, + { + "auxiliary_loss_clip": 0.01057682, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.0142374, + "balance_loss_mlp": 1.01852334, + "epoch": 0.5013678039981962, + "flos": 24243966858240.0, + "grad_norm": 1.5628037633182916, + "language_loss": 0.85919631, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.88017523, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 8339, + "time_per_iteration": 2.4084701538085938 + }, + { + "auxiliary_loss_clip": 0.01057513, + "auxiliary_loss_mlp": 0.01044932, + "balance_loss_clip": 1.01983821, + "balance_loss_mlp": 1.01808703, + "epoch": 0.5014279272508643, + "flos": 26175501849600.0, + "grad_norm": 1.522081880523542, + "language_loss": 0.7207998, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.74182421, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39453125, + "step": 8340, + "time_per_iteration": 2.416010856628418 + }, + { + "auxiliary_loss_clip": 0.01060846, + "auxiliary_loss_mlp": 0.01046922, + "balance_loss_clip": 1.01896763, + "balance_loss_mlp": 1.01873374, + "epoch": 0.5014880505035323, + "flos": 21429478782720.0, + "grad_norm": 1.91958384658196, + "language_loss": 0.79822785, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.81930548, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.421875, + "step": 8341, + "time_per_iteration": 2.372584104537964 + }, + { + "auxiliary_loss_clip": 0.01061525, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.01528788, + "balance_loss_mlp": 1.01909089, + "epoch": 0.5015481737562002, + "flos": 15770058059520.0, + "grad_norm": 3.334853715037388, + "language_loss": 0.90546811, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.92652988, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 8342, + "time_per_iteration": 2.3573524951934814 + }, + { + "auxiliary_loss_clip": 0.01056393, + "auxiliary_loss_mlp": 0.01037951, + "balance_loss_clip": 1.01295233, + "balance_loss_mlp": 1.01781225, + "epoch": 0.5016082970088682, + "flos": 26829669594240.0, + "grad_norm": 1.8474312477823314, + "language_loss": 0.77369654, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.79463995, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 8343, + "time_per_iteration": 2.421483039855957 + }, + { + "auxiliary_loss_clip": 0.01057602, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.01210451, + "balance_loss_mlp": 1.01920676, + "epoch": 0.5016684202615361, + "flos": 21469523978880.0, + "grad_norm": 1.8677277571357696, + "language_loss": 0.68772542, + "learning_rate": 2.086239016143293e-06, + "loss": 0.70866913, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 8344, + "time_per_iteration": 2.4263033866882324 + }, + { + "auxiliary_loss_clip": 0.01060386, + "auxiliary_loss_mlp": 0.01042268, + "balance_loss_clip": 1.01672125, + "balance_loss_mlp": 1.01979852, + "epoch": 0.5017285435142042, + "flos": 26245712327040.0, + "grad_norm": 1.7495446699205126, + "language_loss": 0.76734805, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.78837466, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40625, + "step": 8345, + "time_per_iteration": 2.396933078765869 + }, + { + "auxiliary_loss_clip": 0.01061492, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_clip": 1.01946437, + "balance_loss_mlp": 1.02068686, + "epoch": 0.5017886667668721, + "flos": 20776498024320.0, + "grad_norm": 1.8474247265429458, + "language_loss": 0.79583383, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.81692338, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40820312, + "step": 8346, + "time_per_iteration": 2.4160497188568115 + }, + { + "auxiliary_loss_clip": 0.0105806, + "auxiliary_loss_mlp": 0.01044454, + "balance_loss_clip": 1.01863313, + "balance_loss_mlp": 1.01824558, + "epoch": 0.5018487900195401, + "flos": 20155393203840.0, + "grad_norm": 1.6752918358441549, + "language_loss": 0.7039696, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.72499472, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8347, + "time_per_iteration": 2.3659303188323975 + }, + { + "auxiliary_loss_clip": 0.01059074, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.02138233, + "balance_loss_mlp": 1.0182507, + "epoch": 0.501908913272208, + "flos": 18149702751360.0, + "grad_norm": 1.7947506721838362, + "language_loss": 0.72791791, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.74901664, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40820312, + "step": 8348, + "time_per_iteration": 2.3891489505767822 + }, + { + "auxiliary_loss_clip": 0.01056123, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_clip": 1.01989639, + "balance_loss_mlp": 1.01734567, + "epoch": 0.501969036524876, + "flos": 23111175018240.0, + "grad_norm": 1.6815490749823887, + "language_loss": 0.75678086, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.77779448, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 8349, + "time_per_iteration": 2.3844335079193115 + }, + { + "auxiliary_loss_clip": 0.01058349, + "auxiliary_loss_mlp": 0.01043996, + "balance_loss_clip": 1.01635194, + "balance_loss_mlp": 1.01764238, + "epoch": 0.5020291597775439, + "flos": 11362447601280.0, + "grad_norm": 2.121833215493566, + "language_loss": 0.65722823, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.67825162, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 8350, + "time_per_iteration": 2.368807315826416 + }, + { + "auxiliary_loss_clip": 0.01011409, + "auxiliary_loss_mlp": 0.01003125, + "balance_loss_clip": 1.00043142, + "balance_loss_mlp": 1.00318348, + "epoch": 0.5020892830302119, + "flos": 64007873729280.0, + "grad_norm": 0.7838515260069818, + "language_loss": 0.59882939, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.61897469, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.02697754, + "router_z_loss_mlp": 0.08203125, + "step": 8351, + "time_per_iteration": 3.1352503299713135 + }, + { + "auxiliary_loss_clip": 0.01059364, + "auxiliary_loss_mlp": 0.01042251, + "balance_loss_clip": 1.0154295, + "balance_loss_mlp": 1.01793694, + "epoch": 0.5021494062828799, + "flos": 23731721256960.0, + "grad_norm": 1.7318389393488236, + "language_loss": 0.76700127, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.78801745, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.4140625, + "step": 8352, + "time_per_iteration": 2.3997390270233154 + }, + { + "auxiliary_loss_clip": 0.01058201, + "auxiliary_loss_mlp": 0.01046791, + "balance_loss_clip": 1.01850224, + "balance_loss_mlp": 1.01858771, + "epoch": 0.5022095295355479, + "flos": 21575764846080.0, + "grad_norm": 1.643710814342995, + "language_loss": 0.73015594, + "learning_rate": 2.082736990429464e-06, + "loss": 0.75120592, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.39648438, + "step": 8353, + "time_per_iteration": 2.362715482711792 + }, + { + "auxiliary_loss_clip": 0.01059223, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.02033234, + "balance_loss_mlp": 1.01918805, + "epoch": 0.5022696527882159, + "flos": 21396171479040.0, + "grad_norm": 1.9580536518072933, + "language_loss": 0.75480407, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.77589536, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40039062, + "step": 8354, + "time_per_iteration": 2.3942837715148926 + }, + { + "auxiliary_loss_clip": 0.0105831, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.0171082, + "balance_loss_mlp": 1.01924944, + "epoch": 0.5023297760408838, + "flos": 27159528919680.0, + "grad_norm": 1.5011179956805172, + "language_loss": 0.73537952, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.75640452, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.390625, + "step": 8355, + "time_per_iteration": 2.4179279804229736 + }, + { + "auxiliary_loss_clip": 0.01059941, + "auxiliary_loss_mlp": 0.01037616, + "balance_loss_clip": 1.00885129, + "balance_loss_mlp": 1.01883245, + "epoch": 0.5023898992935518, + "flos": 26212614491520.0, + "grad_norm": 1.6912783185109805, + "language_loss": 0.82353151, + "learning_rate": 2.081569591520548e-06, + "loss": 0.8445071, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41015625, + "step": 8356, + "time_per_iteration": 2.429704427719116 + }, + { + "auxiliary_loss_clip": 0.01060105, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.01611829, + "balance_loss_mlp": 1.01715183, + "epoch": 0.5024500225462197, + "flos": 13439570428800.0, + "grad_norm": 2.4822634842450566, + "language_loss": 0.78118765, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.80224383, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 8357, + "time_per_iteration": 2.337212562561035 + }, + { + "auxiliary_loss_clip": 0.01058977, + "auxiliary_loss_mlp": 0.01045986, + "balance_loss_clip": 1.01786399, + "balance_loss_mlp": 1.01871681, + "epoch": 0.5025101457988878, + "flos": 21578522843520.0, + "grad_norm": 1.629949876756042, + "language_loss": 0.78045279, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.80150235, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 8358, + "time_per_iteration": 2.4060282707214355 + }, + { + "auxiliary_loss_clip": 0.01059873, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_clip": 1.01882887, + "balance_loss_mlp": 1.01900589, + "epoch": 0.5025702690515557, + "flos": 24643966838400.0, + "grad_norm": 2.3395452063173487, + "language_loss": 0.74075127, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.76182187, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 8359, + "time_per_iteration": 2.394029378890991 + }, + { + "auxiliary_loss_clip": 0.01058968, + "auxiliary_loss_mlp": 0.01049766, + "balance_loss_clip": 1.02367115, + "balance_loss_mlp": 1.01910973, + "epoch": 0.5026303923042237, + "flos": 22089092699520.0, + "grad_norm": 1.7875931782393355, + "language_loss": 0.78354347, + "learning_rate": 2.080013016407077e-06, + "loss": 0.80463088, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8360, + "time_per_iteration": 2.4092788696289062 + }, + { + "auxiliary_loss_clip": 0.01060168, + "auxiliary_loss_mlp": 0.01045298, + "balance_loss_clip": 1.01923871, + "balance_loss_mlp": 1.01949835, + "epoch": 0.5026905155568916, + "flos": 23696039980800.0, + "grad_norm": 1.6178176276556109, + "language_loss": 0.77926683, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.80032152, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40625, + "step": 8361, + "time_per_iteration": 2.4023187160491943 + }, + { + "auxiliary_loss_clip": 0.01060137, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_clip": 1.0142585, + "balance_loss_mlp": 1.01854289, + "epoch": 0.5027506388095596, + "flos": 25811218056960.0, + "grad_norm": 1.6658925363156425, + "language_loss": 0.86697662, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.88801575, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41601562, + "step": 8362, + "time_per_iteration": 3.6526896953582764 + }, + { + "auxiliary_loss_clip": 0.01059102, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.01362479, + "balance_loss_mlp": 1.01853156, + "epoch": 0.5028107620622275, + "flos": 27525383723520.0, + "grad_norm": 1.6449063650976044, + "language_loss": 0.79787695, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.81887138, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 8363, + "time_per_iteration": 3.8421523571014404 + }, + { + "auxiliary_loss_clip": 0.010562, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.00921011, + "balance_loss_mlp": 1.01803613, + "epoch": 0.5028708853148955, + "flos": 24533152583040.0, + "grad_norm": 2.0369175491394453, + "language_loss": 0.7747606, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.79568124, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 8364, + "time_per_iteration": 3.778409719467163 + }, + { + "auxiliary_loss_clip": 0.01055881, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_clip": 1.01824784, + "balance_loss_mlp": 1.01725686, + "epoch": 0.5029310085675635, + "flos": 20812563325440.0, + "grad_norm": 1.7215615123326828, + "language_loss": 0.71149212, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.73248208, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 8365, + "time_per_iteration": 2.377464532852173 + }, + { + "auxiliary_loss_clip": 0.01062623, + "auxiliary_loss_mlp": 0.01049907, + "balance_loss_clip": 1.02012837, + "balance_loss_mlp": 1.01963592, + "epoch": 0.5029911318202315, + "flos": 22341479984640.0, + "grad_norm": 1.6036553173866273, + "language_loss": 0.74149007, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.76261538, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 8366, + "time_per_iteration": 2.3989102840423584 + }, + { + "auxiliary_loss_clip": 0.01059058, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.01541841, + "balance_loss_mlp": 1.01929653, + "epoch": 0.5030512550728995, + "flos": 24351569268480.0, + "grad_norm": 1.4087794650710723, + "language_loss": 0.78995395, + "learning_rate": 2.077288893713735e-06, + "loss": 0.81096596, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3984375, + "step": 8367, + "time_per_iteration": 2.552541494369507 + }, + { + "auxiliary_loss_clip": 0.01057158, + "auxiliary_loss_mlp": 0.01038511, + "balance_loss_clip": 1.01398921, + "balance_loss_mlp": 1.01765752, + "epoch": 0.5031113783255674, + "flos": 18258945995520.0, + "grad_norm": 2.074677557633191, + "language_loss": 0.71073854, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.73169518, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.39453125, + "step": 8368, + "time_per_iteration": 2.337125062942505 + }, + { + "auxiliary_loss_clip": 0.0101151, + "auxiliary_loss_mlp": 0.01001976, + "balance_loss_clip": 0.99980652, + "balance_loss_mlp": 1.00351465, + "epoch": 0.5031715015782354, + "flos": 57250364924160.0, + "grad_norm": 1.3685317063773643, + "language_loss": 0.63425505, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65438992, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.08007812, + "step": 8369, + "time_per_iteration": 2.9658255577087402 + }, + { + "auxiliary_loss_clip": 0.01057935, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.01583695, + "balance_loss_mlp": 1.01918674, + "epoch": 0.5032316248309033, + "flos": 27526116862080.0, + "grad_norm": 2.035264789272788, + "language_loss": 0.6151377, + "learning_rate": 2.076121368302263e-06, + "loss": 0.63613051, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 8370, + "time_per_iteration": 2.420901298522949 + }, + { + "auxiliary_loss_clip": 0.01058672, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.01567018, + "balance_loss_mlp": 1.01819146, + "epoch": 0.5032917480835714, + "flos": 34494396744960.0, + "grad_norm": 1.7335215572404412, + "language_loss": 0.69754529, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.71856266, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 8371, + "time_per_iteration": 2.4743804931640625 + }, + { + "auxiliary_loss_clip": 0.010599, + "auxiliary_loss_mlp": 0.01043912, + "balance_loss_clip": 1.01559997, + "balance_loss_mlp": 1.01797211, + "epoch": 0.5033518713362393, + "flos": 33655364017920.0, + "grad_norm": 1.598993339088028, + "language_loss": 0.69056016, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.71159828, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 8372, + "time_per_iteration": 3.9135406017303467 + }, + { + "auxiliary_loss_clip": 0.01059169, + "auxiliary_loss_mlp": 0.01052198, + "balance_loss_clip": 1.02144241, + "balance_loss_mlp": 1.01862657, + "epoch": 0.5034119945889073, + "flos": 28184194679040.0, + "grad_norm": 1.6261061395539511, + "language_loss": 0.67846233, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.69957602, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.40625, + "step": 8373, + "time_per_iteration": 2.44113826751709 + }, + { + "auxiliary_loss_clip": 0.01056588, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_clip": 1.02029371, + "balance_loss_mlp": 1.01748586, + "epoch": 0.5034721178415752, + "flos": 21357697294080.0, + "grad_norm": 1.5261485116538334, + "language_loss": 0.75453019, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.77555704, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 8374, + "time_per_iteration": 2.3769383430480957 + }, + { + "auxiliary_loss_clip": 0.01057313, + "auxiliary_loss_mlp": 0.01045287, + "balance_loss_clip": 1.01773763, + "balance_loss_mlp": 1.01740766, + "epoch": 0.5035322410942432, + "flos": 22673713282560.0, + "grad_norm": 1.972120101952305, + "language_loss": 0.69357574, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.71460176, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.3984375, + "step": 8375, + "time_per_iteration": 2.3592889308929443 + }, + { + "auxiliary_loss_clip": 0.01061443, + "auxiliary_loss_mlp": 0.01046016, + "balance_loss_clip": 1.01428199, + "balance_loss_mlp": 1.01882601, + "epoch": 0.5035923643469111, + "flos": 19827698382720.0, + "grad_norm": 1.6917493748688763, + "language_loss": 0.80497748, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.82605207, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.42578125, + "step": 8376, + "time_per_iteration": 2.4184179306030273 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.01710892, + "balance_loss_mlp": 1.01727986, + "epoch": 0.5036524875995791, + "flos": 30513425500800.0, + "grad_norm": 1.988916482082769, + "language_loss": 0.60563326, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.6266768, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41601562, + "step": 8377, + "time_per_iteration": 2.440805435180664 + }, + { + "auxiliary_loss_clip": 0.01058113, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_clip": 1.01682758, + "balance_loss_mlp": 1.01744318, + "epoch": 0.5037126108522471, + "flos": 14719695672960.0, + "grad_norm": 1.8912633971644486, + "language_loss": 0.77317178, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.79421353, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40625, + "step": 8378, + "time_per_iteration": 2.3569552898406982 + }, + { + "auxiliary_loss_clip": 0.01058657, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.00970006, + "balance_loss_mlp": 1.01832664, + "epoch": 0.5037727341049151, + "flos": 25296633394560.0, + "grad_norm": 1.5830997320019373, + "language_loss": 0.76142758, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.78238189, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 8379, + "time_per_iteration": 2.4272427558898926 + }, + { + "auxiliary_loss_clip": 0.01057157, + "auxiliary_loss_mlp": 0.01040792, + "balance_loss_clip": 1.01592481, + "balance_loss_mlp": 1.0179534, + "epoch": 0.5038328573575831, + "flos": 28540693238400.0, + "grad_norm": 2.537341728091919, + "language_loss": 0.68237805, + "learning_rate": 2.072229431544548e-06, + "loss": 0.70335758, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39257812, + "step": 8380, + "time_per_iteration": 2.43739914894104 + }, + { + "auxiliary_loss_clip": 0.01058015, + "auxiliary_loss_mlp": 0.0103973, + "balance_loss_clip": 1.01340866, + "balance_loss_mlp": 1.01837206, + "epoch": 0.503892980610251, + "flos": 31648521490560.0, + "grad_norm": 1.9732145070357676, + "language_loss": 0.64108217, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66205955, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39648438, + "step": 8381, + "time_per_iteration": 2.465404510498047 + }, + { + "auxiliary_loss_clip": 0.01057207, + "auxiliary_loss_mlp": 0.01040941, + "balance_loss_clip": 1.01471472, + "balance_loss_mlp": 1.01819444, + "epoch": 0.503953103862919, + "flos": 27088131456000.0, + "grad_norm": 1.434289581007736, + "language_loss": 0.68437755, + "learning_rate": 2.071451010853365e-06, + "loss": 0.70535898, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 8382, + "time_per_iteration": 2.4047036170959473 + }, + { + "auxiliary_loss_clip": 0.0106262, + "auxiliary_loss_mlp": 0.01054434, + "balance_loss_clip": 1.02477479, + "balance_loss_mlp": 1.01962352, + "epoch": 0.5040132271155869, + "flos": 15632045988480.0, + "grad_norm": 3.007779295854294, + "language_loss": 0.63624251, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.65741301, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 8383, + "time_per_iteration": 2.3579912185668945 + }, + { + "auxiliary_loss_clip": 0.01058191, + "auxiliary_loss_mlp": 0.01040268, + "balance_loss_clip": 1.01454258, + "balance_loss_mlp": 1.01898491, + "epoch": 0.504073350368255, + "flos": 13589242894080.0, + "grad_norm": 2.2549980618577643, + "language_loss": 0.68952703, + "learning_rate": 2.070672579324465e-06, + "loss": 0.71051162, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39257812, + "step": 8384, + "time_per_iteration": 2.3433139324188232 + }, + { + "auxiliary_loss_clip": 0.0105844, + "auxiliary_loss_mlp": 0.01045959, + "balance_loss_clip": 1.0190053, + "balance_loss_mlp": 1.01800227, + "epoch": 0.5041334736209229, + "flos": 29056918734720.0, + "grad_norm": 5.553742068911801, + "language_loss": 0.72495198, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.74599594, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40429688, + "step": 8385, + "time_per_iteration": 2.4376590251922607 + }, + { + "auxiliary_loss_clip": 0.01055575, + "auxiliary_loss_mlp": 0.01041313, + "balance_loss_clip": 1.01700616, + "balance_loss_mlp": 1.01756358, + "epoch": 0.5041935968735909, + "flos": 24607203310080.0, + "grad_norm": 1.9207196368151191, + "language_loss": 0.84418559, + "learning_rate": 2.069894137075919e-06, + "loss": 0.86515445, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38085938, + "step": 8386, + "time_per_iteration": 2.382720947265625 + }, + { + "auxiliary_loss_clip": 0.0106049, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_clip": 1.01689005, + "balance_loss_mlp": 1.01910901, + "epoch": 0.5042537201262588, + "flos": 26285722611840.0, + "grad_norm": 1.4358167674112996, + "language_loss": 0.67824709, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.6993103, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 8387, + "time_per_iteration": 2.4458038806915283 + }, + { + "auxiliary_loss_clip": 0.0105869, + "auxiliary_loss_mlp": 0.0104741, + "balance_loss_clip": 1.02286494, + "balance_loss_mlp": 1.01895022, + "epoch": 0.5043138433789268, + "flos": 22016298781440.0, + "grad_norm": 1.4493819455118706, + "language_loss": 0.81017447, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.83123553, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3984375, + "step": 8388, + "time_per_iteration": 2.3896632194519043 + }, + { + "auxiliary_loss_clip": 0.01057638, + "auxiliary_loss_mlp": 0.01042263, + "balance_loss_clip": 1.01466644, + "balance_loss_mlp": 1.01786041, + "epoch": 0.5043739666315947, + "flos": 28765847796480.0, + "grad_norm": 2.250255956982847, + "language_loss": 0.70520318, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72620225, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.39648438, + "step": 8389, + "time_per_iteration": 2.4465320110321045 + }, + { + "auxiliary_loss_clip": 0.01060462, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.0144906, + "balance_loss_mlp": 1.01872516, + "epoch": 0.5044340898842627, + "flos": 27598037996160.0, + "grad_norm": 2.047799919377753, + "language_loss": 0.70631689, + "learning_rate": 2.068337220892191e-06, + "loss": 0.72732425, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41796875, + "step": 8390, + "time_per_iteration": 2.4149348735809326 + }, + { + "auxiliary_loss_clip": 0.01010674, + "auxiliary_loss_mlp": 0.01002161, + "balance_loss_clip": 0.99978912, + "balance_loss_mlp": 1.00264597, + "epoch": 0.5044942131369307, + "flos": 67455268531200.0, + "grad_norm": 0.8383855917650871, + "language_loss": 0.53014278, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55027115, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.08007812, + "step": 8391, + "time_per_iteration": 2.9352505207061768 + }, + { + "auxiliary_loss_clip": 0.01010541, + "auxiliary_loss_mlp": 0.01003591, + "balance_loss_clip": 1.00142145, + "balance_loss_mlp": 1.00248814, + "epoch": 0.5045543363895987, + "flos": 58628247575040.0, + "grad_norm": 0.8754830972818822, + "language_loss": 0.60830021, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62844157, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.08007812, + "step": 8392, + "time_per_iteration": 2.8591761589050293 + }, + { + "auxiliary_loss_clip": 0.01056774, + "auxiliary_loss_mlp": 0.01038062, + "balance_loss_clip": 1.01172829, + "balance_loss_mlp": 1.01848221, + "epoch": 0.5046144596422667, + "flos": 22525576917120.0, + "grad_norm": 2.6382121526421565, + "language_loss": 0.85658652, + "learning_rate": 2.067169506493517e-06, + "loss": 0.87753487, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3828125, + "step": 8393, + "time_per_iteration": 2.408466339111328 + }, + { + "auxiliary_loss_clip": 0.01059441, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.01450515, + "balance_loss_mlp": 1.01985455, + "epoch": 0.5046745828949346, + "flos": 27453008741760.0, + "grad_norm": 1.896565364267999, + "language_loss": 0.5280993, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.54909909, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 8394, + "time_per_iteration": 2.4049265384674072 + }, + { + "auxiliary_loss_clip": 0.01058014, + "auxiliary_loss_mlp": 0.01047741, + "balance_loss_clip": 1.01791465, + "balance_loss_mlp": 1.01792049, + "epoch": 0.5047347061476026, + "flos": 17273592293760.0, + "grad_norm": 3.364388372301817, + "language_loss": 0.7630344, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.78409195, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.40039062, + "step": 8395, + "time_per_iteration": 2.401951313018799 + }, + { + "auxiliary_loss_clip": 0.01060513, + "auxiliary_loss_mlp": 0.01041044, + "balance_loss_clip": 1.0142101, + "balance_loss_mlp": 1.01983619, + "epoch": 0.5047948294002705, + "flos": 16648717046400.0, + "grad_norm": 3.115389506048331, + "language_loss": 0.69333977, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.71435535, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 8396, + "time_per_iteration": 2.3416330814361572 + }, + { + "auxiliary_loss_clip": 0.01059804, + "auxiliary_loss_mlp": 0.01040646, + "balance_loss_clip": 1.01530254, + "balance_loss_mlp": 1.02028346, + "epoch": 0.5048549526529386, + "flos": 26864617731840.0, + "grad_norm": 1.6992748131085293, + "language_loss": 0.80077219, + "learning_rate": 2.065612518371792e-06, + "loss": 0.82177669, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39453125, + "step": 8397, + "time_per_iteration": 2.457522392272949 + }, + { + "auxiliary_loss_clip": 0.01057477, + "auxiliary_loss_mlp": 0.01040606, + "balance_loss_clip": 1.01486826, + "balance_loss_mlp": 1.01761079, + "epoch": 0.5049150759056065, + "flos": 21832900076160.0, + "grad_norm": 1.5306463405434139, + "language_loss": 0.66758406, + "learning_rate": 2.065223265084376e-06, + "loss": 0.6885649, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8398, + "time_per_iteration": 2.385115385055542 + }, + { + "auxiliary_loss_clip": 0.01058601, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_clip": 1.01760554, + "balance_loss_mlp": 1.01857507, + "epoch": 0.5049751991582745, + "flos": 21684833533440.0, + "grad_norm": 1.6146264500306389, + "language_loss": 0.73117077, + "learning_rate": 2.064834009323688e-06, + "loss": 0.75220394, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40039062, + "step": 8399, + "time_per_iteration": 2.390725612640381 + }, + { + "auxiliary_loss_clip": 0.01060565, + "auxiliary_loss_mlp": 0.0104491, + "balance_loss_clip": 1.01680088, + "balance_loss_mlp": 1.02026176, + "epoch": 0.5050353224109424, + "flos": 21358360609920.0, + "grad_norm": 1.8786033433842735, + "language_loss": 0.82951784, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.85057265, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 8400, + "time_per_iteration": 2.3603641986846924 + }, + { + "auxiliary_loss_clip": 0.01058301, + "auxiliary_loss_mlp": 0.01043368, + "balance_loss_clip": 1.0171895, + "balance_loss_mlp": 1.01853502, + "epoch": 0.5050954456636104, + "flos": 22818986916480.0, + "grad_norm": 1.8926505079523883, + "language_loss": 0.79944402, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.82046074, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8401, + "time_per_iteration": 3.697805881500244 + }, + { + "auxiliary_loss_clip": 0.01059801, + "auxiliary_loss_mlp": 0.01036924, + "balance_loss_clip": 1.00943446, + "balance_loss_mlp": 1.01861703, + "epoch": 0.5051555689162783, + "flos": 30446845804800.0, + "grad_norm": 1.6369783480841937, + "language_loss": 0.71131104, + "learning_rate": 2.063666227349593e-06, + "loss": 0.73227835, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41210938, + "step": 8402, + "time_per_iteration": 2.4881255626678467 + }, + { + "auxiliary_loss_clip": 0.01059256, + "auxiliary_loss_mlp": 0.01040911, + "balance_loss_clip": 1.01429176, + "balance_loss_mlp": 1.01845741, + "epoch": 0.5052156921689464, + "flos": 21286893323520.0, + "grad_norm": 1.618637783982206, + "language_loss": 0.70703787, + "learning_rate": 2.063276961843422e-06, + "loss": 0.7280395, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40820312, + "step": 8403, + "time_per_iteration": 3.836982488632202 + }, + { + "auxiliary_loss_clip": 0.01057719, + "auxiliary_loss_mlp": 0.01039895, + "balance_loss_clip": 1.01370478, + "balance_loss_mlp": 1.01962328, + "epoch": 0.5052758154216143, + "flos": 25080171765120.0, + "grad_norm": 1.4165721963452325, + "language_loss": 0.86887801, + "learning_rate": 2.062887693937781e-06, + "loss": 0.88985419, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 8404, + "time_per_iteration": 3.8394412994384766 + }, + { + "auxiliary_loss_clip": 0.01059337, + "auxiliary_loss_mlp": 0.0104985, + "balance_loss_clip": 1.02425551, + "balance_loss_mlp": 1.01914728, + "epoch": 0.5053359386742823, + "flos": 20884484459520.0, + "grad_norm": 1.5852179310111671, + "language_loss": 0.76223224, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.78332406, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 8405, + "time_per_iteration": 2.3918955326080322 + }, + { + "auxiliary_loss_clip": 0.0106019, + "auxiliary_loss_mlp": 0.01044511, + "balance_loss_clip": 1.01529288, + "balance_loss_mlp": 1.01870584, + "epoch": 0.5053960619269503, + "flos": 37741808079360.0, + "grad_norm": 2.2931961470698545, + "language_loss": 0.74241686, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.76346385, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4140625, + "step": 8406, + "time_per_iteration": 2.506713628768921 + }, + { + "auxiliary_loss_clip": 0.01056945, + "auxiliary_loss_mlp": 0.01037928, + "balance_loss_clip": 1.01295412, + "balance_loss_mlp": 1.01866412, + "epoch": 0.5054561851796182, + "flos": 23512711098240.0, + "grad_norm": 2.8187455463443274, + "language_loss": 0.7782886, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79923737, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 8407, + "time_per_iteration": 2.415842294692993 + }, + { + "auxiliary_loss_clip": 0.01058043, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.01539958, + "balance_loss_mlp": 1.01760864, + "epoch": 0.5055163084322862, + "flos": 30408895290240.0, + "grad_norm": 1.954245414137095, + "language_loss": 0.64672351, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.66771138, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40429688, + "step": 8408, + "time_per_iteration": 2.451498031616211 + }, + { + "auxiliary_loss_clip": 0.01058592, + "auxiliary_loss_mlp": 0.01042112, + "balance_loss_clip": 1.01402628, + "balance_loss_mlp": 1.01844406, + "epoch": 0.5055764316849541, + "flos": 20258806250880.0, + "grad_norm": 2.3468632071753084, + "language_loss": 0.65098208, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.67198914, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 8409, + "time_per_iteration": 2.37862491607666 + }, + { + "auxiliary_loss_clip": 0.0105786, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.01542306, + "balance_loss_mlp": 1.01833034, + "epoch": 0.5056365549376222, + "flos": 26069610096000.0, + "grad_norm": 1.285142897054033, + "language_loss": 0.7179395, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73891681, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.39648438, + "step": 8410, + "time_per_iteration": 2.438717842102051 + }, + { + "auxiliary_loss_clip": 0.01059381, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.02582633, + "balance_loss_mlp": 1.01933265, + "epoch": 0.5056966781902901, + "flos": 19278130671360.0, + "grad_norm": 1.5948358737379504, + "language_loss": 0.80014116, + "learning_rate": 2.060162752653113e-06, + "loss": 0.82126987, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40039062, + "step": 8411, + "time_per_iteration": 3.851804733276367 + }, + { + "auxiliary_loss_clip": 0.01060291, + "auxiliary_loss_mlp": 0.01050521, + "balance_loss_clip": 1.02098107, + "balance_loss_mlp": 1.01989615, + "epoch": 0.5057568014429581, + "flos": 21322295308800.0, + "grad_norm": 1.825580662684488, + "language_loss": 0.82583284, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.84694093, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40429688, + "step": 8412, + "time_per_iteration": 2.3599846363067627 + }, + { + "auxiliary_loss_clip": 0.01059418, + "auxiliary_loss_mlp": 0.01051967, + "balance_loss_clip": 1.02582419, + "balance_loss_mlp": 1.01884818, + "epoch": 0.505816924695626, + "flos": 17492637363840.0, + "grad_norm": 1.780961354408906, + "language_loss": 0.80882448, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82993829, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 8413, + "time_per_iteration": 2.401076316833496 + }, + { + "auxiliary_loss_clip": 0.0106104, + "auxiliary_loss_mlp": 0.01048025, + "balance_loss_clip": 1.0198437, + "balance_loss_mlp": 1.01851118, + "epoch": 0.505877047948294, + "flos": 21141026196480.0, + "grad_norm": 2.0133369355972817, + "language_loss": 0.82562053, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.84671116, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42578125, + "step": 8414, + "time_per_iteration": 2.362041473388672 + }, + { + "auxiliary_loss_clip": 0.01057764, + "auxiliary_loss_mlp": 0.01039555, + "balance_loss_clip": 1.01230407, + "balance_loss_mlp": 1.016909, + "epoch": 0.5059371712009619, + "flos": 36348738986880.0, + "grad_norm": 2.1815117343372763, + "language_loss": 0.64325547, + "learning_rate": 2.058605592832528e-06, + "loss": 0.66422862, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40820312, + "step": 8415, + "time_per_iteration": 2.496384382247925 + }, + { + "auxiliary_loss_clip": 0.01058126, + "auxiliary_loss_mlp": 0.01046242, + "balance_loss_clip": 1.01651168, + "balance_loss_mlp": 1.01746094, + "epoch": 0.50599729445363, + "flos": 22672316828160.0, + "grad_norm": 1.5500389344848098, + "language_loss": 0.83542609, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.85646981, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40625, + "step": 8416, + "time_per_iteration": 2.3660833835601807 + }, + { + "auxiliary_loss_clip": 0.01056948, + "auxiliary_loss_mlp": 0.01045416, + "balance_loss_clip": 1.01966655, + "balance_loss_mlp": 1.01829112, + "epoch": 0.5060574177062979, + "flos": 22746751580160.0, + "grad_norm": 1.5303912773577506, + "language_loss": 0.80296171, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.82398534, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 8417, + "time_per_iteration": 2.4141528606414795 + }, + { + "auxiliary_loss_clip": 0.01056452, + "auxiliary_loss_mlp": 0.01051028, + "balance_loss_clip": 1.02384818, + "balance_loss_mlp": 1.01809287, + "epoch": 0.5061175409589659, + "flos": 21652119722880.0, + "grad_norm": 1.8637302153441762, + "language_loss": 0.64286637, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.66394114, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3828125, + "step": 8418, + "time_per_iteration": 2.3920953273773193 + }, + { + "auxiliary_loss_clip": 0.01061362, + "auxiliary_loss_mlp": 0.01039654, + "balance_loss_clip": 1.01160407, + "balance_loss_mlp": 1.0189693, + "epoch": 0.5061776642116339, + "flos": 21615181637760.0, + "grad_norm": 2.0517525684761124, + "language_loss": 0.79033387, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.81134403, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42382812, + "step": 8419, + "time_per_iteration": 2.3916120529174805 + }, + { + "auxiliary_loss_clip": 0.01060534, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.01752925, + "balance_loss_mlp": 1.01912773, + "epoch": 0.5062377874643018, + "flos": 24425131236480.0, + "grad_norm": 1.8822221211828354, + "language_loss": 0.79060131, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.8116616, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.4140625, + "step": 8420, + "time_per_iteration": 2.389162063598633 + }, + { + "auxiliary_loss_clip": 0.01059517, + "auxiliary_loss_mlp": 0.01045685, + "balance_loss_clip": 1.01683581, + "balance_loss_mlp": 1.01936269, + "epoch": 0.5062979107169698, + "flos": 22523447324160.0, + "grad_norm": 3.1900420878541493, + "language_loss": 0.78376848, + "learning_rate": 2.056269786726999e-06, + "loss": 0.80482048, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40234375, + "step": 8421, + "time_per_iteration": 2.424870729446411 + }, + { + "auxiliary_loss_clip": 0.01059675, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.01245189, + "balance_loss_mlp": 1.0187856, + "epoch": 0.5063580339696377, + "flos": 24570823806720.0, + "grad_norm": 1.443387530589, + "language_loss": 0.68272752, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.70372474, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40820312, + "step": 8422, + "time_per_iteration": 2.4087188243865967 + }, + { + "auxiliary_loss_clip": 0.01058224, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.01740587, + "balance_loss_mlp": 1.01832426, + "epoch": 0.5064181572223058, + "flos": 22595193901440.0, + "grad_norm": 3.50432983043544, + "language_loss": 0.82680225, + "learning_rate": 2.05549116746431e-06, + "loss": 0.84784359, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.3984375, + "step": 8423, + "time_per_iteration": 2.4642202854156494 + }, + { + "auxiliary_loss_clip": 0.0106174, + "auxiliary_loss_mlp": 0.01044068, + "balance_loss_clip": 1.0150528, + "balance_loss_mlp": 1.01997089, + "epoch": 0.5064782804749737, + "flos": 25993743978240.0, + "grad_norm": 1.873029817935076, + "language_loss": 0.76655161, + "learning_rate": 2.055101854669237e-06, + "loss": 0.7876097, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 8424, + "time_per_iteration": 2.410820245742798 + }, + { + "auxiliary_loss_clip": 0.01058411, + "auxiliary_loss_mlp": 0.01047091, + "balance_loss_clip": 1.01874256, + "balance_loss_mlp": 1.0185678, + "epoch": 0.5065384037276417, + "flos": 28551655405440.0, + "grad_norm": 1.416509653630379, + "language_loss": 0.71898234, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.74003732, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.3984375, + "step": 8425, + "time_per_iteration": 2.4835565090179443 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.01758242, + "balance_loss_mlp": 1.01891637, + "epoch": 0.5065985269803096, + "flos": 22964923866240.0, + "grad_norm": 1.970271908867102, + "language_loss": 0.79876053, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.81980145, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 8426, + "time_per_iteration": 2.381798505783081 + }, + { + "auxiliary_loss_clip": 0.01061995, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.01361251, + "balance_loss_mlp": 1.02007687, + "epoch": 0.5066586502329776, + "flos": 21607710606720.0, + "grad_norm": 2.2705617730664382, + "language_loss": 0.79427338, + "learning_rate": 2.053933903806265e-06, + "loss": 0.81532043, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41992188, + "step": 8427, + "time_per_iteration": 2.401426076889038 + }, + { + "auxiliary_loss_clip": 0.01056571, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.0135262, + "balance_loss_mlp": 1.01772189, + "epoch": 0.5067187734856455, + "flos": 20338861731840.0, + "grad_norm": 1.6959665201401304, + "language_loss": 0.72757256, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.74854565, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38867188, + "step": 8428, + "time_per_iteration": 2.388887643814087 + }, + { + "auxiliary_loss_clip": 0.01057507, + "auxiliary_loss_mlp": 0.01042383, + "balance_loss_clip": 1.01646709, + "balance_loss_mlp": 1.01780117, + "epoch": 0.5067788967383136, + "flos": 28839793789440.0, + "grad_norm": 1.584197666321564, + "language_loss": 0.84232414, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.86332309, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 8429, + "time_per_iteration": 2.42390775680542 + }, + { + "auxiliary_loss_clip": 0.01062794, + "auxiliary_loss_mlp": 0.01049705, + "balance_loss_clip": 1.01992631, + "balance_loss_mlp": 1.02008057, + "epoch": 0.5068390199909815, + "flos": 32448870564480.0, + "grad_norm": 1.8701501045478275, + "language_loss": 0.7325362, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75366116, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42773438, + "step": 8430, + "time_per_iteration": 2.492708444595337 + }, + { + "auxiliary_loss_clip": 0.01060579, + "auxiliary_loss_mlp": 0.01044291, + "balance_loss_clip": 1.01709974, + "balance_loss_mlp": 1.01896358, + "epoch": 0.5068991432436495, + "flos": 23145529662720.0, + "grad_norm": 1.6053442330517715, + "language_loss": 0.77801883, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.7990675, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41601562, + "step": 8431, + "time_per_iteration": 2.377077579498291 + }, + { + "auxiliary_loss_clip": 0.01059553, + "auxiliary_loss_mlp": 0.01046219, + "balance_loss_clip": 1.01924241, + "balance_loss_mlp": 1.01975226, + "epoch": 0.5069592664963174, + "flos": 19935126236160.0, + "grad_norm": 2.6860181299786445, + "language_loss": 0.73223424, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.75329196, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 8432, + "time_per_iteration": 2.3855605125427246 + }, + { + "auxiliary_loss_clip": 0.01012398, + "auxiliary_loss_mlp": 0.01002687, + "balance_loss_clip": 1.00056541, + "balance_loss_mlp": 1.00438821, + "epoch": 0.5070193897489854, + "flos": 65790643950720.0, + "grad_norm": 0.7591033104976589, + "language_loss": 0.63798451, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65813529, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.08007812, + "step": 8433, + "time_per_iteration": 3.0705459117889404 + }, + { + "auxiliary_loss_clip": 0.01059378, + "auxiliary_loss_mlp": 0.01046967, + "balance_loss_clip": 1.01903629, + "balance_loss_mlp": 1.01870489, + "epoch": 0.5070795130016534, + "flos": 17274360343680.0, + "grad_norm": 1.6593183180098319, + "language_loss": 0.78574914, + "learning_rate": 2.051208614233681e-06, + "loss": 0.80681252, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 8434, + "time_per_iteration": 2.3654568195343018 + }, + { + "auxiliary_loss_clip": 0.01059216, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_clip": 1.0186044, + "balance_loss_mlp": 1.01781309, + "epoch": 0.5071396362543213, + "flos": 21068860682880.0, + "grad_norm": 1.5739680904640445, + "language_loss": 0.72307813, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.74413407, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 8435, + "time_per_iteration": 2.3703866004943848 + }, + { + "auxiliary_loss_clip": 0.01059993, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.01783228, + "balance_loss_mlp": 1.0187571, + "epoch": 0.5071997595069894, + "flos": 23143819006080.0, + "grad_norm": 2.683407358818919, + "language_loss": 0.73840463, + "learning_rate": 2.050429942372112e-06, + "loss": 0.75946194, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41210938, + "step": 8436, + "time_per_iteration": 2.4049243927001953 + }, + { + "auxiliary_loss_clip": 0.01059501, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.01546407, + "balance_loss_mlp": 1.01849604, + "epoch": 0.5072598827596573, + "flos": 22746088264320.0, + "grad_norm": 1.5475834419501024, + "language_loss": 0.84954441, + "learning_rate": 2.050040603565483e-06, + "loss": 0.87057942, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 8437, + "time_per_iteration": 2.375600814819336 + }, + { + "auxiliary_loss_clip": 0.01055993, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.01251292, + "balance_loss_mlp": 1.01750422, + "epoch": 0.5073200060123253, + "flos": 22565168265600.0, + "grad_norm": 1.4230738292864307, + "language_loss": 0.81930906, + "learning_rate": 2.049651262861309e-06, + "loss": 0.84025651, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 8438, + "time_per_iteration": 2.4282643795013428 + }, + { + "auxiliary_loss_clip": 0.01058333, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_clip": 1.01641345, + "balance_loss_mlp": 1.01676142, + "epoch": 0.5073801292649932, + "flos": 25805318037120.0, + "grad_norm": 2.1132935610971484, + "language_loss": 0.80684358, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.82788277, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41601562, + "step": 8439, + "time_per_iteration": 2.4377732276916504 + }, + { + "auxiliary_loss_clip": 0.01057582, + "auxiliary_loss_mlp": 0.01038082, + "balance_loss_clip": 1.01198745, + "balance_loss_mlp": 1.01762724, + "epoch": 0.5074402525176612, + "flos": 25372778803200.0, + "grad_norm": 1.637344177504828, + "language_loss": 0.71943247, + "learning_rate": 2.048872575819383e-06, + "loss": 0.74038911, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8440, + "time_per_iteration": 2.42033314704895 + }, + { + "auxiliary_loss_clip": 0.01059602, + "auxiliary_loss_mlp": 0.01040381, + "balance_loss_clip": 1.01430976, + "balance_loss_mlp": 1.01821733, + "epoch": 0.5075003757703291, + "flos": 26063326051200.0, + "grad_norm": 1.9568797808305602, + "language_loss": 0.72523022, + "learning_rate": 2.048483229511158e-06, + "loss": 0.74623007, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.4140625, + "step": 8441, + "time_per_iteration": 3.756898880004883 + }, + { + "auxiliary_loss_clip": 0.01061101, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.01898539, + "balance_loss_mlp": 1.01823747, + "epoch": 0.5075604990229972, + "flos": 21834366353280.0, + "grad_norm": 1.7994065567877748, + "language_loss": 0.65611571, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.67721188, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4296875, + "step": 8442, + "time_per_iteration": 3.7983431816101074 + }, + { + "auxiliary_loss_clip": 0.01057611, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.01083851, + "balance_loss_mlp": 1.01883388, + "epoch": 0.5076206222756651, + "flos": 31977333475200.0, + "grad_norm": 1.570117451899115, + "language_loss": 0.71624738, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73718655, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 8443, + "time_per_iteration": 3.893843412399292 + }, + { + "auxiliary_loss_clip": 0.01061157, + "auxiliary_loss_mlp": 0.01050171, + "balance_loss_clip": 1.02156067, + "balance_loss_mlp": 1.01834249, + "epoch": 0.5076807455283331, + "flos": 36902530972800.0, + "grad_norm": 1.24265769294492, + "language_loss": 0.62688887, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64800215, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 8444, + "time_per_iteration": 2.5245463848114014 + }, + { + "auxiliary_loss_clip": 0.01058011, + "auxiliary_loss_mlp": 0.01041008, + "balance_loss_clip": 1.01573515, + "balance_loss_mlp": 1.01799595, + "epoch": 0.507740868781001, + "flos": 29861108058240.0, + "grad_norm": 1.6084283402280708, + "language_loss": 0.6486457, + "learning_rate": 2.046925826041012e-06, + "loss": 0.66963589, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 8445, + "time_per_iteration": 2.42889142036438 + }, + { + "auxiliary_loss_clip": 0.01010636, + "auxiliary_loss_mlp": 0.0101144, + "balance_loss_clip": 1.00923479, + "balance_loss_mlp": 1.00266433, + "epoch": 0.507800992033669, + "flos": 61916157953280.0, + "grad_norm": 0.8420181665240871, + "language_loss": 0.62067723, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.64089799, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.08007812, + "step": 8446, + "time_per_iteration": 3.02414608001709 + }, + { + "auxiliary_loss_clip": 0.0105771, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.01447058, + "balance_loss_mlp": 1.01820731, + "epoch": 0.507861115286337, + "flos": 20699549654400.0, + "grad_norm": 1.535013310124043, + "language_loss": 0.81865871, + "learning_rate": 2.04614711357029e-06, + "loss": 0.83963859, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 8447, + "time_per_iteration": 2.3655736446380615 + }, + { + "auxiliary_loss_clip": 0.01060043, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_clip": 1.01827824, + "balance_loss_mlp": 1.02035141, + "epoch": 0.507921238539005, + "flos": 30845728621440.0, + "grad_norm": 1.3689634974061746, + "language_loss": 0.71522647, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.73626196, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3984375, + "step": 8448, + "time_per_iteration": 2.464146375656128 + }, + { + "auxiliary_loss_clip": 0.01059175, + "auxiliary_loss_mlp": 0.01042424, + "balance_loss_clip": 1.01779509, + "balance_loss_mlp": 1.01984167, + "epoch": 0.507981361791673, + "flos": 35698725694080.0, + "grad_norm": 1.399631487467386, + "language_loss": 0.72520828, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74622434, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39257812, + "step": 8449, + "time_per_iteration": 2.5183145999908447 + }, + { + "auxiliary_loss_clip": 0.01058134, + "auxiliary_loss_mlp": 0.01044954, + "balance_loss_clip": 1.02007532, + "balance_loss_mlp": 1.01855624, + "epoch": 0.5080414850443409, + "flos": 27160262058240.0, + "grad_norm": 1.5999901356977657, + "language_loss": 0.74203265, + "learning_rate": 2.044979031776844e-06, + "loss": 0.76306349, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39648438, + "step": 8450, + "time_per_iteration": 3.8643808364868164 + }, + { + "auxiliary_loss_clip": 0.01061909, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.01883161, + "balance_loss_mlp": 1.02014458, + "epoch": 0.5081016082970089, + "flos": 27084081738240.0, + "grad_norm": 1.6344379982743111, + "language_loss": 0.77664065, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79772633, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 8451, + "time_per_iteration": 2.4061996936798096 + }, + { + "auxiliary_loss_clip": 0.0105902, + "auxiliary_loss_mlp": 0.01042867, + "balance_loss_clip": 1.01680732, + "balance_loss_mlp": 1.01862383, + "epoch": 0.5081617315496768, + "flos": 22855436242560.0, + "grad_norm": 1.767493414730549, + "language_loss": 0.86563838, + "learning_rate": 2.044200302028559e-06, + "loss": 0.88665724, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 8452, + "time_per_iteration": 2.381490468978882 + }, + { + "auxiliary_loss_clip": 0.01063912, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.0199635, + "balance_loss_mlp": 1.02048802, + "epoch": 0.5082218548023448, + "flos": 16281186497280.0, + "grad_norm": 2.5835770687561976, + "language_loss": 0.79761779, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.81874704, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.43359375, + "step": 8453, + "time_per_iteration": 2.3139281272888184 + }, + { + "auxiliary_loss_clip": 0.01058732, + "auxiliary_loss_mlp": 0.01042395, + "balance_loss_clip": 1.01886272, + "balance_loss_mlp": 1.01872623, + "epoch": 0.5082819780550127, + "flos": 24459660437760.0, + "grad_norm": 1.613067809936125, + "language_loss": 0.77920878, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.80022001, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.40039062, + "step": 8454, + "time_per_iteration": 2.4301228523254395 + }, + { + "auxiliary_loss_clip": 0.01060469, + "auxiliary_loss_mlp": 0.01043311, + "balance_loss_clip": 1.01534498, + "balance_loss_mlp": 1.01992035, + "epoch": 0.5083421013076808, + "flos": 23402176133760.0, + "grad_norm": 1.9065991176185264, + "language_loss": 0.90625519, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.92729294, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 8455, + "time_per_iteration": 2.387866258621216 + }, + { + "auxiliary_loss_clip": 0.01062036, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_clip": 1.0170846, + "balance_loss_mlp": 1.01970124, + "epoch": 0.5084022245603487, + "flos": 23871723275520.0, + "grad_norm": 1.7268764699752623, + "language_loss": 0.63175762, + "learning_rate": 2.042642822537149e-06, + "loss": 0.6528722, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.42382812, + "step": 8456, + "time_per_iteration": 2.3955698013305664 + }, + { + "auxiliary_loss_clip": 0.01011951, + "auxiliary_loss_mlp": 0.01007527, + "balance_loss_clip": 1.00530994, + "balance_loss_mlp": 1.00404358, + "epoch": 0.5084623478130167, + "flos": 62870333944320.0, + "grad_norm": 0.8382573550432458, + "language_loss": 0.62534261, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64553738, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.07910156, + "step": 8457, + "time_per_iteration": 2.886145830154419 + }, + { + "auxiliary_loss_clip": 0.0106108, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_clip": 1.0258708, + "balance_loss_mlp": 1.01929855, + "epoch": 0.5085224710656846, + "flos": 22345040943360.0, + "grad_norm": 1.70646377723114, + "language_loss": 0.68850744, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.70966589, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 8458, + "time_per_iteration": 2.4077377319335938 + }, + { + "auxiliary_loss_clip": 0.01059225, + "auxiliary_loss_mlp": 0.01045053, + "balance_loss_clip": 1.01687205, + "balance_loss_mlp": 1.01771533, + "epoch": 0.5085825943183526, + "flos": 26065106530560.0, + "grad_norm": 1.6775500414284321, + "language_loss": 0.78374052, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80478323, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 8459, + "time_per_iteration": 2.406562566757202 + }, + { + "auxiliary_loss_clip": 0.01065758, + "auxiliary_loss_mlp": 0.01044659, + "balance_loss_clip": 1.01491594, + "balance_loss_mlp": 1.02094018, + "epoch": 0.5086427175710206, + "flos": 17419773623040.0, + "grad_norm": 2.0588581516784967, + "language_loss": 0.81104445, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83214867, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.44921875, + "step": 8460, + "time_per_iteration": 2.3602967262268066 + }, + { + "auxiliary_loss_clip": 0.01060884, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_clip": 1.01852596, + "balance_loss_mlp": 1.0195725, + "epoch": 0.5087028408236886, + "flos": 20630700720000.0, + "grad_norm": 1.4998423325380499, + "language_loss": 0.6989097, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.71998715, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4140625, + "step": 8461, + "time_per_iteration": 2.353825569152832 + }, + { + "auxiliary_loss_clip": 0.0105927, + "auxiliary_loss_mlp": 0.01054555, + "balance_loss_clip": 1.02606416, + "balance_loss_mlp": 1.01849461, + "epoch": 0.5087629640763566, + "flos": 25592626834560.0, + "grad_norm": 1.9715534416134695, + "language_loss": 0.77255726, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.79369557, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 8462, + "time_per_iteration": 2.425936460494995 + }, + { + "auxiliary_loss_clip": 0.01060125, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_clip": 1.01603341, + "balance_loss_mlp": 1.01899862, + "epoch": 0.5088230873290245, + "flos": 13260780023040.0, + "grad_norm": 2.3832349630852483, + "language_loss": 0.82953119, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.85057431, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 8463, + "time_per_iteration": 2.3425872325897217 + }, + { + "auxiliary_loss_clip": 0.0105964, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_clip": 1.0212338, + "balance_loss_mlp": 1.01911569, + "epoch": 0.5088832105816925, + "flos": 20042554089600.0, + "grad_norm": 1.6484195477476822, + "language_loss": 0.77233231, + "learning_rate": 2.039527786882341e-06, + "loss": 0.79341376, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 8464, + "time_per_iteration": 2.3898494243621826 + }, + { + "auxiliary_loss_clip": 0.01011459, + "auxiliary_loss_mlp": 0.01009675, + "balance_loss_clip": 1.00724339, + "balance_loss_mlp": 1.00347078, + "epoch": 0.5089433338343604, + "flos": 67418363491200.0, + "grad_norm": 0.694206941864317, + "language_loss": 0.59482944, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61504078, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.08007812, + "step": 8465, + "time_per_iteration": 3.1264331340789795 + }, + { + "auxiliary_loss_clip": 0.01056907, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_clip": 1.01691413, + "balance_loss_mlp": 1.01749182, + "epoch": 0.5090034570870284, + "flos": 22709254913280.0, + "grad_norm": 1.7616177920521146, + "language_loss": 0.81471777, + "learning_rate": 2.038749012684354e-06, + "loss": 0.83573043, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 8466, + "time_per_iteration": 2.405600070953369 + }, + { + "auxiliary_loss_clip": 0.01056475, + "auxiliary_loss_mlp": 0.010461, + "balance_loss_clip": 1.01830077, + "balance_loss_mlp": 1.01689303, + "epoch": 0.5090635803396963, + "flos": 20444858219520.0, + "grad_norm": 1.5334302282990855, + "language_loss": 0.7872715, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80829728, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.39453125, + "step": 8467, + "time_per_iteration": 2.364964723587036 + }, + { + "auxiliary_loss_clip": 0.01057327, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.0178895, + "balance_loss_mlp": 1.01806641, + "epoch": 0.5091237035923644, + "flos": 23767751646720.0, + "grad_norm": 1.6490274474095241, + "language_loss": 0.75406432, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.77506399, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39257812, + "step": 8468, + "time_per_iteration": 2.421795129776001 + }, + { + "auxiliary_loss_clip": 0.01058273, + "auxiliary_loss_mlp": 0.01046941, + "balance_loss_clip": 1.0206672, + "balance_loss_mlp": 1.01886284, + "epoch": 0.5091838268450323, + "flos": 18327061791360.0, + "grad_norm": 1.8848212811085376, + "language_loss": 0.79052973, + "learning_rate": 2.03758084040404e-06, + "loss": 0.81158185, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 8469, + "time_per_iteration": 2.4092581272125244 + }, + { + "auxiliary_loss_clip": 0.01060927, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.01732397, + "balance_loss_mlp": 1.02089882, + "epoch": 0.5092439500977003, + "flos": 29056395064320.0, + "grad_norm": 1.6848038935870577, + "language_loss": 0.69951981, + "learning_rate": 2.037191446774109e-06, + "loss": 0.72058165, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40039062, + "step": 8470, + "time_per_iteration": 2.453277826309204 + }, + { + "auxiliary_loss_clip": 0.01061868, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.02325177, + "balance_loss_mlp": 1.01988184, + "epoch": 0.5093040733503682, + "flos": 13553037947520.0, + "grad_norm": 1.8593007499251606, + "language_loss": 0.74907631, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.77020204, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41992188, + "step": 8471, + "time_per_iteration": 2.3704445362091064 + }, + { + "auxiliary_loss_clip": 0.01011306, + "auxiliary_loss_mlp": 0.01012123, + "balance_loss_clip": 1.00950074, + "balance_loss_mlp": 1.00329638, + "epoch": 0.5093641966030362, + "flos": 68903080502400.0, + "grad_norm": 0.7537130859706597, + "language_loss": 0.58227253, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60250676, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.08007812, + "step": 8472, + "time_per_iteration": 3.016374349594116 + }, + { + "auxiliary_loss_clip": 0.01057877, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.01727557, + "balance_loss_mlp": 1.01789689, + "epoch": 0.5094243198557042, + "flos": 21579849475200.0, + "grad_norm": 1.7610066679645031, + "language_loss": 0.70315015, + "learning_rate": 2.03602325748156e-06, + "loss": 0.72414231, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3984375, + "step": 8473, + "time_per_iteration": 2.375530481338501 + }, + { + "auxiliary_loss_clip": 0.01057618, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_clip": 1.01838565, + "balance_loss_mlp": 1.01825333, + "epoch": 0.5094844431083722, + "flos": 28839444675840.0, + "grad_norm": 2.1591280606595733, + "language_loss": 0.85956568, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.88057792, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39453125, + "step": 8474, + "time_per_iteration": 2.432929515838623 + }, + { + "auxiliary_loss_clip": 0.01058455, + "auxiliary_loss_mlp": 0.01043776, + "balance_loss_clip": 1.01962423, + "balance_loss_mlp": 1.01874661, + "epoch": 0.5095445663610402, + "flos": 14975224980480.0, + "grad_norm": 1.816493254569138, + "language_loss": 0.65369725, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67471951, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3984375, + "step": 8475, + "time_per_iteration": 2.346534013748169 + }, + { + "auxiliary_loss_clip": 0.01063368, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_clip": 1.02077329, + "balance_loss_mlp": 1.02053189, + "epoch": 0.5096046896137081, + "flos": 20776044176640.0, + "grad_norm": 2.556273002101492, + "language_loss": 0.82963926, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.85076082, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4296875, + "step": 8476, + "time_per_iteration": 2.408928632736206 + }, + { + "auxiliary_loss_clip": 0.0106154, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_clip": 1.01667666, + "balance_loss_mlp": 1.01975107, + "epoch": 0.5096648128663761, + "flos": 23183968936320.0, + "grad_norm": 2.2272393654219704, + "language_loss": 0.82817686, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.8492583, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41796875, + "step": 8477, + "time_per_iteration": 2.378952980041504 + }, + { + "auxiliary_loss_clip": 0.01063991, + "auxiliary_loss_mlp": 0.01042736, + "balance_loss_clip": 1.0150677, + "balance_loss_mlp": 1.02228308, + "epoch": 0.509724936119044, + "flos": 22308347237760.0, + "grad_norm": 17.59995275052481, + "language_loss": 0.63340479, + "learning_rate": 2.034076248204082e-06, + "loss": 0.65447211, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 8478, + "time_per_iteration": 2.3954272270202637 + }, + { + "auxiliary_loss_clip": 0.01059507, + "auxiliary_loss_mlp": 0.01049154, + "balance_loss_clip": 1.02327335, + "balance_loss_mlp": 1.02026474, + "epoch": 0.509785059371712, + "flos": 26285862257280.0, + "grad_norm": 1.7890720293344577, + "language_loss": 0.67571807, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.6968047, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39257812, + "step": 8479, + "time_per_iteration": 2.416754961013794 + }, + { + "auxiliary_loss_clip": 0.01060609, + "auxiliary_loss_mlp": 0.01043257, + "balance_loss_clip": 1.01860452, + "balance_loss_mlp": 1.02118158, + "epoch": 0.50984518262438, + "flos": 22963527411840.0, + "grad_norm": 1.5243540715815629, + "language_loss": 0.7058692, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.72690785, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.39453125, + "step": 8480, + "time_per_iteration": 3.6331043243408203 + }, + { + "auxiliary_loss_clip": 0.01060678, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.01294613, + "balance_loss_mlp": 1.01873326, + "epoch": 0.509905305877048, + "flos": 26212195555200.0, + "grad_norm": 1.748521003081691, + "language_loss": 0.80091715, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.82193589, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 8481, + "time_per_iteration": 2.4144697189331055 + }, + { + "auxiliary_loss_clip": 0.01057467, + "auxiliary_loss_mlp": 0.010499, + "balance_loss_clip": 1.02528358, + "balance_loss_mlp": 1.01885128, + "epoch": 0.5099654291297159, + "flos": 20339001377280.0, + "grad_norm": 1.5321760857809004, + "language_loss": 0.84126312, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.86233681, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38671875, + "step": 8482, + "time_per_iteration": 3.8280582427978516 + }, + { + "auxiliary_loss_clip": 0.01062976, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.01730359, + "balance_loss_mlp": 1.01997113, + "epoch": 0.5100255523823839, + "flos": 29053671978240.0, + "grad_norm": 1.5992475498017331, + "language_loss": 0.86747789, + "learning_rate": 2.032129206622238e-06, + "loss": 0.88854992, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4296875, + "step": 8483, + "time_per_iteration": 3.7988643646240234 + }, + { + "auxiliary_loss_clip": 0.01059816, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.01806378, + "balance_loss_mlp": 1.01899791, + "epoch": 0.5100856756350518, + "flos": 22454807857920.0, + "grad_norm": 1.7898355895582827, + "language_loss": 0.84272349, + "learning_rate": 2.031739794591775e-06, + "loss": 0.86376464, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40820312, + "step": 8484, + "time_per_iteration": 2.385152578353882 + }, + { + "auxiliary_loss_clip": 0.01060972, + "auxiliary_loss_mlp": 0.01045525, + "balance_loss_clip": 1.01714146, + "balance_loss_mlp": 1.01933277, + "epoch": 0.5101457988877198, + "flos": 19170074413440.0, + "grad_norm": 1.8987637941413014, + "language_loss": 0.82798123, + "learning_rate": 2.031350381357736e-06, + "loss": 0.84904623, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 8485, + "time_per_iteration": 2.3719170093536377 + }, + { + "auxiliary_loss_clip": 0.01057456, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_clip": 1.02002668, + "balance_loss_mlp": 1.01869547, + "epoch": 0.5102059221403878, + "flos": 14865492977280.0, + "grad_norm": 2.1843209048065253, + "language_loss": 0.74839187, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76941442, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 8486, + "time_per_iteration": 2.3604555130004883 + }, + { + "auxiliary_loss_clip": 0.01060394, + "auxiliary_loss_mlp": 0.01041241, + "balance_loss_clip": 1.01514649, + "balance_loss_mlp": 1.01943445, + "epoch": 0.5102660453930558, + "flos": 22960141009920.0, + "grad_norm": 1.5276396309678737, + "language_loss": 0.71432161, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.73533797, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41015625, + "step": 8487, + "time_per_iteration": 2.4232871532440186 + }, + { + "auxiliary_loss_clip": 0.01060502, + "auxiliary_loss_mlp": 0.01043244, + "balance_loss_clip": 1.01598072, + "balance_loss_mlp": 1.02015007, + "epoch": 0.5103261686457238, + "flos": 23148182926080.0, + "grad_norm": 2.089991253011674, + "language_loss": 0.73613989, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75717735, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 8488, + "time_per_iteration": 2.381349802017212 + }, + { + "auxiliary_loss_clip": 0.01059971, + "auxiliary_loss_mlp": 0.01054579, + "balance_loss_clip": 1.02772164, + "balance_loss_mlp": 1.01851821, + "epoch": 0.5103862918983917, + "flos": 14318369061120.0, + "grad_norm": 2.0826718493094716, + "language_loss": 0.71753764, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.73868316, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 8489, + "time_per_iteration": 2.3843705654144287 + }, + { + "auxiliary_loss_clip": 0.01058268, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.01876593, + "balance_loss_mlp": 1.0176003, + "epoch": 0.5104464151510597, + "flos": 25847353180800.0, + "grad_norm": 2.216725040649054, + "language_loss": 0.73679549, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.75782347, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40625, + "step": 8490, + "time_per_iteration": 3.8420891761779785 + }, + { + "auxiliary_loss_clip": 0.01056948, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.01577675, + "balance_loss_mlp": 1.01737738, + "epoch": 0.5105065384037276, + "flos": 21651840432000.0, + "grad_norm": 1.5838017644755435, + "language_loss": 0.81245548, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.83342302, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.39453125, + "step": 8491, + "time_per_iteration": 2.3823461532592773 + }, + { + "auxiliary_loss_clip": 0.01055452, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.0183413, + "balance_loss_mlp": 1.01721275, + "epoch": 0.5105666616563956, + "flos": 22490489134080.0, + "grad_norm": 3.869944346241888, + "language_loss": 0.80442715, + "learning_rate": 2.028624456259728e-06, + "loss": 0.82540572, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 8492, + "time_per_iteration": 2.407033920288086 + }, + { + "auxiliary_loss_clip": 0.01061291, + "auxiliary_loss_mlp": 0.01049074, + "balance_loss_clip": 1.02054679, + "balance_loss_mlp": 1.02007675, + "epoch": 0.5106267849090635, + "flos": 22454668212480.0, + "grad_norm": 2.334088030719718, + "language_loss": 0.79114056, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.81224424, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41210938, + "step": 8493, + "time_per_iteration": 2.380302906036377 + }, + { + "auxiliary_loss_clip": 0.01058824, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.01478708, + "balance_loss_mlp": 1.01898789, + "epoch": 0.5106869081617316, + "flos": 23546053313280.0, + "grad_norm": 1.682354192991298, + "language_loss": 0.84550291, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.86651027, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 8494, + "time_per_iteration": 2.4185791015625 + }, + { + "auxiliary_loss_clip": 0.01061125, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.02203953, + "balance_loss_mlp": 1.01969254, + "epoch": 0.5107470314143995, + "flos": 26791893636480.0, + "grad_norm": 2.064172699552442, + "language_loss": 0.8042196, + "learning_rate": 2.027456186069326e-06, + "loss": 0.8253051, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.4140625, + "step": 8495, + "time_per_iteration": 2.4141385555267334 + }, + { + "auxiliary_loss_clip": 0.01059475, + "auxiliary_loss_mlp": 0.01043149, + "balance_loss_clip": 1.01762617, + "balance_loss_mlp": 1.01934302, + "epoch": 0.5108071546670675, + "flos": 25738493961600.0, + "grad_norm": 1.5548976243058288, + "language_loss": 0.79839039, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.81941664, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40039062, + "step": 8496, + "time_per_iteration": 2.4466471672058105 + }, + { + "auxiliary_loss_clip": 0.01056311, + "auxiliary_loss_mlp": 0.01041605, + "balance_loss_clip": 1.01649904, + "balance_loss_mlp": 1.01703012, + "epoch": 0.5108672779197354, + "flos": 18696547376640.0, + "grad_norm": 2.7798772706910135, + "language_loss": 0.80509442, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.82607359, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39257812, + "step": 8497, + "time_per_iteration": 2.352236032485962 + }, + { + "auxiliary_loss_clip": 0.01057798, + "auxiliary_loss_mlp": 0.01045407, + "balance_loss_clip": 1.01896644, + "balance_loss_mlp": 1.01850057, + "epoch": 0.5109274011724034, + "flos": 26686944489600.0, + "grad_norm": 1.667747516552524, + "language_loss": 0.83095706, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.85198909, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 8498, + "time_per_iteration": 2.4277453422546387 + }, + { + "auxiliary_loss_clip": 0.01058752, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.01128626, + "balance_loss_mlp": 1.01893854, + "epoch": 0.5109875244250714, + "flos": 22782921615360.0, + "grad_norm": 2.514960657591234, + "language_loss": 0.72058791, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.74154603, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8499, + "time_per_iteration": 2.399409055709839 + }, + { + "auxiliary_loss_clip": 0.01058069, + "auxiliary_loss_mlp": 0.01043451, + "balance_loss_clip": 1.0177381, + "balance_loss_mlp": 1.01813507, + "epoch": 0.5110476476777394, + "flos": 35587108477440.0, + "grad_norm": 1.476868399800931, + "language_loss": 0.73334008, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.75435525, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 8500, + "time_per_iteration": 2.4770984649658203 + }, + { + "auxiliary_loss_clip": 0.01062264, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.0178858, + "balance_loss_mlp": 1.01851737, + "epoch": 0.5111077709304074, + "flos": 19279806416640.0, + "grad_norm": 3.014503508323248, + "language_loss": 0.65995121, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.68105829, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.4375, + "step": 8501, + "time_per_iteration": 2.3783631324768066 + }, + { + "auxiliary_loss_clip": 0.01060002, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.02015555, + "balance_loss_mlp": 1.01900148, + "epoch": 0.5111678941830753, + "flos": 20667150046080.0, + "grad_norm": 1.6954590496319137, + "language_loss": 0.89045727, + "learning_rate": 2.024730186540907e-06, + "loss": 0.91153169, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 8502, + "time_per_iteration": 2.3655779361724854 + }, + { + "auxiliary_loss_clip": 0.01056431, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.01503325, + "balance_loss_mlp": 1.01724267, + "epoch": 0.5112280174357433, + "flos": 26286665218560.0, + "grad_norm": 1.3775320038525007, + "language_loss": 0.83224136, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.85320693, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.390625, + "step": 8503, + "time_per_iteration": 2.444091796875 + }, + { + "auxiliary_loss_clip": 0.01014819, + "auxiliary_loss_mlp": 0.01003948, + "balance_loss_clip": 1.00173104, + "balance_loss_mlp": 1.0068028, + "epoch": 0.5112881406884112, + "flos": 59471504576640.0, + "grad_norm": 0.8576582947972992, + "language_loss": 0.63844234, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65863007, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.08007812, + "step": 8504, + "time_per_iteration": 3.0700438022613525 + }, + { + "auxiliary_loss_clip": 0.01058297, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.01341724, + "balance_loss_mlp": 1.0190208, + "epoch": 0.5113482639410792, + "flos": 26467655040000.0, + "grad_norm": 2.053833425561365, + "language_loss": 0.85326159, + "learning_rate": 2.023561886666816e-06, + "loss": 0.87424076, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39257812, + "step": 8505, + "time_per_iteration": 2.4076192378997803 + }, + { + "auxiliary_loss_clip": 0.01058524, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.01120996, + "balance_loss_mlp": 1.01938999, + "epoch": 0.5114083871937471, + "flos": 29894624830080.0, + "grad_norm": 2.0328849133639477, + "language_loss": 0.75993967, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.78089476, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 8506, + "time_per_iteration": 2.4578373432159424 + }, + { + "auxiliary_loss_clip": 0.0106049, + "auxiliary_loss_mlp": 0.01041443, + "balance_loss_clip": 1.01475239, + "balance_loss_mlp": 1.0194155, + "epoch": 0.5114685104464152, + "flos": 24313479108480.0, + "grad_norm": 1.760485764810966, + "language_loss": 0.59581649, + "learning_rate": 2.022783015592131e-06, + "loss": 0.61683589, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 8507, + "time_per_iteration": 2.416443347930908 + }, + { + "auxiliary_loss_clip": 0.01059894, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.02147877, + "balance_loss_mlp": 1.01947165, + "epoch": 0.5115286336990831, + "flos": 17018342277120.0, + "grad_norm": 2.336910049019614, + "language_loss": 0.86441511, + "learning_rate": 2.022393578751503e-06, + "loss": 0.88550526, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40234375, + "step": 8508, + "time_per_iteration": 2.3512864112854004 + }, + { + "auxiliary_loss_clip": 0.01060449, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_clip": 1.0150665, + "balance_loss_mlp": 1.01868606, + "epoch": 0.5115887569517511, + "flos": 23658264023040.0, + "grad_norm": 1.6904228890094024, + "language_loss": 0.73971701, + "learning_rate": 2.022004141061709e-06, + "loss": 0.7607584, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 8509, + "time_per_iteration": 2.4185376167297363 + }, + { + "auxiliary_loss_clip": 0.01057824, + "auxiliary_loss_mlp": 0.01037891, + "balance_loss_clip": 1.01319075, + "balance_loss_mlp": 1.01842356, + "epoch": 0.511648880204419, + "flos": 16106271252480.0, + "grad_norm": 1.6631376835743572, + "language_loss": 0.77526665, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.79622382, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.39453125, + "step": 8510, + "time_per_iteration": 2.345179796218872 + }, + { + "auxiliary_loss_clip": 0.01058569, + "auxiliary_loss_mlp": 0.01041064, + "balance_loss_clip": 1.01629245, + "balance_loss_mlp": 1.01950943, + "epoch": 0.511709003457087, + "flos": 32633595901440.0, + "grad_norm": 1.699063497637329, + "language_loss": 0.71924716, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.74024349, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 8511, + "time_per_iteration": 2.50089430809021 + }, + { + "auxiliary_loss_clip": 0.01058123, + "auxiliary_loss_mlp": 0.01042847, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.01980734, + "epoch": 0.511769126709755, + "flos": 21761013853440.0, + "grad_norm": 1.8578425715498674, + "language_loss": 0.68383574, + "learning_rate": 2.020835823045001e-06, + "loss": 0.70484543, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 8512, + "time_per_iteration": 2.374147653579712 + }, + { + "auxiliary_loss_clip": 0.01059672, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.01619208, + "balance_loss_mlp": 1.01849604, + "epoch": 0.511829249962423, + "flos": 23914212266880.0, + "grad_norm": 1.6643768919269346, + "language_loss": 0.67855716, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.69959831, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 8513, + "time_per_iteration": 2.4140875339508057 + }, + { + "auxiliary_loss_clip": 0.01057349, + "auxiliary_loss_mlp": 0.01042241, + "balance_loss_clip": 1.01545501, + "balance_loss_mlp": 1.01858091, + "epoch": 0.511889373215091, + "flos": 23726030705280.0, + "grad_norm": 1.864333754801022, + "language_loss": 0.69601905, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71701491, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38671875, + "step": 8514, + "time_per_iteration": 2.4115965366363525 + }, + { + "auxiliary_loss_clip": 0.01056221, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.01858675, + "balance_loss_mlp": 1.01722324, + "epoch": 0.5119494964677589, + "flos": 28110248686080.0, + "grad_norm": 1.5249478632116205, + "language_loss": 0.66780084, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68880415, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.390625, + "step": 8515, + "time_per_iteration": 2.4685540199279785 + }, + { + "auxiliary_loss_clip": 0.0105529, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.0141902, + "balance_loss_mlp": 1.01648593, + "epoch": 0.5120096197204269, + "flos": 24972045684480.0, + "grad_norm": 3.556507713972009, + "language_loss": 0.76318705, + "learning_rate": 2.019278054696955e-06, + "loss": 0.78413141, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 8516, + "time_per_iteration": 2.4189131259918213 + }, + { + "auxiliary_loss_clip": 0.01060893, + "auxiliary_loss_mlp": 0.01045879, + "balance_loss_clip": 1.01811516, + "balance_loss_mlp": 1.01941478, + "epoch": 0.5120697429730948, + "flos": 17967037184640.0, + "grad_norm": 1.8483468226846396, + "language_loss": 0.79111099, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.81217873, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 8517, + "time_per_iteration": 2.3737893104553223 + }, + { + "auxiliary_loss_clip": 0.01059808, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_clip": 1.01319313, + "balance_loss_mlp": 1.01838803, + "epoch": 0.5121298662257628, + "flos": 23291292055680.0, + "grad_norm": 1.8013719480212882, + "language_loss": 0.74703163, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.76804233, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 8518, + "time_per_iteration": 2.3857932090759277 + }, + { + "auxiliary_loss_clip": 0.01057995, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.01911998, + "balance_loss_mlp": 1.01823592, + "epoch": 0.5121899894784308, + "flos": 17310111442560.0, + "grad_norm": 1.7248289437768196, + "language_loss": 0.80002332, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.82106149, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 8519, + "time_per_iteration": 3.699230909347534 + }, + { + "auxiliary_loss_clip": 0.01058744, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.01741982, + "balance_loss_mlp": 1.01894784, + "epoch": 0.5122501127310988, + "flos": 24929102845440.0, + "grad_norm": 1.477083357572077, + "language_loss": 0.80575848, + "learning_rate": 2.017720274652497e-06, + "loss": 0.82679331, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3984375, + "step": 8520, + "time_per_iteration": 2.456456422805786 + }, + { + "auxiliary_loss_clip": 0.01062348, + "auxiliary_loss_mlp": 0.01047017, + "balance_loss_clip": 1.01721489, + "balance_loss_mlp": 1.0191437, + "epoch": 0.5123102359837667, + "flos": 18441855941760.0, + "grad_norm": 1.874234480973075, + "language_loss": 0.83050859, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.8516022, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.43359375, + "step": 8521, + "time_per_iteration": 3.871715784072876 + }, + { + "auxiliary_loss_clip": 0.01056885, + "auxiliary_loss_mlp": 0.01044704, + "balance_loss_clip": 1.01654708, + "balance_loss_mlp": 1.01741266, + "epoch": 0.5123703592364347, + "flos": 26683732644480.0, + "grad_norm": 1.7442068212686215, + "language_loss": 0.68872917, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70974499, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.39453125, + "step": 8522, + "time_per_iteration": 2.4884464740753174 + }, + { + "auxiliary_loss_clip": 0.01063591, + "auxiliary_loss_mlp": 0.01053523, + "balance_loss_clip": 1.02177787, + "balance_loss_mlp": 1.01963902, + "epoch": 0.5124304824891026, + "flos": 28802681147520.0, + "grad_norm": 1.6992335897159507, + "language_loss": 0.63180339, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.65297455, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.43945312, + "step": 8523, + "time_per_iteration": 3.864572048187256 + }, + { + "auxiliary_loss_clip": 0.01061057, + "auxiliary_loss_mlp": 0.01042228, + "balance_loss_clip": 1.01858926, + "balance_loss_mlp": 1.01980662, + "epoch": 0.5124906057417706, + "flos": 21760769473920.0, + "grad_norm": 1.9323552634249836, + "language_loss": 0.78889459, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80992746, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.41210938, + "step": 8524, + "time_per_iteration": 2.394404411315918 + }, + { + "auxiliary_loss_clip": 0.01059274, + "auxiliary_loss_mlp": 0.01042931, + "balance_loss_clip": 1.0172174, + "balance_loss_mlp": 1.01962483, + "epoch": 0.5125507289944387, + "flos": 18879527145600.0, + "grad_norm": 2.321488004421083, + "language_loss": 0.75879574, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77981782, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39648438, + "step": 8525, + "time_per_iteration": 2.375128746032715 + }, + { + "auxiliary_loss_clip": 0.01059437, + "auxiliary_loss_mlp": 0.01046505, + "balance_loss_clip": 1.01815677, + "balance_loss_mlp": 1.01899898, + "epoch": 0.5126108522471066, + "flos": 35626350712320.0, + "grad_norm": 1.796641649184217, + "language_loss": 0.76034749, + "learning_rate": 2.015383584722531e-06, + "loss": 0.78140688, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40429688, + "step": 8526, + "time_per_iteration": 2.5429975986480713 + }, + { + "auxiliary_loss_clip": 0.01059844, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_clip": 1.01478779, + "balance_loss_mlp": 1.0197283, + "epoch": 0.5126709754997746, + "flos": 20189957316480.0, + "grad_norm": 1.581574957714515, + "language_loss": 0.66615528, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.68717593, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 8527, + "time_per_iteration": 2.4000167846679688 + }, + { + "auxiliary_loss_clip": 0.01057617, + "auxiliary_loss_mlp": 0.01042144, + "balance_loss_clip": 1.01813579, + "balance_loss_mlp": 1.01966739, + "epoch": 0.5127310987524425, + "flos": 18587548512000.0, + "grad_norm": 1.6287403683324753, + "language_loss": 0.74970263, + "learning_rate": 2.014604683254908e-06, + "loss": 0.77070028, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 8528, + "time_per_iteration": 3.8061838150024414 + }, + { + "auxiliary_loss_clip": 0.01056732, + "auxiliary_loss_mlp": 0.01039818, + "balance_loss_clip": 1.014081, + "balance_loss_mlp": 1.01746714, + "epoch": 0.5127912220051105, + "flos": 22453620871680.0, + "grad_norm": 2.509196433939289, + "language_loss": 0.84685808, + "learning_rate": 2.014215231682995e-06, + "loss": 0.8678236, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39257812, + "step": 8529, + "time_per_iteration": 2.409266233444214 + }, + { + "auxiliary_loss_clip": 0.01056588, + "auxiliary_loss_mlp": 0.01037106, + "balance_loss_clip": 1.01246595, + "balance_loss_mlp": 1.01928341, + "epoch": 0.5128513452577784, + "flos": 19092846752640.0, + "grad_norm": 2.0000810773624718, + "language_loss": 0.75220764, + "learning_rate": 2.01382577957204e-06, + "loss": 0.7731446, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 8530, + "time_per_iteration": 2.359086275100708 + }, + { + "auxiliary_loss_clip": 0.01012736, + "auxiliary_loss_mlp": 0.01005158, + "balance_loss_clip": 1.00270212, + "balance_loss_mlp": 1.00495255, + "epoch": 0.5129114685104464, + "flos": 67888573948800.0, + "grad_norm": 0.7594127512124409, + "language_loss": 0.60880721, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62898612, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.078125, + "step": 8531, + "time_per_iteration": 3.1339187622070312 + }, + { + "auxiliary_loss_clip": 0.01060237, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_clip": 1.01408291, + "balance_loss_mlp": 1.02011704, + "epoch": 0.5129715917631144, + "flos": 20448104976000.0, + "grad_norm": 1.7031259136727874, + "language_loss": 0.78163862, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.80264515, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40039062, + "step": 8532, + "time_per_iteration": 2.370124578475952 + }, + { + "auxiliary_loss_clip": 0.01058608, + "auxiliary_loss_mlp": 0.01041531, + "balance_loss_clip": 1.01494718, + "balance_loss_mlp": 1.02009034, + "epoch": 0.5130317150157824, + "flos": 35114698604160.0, + "grad_norm": 1.9622905596979503, + "language_loss": 0.68789613, + "learning_rate": 2.012657420152597e-06, + "loss": 0.70889747, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 8533, + "time_per_iteration": 2.5079092979431152 + }, + { + "auxiliary_loss_clip": 0.01061739, + "auxiliary_loss_mlp": 0.01043778, + "balance_loss_clip": 1.01625228, + "balance_loss_mlp": 1.01979756, + "epoch": 0.5130918382684503, + "flos": 19790620652160.0, + "grad_norm": 2.1797980166786894, + "language_loss": 0.82926643, + "learning_rate": 2.01226796603315e-06, + "loss": 0.85032159, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 8534, + "time_per_iteration": 2.3801212310791016 + }, + { + "auxiliary_loss_clip": 0.0106127, + "auxiliary_loss_mlp": 0.0104886, + "balance_loss_clip": 1.020679, + "balance_loss_mlp": 1.01872575, + "epoch": 0.5131519615211183, + "flos": 26321892647040.0, + "grad_norm": 1.5122338337526506, + "language_loss": 0.64694142, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.66804266, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42578125, + "step": 8535, + "time_per_iteration": 2.457638740539551 + }, + { + "auxiliary_loss_clip": 0.01058907, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.01539493, + "balance_loss_mlp": 1.01911974, + "epoch": 0.5132120847737862, + "flos": 19170912286080.0, + "grad_norm": 1.5659440175933066, + "language_loss": 0.70472276, + "learning_rate": 2.011489056413418e-06, + "loss": 0.72572815, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8536, + "time_per_iteration": 2.391397476196289 + }, + { + "auxiliary_loss_clip": 0.01057714, + "auxiliary_loss_mlp": 0.01048324, + "balance_loss_clip": 1.02029729, + "balance_loss_mlp": 1.01747215, + "epoch": 0.5132722080264542, + "flos": 20229374108160.0, + "grad_norm": 2.5257121268655816, + "language_loss": 0.73083436, + "learning_rate": 2.011099600942669e-06, + "loss": 0.75189471, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 8537, + "time_per_iteration": 2.40474009513855 + }, + { + "auxiliary_loss_clip": 0.01060192, + "auxiliary_loss_mlp": 0.0104289, + "balance_loss_clip": 1.01717663, + "balance_loss_mlp": 1.01930642, + "epoch": 0.5133323312791223, + "flos": 16468600008960.0, + "grad_norm": 1.8775366628472996, + "language_loss": 0.81298035, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.8340112, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 8538, + "time_per_iteration": 2.335294723510742 + }, + { + "auxiliary_loss_clip": 0.01055263, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_clip": 1.02087438, + "balance_loss_mlp": 1.0164206, + "epoch": 0.5133924545317902, + "flos": 26066887009920.0, + "grad_norm": 1.9134185357710018, + "language_loss": 0.79800117, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.81900239, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.38867188, + "step": 8539, + "time_per_iteration": 2.4937570095062256 + }, + { + "auxiliary_loss_clip": 0.01057287, + "auxiliary_loss_mlp": 0.01042902, + "balance_loss_clip": 1.01618707, + "balance_loss_mlp": 1.01655531, + "epoch": 0.5134525777844582, + "flos": 29129782475520.0, + "grad_norm": 1.607513036844469, + "language_loss": 0.77288437, + "learning_rate": 2.009931232064105e-06, + "loss": 0.79388624, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40820312, + "step": 8540, + "time_per_iteration": 2.4277966022491455 + }, + { + "auxiliary_loss_clip": 0.01059583, + "auxiliary_loss_mlp": 0.01042349, + "balance_loss_clip": 1.01280928, + "balance_loss_mlp": 1.01816845, + "epoch": 0.5135127010371261, + "flos": 17453883888000.0, + "grad_norm": 1.5935132697257863, + "language_loss": 0.76410198, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.78512126, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.4140625, + "step": 8541, + "time_per_iteration": 2.3637542724609375 + }, + { + "auxiliary_loss_clip": 0.01056626, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_clip": 1.01572585, + "balance_loss_mlp": 1.01730943, + "epoch": 0.5135728242897941, + "flos": 21943888888320.0, + "grad_norm": 1.607914710467613, + "language_loss": 0.7132597, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.73424691, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 8542, + "time_per_iteration": 2.369668483734131 + }, + { + "auxiliary_loss_clip": 0.01057592, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.01574659, + "balance_loss_mlp": 1.01750219, + "epoch": 0.513632947542462, + "flos": 22673748193920.0, + "grad_norm": 2.1970596189589746, + "language_loss": 0.80488658, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.82587433, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 8543, + "time_per_iteration": 2.4077601432800293 + }, + { + "auxiliary_loss_clip": 0.01058955, + "auxiliary_loss_mlp": 0.01043055, + "balance_loss_clip": 1.01693606, + "balance_loss_mlp": 1.0184989, + "epoch": 0.51369307079513, + "flos": 29455976108160.0, + "grad_norm": 1.7246015241783519, + "language_loss": 0.68473911, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70575923, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 8544, + "time_per_iteration": 2.4298627376556396 + }, + { + "auxiliary_loss_clip": 0.01058953, + "auxiliary_loss_mlp": 0.01044869, + "balance_loss_clip": 1.01920354, + "balance_loss_mlp": 1.01738095, + "epoch": 0.513753194047798, + "flos": 18988351453440.0, + "grad_norm": 2.325405412244088, + "language_loss": 0.73436403, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.75540221, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41601562, + "step": 8545, + "time_per_iteration": 2.3820533752441406 + }, + { + "auxiliary_loss_clip": 0.01060412, + "auxiliary_loss_mlp": 0.01047416, + "balance_loss_clip": 1.02063, + "balance_loss_mlp": 1.01878595, + "epoch": 0.513813317300466, + "flos": 17820890766720.0, + "grad_norm": 1.9552740963922353, + "language_loss": 0.83562577, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.85670406, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41601562, + "step": 8546, + "time_per_iteration": 2.3604159355163574 + }, + { + "auxiliary_loss_clip": 0.01061178, + "auxiliary_loss_mlp": 0.01047925, + "balance_loss_clip": 1.02025652, + "balance_loss_mlp": 1.01956832, + "epoch": 0.5138734405531339, + "flos": 24060044482560.0, + "grad_norm": 1.647217616233915, + "language_loss": 0.74095666, + "learning_rate": 2.007205025522544e-06, + "loss": 0.76204765, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41601562, + "step": 8547, + "time_per_iteration": 2.409750461578369 + }, + { + "auxiliary_loss_clip": 0.01058657, + "auxiliary_loss_mlp": 0.01049935, + "balance_loss_clip": 1.02223051, + "balance_loss_mlp": 1.01816273, + "epoch": 0.5139335638058019, + "flos": 26096249329920.0, + "grad_norm": 1.5985372163596159, + "language_loss": 0.7434026, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.76448858, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40429688, + "step": 8548, + "time_per_iteration": 2.4135282039642334 + }, + { + "auxiliary_loss_clip": 0.0105931, + "auxiliary_loss_mlp": 0.01049431, + "balance_loss_clip": 1.02324033, + "balance_loss_mlp": 1.01806366, + "epoch": 0.5139936870584698, + "flos": 18916081205760.0, + "grad_norm": 1.6837182871620853, + "language_loss": 0.82636821, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84745562, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41210938, + "step": 8549, + "time_per_iteration": 2.3858635425567627 + }, + { + "auxiliary_loss_clip": 0.0105719, + "auxiliary_loss_mlp": 0.01037812, + "balance_loss_clip": 1.01313519, + "balance_loss_mlp": 1.0186168, + "epoch": 0.5140538103111378, + "flos": 16143069692160.0, + "grad_norm": 2.242911491877206, + "language_loss": 0.73030663, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.75125659, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38671875, + "step": 8550, + "time_per_iteration": 2.363816976547241 + }, + { + "auxiliary_loss_clip": 0.01061718, + "auxiliary_loss_mlp": 0.01042742, + "balance_loss_clip": 1.01578939, + "balance_loss_mlp": 1.01983464, + "epoch": 0.5141139335638057, + "flos": 22419021847680.0, + "grad_norm": 1.4737718457205646, + "language_loss": 0.76223755, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.78328216, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41796875, + "step": 8551, + "time_per_iteration": 2.3895111083984375 + }, + { + "auxiliary_loss_clip": 0.01056834, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.01419604, + "balance_loss_mlp": 1.01801467, + "epoch": 0.5141740568164738, + "flos": 27088410746880.0, + "grad_norm": 1.5897108466378804, + "language_loss": 0.69814861, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71910679, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 8552, + "time_per_iteration": 2.4332399368286133 + }, + { + "auxiliary_loss_clip": 0.01061004, + "auxiliary_loss_mlp": 0.01043483, + "balance_loss_clip": 1.01571941, + "balance_loss_mlp": 1.0193789, + "epoch": 0.5142341800691418, + "flos": 24972080595840.0, + "grad_norm": 1.7512039116179945, + "language_loss": 0.76116419, + "learning_rate": 2.004868266210965e-06, + "loss": 0.78220904, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41796875, + "step": 8553, + "time_per_iteration": 2.4096620082855225 + }, + { + "auxiliary_loss_clip": 0.01059734, + "auxiliary_loss_mlp": 0.01045661, + "balance_loss_clip": 1.01864815, + "balance_loss_mlp": 1.01879573, + "epoch": 0.5142943033218097, + "flos": 20703459726720.0, + "grad_norm": 6.749646297211825, + "language_loss": 0.69132477, + "learning_rate": 2.004478805593435e-06, + "loss": 0.71237868, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41015625, + "step": 8554, + "time_per_iteration": 2.3876938819885254 + }, + { + "auxiliary_loss_clip": 0.01063667, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_clip": 1.01610601, + "balance_loss_mlp": 1.01991987, + "epoch": 0.5143544265744777, + "flos": 22924494645120.0, + "grad_norm": 1.9151407693552793, + "language_loss": 0.74876618, + "learning_rate": 2.004089344806068e-06, + "loss": 0.76988411, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.4375, + "step": 8555, + "time_per_iteration": 2.410733699798584 + }, + { + "auxiliary_loss_clip": 0.01060476, + "auxiliary_loss_mlp": 0.01045435, + "balance_loss_clip": 1.01790953, + "balance_loss_mlp": 1.01979208, + "epoch": 0.5144145498271456, + "flos": 15920568397440.0, + "grad_norm": 2.350433633296237, + "language_loss": 0.76730627, + "learning_rate": 2.003699883863633e-06, + "loss": 0.78836536, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40625, + "step": 8556, + "time_per_iteration": 2.3820340633392334 + }, + { + "auxiliary_loss_clip": 0.01058616, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.01703119, + "balance_loss_mlp": 1.01863348, + "epoch": 0.5144746730798136, + "flos": 19680260244480.0, + "grad_norm": 1.7634303935278353, + "language_loss": 0.87140256, + "learning_rate": 2.003310422780898e-06, + "loss": 0.89242351, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40039062, + "step": 8557, + "time_per_iteration": 2.357302188873291 + }, + { + "auxiliary_loss_clip": 0.01056751, + "auxiliary_loss_mlp": 0.01044283, + "balance_loss_clip": 1.01985669, + "balance_loss_mlp": 1.01747596, + "epoch": 0.5145347963324816, + "flos": 23913583862400.0, + "grad_norm": 1.5110000307812326, + "language_loss": 0.90199411, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.92300445, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.39257812, + "step": 8558, + "time_per_iteration": 2.4706225395202637 + }, + { + "auxiliary_loss_clip": 0.01057935, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.011343, + "balance_loss_mlp": 1.01849222, + "epoch": 0.5145949195851496, + "flos": 18259015818240.0, + "grad_norm": 1.8009705831426672, + "language_loss": 0.66473305, + "learning_rate": 2.002531500253602e-06, + "loss": 0.68569469, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39453125, + "step": 8559, + "time_per_iteration": 3.6178627014160156 + }, + { + "auxiliary_loss_clip": 0.01059488, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.02025557, + "balance_loss_mlp": 1.01933765, + "epoch": 0.5146550428378175, + "flos": 26212230466560.0, + "grad_norm": 1.6620697012226853, + "language_loss": 0.63709444, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65815377, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 8560, + "time_per_iteration": 3.7992730140686035 + }, + { + "auxiliary_loss_clip": 0.01058364, + "auxiliary_loss_mlp": 0.01043388, + "balance_loss_clip": 1.01703095, + "balance_loss_mlp": 1.0180068, + "epoch": 0.5147151660904855, + "flos": 22673084878080.0, + "grad_norm": 1.5875965150638116, + "language_loss": 0.71861267, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.73963022, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40429688, + "step": 8561, + "time_per_iteration": 2.391887664794922 + }, + { + "auxiliary_loss_clip": 0.01060806, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.0161562, + "balance_loss_mlp": 1.01846671, + "epoch": 0.5147752893431534, + "flos": 24971242723200.0, + "grad_norm": 1.7470901945565382, + "language_loss": 0.67721188, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.69824636, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.42382812, + "step": 8562, + "time_per_iteration": 3.8853988647460938 + }, + { + "auxiliary_loss_clip": 0.01062283, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.01424885, + "balance_loss_mlp": 1.01963782, + "epoch": 0.5148354125958214, + "flos": 22743644469120.0, + "grad_norm": 1.8427452313219466, + "language_loss": 0.7853775, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.80643463, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 8563, + "time_per_iteration": 2.368241786956787 + }, + { + "auxiliary_loss_clip": 0.01062118, + "auxiliary_loss_mlp": 0.01049793, + "balance_loss_clip": 1.01846468, + "balance_loss_mlp": 1.01773167, + "epoch": 0.5148955358484893, + "flos": 23067848154240.0, + "grad_norm": 6.524064763970968, + "language_loss": 0.83945417, + "learning_rate": 2.0005841925139e-06, + "loss": 0.86057329, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.4453125, + "step": 8564, + "time_per_iteration": 2.3760836124420166 + }, + { + "auxiliary_loss_clip": 0.01061297, + "auxiliary_loss_mlp": 0.01052346, + "balance_loss_clip": 1.02296066, + "balance_loss_mlp": 1.01785111, + "epoch": 0.5149556591011574, + "flos": 20339071200000.0, + "grad_norm": 1.8315308913860093, + "language_loss": 0.7454375, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.76657391, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43359375, + "step": 8565, + "time_per_iteration": 2.3717117309570312 + }, + { + "auxiliary_loss_clip": 0.01063844, + "auxiliary_loss_mlp": 0.01049344, + "balance_loss_clip": 1.01663268, + "balance_loss_mlp": 1.01961493, + "epoch": 0.5150157823538254, + "flos": 22637124311040.0, + "grad_norm": 2.1602763130196347, + "language_loss": 0.6954385, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.71657032, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.44140625, + "step": 8566, + "time_per_iteration": 2.3834266662597656 + }, + { + "auxiliary_loss_clip": 0.01059045, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.01449037, + "balance_loss_mlp": 1.01641607, + "epoch": 0.5150759056064933, + "flos": 26066433162240.0, + "grad_norm": 1.662051601037073, + "language_loss": 0.79545146, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.81647646, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.42578125, + "step": 8567, + "time_per_iteration": 2.413003921508789 + }, + { + "auxiliary_loss_clip": 0.01061679, + "auxiliary_loss_mlp": 0.01043528, + "balance_loss_clip": 1.01390433, + "balance_loss_mlp": 1.01863122, + "epoch": 0.5151360288591613, + "flos": 25951604100480.0, + "grad_norm": 1.9148150324777176, + "language_loss": 0.80531043, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.82636249, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 8568, + "time_per_iteration": 2.393540382385254 + }, + { + "auxiliary_loss_clip": 0.01057539, + "auxiliary_loss_mlp": 0.01044566, + "balance_loss_clip": 1.01755309, + "balance_loss_mlp": 1.01650798, + "epoch": 0.5151961521118292, + "flos": 18506480601600.0, + "grad_norm": 2.1178754305934997, + "language_loss": 0.91916156, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.94018257, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 8569, + "time_per_iteration": 3.8019347190856934 + }, + { + "auxiliary_loss_clip": 0.01059762, + "auxiliary_loss_mlp": 0.01050604, + "balance_loss_clip": 1.02144575, + "balance_loss_mlp": 1.01834488, + "epoch": 0.5152562753644973, + "flos": 22232690588160.0, + "grad_norm": 1.5478096321504302, + "language_loss": 0.77427965, + "learning_rate": 1.998247422657674e-06, + "loss": 0.79538333, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4140625, + "step": 8570, + "time_per_iteration": 2.498631477355957 + }, + { + "auxiliary_loss_clip": 0.01058393, + "auxiliary_loss_mlp": 0.01046704, + "balance_loss_clip": 1.0141958, + "balance_loss_mlp": 1.01655126, + "epoch": 0.5153163986171652, + "flos": 38435008590720.0, + "grad_norm": 1.5287740385288529, + "language_loss": 0.75072974, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.77178073, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.41796875, + "step": 8571, + "time_per_iteration": 2.516692638397217 + }, + { + "auxiliary_loss_clip": 0.01010196, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 0.99984848, + "balance_loss_mlp": 1.00197172, + "epoch": 0.5153765218698332, + "flos": 66381164553600.0, + "grad_norm": 0.8123311864854141, + "language_loss": 0.52918267, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54930747, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.08203125, + "step": 8572, + "time_per_iteration": 3.129420042037964 + }, + { + "auxiliary_loss_clip": 0.01058841, + "auxiliary_loss_mlp": 0.01050864, + "balance_loss_clip": 1.02344584, + "balance_loss_mlp": 1.01880717, + "epoch": 0.5154366451225011, + "flos": 24023525333760.0, + "grad_norm": 2.1386004569655697, + "language_loss": 0.78012699, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.80122411, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 8573, + "time_per_iteration": 2.411212205886841 + }, + { + "auxiliary_loss_clip": 0.01058262, + "auxiliary_loss_mlp": 0.01042096, + "balance_loss_clip": 1.0133909, + "balance_loss_mlp": 1.01800108, + "epoch": 0.5154967683751691, + "flos": 23467952868480.0, + "grad_norm": 1.7008904820878583, + "language_loss": 0.78410989, + "learning_rate": 1.996689577219102e-06, + "loss": 0.80511343, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40234375, + "step": 8574, + "time_per_iteration": 2.3866829872131348 + }, + { + "auxiliary_loss_clip": 0.0106036, + "auxiliary_loss_mlp": 0.01049741, + "balance_loss_clip": 1.02026057, + "balance_loss_mlp": 1.01810384, + "epoch": 0.515556891627837, + "flos": 23804515175040.0, + "grad_norm": 1.931526776000156, + "language_loss": 0.86812443, + "learning_rate": 1.996300116136367e-06, + "loss": 0.88922548, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42382812, + "step": 8575, + "time_per_iteration": 2.387206792831421 + }, + { + "auxiliary_loss_clip": 0.01060631, + "auxiliary_loss_mlp": 0.01045358, + "balance_loss_clip": 1.01553154, + "balance_loss_mlp": 1.01793671, + "epoch": 0.515617014880505, + "flos": 19827523825920.0, + "grad_norm": 1.5673130672295255, + "language_loss": 0.78034395, + "learning_rate": 1.995910655193932e-06, + "loss": 0.80140388, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42578125, + "step": 8576, + "time_per_iteration": 2.4093997478485107 + }, + { + "auxiliary_loss_clip": 0.01062992, + "auxiliary_loss_mlp": 0.010441, + "balance_loss_clip": 1.01285493, + "balance_loss_mlp": 1.0194211, + "epoch": 0.515677138133173, + "flos": 14245051472640.0, + "grad_norm": 2.302916897952558, + "language_loss": 0.77984273, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.80091369, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43554688, + "step": 8577, + "time_per_iteration": 2.337428331375122 + }, + { + "auxiliary_loss_clip": 0.01064019, + "auxiliary_loss_mlp": 0.01051004, + "balance_loss_clip": 1.01876998, + "balance_loss_mlp": 1.01950645, + "epoch": 0.515737261385841, + "flos": 28288550332800.0, + "grad_norm": 1.933229087051479, + "language_loss": 0.81537855, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83652878, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4453125, + "step": 8578, + "time_per_iteration": 2.4498679637908936 + }, + { + "auxiliary_loss_clip": 0.01058587, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_clip": 1.01696229, + "balance_loss_mlp": 1.01748025, + "epoch": 0.515797384638509, + "flos": 27890679945600.0, + "grad_norm": 1.8834053740500836, + "language_loss": 0.77837729, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.79942608, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41015625, + "step": 8579, + "time_per_iteration": 2.436156988143921 + }, + { + "auxiliary_loss_clip": 0.01062693, + "auxiliary_loss_mlp": 0.01046409, + "balance_loss_clip": 1.0167495, + "balance_loss_mlp": 1.01931286, + "epoch": 0.5158575078911769, + "flos": 23038939681920.0, + "grad_norm": 1.6255394186297516, + "language_loss": 0.7984978, + "learning_rate": 1.994352813122559e-06, + "loss": 0.8195889, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43359375, + "step": 8580, + "time_per_iteration": 2.4161672592163086 + }, + { + "auxiliary_loss_clip": 0.01062796, + "auxiliary_loss_mlp": 0.01055921, + "balance_loss_clip": 1.0217557, + "balance_loss_mlp": 1.01877809, + "epoch": 0.5159176311438449, + "flos": 12640513075200.0, + "grad_norm": 2.2127985690468877, + "language_loss": 0.73839819, + "learning_rate": 1.99396335310315e-06, + "loss": 0.75958526, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.44140625, + "step": 8581, + "time_per_iteration": 2.32454252243042 + }, + { + "auxiliary_loss_clip": 0.01062304, + "auxiliary_loss_mlp": 0.0104291, + "balance_loss_clip": 1.01384664, + "balance_loss_mlp": 1.01973271, + "epoch": 0.5159777543965128, + "flos": 15557297034240.0, + "grad_norm": 2.0793928200909724, + "language_loss": 0.75807214, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.77912426, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42578125, + "step": 8582, + "time_per_iteration": 2.361762285232544 + }, + { + "auxiliary_loss_clip": 0.01059982, + "auxiliary_loss_mlp": 0.01044297, + "balance_loss_clip": 1.01492345, + "balance_loss_mlp": 1.01814723, + "epoch": 0.5160378776491809, + "flos": 23220557907840.0, + "grad_norm": 2.098882416396348, + "language_loss": 0.67370284, + "learning_rate": 1.99318443376583e-06, + "loss": 0.6947456, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41796875, + "step": 8583, + "time_per_iteration": 2.372183322906494 + }, + { + "auxiliary_loss_clip": 0.01060391, + "auxiliary_loss_mlp": 0.01046568, + "balance_loss_clip": 1.01734972, + "balance_loss_mlp": 1.01735163, + "epoch": 0.5160980009018488, + "flos": 21943539774720.0, + "grad_norm": 1.4590965056469798, + "language_loss": 0.76671237, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78778189, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 8584, + "time_per_iteration": 2.382758140563965 + }, + { + "auxiliary_loss_clip": 0.01061992, + "auxiliary_loss_mlp": 0.01051749, + "balance_loss_clip": 1.02089715, + "balance_loss_mlp": 1.01842034, + "epoch": 0.5161581241545168, + "flos": 22782956526720.0, + "grad_norm": 1.9578348203755775, + "language_loss": 0.80701542, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.82815284, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43554688, + "step": 8585, + "time_per_iteration": 2.369828224182129 + }, + { + "auxiliary_loss_clip": 0.0105635, + "auxiliary_loss_mlp": 0.01046214, + "balance_loss_clip": 1.01916552, + "balance_loss_mlp": 1.01625085, + "epoch": 0.5162182474071847, + "flos": 19674569692800.0, + "grad_norm": 2.2724647409532857, + "language_loss": 0.82233346, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.84335911, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40039062, + "step": 8586, + "time_per_iteration": 2.360952854156494 + }, + { + "auxiliary_loss_clip": 0.01061711, + "auxiliary_loss_mlp": 0.01048473, + "balance_loss_clip": 1.01762116, + "balance_loss_mlp": 1.01852536, + "epoch": 0.5162783706598527, + "flos": 20045207352960.0, + "grad_norm": 1.6808443091618486, + "language_loss": 0.73047125, + "learning_rate": 1.991626598310701e-06, + "loss": 0.75157309, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43359375, + "step": 8587, + "time_per_iteration": 2.375197410583496 + }, + { + "auxiliary_loss_clip": 0.01010938, + "auxiliary_loss_mlp": 0.01014878, + "balance_loss_clip": 1.01272047, + "balance_loss_mlp": 1.00272107, + "epoch": 0.5163384939125206, + "flos": 69956131063680.0, + "grad_norm": 0.7351708166100714, + "language_loss": 0.57927597, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59953403, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.08203125, + "step": 8588, + "time_per_iteration": 3.007497787475586 + }, + { + "auxiliary_loss_clip": 0.01061926, + "auxiliary_loss_mlp": 0.01053173, + "balance_loss_clip": 1.01965189, + "balance_loss_mlp": 1.01955438, + "epoch": 0.5163986171651886, + "flos": 17416177752960.0, + "grad_norm": 1.6402533081369923, + "language_loss": 0.76984012, + "learning_rate": 1.990847682429185e-06, + "loss": 0.79099119, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.42382812, + "step": 8589, + "time_per_iteration": 2.348696231842041 + }, + { + "auxiliary_loss_clip": 0.01061118, + "auxiliary_loss_mlp": 0.01048701, + "balance_loss_clip": 1.01976871, + "balance_loss_mlp": 1.01760769, + "epoch": 0.5164587404178566, + "flos": 21321666904320.0, + "grad_norm": 1.648656135254223, + "language_loss": 0.68291318, + "learning_rate": 1.990458225001627e-06, + "loss": 0.70401132, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.43554688, + "step": 8590, + "time_per_iteration": 2.381047487258911 + }, + { + "auxiliary_loss_clip": 0.01010516, + "auxiliary_loss_mlp": 0.01005311, + "balance_loss_clip": 1.00286746, + "balance_loss_mlp": 1.00218463, + "epoch": 0.5165188636705246, + "flos": 68053923480960.0, + "grad_norm": 0.7859134665215488, + "language_loss": 0.55969584, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57985413, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.08300781, + "step": 8591, + "time_per_iteration": 2.9582712650299072 + }, + { + "auxiliary_loss_clip": 0.01054743, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.00897634, + "balance_loss_mlp": 1.01641023, + "epoch": 0.5165789869231926, + "flos": 19384790474880.0, + "grad_norm": 1.4683031394764285, + "language_loss": 0.82357979, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.84448642, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3828125, + "step": 8592, + "time_per_iteration": 2.382112503051758 + }, + { + "auxiliary_loss_clip": 0.01059314, + "auxiliary_loss_mlp": 0.01046296, + "balance_loss_clip": 1.01794863, + "balance_loss_mlp": 1.01877415, + "epoch": 0.5166391101758605, + "flos": 20959128679680.0, + "grad_norm": 1.980675360231778, + "language_loss": 0.84381473, + "learning_rate": 1.989289854948979e-06, + "loss": 0.86487079, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40429688, + "step": 8593, + "time_per_iteration": 2.360713005065918 + }, + { + "auxiliary_loss_clip": 0.01061229, + "auxiliary_loss_mlp": 0.01045943, + "balance_loss_clip": 1.01695108, + "balance_loss_mlp": 1.01892781, + "epoch": 0.5166992334285285, + "flos": 29461073166720.0, + "grad_norm": 1.5395400526224643, + "language_loss": 0.70806122, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.72913301, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 8594, + "time_per_iteration": 2.4555654525756836 + }, + { + "auxiliary_loss_clip": 0.01060685, + "auxiliary_loss_mlp": 0.01039343, + "balance_loss_clip": 1.0114001, + "balance_loss_mlp": 1.01834357, + "epoch": 0.5167593566811964, + "flos": 20303285189760.0, + "grad_norm": 1.5562643547494142, + "language_loss": 0.78764117, + "learning_rate": 1.988510943586582e-06, + "loss": 0.80864143, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.42382812, + "step": 8595, + "time_per_iteration": 2.3777198791503906 + }, + { + "auxiliary_loss_clip": 0.01059762, + "auxiliary_loss_mlp": 0.01043915, + "balance_loss_clip": 1.01666403, + "balance_loss_mlp": 1.01815987, + "epoch": 0.5168194799338645, + "flos": 14610487340160.0, + "grad_norm": 1.5404259727031036, + "language_loss": 0.66485023, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.68588698, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41601562, + "step": 8596, + "time_per_iteration": 2.374009370803833 + }, + { + "auxiliary_loss_clip": 0.01061141, + "auxiliary_loss_mlp": 0.01044321, + "balance_loss_clip": 1.01208687, + "balance_loss_mlp": 1.0198307, + "epoch": 0.5168796031865324, + "flos": 25006155949440.0, + "grad_norm": 1.5338561177284706, + "language_loss": 0.76517105, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.78622562, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.4140625, + "step": 8597, + "time_per_iteration": 2.415248394012451 + }, + { + "auxiliary_loss_clip": 0.01058816, + "auxiliary_loss_mlp": 0.01040037, + "balance_loss_clip": 1.01142693, + "balance_loss_mlp": 1.01705825, + "epoch": 0.5169397264392004, + "flos": 26938843015680.0, + "grad_norm": 1.6693355946433313, + "language_loss": 0.82123262, + "learning_rate": 1.987342579847403e-06, + "loss": 0.84222114, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 8598, + "time_per_iteration": 2.4188878536224365 + }, + { + "auxiliary_loss_clip": 0.01060183, + "auxiliary_loss_mlp": 0.01040839, + "balance_loss_clip": 1.01257467, + "balance_loss_mlp": 1.0181303, + "epoch": 0.5169998496918683, + "flos": 25406714511360.0, + "grad_norm": 1.5680566398432116, + "language_loss": 0.76565331, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.78666353, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 8599, + "time_per_iteration": 3.73745059967041 + }, + { + "auxiliary_loss_clip": 0.01058784, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.01719785, + "balance_loss_mlp": 1.0190618, + "epoch": 0.5170599729445363, + "flos": 24679648114560.0, + "grad_norm": 2.146859537403998, + "language_loss": 0.73982966, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.76086032, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39648438, + "step": 8600, + "time_per_iteration": 3.8174214363098145 + }, + { + "auxiliary_loss_clip": 0.01059659, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.01044393, + "balance_loss_mlp": 1.01945448, + "epoch": 0.5171200961972042, + "flos": 20993448412800.0, + "grad_norm": 1.4877499752672034, + "language_loss": 0.75800622, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.77899545, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40234375, + "step": 8601, + "time_per_iteration": 2.3712313175201416 + }, + { + "auxiliary_loss_clip": 0.01061157, + "auxiliary_loss_mlp": 0.01046135, + "balance_loss_clip": 1.01589131, + "balance_loss_mlp": 1.01881099, + "epoch": 0.5171802194498722, + "flos": 22744587075840.0, + "grad_norm": 2.0114687556491577, + "language_loss": 0.85934794, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.8804208, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.42382812, + "step": 8602, + "time_per_iteration": 3.7635622024536133 + }, + { + "auxiliary_loss_clip": 0.01060312, + "auxiliary_loss_mlp": 0.01045269, + "balance_loss_clip": 1.017398, + "balance_loss_mlp": 1.01904058, + "epoch": 0.5172403427025402, + "flos": 28175676307200.0, + "grad_norm": 1.7114024685601863, + "language_loss": 0.75494182, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.77599764, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41210938, + "step": 8603, + "time_per_iteration": 2.4482667446136475 + }, + { + "auxiliary_loss_clip": 0.01064174, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.0166831, + "balance_loss_mlp": 1.0213474, + "epoch": 0.5173004659552082, + "flos": 20336836872960.0, + "grad_norm": 2.0725355079841377, + "language_loss": 0.74246591, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.76354653, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4296875, + "step": 8604, + "time_per_iteration": 2.3572020530700684 + }, + { + "auxiliary_loss_clip": 0.01064344, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_clip": 1.01733029, + "balance_loss_mlp": 1.01919973, + "epoch": 0.5173605892078762, + "flos": 19062297446400.0, + "grad_norm": 1.9033405662604428, + "language_loss": 0.86225927, + "learning_rate": 1.984616415277469e-06, + "loss": 0.8833766, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.45117188, + "step": 8605, + "time_per_iteration": 2.3699514865875244 + }, + { + "auxiliary_loss_clip": 0.01060555, + "auxiliary_loss_mlp": 0.01042096, + "balance_loss_clip": 1.01498771, + "balance_loss_mlp": 1.01950324, + "epoch": 0.5174207124605441, + "flos": 27994092992640.0, + "grad_norm": 2.8597008826402166, + "language_loss": 0.66001898, + "learning_rate": 1.984226965411294e-06, + "loss": 0.68104547, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41015625, + "step": 8606, + "time_per_iteration": 2.438121795654297 + }, + { + "auxiliary_loss_clip": 0.0105966, + "auxiliary_loss_mlp": 0.01043378, + "balance_loss_clip": 1.01590037, + "balance_loss_mlp": 1.02001345, + "epoch": 0.5174808357132121, + "flos": 19495744375680.0, + "grad_norm": 1.4697328839526682, + "language_loss": 0.78558415, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80661452, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.39648438, + "step": 8607, + "time_per_iteration": 2.3781893253326416 + }, + { + "auxiliary_loss_clip": 0.0106147, + "auxiliary_loss_mlp": 0.01049888, + "balance_loss_clip": 1.02089584, + "balance_loss_mlp": 1.01915407, + "epoch": 0.51754095896588, + "flos": 22783061260800.0, + "grad_norm": 1.6809778746127417, + "language_loss": 0.73249513, + "learning_rate": 1.983448067488057e-06, + "loss": 0.7536087, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 8608, + "time_per_iteration": 2.3808720111846924 + }, + { + "auxiliary_loss_clip": 0.01062599, + "auxiliary_loss_mlp": 0.01048427, + "balance_loss_clip": 1.01888728, + "balance_loss_mlp": 1.01896667, + "epoch": 0.5176010822185481, + "flos": 22668302021760.0, + "grad_norm": 1.8911582526905595, + "language_loss": 0.87418115, + "learning_rate": 1.983058619460531e-06, + "loss": 0.89529145, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.43554688, + "step": 8609, + "time_per_iteration": 3.8297431468963623 + }, + { + "auxiliary_loss_clip": 0.01058297, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_clip": 1.01709425, + "balance_loss_mlp": 1.01814342, + "epoch": 0.517661205471216, + "flos": 23950068099840.0, + "grad_norm": 1.5009472415083918, + "language_loss": 0.75016081, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.77117187, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 8610, + "time_per_iteration": 2.392219305038452 + }, + { + "auxiliary_loss_clip": 0.01061235, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.01801848, + "balance_loss_mlp": 1.01901984, + "epoch": 0.517721328723884, + "flos": 15595177726080.0, + "grad_norm": 2.478073767645993, + "language_loss": 0.69364512, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.71473527, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.421875, + "step": 8611, + "time_per_iteration": 2.3727009296417236 + }, + { + "auxiliary_loss_clip": 0.01058041, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.01543725, + "balance_loss_mlp": 1.01791, + "epoch": 0.5177814519765519, + "flos": 20959128679680.0, + "grad_norm": 9.703014121955546, + "language_loss": 0.78096437, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.80195653, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 8612, + "time_per_iteration": 2.391763210296631 + }, + { + "auxiliary_loss_clip": 0.01058977, + "auxiliary_loss_mlp": 0.0105484, + "balance_loss_clip": 1.02842307, + "balance_loss_mlp": 1.018942, + "epoch": 0.5178415752292199, + "flos": 17966862627840.0, + "grad_norm": 2.021707187755749, + "language_loss": 0.83566517, + "learning_rate": 1.981500833922294e-06, + "loss": 0.8568033, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40039062, + "step": 8613, + "time_per_iteration": 2.376495838165283 + }, + { + "auxiliary_loss_clip": 0.01058631, + "auxiliary_loss_mlp": 0.01044126, + "balance_loss_clip": 1.01537275, + "balance_loss_mlp": 1.01854587, + "epoch": 0.5179016984818878, + "flos": 17820541653120.0, + "grad_norm": 2.311871852045684, + "language_loss": 0.68980074, + "learning_rate": 1.981111389254541e-06, + "loss": 0.7108283, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40234375, + "step": 8614, + "time_per_iteration": 2.3408281803131104 + }, + { + "auxiliary_loss_clip": 0.01058661, + "auxiliary_loss_mlp": 0.0104118, + "balance_loss_clip": 1.01404786, + "balance_loss_mlp": 1.0180366, + "epoch": 0.5179618217345558, + "flos": 17819529223680.0, + "grad_norm": 2.0292789222500316, + "language_loss": 0.87569779, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.89669621, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 8615, + "time_per_iteration": 2.3525636196136475 + }, + { + "auxiliary_loss_clip": 0.01059354, + "auxiliary_loss_mlp": 0.01048393, + "balance_loss_clip": 1.02188098, + "balance_loss_mlp": 1.01920331, + "epoch": 0.5180219449872238, + "flos": 22521212997120.0, + "grad_norm": 1.7204634791384206, + "language_loss": 0.81922895, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.8403064, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 8616, + "time_per_iteration": 2.380617380142212 + }, + { + "auxiliary_loss_clip": 0.01063663, + "auxiliary_loss_mlp": 0.01058206, + "balance_loss_clip": 1.02785563, + "balance_loss_mlp": 1.02088475, + "epoch": 0.5180820682398918, + "flos": 23914317000960.0, + "grad_norm": 1.6937763438070625, + "language_loss": 0.75727648, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77849519, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.42773438, + "step": 8617, + "time_per_iteration": 2.4198520183563232 + }, + { + "auxiliary_loss_clip": 0.01060585, + "auxiliary_loss_mlp": 0.01053713, + "balance_loss_clip": 1.02381563, + "balance_loss_mlp": 1.0191133, + "epoch": 0.5181421914925598, + "flos": 16979065130880.0, + "grad_norm": 1.7920525594260173, + "language_loss": 0.7125017, + "learning_rate": 1.979553617893785e-06, + "loss": 0.73364472, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.4140625, + "step": 8618, + "time_per_iteration": 2.3416073322296143 + }, + { + "auxiliary_loss_clip": 0.01010763, + "auxiliary_loss_mlp": 0.01002552, + "balance_loss_clip": 1.00019133, + "balance_loss_mlp": 1.00283694, + "epoch": 0.5182023147452277, + "flos": 66056437198080.0, + "grad_norm": 0.9470492992878874, + "language_loss": 0.67377687, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69391, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07910156, + "step": 8619, + "time_per_iteration": 2.997868776321411 + }, + { + "auxiliary_loss_clip": 0.01057541, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.01794243, + "epoch": 0.5182624379978957, + "flos": 18186745570560.0, + "grad_norm": 3.7902197319946205, + "language_loss": 0.8117466, + "learning_rate": 1.97877473680631e-06, + "loss": 0.8327828, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 8620, + "time_per_iteration": 2.3587841987609863 + }, + { + "auxiliary_loss_clip": 0.01057144, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.01953733, + "balance_loss_mlp": 1.0183351, + "epoch": 0.5183225612505636, + "flos": 14025866757120.0, + "grad_norm": 2.246061975055125, + "language_loss": 0.82992184, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.85093933, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 8621, + "time_per_iteration": 2.4415416717529297 + }, + { + "auxiliary_loss_clip": 0.01057971, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_clip": 1.01943731, + "balance_loss_mlp": 1.01894379, + "epoch": 0.5183826845032317, + "flos": 23658648048000.0, + "grad_norm": 1.8930811762836135, + "language_loss": 0.66553462, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.68655419, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.390625, + "step": 8622, + "time_per_iteration": 2.3828036785125732 + }, + { + "auxiliary_loss_clip": 0.01059927, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.02095795, + "balance_loss_mlp": 1.01846552, + "epoch": 0.5184428077558996, + "flos": 15887680030080.0, + "grad_norm": 2.3024508892326434, + "language_loss": 0.62506688, + "learning_rate": 1.977606421248497e-06, + "loss": 0.6461513, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 8623, + "time_per_iteration": 2.360011100769043 + }, + { + "auxiliary_loss_clip": 0.0105789, + "auxiliary_loss_mlp": 0.01040368, + "balance_loss_clip": 1.01444018, + "balance_loss_mlp": 1.01759219, + "epoch": 0.5185029310085676, + "flos": 21029827916160.0, + "grad_norm": 1.7550176946988412, + "language_loss": 0.77462006, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.79560256, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 8624, + "time_per_iteration": 2.3688676357269287 + }, + { + "auxiliary_loss_clip": 0.01056723, + "auxiliary_loss_mlp": 0.01042082, + "balance_loss_clip": 1.0172987, + "balance_loss_mlp": 1.01771331, + "epoch": 0.5185630542612355, + "flos": 26541461387520.0, + "grad_norm": 1.8416599404667129, + "language_loss": 0.72529542, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.74628341, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 8625, + "time_per_iteration": 2.437434673309326 + }, + { + "auxiliary_loss_clip": 0.01057649, + "auxiliary_loss_mlp": 0.01039848, + "balance_loss_clip": 1.01426601, + "balance_loss_mlp": 1.01779628, + "epoch": 0.5186231775139035, + "flos": 20667359514240.0, + "grad_norm": 1.7430011445362872, + "language_loss": 0.69202864, + "learning_rate": 1.976438113333184e-06, + "loss": 0.71300358, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3984375, + "step": 8626, + "time_per_iteration": 2.356048345565796 + }, + { + "auxiliary_loss_clip": 0.0105703, + "auxiliary_loss_mlp": 0.01038156, + "balance_loss_clip": 1.01274085, + "balance_loss_mlp": 1.01831663, + "epoch": 0.5186833007665714, + "flos": 20884484459520.0, + "grad_norm": 2.208856285894059, + "language_loss": 0.71613133, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.7370832, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 8627, + "time_per_iteration": 2.36867356300354 + }, + { + "auxiliary_loss_clip": 0.01062084, + "auxiliary_loss_mlp": 0.0104921, + "balance_loss_clip": 1.02073085, + "balance_loss_mlp": 1.01970041, + "epoch": 0.5187434240192395, + "flos": 20885846002560.0, + "grad_norm": 1.7450518068090621, + "language_loss": 0.74209583, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.76320881, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42382812, + "step": 8628, + "time_per_iteration": 2.368021249771118 + }, + { + "auxiliary_loss_clip": 0.01058328, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.01801038, + "balance_loss_mlp": 1.01934719, + "epoch": 0.5188035472719074, + "flos": 19859050650240.0, + "grad_norm": 1.6456687569439328, + "language_loss": 0.78179717, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.80280209, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.390625, + "step": 8629, + "time_per_iteration": 2.371837854385376 + }, + { + "auxiliary_loss_clip": 0.01059167, + "auxiliary_loss_mlp": 0.01041889, + "balance_loss_clip": 1.01687872, + "balance_loss_mlp": 1.01887751, + "epoch": 0.5188636705245754, + "flos": 21137360503680.0, + "grad_norm": 2.0199238800565364, + "language_loss": 0.75623369, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.77724421, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.40234375, + "step": 8630, + "time_per_iteration": 2.379490852355957 + }, + { + "auxiliary_loss_clip": 0.01058046, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.0116148, + "balance_loss_mlp": 1.01736426, + "epoch": 0.5189237937772434, + "flos": 22418672734080.0, + "grad_norm": 2.1980064583612475, + "language_loss": 0.81801069, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.8389715, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40625, + "step": 8631, + "time_per_iteration": 2.384612560272217 + }, + { + "auxiliary_loss_clip": 0.01059566, + "auxiliary_loss_mlp": 0.010432, + "balance_loss_clip": 1.01787949, + "balance_loss_mlp": 1.01920044, + "epoch": 0.5189839170299113, + "flos": 25445537809920.0, + "grad_norm": 1.4743549493245562, + "language_loss": 0.76430452, + "learning_rate": 1.974101522024942e-06, + "loss": 0.78533214, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40429688, + "step": 8632, + "time_per_iteration": 2.4333744049072266 + }, + { + "auxiliary_loss_clip": 0.01054669, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.0105679, + "balance_loss_mlp": 1.01720583, + "epoch": 0.5190440402825793, + "flos": 18586745550720.0, + "grad_norm": 1.8694246518284945, + "language_loss": 0.79923856, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.8201251, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 8633, + "time_per_iteration": 2.3420188426971436 + }, + { + "auxiliary_loss_clip": 0.01057577, + "auxiliary_loss_mlp": 0.01037935, + "balance_loss_clip": 1.01235223, + "balance_loss_mlp": 1.01840198, + "epoch": 0.5191041635352472, + "flos": 21907544296320.0, + "grad_norm": 1.720547105511141, + "language_loss": 0.81460118, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.83555627, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39257812, + "step": 8634, + "time_per_iteration": 2.3956730365753174 + }, + { + "auxiliary_loss_clip": 0.01057606, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.01089275, + "balance_loss_mlp": 1.01864803, + "epoch": 0.5191642867879153, + "flos": 27526710355200.0, + "grad_norm": 1.456809717712187, + "language_loss": 0.7001794, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.72110647, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.390625, + "step": 8635, + "time_per_iteration": 2.432401418685913 + }, + { + "auxiliary_loss_clip": 0.01060392, + "auxiliary_loss_mlp": 0.0104091, + "balance_loss_clip": 1.01574516, + "balance_loss_mlp": 1.0193032, + "epoch": 0.5192244100405832, + "flos": 15705084286080.0, + "grad_norm": 1.513113110721494, + "language_loss": 0.78763294, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.80864596, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.41015625, + "step": 8636, + "time_per_iteration": 2.3771419525146484 + }, + { + "auxiliary_loss_clip": 0.01059269, + "auxiliary_loss_mlp": 0.01044053, + "balance_loss_clip": 1.0178746, + "balance_loss_mlp": 1.01825464, + "epoch": 0.5192845332932512, + "flos": 12056276517120.0, + "grad_norm": 2.033632827937796, + "language_loss": 0.73011541, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.75114858, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41015625, + "step": 8637, + "time_per_iteration": 2.337094783782959 + }, + { + "auxiliary_loss_clip": 0.01056853, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.01377773, + "balance_loss_mlp": 1.01784885, + "epoch": 0.5193446565459191, + "flos": 18952181418240.0, + "grad_norm": 1.7543237595746102, + "language_loss": 0.76828229, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78923863, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 8638, + "time_per_iteration": 3.646576404571533 + }, + { + "auxiliary_loss_clip": 0.01056219, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.01174474, + "balance_loss_mlp": 1.01822472, + "epoch": 0.5194047797985871, + "flos": 20373949514880.0, + "grad_norm": 2.143636135548527, + "language_loss": 0.75782728, + "learning_rate": 1.971375543740272e-06, + "loss": 0.77875894, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 8639, + "time_per_iteration": 2.3743813037872314 + }, + { + "auxiliary_loss_clip": 0.01056388, + "auxiliary_loss_mlp": 0.01041553, + "balance_loss_clip": 1.01659024, + "balance_loss_mlp": 1.01738381, + "epoch": 0.519464903051255, + "flos": 24351848559360.0, + "grad_norm": 1.6953667481796126, + "language_loss": 0.78977895, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.81075835, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 8640, + "time_per_iteration": 3.8895034790039062 + }, + { + "auxiliary_loss_clip": 0.01056702, + "auxiliary_loss_mlp": 0.01043245, + "balance_loss_clip": 1.0177815, + "balance_loss_mlp": 1.01745439, + "epoch": 0.519525026303923, + "flos": 14061024362880.0, + "grad_norm": 2.044175130160542, + "language_loss": 0.67116296, + "learning_rate": 1.97059670234927e-06, + "loss": 0.69216239, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 8641, + "time_per_iteration": 2.3712549209594727 + }, + { + "auxiliary_loss_clip": 0.01056977, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.01557457, + "balance_loss_mlp": 1.01782191, + "epoch": 0.519585149556591, + "flos": 28834731642240.0, + "grad_norm": 1.9689268059163851, + "language_loss": 0.76775724, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78873557, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39257812, + "step": 8642, + "time_per_iteration": 3.7600438594818115 + }, + { + "auxiliary_loss_clip": 0.01054425, + "auxiliary_loss_mlp": 0.01038518, + "balance_loss_clip": 1.01527262, + "balance_loss_mlp": 1.01672995, + "epoch": 0.519645272809259, + "flos": 25371871107840.0, + "grad_norm": 1.5518916478325302, + "language_loss": 0.8395499, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.86047935, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37695312, + "step": 8643, + "time_per_iteration": 2.410273313522339 + }, + { + "auxiliary_loss_clip": 0.01059906, + "auxiliary_loss_mlp": 0.01047641, + "balance_loss_clip": 1.02034211, + "balance_loss_mlp": 1.01919997, + "epoch": 0.519705396061927, + "flos": 25371731462400.0, + "grad_norm": 1.5030227893070505, + "language_loss": 0.71464056, + "learning_rate": 1.969428448662004e-06, + "loss": 0.73571604, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40820312, + "step": 8644, + "time_per_iteration": 2.424198627471924 + }, + { + "auxiliary_loss_clip": 0.01057804, + "auxiliary_loss_mlp": 0.01038006, + "balance_loss_clip": 1.01354384, + "balance_loss_mlp": 1.01777649, + "epoch": 0.5197655193145949, + "flos": 28474951415040.0, + "grad_norm": 1.6026946721507562, + "language_loss": 0.81300634, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.83396447, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.40039062, + "step": 8645, + "time_per_iteration": 2.4513957500457764 + }, + { + "auxiliary_loss_clip": 0.01055286, + "auxiliary_loss_mlp": 0.01040646, + "balance_loss_clip": 1.01457453, + "balance_loss_mlp": 1.0165, + "epoch": 0.5198256425672629, + "flos": 20008164533760.0, + "grad_norm": 1.9861857122117847, + "language_loss": 0.79330468, + "learning_rate": 1.968649618642264e-06, + "loss": 0.814264, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 8646, + "time_per_iteration": 2.3493642807006836 + }, + { + "auxiliary_loss_clip": 0.01058101, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.01559329, + "balance_loss_mlp": 1.01887846, + "epoch": 0.5198857658199308, + "flos": 19827838028160.0, + "grad_norm": 2.182512432837295, + "language_loss": 0.66838747, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68937033, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39257812, + "step": 8647, + "time_per_iteration": 2.368234157562256 + }, + { + "auxiliary_loss_clip": 0.01057993, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.01829898, + "balance_loss_mlp": 1.0175395, + "epoch": 0.5199458890725989, + "flos": 24460777601280.0, + "grad_norm": 2.026021143728582, + "language_loss": 0.72862148, + "learning_rate": 1.967870793377763e-06, + "loss": 0.74966431, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40429688, + "step": 8648, + "time_per_iteration": 3.804499626159668 + }, + { + "auxiliary_loss_clip": 0.01059365, + "auxiliary_loss_mlp": 0.01041386, + "balance_loss_clip": 1.0141232, + "balance_loss_mlp": 1.01848817, + "epoch": 0.5200060123252668, + "flos": 23403642410880.0, + "grad_norm": 1.655959108293007, + "language_loss": 0.6578145, + "learning_rate": 1.967481382565642e-06, + "loss": 0.6788221, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40820312, + "step": 8649, + "time_per_iteration": 2.3656485080718994 + }, + { + "auxiliary_loss_clip": 0.010587, + "auxiliary_loss_mlp": 0.01047145, + "balance_loss_clip": 1.01947689, + "balance_loss_mlp": 1.01749313, + "epoch": 0.5200661355779348, + "flos": 17200414350720.0, + "grad_norm": 1.7855501267155942, + "language_loss": 0.71916592, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.74022436, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41210938, + "step": 8650, + "time_per_iteration": 2.3648929595947266 + }, + { + "auxiliary_loss_clip": 0.01054738, + "auxiliary_loss_mlp": 0.01039525, + "balance_loss_clip": 1.01539707, + "balance_loss_mlp": 1.01687396, + "epoch": 0.5201262588306027, + "flos": 18514091278080.0, + "grad_norm": 1.7067828181771945, + "language_loss": 0.79375482, + "learning_rate": 1.966702564655496e-06, + "loss": 0.8146975, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37890625, + "step": 8651, + "time_per_iteration": 2.3546831607818604 + }, + { + "auxiliary_loss_clip": 0.01058339, + "auxiliary_loss_mlp": 0.0104246, + "balance_loss_clip": 1.01618648, + "balance_loss_mlp": 1.01894593, + "epoch": 0.5201863820832707, + "flos": 18618551665920.0, + "grad_norm": 1.724081196901747, + "language_loss": 0.80076015, + "learning_rate": 1.966313157587003e-06, + "loss": 0.82176816, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 8652, + "time_per_iteration": 2.392047166824341 + }, + { + "auxiliary_loss_clip": 0.01058429, + "auxiliary_loss_mlp": 0.01043678, + "balance_loss_clip": 1.01658177, + "balance_loss_mlp": 1.01853395, + "epoch": 0.5202465053359386, + "flos": 22856029735680.0, + "grad_norm": 1.8830591023143755, + "language_loss": 0.71400857, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.73502964, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 8653, + "time_per_iteration": 2.36360764503479 + }, + { + "auxiliary_loss_clip": 0.01058808, + "auxiliary_loss_mlp": 0.01052096, + "balance_loss_clip": 1.02497625, + "balance_loss_mlp": 1.0180645, + "epoch": 0.5203066285886067, + "flos": 21980442948480.0, + "grad_norm": 1.5492832562629386, + "language_loss": 0.78948599, + "learning_rate": 1.965534347297008e-06, + "loss": 0.81059504, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40820312, + "step": 8654, + "time_per_iteration": 2.399876117706299 + }, + { + "auxiliary_loss_clip": 0.01059661, + "auxiliary_loss_mlp": 0.01049271, + "balance_loss_clip": 1.02175784, + "balance_loss_mlp": 1.01783538, + "epoch": 0.5203667518412746, + "flos": 20232201928320.0, + "grad_norm": 1.9402366763628847, + "language_loss": 0.85397243, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.87506175, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 8655, + "time_per_iteration": 2.4194746017456055 + }, + { + "auxiliary_loss_clip": 0.01054995, + "auxiliary_loss_mlp": 0.01043564, + "balance_loss_clip": 1.0176959, + "balance_loss_mlp": 1.01719058, + "epoch": 0.5204268750939426, + "flos": 15704560615680.0, + "grad_norm": 2.195797486812235, + "language_loss": 0.67257971, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.69356537, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 8656, + "time_per_iteration": 2.361248731613159 + }, + { + "auxiliary_loss_clip": 0.01058111, + "auxiliary_loss_mlp": 0.01038416, + "balance_loss_clip": 1.01223779, + "balance_loss_mlp": 1.01851368, + "epoch": 0.5204869983466105, + "flos": 27448365530880.0, + "grad_norm": 1.8466992860615747, + "language_loss": 0.7442261, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.76519132, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39648438, + "step": 8657, + "time_per_iteration": 2.4133663177490234 + }, + { + "auxiliary_loss_clip": 0.01057473, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.01232123, + "balance_loss_mlp": 1.01815796, + "epoch": 0.5205471215992785, + "flos": 20594391039360.0, + "grad_norm": 1.8935632879023778, + "language_loss": 0.72271514, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.74367768, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 8658, + "time_per_iteration": 2.374812602996826 + }, + { + "auxiliary_loss_clip": 0.01055064, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.01677799, + "balance_loss_mlp": 1.01631439, + "epoch": 0.5206072448519465, + "flos": 22126798834560.0, + "grad_norm": 1.6594102980886032, + "language_loss": 0.85135216, + "learning_rate": 1.963587344701897e-06, + "loss": 0.87232184, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 8659, + "time_per_iteration": 2.3967931270599365 + }, + { + "auxiliary_loss_clip": 0.01061159, + "auxiliary_loss_mlp": 0.0105116, + "balance_loss_clip": 1.02319312, + "balance_loss_mlp": 1.01910973, + "epoch": 0.5206673681046144, + "flos": 18329505586560.0, + "grad_norm": 41.71607671674362, + "language_loss": 0.76919484, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.79031801, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41992188, + "step": 8660, + "time_per_iteration": 2.3444137573242188 + }, + { + "auxiliary_loss_clip": 0.01055599, + "auxiliary_loss_mlp": 0.0104087, + "balance_loss_clip": 1.01571703, + "balance_loss_mlp": 1.01754546, + "epoch": 0.5207274913572825, + "flos": 20229199551360.0, + "grad_norm": 1.7635166035786252, + "language_loss": 0.78894216, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80990684, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 8661, + "time_per_iteration": 2.376197576522827 + }, + { + "auxiliary_loss_clip": 0.01058799, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.01569223, + "balance_loss_mlp": 1.0185008, + "epoch": 0.5207876146099504, + "flos": 22125960961920.0, + "grad_norm": 1.7858020366396843, + "language_loss": 0.71590358, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.73689443, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.40234375, + "step": 8662, + "time_per_iteration": 2.3949086666107178 + }, + { + "auxiliary_loss_clip": 0.01056098, + "auxiliary_loss_mlp": 0.01037542, + "balance_loss_clip": 1.01386666, + "balance_loss_mlp": 1.01815522, + "epoch": 0.5208477378626184, + "flos": 23877762940800.0, + "grad_norm": 1.8697759171122728, + "language_loss": 0.70388722, + "learning_rate": 1.962029767391098e-06, + "loss": 0.72482365, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 8663, + "time_per_iteration": 2.451535940170288 + }, + { + "auxiliary_loss_clip": 0.01057545, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_clip": 1.01686275, + "balance_loss_mlp": 1.01859665, + "epoch": 0.5209078611152863, + "flos": 20960420400000.0, + "grad_norm": 2.80791360781465, + "language_loss": 0.7781893, + "learning_rate": 1.961640376626072e-06, + "loss": 0.79918277, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 8664, + "time_per_iteration": 2.392019033432007 + }, + { + "auxiliary_loss_clip": 0.01056844, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.01278996, + "balance_loss_mlp": 1.01801038, + "epoch": 0.5209679843679543, + "flos": 20666696198400.0, + "grad_norm": 1.9135389256522037, + "language_loss": 0.77326179, + "learning_rate": 1.961250987315646e-06, + "loss": 0.7942006, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38867188, + "step": 8665, + "time_per_iteration": 2.3781957626342773 + }, + { + "auxiliary_loss_clip": 0.01056736, + "auxiliary_loss_mlp": 0.01044221, + "balance_loss_clip": 1.02023578, + "balance_loss_mlp": 1.01810145, + "epoch": 0.5210281076206222, + "flos": 20226336819840.0, + "grad_norm": 1.7688649962937042, + "language_loss": 0.73275435, + "learning_rate": 1.960861599474586e-06, + "loss": 0.75376385, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38671875, + "step": 8666, + "time_per_iteration": 2.3667633533477783 + }, + { + "auxiliary_loss_clip": 0.01059395, + "auxiliary_loss_mlp": 0.0104481, + "balance_loss_clip": 1.01648629, + "balance_loss_mlp": 1.01760554, + "epoch": 0.5210882308732903, + "flos": 16069088787840.0, + "grad_norm": 2.047104075556123, + "language_loss": 0.70703954, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.72808158, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 8667, + "time_per_iteration": 2.3887619972229004 + }, + { + "auxiliary_loss_clip": 0.01054107, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.01567101, + "balance_loss_mlp": 1.01699197, + "epoch": 0.5211483541259582, + "flos": 24824188609920.0, + "grad_norm": 1.4301182139453286, + "language_loss": 0.81824553, + "learning_rate": 1.960082828259629e-06, + "loss": 0.8391726, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.37109375, + "step": 8668, + "time_per_iteration": 2.408695697784424 + }, + { + "auxiliary_loss_clip": 0.01056026, + "auxiliary_loss_mlp": 0.010365, + "balance_loss_clip": 1.01253855, + "balance_loss_mlp": 1.01752687, + "epoch": 0.5212084773786262, + "flos": 20369760151680.0, + "grad_norm": 2.025802953213226, + "language_loss": 0.65592206, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.67684728, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38476562, + "step": 8669, + "time_per_iteration": 2.3873908519744873 + }, + { + "auxiliary_loss_clip": 0.01056576, + "auxiliary_loss_mlp": 0.01041058, + "balance_loss_clip": 1.01638162, + "balance_loss_mlp": 1.01889777, + "epoch": 0.5212686006312941, + "flos": 23144447410560.0, + "grad_norm": 1.542324685123087, + "language_loss": 0.67225313, + "learning_rate": 1.959304063099325e-06, + "loss": 0.69322944, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37695312, + "step": 8670, + "time_per_iteration": 2.3840346336364746 + }, + { + "auxiliary_loss_clip": 0.01052771, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.01298606, + "balance_loss_mlp": 1.01663911, + "epoch": 0.5213287238839621, + "flos": 27773023063680.0, + "grad_norm": 2.0609820593283157, + "language_loss": 0.77284163, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.79372901, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 8671, + "time_per_iteration": 2.4253759384155273 + }, + { + "auxiliary_loss_clip": 0.01060616, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.02465987, + "balance_loss_mlp": 1.02070892, + "epoch": 0.5213888471366301, + "flos": 19936662336000.0, + "grad_norm": 2.016256790974508, + "language_loss": 0.80003715, + "learning_rate": 1.958525304111796e-06, + "loss": 0.82115155, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8672, + "time_per_iteration": 2.3904175758361816 + }, + { + "auxiliary_loss_clip": 0.01052667, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.01463771, + "balance_loss_mlp": 1.01618886, + "epoch": 0.521448970389298, + "flos": 16981788216960.0, + "grad_norm": 2.1274131667437195, + "language_loss": 0.73971999, + "learning_rate": 1.958135926969736e-06, + "loss": 0.76061118, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36523438, + "step": 8673, + "time_per_iteration": 2.3626186847686768 + }, + { + "auxiliary_loss_clip": 0.01056605, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.0151062, + "balance_loss_mlp": 1.01747024, + "epoch": 0.5215090936419661, + "flos": 18988700567040.0, + "grad_norm": 1.5500978301247603, + "language_loss": 0.76374114, + "learning_rate": 1.957746551415166e-06, + "loss": 0.7847048, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.390625, + "step": 8674, + "time_per_iteration": 2.3947479724884033 + }, + { + "auxiliary_loss_clip": 0.01055741, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.01398504, + "balance_loss_mlp": 1.01615405, + "epoch": 0.521569216894634, + "flos": 16142511110400.0, + "grad_norm": 2.3486525877211966, + "language_loss": 0.86855829, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88950467, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.39648438, + "step": 8675, + "time_per_iteration": 2.369410276412964 + }, + { + "auxiliary_loss_clip": 0.01010522, + "auxiliary_loss_mlp": 0.01006657, + "balance_loss_clip": 1.00415313, + "balance_loss_mlp": 1.00278497, + "epoch": 0.521629340147302, + "flos": 57576733113600.0, + "grad_norm": 0.880460868265128, + "language_loss": 0.6326282, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65280002, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.07714844, + "step": 8676, + "time_per_iteration": 2.965484857559204 + }, + { + "auxiliary_loss_clip": 0.01054425, + "auxiliary_loss_mlp": 0.01040803, + "balance_loss_clip": 1.01694942, + "balance_loss_mlp": 1.01682806, + "epoch": 0.5216894633999699, + "flos": 26795698974720.0, + "grad_norm": 1.4937920120220312, + "language_loss": 0.7000879, + "learning_rate": 1.956578434424046e-06, + "loss": 0.72104019, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37695312, + "step": 8677, + "time_per_iteration": 2.454317331314087 + }, + { + "auxiliary_loss_clip": 0.01055104, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.01612973, + "balance_loss_mlp": 1.01666474, + "epoch": 0.5217495866526379, + "flos": 26357539011840.0, + "grad_norm": 1.568851285072069, + "language_loss": 0.66670763, + "learning_rate": 1.956189065367086e-06, + "loss": 0.68765277, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3828125, + "step": 8678, + "time_per_iteration": 3.6005992889404297 + }, + { + "auxiliary_loss_clip": 0.01056691, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.01568627, + "balance_loss_mlp": 1.01740086, + "epoch": 0.5218097099053058, + "flos": 23582956487040.0, + "grad_norm": 2.1999663297170997, + "language_loss": 0.70739549, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.72838485, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 8679, + "time_per_iteration": 3.789944648742676 + }, + { + "auxiliary_loss_clip": 0.01058405, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02158999, + "balance_loss_mlp": 1.01898336, + "epoch": 0.5218698331579739, + "flos": 18076420074240.0, + "grad_norm": 1.7376325251658797, + "language_loss": 0.67851698, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69956177, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.39453125, + "step": 8680, + "time_per_iteration": 2.397831916809082 + }, + { + "auxiliary_loss_clip": 0.01057584, + "auxiliary_loss_mlp": 0.01041728, + "balance_loss_clip": 1.01574016, + "balance_loss_mlp": 1.01833081, + "epoch": 0.5219299564106418, + "flos": 19280120618880.0, + "grad_norm": 1.872790665761548, + "language_loss": 0.84427702, + "learning_rate": 1.955020968223156e-06, + "loss": 0.86527026, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39257812, + "step": 8681, + "time_per_iteration": 3.745870351791382 + }, + { + "auxiliary_loss_clip": 0.01056102, + "auxiliary_loss_mlp": 0.01038927, + "balance_loss_clip": 1.01400042, + "balance_loss_mlp": 1.01753974, + "epoch": 0.5219900796633098, + "flos": 26650146049920.0, + "grad_norm": 1.774921802068531, + "language_loss": 0.78890586, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.80985612, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38476562, + "step": 8682, + "time_per_iteration": 2.4392757415771484 + }, + { + "auxiliary_loss_clip": 0.01058631, + "auxiliary_loss_mlp": 0.01041081, + "balance_loss_clip": 1.0170126, + "balance_loss_mlp": 1.01936913, + "epoch": 0.5220502029159777, + "flos": 34311312241920.0, + "grad_norm": 2.066926550116428, + "language_loss": 0.70765734, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.7286545, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.39257812, + "step": 8683, + "time_per_iteration": 2.487595319747925 + }, + { + "auxiliary_loss_clip": 0.01058148, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_clip": 1.01872349, + "balance_loss_mlp": 1.01811981, + "epoch": 0.5221103261686457, + "flos": 22155602572800.0, + "grad_norm": 1.5941627395400075, + "language_loss": 0.77391988, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.79495776, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40039062, + "step": 8684, + "time_per_iteration": 2.4316630363464355 + }, + { + "auxiliary_loss_clip": 0.01055622, + "auxiliary_loss_mlp": 0.01039792, + "balance_loss_clip": 1.0143404, + "balance_loss_mlp": 1.01739573, + "epoch": 0.5221704494213137, + "flos": 19207396523520.0, + "grad_norm": 1.6527356822890986, + "language_loss": 0.77366984, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.79462397, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 8685, + "time_per_iteration": 2.3657829761505127 + }, + { + "auxiliary_loss_clip": 0.01060325, + "auxiliary_loss_mlp": 0.01046015, + "balance_loss_clip": 1.01832247, + "balance_loss_mlp": 1.01989663, + "epoch": 0.5222305726739817, + "flos": 19353054182400.0, + "grad_norm": 1.6907191965042778, + "language_loss": 0.81427276, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83533615, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40429688, + "step": 8686, + "time_per_iteration": 2.3812577724456787 + }, + { + "auxiliary_loss_clip": 0.01054245, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.01235187, + "balance_loss_mlp": 1.01727962, + "epoch": 0.5222906959266497, + "flos": 27813661752960.0, + "grad_norm": 1.5345942980041698, + "language_loss": 0.70987964, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.73079097, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36914062, + "step": 8687, + "time_per_iteration": 2.422388792037964 + }, + { + "auxiliary_loss_clip": 0.01055815, + "auxiliary_loss_mlp": 0.01040858, + "balance_loss_clip": 1.01764786, + "balance_loss_mlp": 1.01816583, + "epoch": 0.5223508191793176, + "flos": 12712189829760.0, + "grad_norm": 2.114508202058387, + "language_loss": 0.83506423, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85603094, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 8688, + "time_per_iteration": 3.7761032581329346 + }, + { + "auxiliary_loss_clip": 0.01058306, + "auxiliary_loss_mlp": 0.0103881, + "balance_loss_clip": 1.01294196, + "balance_loss_mlp": 1.01936483, + "epoch": 0.5224109424319856, + "flos": 15631347761280.0, + "grad_norm": 2.329607447682385, + "language_loss": 0.75480485, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.77577603, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 8689, + "time_per_iteration": 2.347965717315674 + }, + { + "auxiliary_loss_clip": 0.01055641, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.01420069, + "balance_loss_mlp": 1.01770663, + "epoch": 0.5224710656846535, + "flos": 15741324144000.0, + "grad_norm": 1.8728909652525534, + "language_loss": 0.83874685, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85969186, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37890625, + "step": 8690, + "time_per_iteration": 2.3429031372070312 + }, + { + "auxiliary_loss_clip": 0.01058858, + "auxiliary_loss_mlp": 0.01045466, + "balance_loss_clip": 1.01740396, + "balance_loss_mlp": 1.01884937, + "epoch": 0.5225311889373215, + "flos": 26029809279360.0, + "grad_norm": 2.147571767602476, + "language_loss": 0.79388702, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.8149302, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40039062, + "step": 8691, + "time_per_iteration": 2.4180705547332764 + }, + { + "auxiliary_loss_clip": 0.01060014, + "auxiliary_loss_mlp": 0.0104668, + "balance_loss_clip": 1.0172236, + "balance_loss_mlp": 1.01905251, + "epoch": 0.5225913121899894, + "flos": 18368293973760.0, + "grad_norm": 2.031352265679352, + "language_loss": 0.78017777, + "learning_rate": 1.950738079725646e-06, + "loss": 0.80124474, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41015625, + "step": 8692, + "time_per_iteration": 2.3377621173858643 + }, + { + "auxiliary_loss_clip": 0.01054945, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.01649904, + "balance_loss_mlp": 1.01729131, + "epoch": 0.5226514354426575, + "flos": 29272367934720.0, + "grad_norm": 1.7154108708746885, + "language_loss": 0.73036647, + "learning_rate": 1.950348737138691e-06, + "loss": 0.75131488, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 8693, + "time_per_iteration": 2.421260118484497 + }, + { + "auxiliary_loss_clip": 0.01061455, + "auxiliary_loss_mlp": 0.01047297, + "balance_loss_clip": 1.01798308, + "balance_loss_mlp": 1.0196054, + "epoch": 0.5227115586953254, + "flos": 22852294220160.0, + "grad_norm": 2.181704162236289, + "language_loss": 0.83275342, + "learning_rate": 1.949959396434517e-06, + "loss": 0.85384089, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41796875, + "step": 8694, + "time_per_iteration": 2.4089062213897705 + }, + { + "auxiliary_loss_clip": 0.01012283, + "auxiliary_loss_mlp": 0.0100378, + "balance_loss_clip": 1.00120533, + "balance_loss_mlp": 1.00424361, + "epoch": 0.5227716819479934, + "flos": 57471539587200.0, + "grad_norm": 0.7572355846963392, + "language_loss": 0.55743909, + "learning_rate": 1.949570057627888e-06, + "loss": 0.5775997, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.08007812, + "step": 8695, + "time_per_iteration": 3.06791615486145 + }, + { + "auxiliary_loss_clip": 0.01056979, + "auxiliary_loss_mlp": 0.01040619, + "balance_loss_clip": 1.01379716, + "balance_loss_mlp": 1.01793647, + "epoch": 0.5228318052006613, + "flos": 13807415180160.0, + "grad_norm": 1.8468687083546864, + "language_loss": 0.74761295, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.76858902, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 8696, + "time_per_iteration": 2.3486523628234863 + }, + { + "auxiliary_loss_clip": 0.01057762, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.0193069, + "balance_loss_mlp": 1.01829433, + "epoch": 0.5228919284533293, + "flos": 15595282460160.0, + "grad_norm": 1.524417154723536, + "language_loss": 0.72870672, + "learning_rate": 1.948791385766319e-06, + "loss": 0.74972749, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39453125, + "step": 8697, + "time_per_iteration": 2.359988212585449 + }, + { + "auxiliary_loss_clip": 0.01054828, + "auxiliary_loss_mlp": 0.01039477, + "balance_loss_clip": 1.01559937, + "balance_loss_mlp": 1.01767457, + "epoch": 0.5229520517059973, + "flos": 22490419311360.0, + "grad_norm": 2.3251333518047406, + "language_loss": 0.82130867, + "learning_rate": 1.948402052740906e-06, + "loss": 0.84225178, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 8698, + "time_per_iteration": 2.4124057292938232 + }, + { + "auxiliary_loss_clip": 0.01055742, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.01748991, + "balance_loss_mlp": 1.01819336, + "epoch": 0.5230121749586653, + "flos": 22089790926720.0, + "grad_norm": 1.6961092317347655, + "language_loss": 0.7534464, + "learning_rate": 1.948012721672093e-06, + "loss": 0.77441722, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 8699, + "time_per_iteration": 2.377918004989624 + }, + { + "auxiliary_loss_clip": 0.01058924, + "auxiliary_loss_mlp": 0.0104254, + "balance_loss_clip": 1.01528835, + "balance_loss_mlp": 1.01754665, + "epoch": 0.5230722982113333, + "flos": 22126065696000.0, + "grad_norm": 1.773525905427694, + "language_loss": 0.74855703, + "learning_rate": 1.947623392574642e-06, + "loss": 0.76957172, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 8700, + "time_per_iteration": 2.4055604934692383 + }, + { + "auxiliary_loss_clip": 0.01059298, + "auxiliary_loss_mlp": 0.01048571, + "balance_loss_clip": 1.02111685, + "balance_loss_mlp": 1.01854074, + "epoch": 0.5231324214640012, + "flos": 25008110985600.0, + "grad_norm": 1.6412549369581064, + "language_loss": 0.69239759, + "learning_rate": 1.947234065463318e-06, + "loss": 0.7134763, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40625, + "step": 8701, + "time_per_iteration": 2.4089200496673584 + }, + { + "auxiliary_loss_clip": 0.01054605, + "auxiliary_loss_mlp": 0.01041757, + "balance_loss_clip": 1.01653254, + "balance_loss_mlp": 1.01683521, + "epoch": 0.5231925447166692, + "flos": 25739296922880.0, + "grad_norm": 2.8840722567661725, + "language_loss": 0.6749472, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.69591081, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 8702, + "time_per_iteration": 2.4184858798980713 + }, + { + "auxiliary_loss_clip": 0.01055387, + "auxiliary_loss_mlp": 0.01038258, + "balance_loss_clip": 1.01316404, + "balance_loss_mlp": 1.01784325, + "epoch": 0.5232526679693371, + "flos": 21432865184640.0, + "grad_norm": 1.8347647522178292, + "language_loss": 0.77607536, + "learning_rate": 1.946455417258101e-06, + "loss": 0.79701185, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 8703, + "time_per_iteration": 2.367572069168091 + }, + { + "auxiliary_loss_clip": 0.01061003, + "auxiliary_loss_mlp": 0.01048823, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.01948404, + "epoch": 0.5233127912220051, + "flos": 35296945234560.0, + "grad_norm": 2.120800386725718, + "language_loss": 0.78304303, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.80414128, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.41601562, + "step": 8704, + "time_per_iteration": 2.5978171825408936 + }, + { + "auxiliary_loss_clip": 0.01054666, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.01428986, + "balance_loss_mlp": 1.01776814, + "epoch": 0.523372914474673, + "flos": 17050497505920.0, + "grad_norm": 2.191921423827496, + "language_loss": 0.78889239, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80982959, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36914062, + "step": 8705, + "time_per_iteration": 2.348168134689331 + }, + { + "auxiliary_loss_clip": 0.01057959, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.0144161, + "balance_loss_mlp": 1.01822066, + "epoch": 0.5234330377273411, + "flos": 18405301881600.0, + "grad_norm": 1.8954177601980944, + "language_loss": 0.70698822, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.72797239, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3984375, + "step": 8706, + "time_per_iteration": 2.381861686706543 + }, + { + "auxiliary_loss_clip": 0.01011697, + "auxiliary_loss_mlp": 0.01007064, + "balance_loss_clip": 1.004632, + "balance_loss_mlp": 1.00390267, + "epoch": 0.523493160980009, + "flos": 65846608727040.0, + "grad_norm": 0.6831141602233511, + "language_loss": 0.52601075, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54619837, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.078125, + "step": 8707, + "time_per_iteration": 3.065253734588623 + }, + { + "auxiliary_loss_clip": 0.01056141, + "auxiliary_loss_mlp": 0.01042789, + "balance_loss_clip": 1.01829123, + "balance_loss_mlp": 1.01763892, + "epoch": 0.523553284232677, + "flos": 21870990236160.0, + "grad_norm": 1.640904618811821, + "language_loss": 0.76192904, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.78291833, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38476562, + "step": 8708, + "time_per_iteration": 2.4026901721954346 + }, + { + "auxiliary_loss_clip": 0.010567, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.01254058, + "balance_loss_mlp": 1.01959705, + "epoch": 0.5236134074853449, + "flos": 20847197260800.0, + "grad_norm": 1.627696789144538, + "language_loss": 0.78402996, + "learning_rate": 1.944119521844849e-06, + "loss": 0.80495447, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 8709, + "time_per_iteration": 2.3709378242492676 + }, + { + "auxiliary_loss_clip": 0.01058698, + "auxiliary_loss_mlp": 0.01043133, + "balance_loss_clip": 1.01572728, + "balance_loss_mlp": 1.01763427, + "epoch": 0.5236735307380129, + "flos": 25519239423360.0, + "grad_norm": 1.9144941932729982, + "language_loss": 0.85177374, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.87279207, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 8710, + "time_per_iteration": 2.402303695678711 + }, + { + "auxiliary_loss_clip": 0.01054936, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.01128197, + "balance_loss_mlp": 1.01835024, + "epoch": 0.523733653990681, + "flos": 23582083703040.0, + "grad_norm": 1.9349589243215275, + "language_loss": 0.71063024, + "learning_rate": 1.943340906834908e-06, + "loss": 0.73152655, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36523438, + "step": 8711, + "time_per_iteration": 2.3834891319274902 + }, + { + "auxiliary_loss_clip": 0.01056826, + "auxiliary_loss_mlp": 0.01038041, + "balance_loss_clip": 1.01486659, + "balance_loss_mlp": 1.01876092, + "epoch": 0.5237937772433489, + "flos": 21105170363520.0, + "grad_norm": 1.6964714420456806, + "language_loss": 0.84106719, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.8620159, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 8712, + "time_per_iteration": 2.3998725414276123 + }, + { + "auxiliary_loss_clip": 0.01058687, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_clip": 1.01970983, + "balance_loss_mlp": 1.01960504, + "epoch": 0.5238539004960169, + "flos": 19171854892800.0, + "grad_norm": 3.5023494652383147, + "language_loss": 0.7064721, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.72751522, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 8713, + "time_per_iteration": 2.388051748275757 + }, + { + "auxiliary_loss_clip": 0.0105999, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.01843083, + "balance_loss_mlp": 1.01939476, + "epoch": 0.5239140237486848, + "flos": 17887435551360.0, + "grad_norm": 2.963735677675801, + "language_loss": 0.79046351, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.81152177, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 8714, + "time_per_iteration": 2.3725767135620117 + }, + { + "auxiliary_loss_clip": 0.0105969, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.00872815, + "balance_loss_mlp": 1.02109134, + "epoch": 0.5239741470013528, + "flos": 17929470695040.0, + "grad_norm": 1.8576815591205822, + "language_loss": 0.77284169, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.79377598, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 8715, + "time_per_iteration": 2.354696273803711 + }, + { + "auxiliary_loss_clip": 0.01056281, + "auxiliary_loss_mlp": 0.01036607, + "balance_loss_clip": 1.01369488, + "balance_loss_mlp": 1.01938128, + "epoch": 0.5240342702540207, + "flos": 30992049596160.0, + "grad_norm": 1.5690556272342817, + "language_loss": 0.71948993, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.74041891, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36914062, + "step": 8716, + "time_per_iteration": 2.485448122024536 + }, + { + "auxiliary_loss_clip": 0.01056286, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.01477277, + "balance_loss_mlp": 1.01900887, + "epoch": 0.5240943935066887, + "flos": 25004026356480.0, + "grad_norm": 1.7689617245951055, + "language_loss": 0.87473756, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89567763, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.37304688, + "step": 8717, + "time_per_iteration": 3.622506618499756 + }, + { + "auxiliary_loss_clip": 0.01056914, + "auxiliary_loss_mlp": 0.01037624, + "balance_loss_clip": 1.01481926, + "balance_loss_mlp": 1.01910019, + "epoch": 0.5241545167593566, + "flos": 23657984732160.0, + "grad_norm": 2.4078394416600593, + "language_loss": 0.62718987, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.64813524, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.37890625, + "step": 8718, + "time_per_iteration": 2.4041476249694824 + }, + { + "auxiliary_loss_clip": 0.01058592, + "auxiliary_loss_mlp": 0.01048287, + "balance_loss_clip": 1.0211792, + "balance_loss_mlp": 1.01899672, + "epoch": 0.5242146400120247, + "flos": 23399383224960.0, + "grad_norm": 1.8191480582373898, + "language_loss": 0.73149961, + "learning_rate": 1.940226533916872e-06, + "loss": 0.75256848, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39648438, + "step": 8719, + "time_per_iteration": 3.829763174057007 + }, + { + "auxiliary_loss_clip": 0.01053965, + "auxiliary_loss_mlp": 0.01040217, + "balance_loss_clip": 1.01992726, + "balance_loss_mlp": 1.01805866, + "epoch": 0.5242747632646926, + "flos": 17748096848640.0, + "grad_norm": 1.8177636261561498, + "language_loss": 0.74276817, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.7637099, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.359375, + "step": 8720, + "time_per_iteration": 2.3425824642181396 + }, + { + "auxiliary_loss_clip": 0.01056746, + "auxiliary_loss_mlp": 0.01042746, + "balance_loss_clip": 1.01826036, + "balance_loss_mlp": 1.01898074, + "epoch": 0.5243348865173606, + "flos": 32596378525440.0, + "grad_norm": 1.6075355060439904, + "language_loss": 0.71082437, + "learning_rate": 1.939447963058281e-06, + "loss": 0.73181927, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 8721, + "time_per_iteration": 3.890228271484375 + }, + { + "auxiliary_loss_clip": 0.01054142, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_clip": 1.02323627, + "balance_loss_mlp": 1.01734602, + "epoch": 0.5243950097700285, + "flos": 25482929742720.0, + "grad_norm": 1.5899296682751258, + "language_loss": 0.87446201, + "learning_rate": 1.939058681065813e-06, + "loss": 0.89546371, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36914062, + "step": 8722, + "time_per_iteration": 2.3921303749084473 + }, + { + "auxiliary_loss_clip": 0.01056507, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.01345551, + "balance_loss_mlp": 1.01989698, + "epoch": 0.5244551330226965, + "flos": 15267482904960.0, + "grad_norm": 1.8859182991268613, + "language_loss": 0.80674177, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82767421, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 8723, + "time_per_iteration": 2.4017889499664307 + }, + { + "auxiliary_loss_clip": 0.0105832, + "auxiliary_loss_mlp": 0.01052425, + "balance_loss_clip": 1.02509058, + "balance_loss_mlp": 1.01953804, + "epoch": 0.5245152562753645, + "flos": 22236007167360.0, + "grad_norm": 1.7878959975678363, + "language_loss": 0.76610428, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.78721172, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 8724, + "time_per_iteration": 2.3943753242492676 + }, + { + "auxiliary_loss_clip": 0.01057443, + "auxiliary_loss_mlp": 0.01043502, + "balance_loss_clip": 1.01769304, + "balance_loss_mlp": 1.01763058, + "epoch": 0.5245753795280325, + "flos": 29425112599680.0, + "grad_norm": 1.5237493746677904, + "language_loss": 0.71331334, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.73432279, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 8725, + "time_per_iteration": 2.4625654220581055 + }, + { + "auxiliary_loss_clip": 0.01012062, + "auxiliary_loss_mlp": 0.01008948, + "balance_loss_clip": 1.00565839, + "balance_loss_mlp": 1.00400853, + "epoch": 0.5246355027807005, + "flos": 58831196641920.0, + "grad_norm": 0.7689618128825894, + "language_loss": 0.55715448, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57736456, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.03295898, + "router_z_loss_mlp": 0.08056641, + "step": 8726, + "time_per_iteration": 3.042745590209961 + }, + { + "auxiliary_loss_clip": 0.01010862, + "auxiliary_loss_mlp": 0.01014743, + "balance_loss_clip": 1.0117749, + "balance_loss_mlp": 1.00303054, + "epoch": 0.5246956260333684, + "flos": 64523226441600.0, + "grad_norm": 0.8081085402575653, + "language_loss": 0.58449149, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60474759, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.078125, + "step": 8727, + "time_per_iteration": 4.44676947593689 + }, + { + "auxiliary_loss_clip": 0.01056891, + "auxiliary_loss_mlp": 0.01041721, + "balance_loss_clip": 1.0174737, + "balance_loss_mlp": 1.01760566, + "epoch": 0.5247557492860364, + "flos": 24532524178560.0, + "grad_norm": 1.264401217922183, + "language_loss": 0.71490341, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.73588955, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.39257812, + "step": 8728, + "time_per_iteration": 2.425903558731079 + }, + { + "auxiliary_loss_clip": 0.01054135, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.01898408, + "balance_loss_mlp": 1.01787412, + "epoch": 0.5248158725387043, + "flos": 18805162216320.0, + "grad_norm": 1.4406759006257879, + "language_loss": 0.7011348, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.72209644, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 8729, + "time_per_iteration": 2.3614766597747803 + }, + { + "auxiliary_loss_clip": 0.01057091, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.0237124, + "balance_loss_mlp": 1.01896799, + "epoch": 0.5248759957913723, + "flos": 20954904405120.0, + "grad_norm": 1.7599484946186146, + "language_loss": 0.84838784, + "learning_rate": 1.935944509558464e-06, + "loss": 0.86944485, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38085938, + "step": 8730, + "time_per_iteration": 2.3882665634155273 + }, + { + "auxiliary_loss_clip": 0.01058447, + "auxiliary_loss_mlp": 0.01043617, + "balance_loss_clip": 1.01908422, + "balance_loss_mlp": 1.02000403, + "epoch": 0.5249361190440403, + "flos": 18659993316480.0, + "grad_norm": 1.9941384161614533, + "language_loss": 0.8063423, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.8273629, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38671875, + "step": 8731, + "time_per_iteration": 2.360391855239868 + }, + { + "auxiliary_loss_clip": 0.01053243, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.01608753, + "balance_loss_mlp": 1.01767635, + "epoch": 0.5249962422967083, + "flos": 24862174035840.0, + "grad_norm": 1.9409676628356936, + "language_loss": 0.84366351, + "learning_rate": 1.935165990676312e-06, + "loss": 0.86456627, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.35546875, + "step": 8732, + "time_per_iteration": 2.4193758964538574 + }, + { + "auxiliary_loss_clip": 0.01058635, + "auxiliary_loss_mlp": 0.01040968, + "balance_loss_clip": 1.01841354, + "balance_loss_mlp": 1.02113891, + "epoch": 0.5250563655493762, + "flos": 15261931998720.0, + "grad_norm": 1.515051279518933, + "language_loss": 0.78927875, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.81027472, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.375, + "step": 8733, + "time_per_iteration": 2.349703550338745 + }, + { + "auxiliary_loss_clip": 0.01060225, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.01605535, + "balance_loss_mlp": 1.02103329, + "epoch": 0.5251164888020442, + "flos": 18624172394880.0, + "grad_norm": 1.9932740130071283, + "language_loss": 0.82879496, + "learning_rate": 1.934387481628208e-06, + "loss": 0.84978878, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.39257812, + "step": 8734, + "time_per_iteration": 2.378478527069092 + }, + { + "auxiliary_loss_clip": 0.010596, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.01664221, + "balance_loss_mlp": 1.02262902, + "epoch": 0.5251766120547121, + "flos": 29709620202240.0, + "grad_norm": 1.3360558111626202, + "language_loss": 0.77340525, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79440135, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 8735, + "time_per_iteration": 2.4504826068878174 + }, + { + "auxiliary_loss_clip": 0.01060773, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.02056956, + "balance_loss_mlp": 1.02180052, + "epoch": 0.5252367353073801, + "flos": 23439184041600.0, + "grad_norm": 1.60824494173692, + "language_loss": 0.81755507, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.83858979, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.390625, + "step": 8736, + "time_per_iteration": 2.4420132637023926 + }, + { + "auxiliary_loss_clip": 0.01060529, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.02022648, + "balance_loss_mlp": 1.02254438, + "epoch": 0.5252968585600482, + "flos": 30809384029440.0, + "grad_norm": 2.2306287375429674, + "language_loss": 0.71547711, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.73652458, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 8737, + "time_per_iteration": 2.45532488822937 + }, + { + "auxiliary_loss_clip": 0.01060789, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.01971257, + "balance_loss_mlp": 1.02290571, + "epoch": 0.5253569818127161, + "flos": 20627314318080.0, + "grad_norm": 1.4964947883761246, + "language_loss": 0.78298426, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.8040216, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 8738, + "time_per_iteration": 2.4596011638641357 + }, + { + "auxiliary_loss_clip": 0.01020948, + "auxiliary_loss_mlp": 0.01004385, + "balance_loss_clip": 1.00170302, + "balance_loss_mlp": 1.0132798, + "epoch": 0.5254171050653841, + "flos": 63425452561920.0, + "grad_norm": 0.7369338861809099, + "language_loss": 0.54468799, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56494135, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.07666016, + "step": 8739, + "time_per_iteration": 2.995771884918213 + }, + { + "auxiliary_loss_clip": 0.01057626, + "auxiliary_loss_mlp": 0.01045663, + "balance_loss_clip": 1.02393115, + "balance_loss_mlp": 1.02084923, + "epoch": 0.525477228318052, + "flos": 34669556369280.0, + "grad_norm": 1.6292068314972095, + "language_loss": 0.8557179, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.87675071, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3671875, + "step": 8740, + "time_per_iteration": 2.4990029335021973 + }, + { + "auxiliary_loss_clip": 0.01059315, + "auxiliary_loss_mlp": 0.0104741, + "balance_loss_clip": 1.02365172, + "balance_loss_mlp": 1.02172267, + "epoch": 0.52553735157072, + "flos": 17929889631360.0, + "grad_norm": 2.2247570088756525, + "language_loss": 0.70634121, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.72740841, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37695312, + "step": 8741, + "time_per_iteration": 2.371375322341919 + }, + { + "auxiliary_loss_clip": 0.01061868, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.02222764, + "balance_loss_mlp": 1.02226734, + "epoch": 0.5255974748233879, + "flos": 9940120922880.0, + "grad_norm": 2.282515297047375, + "language_loss": 0.67687726, + "learning_rate": 1.931273546137947e-06, + "loss": 0.69796002, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.39648438, + "step": 8742, + "time_per_iteration": 2.393458366394043 + }, + { + "auxiliary_loss_clip": 0.01060959, + "auxiliary_loss_mlp": 0.01049807, + "balance_loss_clip": 1.02387881, + "balance_loss_mlp": 1.0211519, + "epoch": 0.5256575980760559, + "flos": 16867622471040.0, + "grad_norm": 3.8463710977476433, + "language_loss": 0.64764762, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.66875529, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3984375, + "step": 8743, + "time_per_iteration": 2.3419594764709473 + }, + { + "auxiliary_loss_clip": 0.01016438, + "auxiliary_loss_mlp": 0.01005366, + "balance_loss_clip": 1.00317216, + "balance_loss_mlp": 1.00875866, + "epoch": 0.5257177213287239, + "flos": 62382561206400.0, + "grad_norm": 0.7874550288716761, + "language_loss": 0.54152644, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56174445, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.07666016, + "step": 8744, + "time_per_iteration": 3.142137050628662 + }, + { + "auxiliary_loss_clip": 0.01060769, + "auxiliary_loss_mlp": 0.0105516, + "balance_loss_clip": 1.02827835, + "balance_loss_mlp": 1.02035952, + "epoch": 0.5257778445813919, + "flos": 20775869619840.0, + "grad_norm": 2.0409896473815468, + "language_loss": 0.77576262, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.79692191, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40429688, + "step": 8745, + "time_per_iteration": 2.378455638885498 + }, + { + "auxiliary_loss_clip": 0.01057233, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.02348781, + "balance_loss_mlp": 1.01977539, + "epoch": 0.5258379678340598, + "flos": 17017678961280.0, + "grad_norm": 2.154127393503611, + "language_loss": 0.83249569, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.85353571, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 8746, + "time_per_iteration": 2.375123977661133 + }, + { + "auxiliary_loss_clip": 0.01056128, + "auxiliary_loss_mlp": 0.0104867, + "balance_loss_clip": 1.02482855, + "balance_loss_mlp": 1.01883245, + "epoch": 0.5258980910867278, + "flos": 21067708608000.0, + "grad_norm": 1.7890727202434968, + "language_loss": 0.76738036, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.78842831, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37304688, + "step": 8747, + "time_per_iteration": 2.384676694869995 + }, + { + "auxiliary_loss_clip": 0.01051973, + "auxiliary_loss_mlp": 0.01049705, + "balance_loss_clip": 1.02455163, + "balance_loss_mlp": 1.01661134, + "epoch": 0.5259582143393957, + "flos": 18003486510720.0, + "grad_norm": 1.7655506844856546, + "language_loss": 0.83793688, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.8589536, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.35351562, + "step": 8748, + "time_per_iteration": 2.3717947006225586 + }, + { + "auxiliary_loss_clip": 0.0105523, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_clip": 1.02343512, + "balance_loss_mlp": 1.017308, + "epoch": 0.5260183375920637, + "flos": 22782747058560.0, + "grad_norm": 2.0777534307082606, + "language_loss": 0.82299793, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.84404296, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 8749, + "time_per_iteration": 2.385683059692383 + }, + { + "auxiliary_loss_clip": 0.01056037, + "auxiliary_loss_mlp": 0.01053717, + "balance_loss_clip": 1.03044772, + "balance_loss_mlp": 1.01872849, + "epoch": 0.5260784608447318, + "flos": 27051193370880.0, + "grad_norm": 2.0238846471825673, + "language_loss": 0.74046791, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.76156545, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37304688, + "step": 8750, + "time_per_iteration": 2.45823073387146 + }, + { + "auxiliary_loss_clip": 0.01055878, + "auxiliary_loss_mlp": 0.01043941, + "balance_loss_clip": 1.02052844, + "balance_loss_mlp": 1.01765072, + "epoch": 0.5261385840973997, + "flos": 20661913342080.0, + "grad_norm": 1.314274524467246, + "language_loss": 0.7751323, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.79613042, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3828125, + "step": 8751, + "time_per_iteration": 2.38356614112854 + }, + { + "auxiliary_loss_clip": 0.01054179, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.0217315, + "balance_loss_mlp": 1.01815772, + "epoch": 0.5261987073500677, + "flos": 23621535406080.0, + "grad_norm": 1.3845837319638385, + "language_loss": 0.77462542, + "learning_rate": 1.927381362210902e-06, + "loss": 0.79560602, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 8752, + "time_per_iteration": 2.432647466659546 + }, + { + "auxiliary_loss_clip": 0.01058172, + "auxiliary_loss_mlp": 0.01045501, + "balance_loss_clip": 1.01976347, + "balance_loss_mlp": 1.01915622, + "epoch": 0.5262588306027356, + "flos": 27635010992640.0, + "grad_norm": 1.4733096638464633, + "language_loss": 0.68351376, + "learning_rate": 1.926992158720058e-06, + "loss": 0.7045505, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 8753, + "time_per_iteration": 2.4235689640045166 + }, + { + "auxiliary_loss_clip": 0.01058512, + "auxiliary_loss_mlp": 0.0104702, + "balance_loss_clip": 1.02397704, + "balance_loss_mlp": 1.02122831, + "epoch": 0.5263189538554036, + "flos": 21758709703680.0, + "grad_norm": 1.4941498728070772, + "language_loss": 0.84941691, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.87047231, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37304688, + "step": 8754, + "time_per_iteration": 2.404838800430298 + }, + { + "auxiliary_loss_clip": 0.01060328, + "auxiliary_loss_mlp": 0.01043791, + "balance_loss_clip": 1.01973426, + "balance_loss_mlp": 1.0212115, + "epoch": 0.5263790771080715, + "flos": 14275670601600.0, + "grad_norm": 1.8314818885106496, + "language_loss": 0.88412648, + "learning_rate": 1.926213760058522e-06, + "loss": 0.90516764, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.390625, + "step": 8755, + "time_per_iteration": 2.351964235305786 + }, + { + "auxiliary_loss_clip": 0.01017915, + "auxiliary_loss_mlp": 0.01007243, + "balance_loss_clip": 1.00488257, + "balance_loss_mlp": 1.01041031, + "epoch": 0.5264392003607395, + "flos": 65802932749440.0, + "grad_norm": 1.0386365409873706, + "language_loss": 0.58940661, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60965812, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07519531, + "step": 8756, + "time_per_iteration": 3.092277765274048 + }, + { + "auxiliary_loss_clip": 0.01062733, + "auxiliary_loss_mlp": 0.01039129, + "balance_loss_clip": 1.01444101, + "balance_loss_mlp": 1.02263546, + "epoch": 0.5264993236134075, + "flos": 21031364016000.0, + "grad_norm": 1.7845334963678385, + "language_loss": 0.7136035, + "learning_rate": 1.925435372588913e-06, + "loss": 0.73462212, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.40039062, + "step": 8757, + "time_per_iteration": 3.681122064590454 + }, + { + "auxiliary_loss_clip": 0.01063399, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_clip": 1.01976359, + "balance_loss_mlp": 1.02334428, + "epoch": 0.5265594468660755, + "flos": 16617260044800.0, + "grad_norm": 1.5289496053062177, + "language_loss": 0.88919771, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.91029066, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 8758, + "time_per_iteration": 2.380053758621216 + }, + { + "auxiliary_loss_clip": 0.0106221, + "auxiliary_loss_mlp": 0.01043696, + "balance_loss_clip": 1.01650417, + "balance_loss_mlp": 1.02185965, + "epoch": 0.5266195701187434, + "flos": 24132978046080.0, + "grad_norm": 2.781284707831141, + "language_loss": 0.77075654, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.79181564, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 8759, + "time_per_iteration": 3.9452333450317383 + }, + { + "auxiliary_loss_clip": 0.01060951, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.01611876, + "balance_loss_mlp": 1.02289939, + "epoch": 0.5266796933714114, + "flos": 15843410559360.0, + "grad_norm": 2.7964519017675427, + "language_loss": 0.72968918, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.75070101, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38085938, + "step": 8760, + "time_per_iteration": 3.7207367420196533 + }, + { + "auxiliary_loss_clip": 0.01065175, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_clip": 1.01930261, + "balance_loss_mlp": 1.02468121, + "epoch": 0.5267398166240793, + "flos": 20950610307840.0, + "grad_norm": 2.1157488582224486, + "language_loss": 0.78108966, + "learning_rate": 1.923878631697736e-06, + "loss": 0.80219352, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40625, + "step": 8761, + "time_per_iteration": 2.3809332847595215 + }, + { + "auxiliary_loss_clip": 0.01064633, + "auxiliary_loss_mlp": 0.01042472, + "balance_loss_clip": 1.01916623, + "balance_loss_mlp": 1.02669811, + "epoch": 0.5267999398767473, + "flos": 20995333626240.0, + "grad_norm": 1.7358435903627882, + "language_loss": 0.71560919, + "learning_rate": 1.923489453654373e-06, + "loss": 0.73668027, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 8762, + "time_per_iteration": 2.4480535984039307 + }, + { + "auxiliary_loss_clip": 0.01025891, + "auxiliary_loss_mlp": 0.01024549, + "balance_loss_clip": 1.0218668, + "balance_loss_mlp": 1.01834786, + "epoch": 0.5268600631294152, + "flos": 66846312864000.0, + "grad_norm": 0.9306984605231466, + "language_loss": 0.65718877, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67769313, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.07519531, + "step": 8763, + "time_per_iteration": 2.926093339920044 + }, + { + "auxiliary_loss_clip": 0.0106348, + "auxiliary_loss_mlp": 0.01043581, + "balance_loss_clip": 1.01758146, + "balance_loss_mlp": 1.02470195, + "epoch": 0.5269201863820833, + "flos": 17164593429120.0, + "grad_norm": 2.0015222711613245, + "language_loss": 0.72594279, + "learning_rate": 1.922711106286265e-06, + "loss": 0.74701345, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 8764, + "time_per_iteration": 2.400773525238037 + }, + { + "auxiliary_loss_clip": 0.01064299, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.01645756, + "balance_loss_mlp": 1.0248183, + "epoch": 0.5269803096347513, + "flos": 20521527298560.0, + "grad_norm": 1.8224268175574079, + "language_loss": 0.7546376, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.77571309, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 8765, + "time_per_iteration": 2.4014012813568115 + }, + { + "auxiliary_loss_clip": 0.01063035, + "auxiliary_loss_mlp": 0.01053533, + "balance_loss_clip": 1.02570939, + "balance_loss_mlp": 1.02259564, + "epoch": 0.5270404328874192, + "flos": 27229879042560.0, + "grad_norm": 1.4477945558839704, + "language_loss": 0.86199421, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.88315988, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40429688, + "step": 8766, + "time_per_iteration": 2.4741854667663574 + }, + { + "auxiliary_loss_clip": 0.01062726, + "auxiliary_loss_mlp": 0.01056814, + "balance_loss_clip": 1.0302304, + "balance_loss_mlp": 1.02284098, + "epoch": 0.5271005561400872, + "flos": 23109429450240.0, + "grad_norm": 1.665646772679221, + "language_loss": 0.80079776, + "learning_rate": 1.921543607252017e-06, + "loss": 0.82199311, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 8767, + "time_per_iteration": 3.841486930847168 + }, + { + "auxiliary_loss_clip": 0.01065866, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.02775788, + "balance_loss_mlp": 1.02527404, + "epoch": 0.5271606793927551, + "flos": 22563701988480.0, + "grad_norm": 1.9369669914269416, + "language_loss": 0.74536908, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.76657015, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40625, + "step": 8768, + "time_per_iteration": 2.41845703125 + }, + { + "auxiliary_loss_clip": 0.01060383, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.02457309, + "balance_loss_mlp": 1.02237427, + "epoch": 0.5272208026454231, + "flos": 18763441274880.0, + "grad_norm": 1.933059860605097, + "language_loss": 0.75451422, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.77559453, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37890625, + "step": 8769, + "time_per_iteration": 2.366391658782959 + }, + { + "auxiliary_loss_clip": 0.0105946, + "auxiliary_loss_mlp": 0.01057508, + "balance_loss_clip": 1.03345132, + "balance_loss_mlp": 1.02111757, + "epoch": 0.5272809258980911, + "flos": 20411131979520.0, + "grad_norm": 1.6141404870747005, + "language_loss": 0.75011218, + "learning_rate": 1.920376134993436e-06, + "loss": 0.77128184, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 8770, + "time_per_iteration": 2.4181125164031982 + }, + { + "auxiliary_loss_clip": 0.01061141, + "auxiliary_loss_mlp": 0.01059539, + "balance_loss_clip": 1.03221619, + "balance_loss_mlp": 1.02106392, + "epoch": 0.5273410491507591, + "flos": 28255487408640.0, + "grad_norm": 1.6926271326134057, + "language_loss": 0.70028627, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.72149301, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 8771, + "time_per_iteration": 2.4243364334106445 + }, + { + "auxiliary_loss_clip": 0.01056843, + "auxiliary_loss_mlp": 0.01058989, + "balance_loss_clip": 1.03421688, + "balance_loss_mlp": 1.01862288, + "epoch": 0.527401172403427, + "flos": 22454074719360.0, + "grad_norm": 1.919680573082745, + "language_loss": 0.77613837, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.79729676, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 8772, + "time_per_iteration": 2.4343321323394775 + }, + { + "auxiliary_loss_clip": 0.01058977, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.03250158, + "balance_loss_mlp": 1.01935434, + "epoch": 0.527461295656095, + "flos": 21030072295680.0, + "grad_norm": 1.8530392064491314, + "language_loss": 0.67408526, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.69527978, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.39648438, + "step": 8773, + "time_per_iteration": 2.398852825164795 + }, + { + "auxiliary_loss_clip": 0.01059005, + "auxiliary_loss_mlp": 0.01055863, + "balance_loss_clip": 1.02879083, + "balance_loss_mlp": 1.01836252, + "epoch": 0.5275214189087629, + "flos": 26320845306240.0, + "grad_norm": 1.7256670890012678, + "language_loss": 0.86780375, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88895243, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 8774, + "time_per_iteration": 2.5173864364624023 + }, + { + "auxiliary_loss_clip": 0.01058144, + "auxiliary_loss_mlp": 0.01059481, + "balance_loss_clip": 1.03369617, + "balance_loss_mlp": 1.01859272, + "epoch": 0.5275815421614309, + "flos": 20046010314240.0, + "grad_norm": 1.4886581558670002, + "language_loss": 0.80844235, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82961857, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 8775, + "time_per_iteration": 2.415989398956299 + }, + { + "auxiliary_loss_clip": 0.01056733, + "auxiliary_loss_mlp": 0.01060291, + "balance_loss_clip": 1.0339458, + "balance_loss_mlp": 1.01850724, + "epoch": 0.5276416654140988, + "flos": 21431189439360.0, + "grad_norm": 1.6657083598587137, + "language_loss": 0.84585285, + "learning_rate": 1.918041272397012e-06, + "loss": 0.86702305, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3828125, + "step": 8776, + "time_per_iteration": 2.4803123474121094 + }, + { + "auxiliary_loss_clip": 0.01059194, + "auxiliary_loss_mlp": 0.01057592, + "balance_loss_clip": 1.03149748, + "balance_loss_mlp": 1.01932907, + "epoch": 0.5277017886667669, + "flos": 17164139581440.0, + "grad_norm": 1.7829563817489542, + "language_loss": 0.68662083, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70778871, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 8777, + "time_per_iteration": 2.3876562118530273 + }, + { + "auxiliary_loss_clip": 0.01059721, + "auxiliary_loss_mlp": 0.01057475, + "balance_loss_clip": 1.03155899, + "balance_loss_mlp": 1.02093685, + "epoch": 0.5277619119194349, + "flos": 20447127457920.0, + "grad_norm": 1.5425136144291716, + "language_loss": 0.83212143, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.85329342, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38867188, + "step": 8778, + "time_per_iteration": 2.5151069164276123 + }, + { + "auxiliary_loss_clip": 0.01061193, + "auxiliary_loss_mlp": 0.01055674, + "balance_loss_clip": 1.02839887, + "balance_loss_mlp": 1.02014101, + "epoch": 0.5278220351721028, + "flos": 24059939748480.0, + "grad_norm": 2.008598959560973, + "language_loss": 0.80970883, + "learning_rate": 1.916873882856013e-06, + "loss": 0.83087754, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 8779, + "time_per_iteration": 2.415189027786255 + }, + { + "auxiliary_loss_clip": 0.01059032, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.02509284, + "balance_loss_mlp": 1.02140379, + "epoch": 0.5278821584247708, + "flos": 24641802334080.0, + "grad_norm": 2.6453383344017025, + "language_loss": 0.7830928, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.8041743, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 8780, + "time_per_iteration": 2.4948935508728027 + }, + { + "auxiliary_loss_clip": 0.01066832, + "auxiliary_loss_mlp": 0.01045883, + "balance_loss_clip": 1.0187037, + "balance_loss_mlp": 1.02548337, + "epoch": 0.5279422816774387, + "flos": 35406781971840.0, + "grad_norm": 2.484989147992843, + "language_loss": 0.70239067, + "learning_rate": 1.916095638898174e-06, + "loss": 0.72351784, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.4140625, + "step": 8781, + "time_per_iteration": 2.526517391204834 + }, + { + "auxiliary_loss_clip": 0.01060478, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_clip": 1.02423549, + "balance_loss_mlp": 1.0225687, + "epoch": 0.5280024049301068, + "flos": 22965901384320.0, + "grad_norm": 1.5604329440408664, + "language_loss": 0.73176873, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.75284803, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 8782, + "time_per_iteration": 2.4905261993408203 + }, + { + "auxiliary_loss_clip": 0.01062411, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.01349688, + "balance_loss_mlp": 1.02382493, + "epoch": 0.5280625281827747, + "flos": 21506531886720.0, + "grad_norm": 1.7833002331495102, + "language_loss": 0.69886035, + "learning_rate": 1.915317407666982e-06, + "loss": 0.71988136, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 8783, + "time_per_iteration": 2.402932643890381 + }, + { + "auxiliary_loss_clip": 0.01069091, + "auxiliary_loss_mlp": 0.01048868, + "balance_loss_clip": 1.0157752, + "balance_loss_mlp": 1.02481556, + "epoch": 0.5281226514354427, + "flos": 31206940214400.0, + "grad_norm": 2.110705560758405, + "language_loss": 0.70799989, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.7291795, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.44140625, + "step": 8784, + "time_per_iteration": 2.5252246856689453 + }, + { + "auxiliary_loss_clip": 0.01068306, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.01758683, + "balance_loss_mlp": 1.02483833, + "epoch": 0.5281827746881106, + "flos": 25076785363200.0, + "grad_norm": 2.324072504388123, + "language_loss": 0.77062678, + "learning_rate": 1.91453918928048e-06, + "loss": 0.79179251, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.43359375, + "step": 8785, + "time_per_iteration": 2.419907808303833 + }, + { + "auxiliary_loss_clip": 0.01065551, + "auxiliary_loss_mlp": 0.01042038, + "balance_loss_clip": 1.01470351, + "balance_loss_mlp": 1.02554309, + "epoch": 0.5282428979407786, + "flos": 20630211960960.0, + "grad_norm": 3.2768154774097216, + "language_loss": 0.84453517, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.86561108, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 8786, + "time_per_iteration": 2.442213296890259 + }, + { + "auxiliary_loss_clip": 0.01064467, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.01308596, + "balance_loss_mlp": 1.02704775, + "epoch": 0.5283030211934465, + "flos": 22418288709120.0, + "grad_norm": 2.128652958646841, + "language_loss": 0.83907872, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.86007059, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.375, + "step": 8787, + "time_per_iteration": 2.4233736991882324 + }, + { + "auxiliary_loss_clip": 0.01065757, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.01061606, + "balance_loss_mlp": 1.02687979, + "epoch": 0.5283631444461145, + "flos": 23614553134080.0, + "grad_norm": 1.6641471404157684, + "language_loss": 0.84220195, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.86319351, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.38867188, + "step": 8788, + "time_per_iteration": 2.42002010345459 + }, + { + "auxiliary_loss_clip": 0.01067266, + "auxiliary_loss_mlp": 0.01045869, + "balance_loss_clip": 1.01980996, + "balance_loss_mlp": 1.02822781, + "epoch": 0.5284232676987825, + "flos": 32670603809280.0, + "grad_norm": 1.5966751274371047, + "language_loss": 0.75749522, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77862656, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 8789, + "time_per_iteration": 2.5190012454986572 + }, + { + "auxiliary_loss_clip": 0.01067266, + "auxiliary_loss_mlp": 0.01047504, + "balance_loss_clip": 1.02337623, + "balance_loss_mlp": 1.02636707, + "epoch": 0.5284833909514505, + "flos": 26759703496320.0, + "grad_norm": 1.5060037095994725, + "language_loss": 0.70959944, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.73074716, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.41015625, + "step": 8790, + "time_per_iteration": 2.484835624694824 + }, + { + "auxiliary_loss_clip": 0.0106468, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.01433587, + "balance_loss_mlp": 1.02648902, + "epoch": 0.5285435142041185, + "flos": 22089616369920.0, + "grad_norm": 1.4734888707081581, + "language_loss": 0.79399848, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81501043, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3828125, + "step": 8791, + "time_per_iteration": 2.407970666885376 + }, + { + "auxiliary_loss_clip": 0.0106361, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.01660633, + "balance_loss_mlp": 1.02611017, + "epoch": 0.5286036374567864, + "flos": 20374438273920.0, + "grad_norm": 2.0906532939480527, + "language_loss": 0.67280078, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.69383144, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.375, + "step": 8792, + "time_per_iteration": 2.42397403717041 + }, + { + "auxiliary_loss_clip": 0.01060102, + "auxiliary_loss_mlp": 0.01042982, + "balance_loss_clip": 1.01862729, + "balance_loss_mlp": 1.02271152, + "epoch": 0.5286637607094544, + "flos": 24351045598080.0, + "grad_norm": 1.920259381284063, + "language_loss": 0.80866086, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82969165, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37304688, + "step": 8793, + "time_per_iteration": 2.4125218391418457 + }, + { + "auxiliary_loss_clip": 0.01064488, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02037024, + "balance_loss_mlp": 1.02479768, + "epoch": 0.5287238839621223, + "flos": 17270310625920.0, + "grad_norm": 2.0510743220721874, + "language_loss": 0.85873467, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.87983882, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39648438, + "step": 8794, + "time_per_iteration": 2.3777575492858887 + }, + { + "auxiliary_loss_clip": 0.0106552, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.02631629, + "balance_loss_mlp": 1.02322435, + "epoch": 0.5287840072147904, + "flos": 17565920040960.0, + "grad_norm": 2.3136085647910094, + "language_loss": 0.69693053, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.71812248, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.421875, + "step": 8795, + "time_per_iteration": 2.376643180847168 + }, + { + "auxiliary_loss_clip": 0.01062919, + "auxiliary_loss_mlp": 0.01049026, + "balance_loss_clip": 1.02363467, + "balance_loss_mlp": 1.02226162, + "epoch": 0.5288441304674583, + "flos": 18551099185920.0, + "grad_norm": 1.9631779135278333, + "language_loss": 0.81858176, + "learning_rate": 1.910259223028374e-06, + "loss": 0.83970118, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40625, + "step": 8796, + "time_per_iteration": 2.364229917526245 + }, + { + "auxiliary_loss_clip": 0.01063045, + "auxiliary_loss_mlp": 0.01051225, + "balance_loss_clip": 1.02541661, + "balance_loss_mlp": 1.02232206, + "epoch": 0.5289042537201263, + "flos": 20813436109440.0, + "grad_norm": 1.792741703003455, + "language_loss": 0.70644498, + "learning_rate": 1.909870155310071e-06, + "loss": 0.7275877, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40625, + "step": 8797, + "time_per_iteration": 3.6795458793640137 + }, + { + "auxiliary_loss_clip": 0.01058874, + "auxiliary_loss_mlp": 0.01041476, + "balance_loss_clip": 1.01792026, + "balance_loss_mlp": 1.02183175, + "epoch": 0.5289643769727942, + "flos": 15734551340160.0, + "grad_norm": 1.480876460202256, + "language_loss": 0.82950515, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.85050869, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37109375, + "step": 8798, + "time_per_iteration": 3.802950382232666 + }, + { + "auxiliary_loss_clip": 0.01062459, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_clip": 1.02051377, + "balance_loss_mlp": 1.02078056, + "epoch": 0.5290245002254622, + "flos": 19536278330880.0, + "grad_norm": 1.6816718788712157, + "language_loss": 0.71895194, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.74004734, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.41601562, + "step": 8799, + "time_per_iteration": 2.371108055114746 + }, + { + "auxiliary_loss_clip": 0.01057304, + "auxiliary_loss_mlp": 0.01043979, + "balance_loss_clip": 1.02066159, + "balance_loss_mlp": 1.02023578, + "epoch": 0.5290846234781301, + "flos": 15814222796160.0, + "grad_norm": 1.7925059576958258, + "language_loss": 0.70702004, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.72803289, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 8800, + "time_per_iteration": 3.880500555038452 + }, + { + "auxiliary_loss_clip": 0.01015275, + "auxiliary_loss_mlp": 0.01006245, + "balance_loss_clip": 1.00408745, + "balance_loss_mlp": 1.0080241, + "epoch": 0.5291447467307981, + "flos": 70054516874880.0, + "grad_norm": 0.9608730887301785, + "language_loss": 0.57017988, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59039509, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.07226562, + "step": 8801, + "time_per_iteration": 2.938629627227783 + }, + { + "auxiliary_loss_clip": 0.0105947, + "auxiliary_loss_mlp": 0.01055472, + "balance_loss_clip": 1.02814889, + "balance_loss_mlp": 1.02024174, + "epoch": 0.529204869983466, + "flos": 28362985084800.0, + "grad_norm": 1.7299710673553594, + "language_loss": 0.65102643, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.67217582, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39257812, + "step": 8802, + "time_per_iteration": 2.440952777862549 + }, + { + "auxiliary_loss_clip": 0.01056511, + "auxiliary_loss_mlp": 0.0104442, + "balance_loss_clip": 1.01933849, + "balance_loss_mlp": 1.0180068, + "epoch": 0.5292649932361341, + "flos": 33757624990080.0, + "grad_norm": 1.6432917605781632, + "language_loss": 0.70227224, + "learning_rate": 1.907535821289003e-06, + "loss": 0.7232815, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 8803, + "time_per_iteration": 2.556483268737793 + }, + { + "auxiliary_loss_clip": 0.01056463, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.01506233, + "balance_loss_mlp": 1.01854479, + "epoch": 0.5293251164888021, + "flos": 20446673610240.0, + "grad_norm": 1.6514442618010206, + "language_loss": 0.77543199, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.79640281, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 8804, + "time_per_iteration": 2.3862838745117188 + }, + { + "auxiliary_loss_clip": 0.01012112, + "auxiliary_loss_mlp": 0.01002135, + "balance_loss_clip": 0.99945235, + "balance_loss_mlp": 1.00482976, + "epoch": 0.52938523974147, + "flos": 66541554806400.0, + "grad_norm": 0.7588570171223493, + "language_loss": 0.53083634, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55097878, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.07275391, + "step": 8805, + "time_per_iteration": 3.136951208114624 + }, + { + "auxiliary_loss_clip": 0.01011555, + "auxiliary_loss_mlp": 0.01003059, + "balance_loss_clip": 1.00044847, + "balance_loss_mlp": 1.00426817, + "epoch": 0.529445362994138, + "flos": 67148345969280.0, + "grad_norm": 0.7431702372449289, + "language_loss": 0.6392709, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65941703, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.07275391, + "step": 8806, + "time_per_iteration": 3.064457416534424 + }, + { + "auxiliary_loss_clip": 0.01060845, + "auxiliary_loss_mlp": 0.01041614, + "balance_loss_clip": 1.01584101, + "balance_loss_mlp": 1.01929927, + "epoch": 0.5295054862468059, + "flos": 17748341228160.0, + "grad_norm": 1.477192008012668, + "language_loss": 0.73326373, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.75428826, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41601562, + "step": 8807, + "time_per_iteration": 3.864272117614746 + }, + { + "auxiliary_loss_clip": 0.01057113, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_clip": 1.02161813, + "balance_loss_mlp": 1.01923561, + "epoch": 0.529565609499474, + "flos": 11396697511680.0, + "grad_norm": 2.218606984838343, + "language_loss": 0.71464831, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.73567796, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 8808, + "time_per_iteration": 2.394089698791504 + }, + { + "auxiliary_loss_clip": 0.01057632, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.02007771, + "balance_loss_mlp": 1.01961803, + "epoch": 0.5296257327521419, + "flos": 17195561671680.0, + "grad_norm": 1.7140598009302366, + "language_loss": 0.88453537, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.90553808, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.37890625, + "step": 8809, + "time_per_iteration": 2.4058609008789062 + }, + { + "auxiliary_loss_clip": 0.01062807, + "auxiliary_loss_mlp": 0.0105318, + "balance_loss_clip": 1.02422369, + "balance_loss_mlp": 1.02155709, + "epoch": 0.5296858560048099, + "flos": 39962633529600.0, + "grad_norm": 1.6740658952566916, + "language_loss": 0.64925444, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.67041427, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 8810, + "time_per_iteration": 2.5711207389831543 + }, + { + "auxiliary_loss_clip": 0.01057191, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.01825166, + "balance_loss_mlp": 1.02004576, + "epoch": 0.5297459792574778, + "flos": 20960315665920.0, + "grad_norm": 1.5661132725050455, + "language_loss": 0.68302178, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.7040149, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 8811, + "time_per_iteration": 2.438230514526367 + }, + { + "auxiliary_loss_clip": 0.01016896, + "auxiliary_loss_mlp": 0.01009305, + "balance_loss_clip": 1.00670671, + "balance_loss_mlp": 1.00965142, + "epoch": 0.5298061025101458, + "flos": 66520468344960.0, + "grad_norm": 0.6638883443666355, + "language_loss": 0.53449357, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55475557, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.07226562, + "step": 8812, + "time_per_iteration": 3.1377737522125244 + }, + { + "auxiliary_loss_clip": 0.01017433, + "auxiliary_loss_mlp": 0.01017581, + "balance_loss_clip": 1.01482737, + "balance_loss_mlp": 1.01027989, + "epoch": 0.5298662257628137, + "flos": 67659579141120.0, + "grad_norm": 0.735726061196352, + "language_loss": 0.56359839, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58394861, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.07128906, + "step": 8813, + "time_per_iteration": 3.1136279106140137 + }, + { + "auxiliary_loss_clip": 0.01058098, + "auxiliary_loss_mlp": 0.01049, + "balance_loss_clip": 1.02568269, + "balance_loss_mlp": 1.0221976, + "epoch": 0.5299263490154817, + "flos": 19645381929600.0, + "grad_norm": 1.5638958517568933, + "language_loss": 0.82964516, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.85071611, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 8814, + "time_per_iteration": 2.4017255306243896 + }, + { + "auxiliary_loss_clip": 0.01064433, + "auxiliary_loss_mlp": 0.01044311, + "balance_loss_clip": 1.01913381, + "balance_loss_mlp": 1.02474284, + "epoch": 0.5299864722681497, + "flos": 22053900182400.0, + "grad_norm": 1.5213610480665616, + "language_loss": 0.86071992, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.88180739, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39648438, + "step": 8815, + "time_per_iteration": 2.407411575317383 + }, + { + "auxiliary_loss_clip": 0.01060621, + "auxiliary_loss_mlp": 0.0104284, + "balance_loss_clip": 1.01918876, + "balance_loss_mlp": 1.02317059, + "epoch": 0.5300465955208177, + "flos": 21762584864640.0, + "grad_norm": 1.9172934324757265, + "language_loss": 0.67519611, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.69623065, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.375, + "step": 8816, + "time_per_iteration": 2.450045108795166 + }, + { + "auxiliary_loss_clip": 0.01065068, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.01661301, + "balance_loss_mlp": 1.02574062, + "epoch": 0.5301067187734857, + "flos": 42994840043520.0, + "grad_norm": 1.5416065044464464, + "language_loss": 0.73338866, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.75445294, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39453125, + "step": 8817, + "time_per_iteration": 2.583049774169922 + }, + { + "auxiliary_loss_clip": 0.01063232, + "auxiliary_loss_mlp": 0.01042677, + "balance_loss_clip": 1.01764309, + "balance_loss_mlp": 1.02422059, + "epoch": 0.5301668420261536, + "flos": 20553368325120.0, + "grad_norm": 1.7391512623309993, + "language_loss": 0.65849602, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67955518, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 8818, + "time_per_iteration": 2.460122585296631 + }, + { + "auxiliary_loss_clip": 0.01063157, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.01244056, + "balance_loss_mlp": 1.02454138, + "epoch": 0.5302269652788216, + "flos": 17485899471360.0, + "grad_norm": 1.9157406818381337, + "language_loss": 0.75755465, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77857172, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 8819, + "time_per_iteration": 2.3741085529327393 + }, + { + "auxiliary_loss_clip": 0.01064575, + "auxiliary_loss_mlp": 0.01049372, + "balance_loss_clip": 1.02311039, + "balance_loss_mlp": 1.02395272, + "epoch": 0.5302870885314895, + "flos": 14573339786880.0, + "grad_norm": 1.9928780209130403, + "language_loss": 0.83864629, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.85978574, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 8820, + "time_per_iteration": 2.3901596069335938 + }, + { + "auxiliary_loss_clip": 0.01060951, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.01663184, + "balance_loss_mlp": 1.02257657, + "epoch": 0.5303472117841576, + "flos": 23436984625920.0, + "grad_norm": 2.4774293917992427, + "language_loss": 0.72966605, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.75067735, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3828125, + "step": 8821, + "time_per_iteration": 2.4542064666748047 + }, + { + "auxiliary_loss_clip": 0.01059494, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.01049733, + "balance_loss_mlp": 1.02293777, + "epoch": 0.5304073350368255, + "flos": 22707963192960.0, + "grad_norm": 1.4568779073272584, + "language_loss": 0.75382578, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.77474934, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36523438, + "step": 8822, + "time_per_iteration": 2.435206890106201 + }, + { + "auxiliary_loss_clip": 0.01060444, + "auxiliary_loss_mlp": 0.0104355, + "balance_loss_clip": 1.01807475, + "balance_loss_mlp": 1.02248263, + "epoch": 0.5304674582894935, + "flos": 27927303828480.0, + "grad_norm": 1.5885946231415418, + "language_loss": 0.68194342, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.70298338, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 8823, + "time_per_iteration": 2.4676249027252197 + }, + { + "auxiliary_loss_clip": 0.01063256, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.01289701, + "balance_loss_mlp": 1.02334809, + "epoch": 0.5305275815421614, + "flos": 21249606124800.0, + "grad_norm": 1.6231188707075235, + "language_loss": 0.70528579, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.72631955, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3984375, + "step": 8824, + "time_per_iteration": 2.451826572418213 + }, + { + "auxiliary_loss_clip": 0.01059364, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_clip": 1.02136147, + "balance_loss_mlp": 1.02277458, + "epoch": 0.5305877047948294, + "flos": 17602124987520.0, + "grad_norm": 3.7497419075933385, + "language_loss": 0.77654326, + "learning_rate": 1.898977700702689e-06, + "loss": 0.79758084, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 8825, + "time_per_iteration": 2.3647689819335938 + }, + { + "auxiliary_loss_clip": 0.0106036, + "auxiliary_loss_mlp": 0.01047551, + "balance_loss_clip": 1.02258897, + "balance_loss_mlp": 1.02248597, + "epoch": 0.5306478280474973, + "flos": 15194584252800.0, + "grad_norm": 1.7732730682069018, + "language_loss": 0.86219126, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.88327038, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37890625, + "step": 8826, + "time_per_iteration": 2.4161453247070312 + }, + { + "auxiliary_loss_clip": 0.01057324, + "auxiliary_loss_mlp": 0.0104152, + "balance_loss_clip": 1.01996744, + "balance_loss_mlp": 1.02096653, + "epoch": 0.5307079513001653, + "flos": 15340311734400.0, + "grad_norm": 1.3985904453167153, + "language_loss": 0.65555811, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.67654657, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36328125, + "step": 8827, + "time_per_iteration": 2.3964104652404785 + }, + { + "auxiliary_loss_clip": 0.01060131, + "auxiliary_loss_mlp": 0.01048351, + "balance_loss_clip": 1.02412796, + "balance_loss_mlp": 1.02124083, + "epoch": 0.5307680745528333, + "flos": 43542766920960.0, + "grad_norm": 1.547113097641632, + "language_loss": 0.61446357, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.63554835, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38867188, + "step": 8828, + "time_per_iteration": 2.6076481342315674 + }, + { + "auxiliary_loss_clip": 0.01058252, + "auxiliary_loss_mlp": 0.01050409, + "balance_loss_clip": 1.02624512, + "balance_loss_mlp": 1.01976371, + "epoch": 0.5308281978055013, + "flos": 20047860616320.0, + "grad_norm": 1.5613820157540044, + "language_loss": 0.8223874, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.84347397, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38476562, + "step": 8829, + "time_per_iteration": 2.4185941219329834 + }, + { + "auxiliary_loss_clip": 0.01054938, + "auxiliary_loss_mlp": 0.01049376, + "balance_loss_clip": 1.02677357, + "balance_loss_mlp": 1.01920879, + "epoch": 0.5308883210581693, + "flos": 20702901144960.0, + "grad_norm": 1.4946404527831225, + "language_loss": 0.79079181, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.81183493, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35742188, + "step": 8830, + "time_per_iteration": 2.443601131439209 + }, + { + "auxiliary_loss_clip": 0.01055691, + "auxiliary_loss_mlp": 0.0104716, + "balance_loss_clip": 1.02342534, + "balance_loss_mlp": 1.0191803, + "epoch": 0.5309484443108372, + "flos": 14354643830400.0, + "grad_norm": 1.9143193050059826, + "language_loss": 0.81340587, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83443439, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36523438, + "step": 8831, + "time_per_iteration": 2.3900814056396484 + }, + { + "auxiliary_loss_clip": 0.01055632, + "auxiliary_loss_mlp": 0.01054774, + "balance_loss_clip": 1.03000259, + "balance_loss_mlp": 1.01835346, + "epoch": 0.5310085675635052, + "flos": 20009491165440.0, + "grad_norm": 1.8916516066050082, + "language_loss": 0.74651122, + "learning_rate": 1.896255043672186e-06, + "loss": 0.76761532, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 8832, + "time_per_iteration": 2.4469454288482666 + }, + { + "auxiliary_loss_clip": 0.01059842, + "auxiliary_loss_mlp": 0.01058549, + "balance_loss_clip": 1.03340805, + "balance_loss_mlp": 1.02107048, + "epoch": 0.5310686908161731, + "flos": 22126205341440.0, + "grad_norm": 2.094123761504719, + "language_loss": 0.76921493, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.79039884, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38867188, + "step": 8833, + "time_per_iteration": 2.3871819972991943 + }, + { + "auxiliary_loss_clip": 0.01059713, + "auxiliary_loss_mlp": 0.01049213, + "balance_loss_clip": 1.02380967, + "balance_loss_mlp": 1.02042353, + "epoch": 0.5311288140688412, + "flos": 24716725845120.0, + "grad_norm": 1.6433434598810113, + "language_loss": 0.74611098, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.76720023, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 8834, + "time_per_iteration": 2.4571337699890137 + }, + { + "auxiliary_loss_clip": 0.01062626, + "auxiliary_loss_mlp": 0.01049154, + "balance_loss_clip": 1.02252269, + "balance_loss_mlp": 1.02049017, + "epoch": 0.5311889373215091, + "flos": 24096563631360.0, + "grad_norm": 1.7558438479762275, + "language_loss": 0.79309857, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.81421632, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 8835, + "time_per_iteration": 2.4141416549682617 + }, + { + "auxiliary_loss_clip": 0.01057825, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_clip": 1.01859128, + "balance_loss_mlp": 1.01966023, + "epoch": 0.5312490605741771, + "flos": 22015949667840.0, + "grad_norm": 1.6332604849994858, + "language_loss": 0.73422229, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.75523627, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 8836, + "time_per_iteration": 3.7075467109680176 + }, + { + "auxiliary_loss_clip": 0.0106039, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.02059531, + "epoch": 0.531309183826845, + "flos": 19389538419840.0, + "grad_norm": 1.8177015382597919, + "language_loss": 0.81933516, + "learning_rate": 1.894310406375987e-06, + "loss": 0.84039825, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 8837, + "time_per_iteration": 2.3757455348968506 + }, + { + "auxiliary_loss_clip": 0.01058942, + "auxiliary_loss_mlp": 0.01045184, + "balance_loss_clip": 1.01824284, + "balance_loss_mlp": 1.02108049, + "epoch": 0.531369307079513, + "flos": 20189119443840.0, + "grad_norm": 2.0824916080175124, + "language_loss": 0.8668617, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88790298, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 8838, + "time_per_iteration": 3.782651662826538 + }, + { + "auxiliary_loss_clip": 0.01059842, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.01656222, + "balance_loss_mlp": 1.02237415, + "epoch": 0.5314294303321809, + "flos": 18879143120640.0, + "grad_norm": 1.87373645089244, + "language_loss": 0.7378245, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.75882745, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 8839, + "time_per_iteration": 3.7373855113983154 + }, + { + "auxiliary_loss_clip": 0.01061048, + "auxiliary_loss_mlp": 0.0104345, + "balance_loss_clip": 1.0188098, + "balance_loss_mlp": 1.02147341, + "epoch": 0.531489553584849, + "flos": 23038904770560.0, + "grad_norm": 1.5604271507126428, + "language_loss": 0.77475286, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79579788, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39648438, + "step": 8840, + "time_per_iteration": 2.3892595767974854 + }, + { + "auxiliary_loss_clip": 0.01062779, + "auxiliary_loss_mlp": 0.01044289, + "balance_loss_clip": 1.01759768, + "balance_loss_mlp": 1.02286577, + "epoch": 0.5315496768375169, + "flos": 19789503488640.0, + "grad_norm": 1.9139129823169478, + "language_loss": 0.7857036, + "learning_rate": 1.892754768590216e-06, + "loss": 0.80677426, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3984375, + "step": 8841, + "time_per_iteration": 2.3851101398468018 + }, + { + "auxiliary_loss_clip": 0.01026071, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.0322485, + "balance_loss_mlp": 1.01837087, + "epoch": 0.5316098000901849, + "flos": 71019620121600.0, + "grad_norm": 0.7128305122710047, + "language_loss": 0.56920904, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58981621, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.07714844, + "step": 8842, + "time_per_iteration": 3.1683902740478516 + }, + { + "auxiliary_loss_clip": 0.01065711, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.01336336, + "balance_loss_mlp": 1.02540922, + "epoch": 0.5316699233428529, + "flos": 16434629389440.0, + "grad_norm": 1.9438791093520686, + "language_loss": 0.75480783, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.77586198, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 8843, + "time_per_iteration": 2.367952823638916 + }, + { + "auxiliary_loss_clip": 0.01028046, + "auxiliary_loss_mlp": 0.01012362, + "balance_loss_clip": 1.0097754, + "balance_loss_mlp": 1.02005267, + "epoch": 0.5317300465955208, + "flos": 67417036859520.0, + "grad_norm": 0.8853029183172776, + "language_loss": 0.61115479, + "learning_rate": 1.891588082900145e-06, + "loss": 0.6315589, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.08007812, + "step": 8844, + "time_per_iteration": 3.1023476123809814 + }, + { + "auxiliary_loss_clip": 0.01028287, + "auxiliary_loss_mlp": 0.01005563, + "balance_loss_clip": 1.00328577, + "balance_loss_mlp": 1.02052307, + "epoch": 0.5317901698481888, + "flos": 59505405373440.0, + "grad_norm": 0.8491527716064996, + "language_loss": 0.62294441, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64328289, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.07763672, + "step": 8845, + "time_per_iteration": 3.0294365882873535 + }, + { + "auxiliary_loss_clip": 0.01064986, + "auxiliary_loss_mlp": 0.01050081, + "balance_loss_clip": 1.02374732, + "balance_loss_mlp": 1.02453005, + "epoch": 0.5318502931008567, + "flos": 19128388383360.0, + "grad_norm": 1.8837555701047806, + "language_loss": 0.7772972, + "learning_rate": 1.890810312970474e-06, + "loss": 0.79844785, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40429688, + "step": 8846, + "time_per_iteration": 3.8905065059661865 + }, + { + "auxiliary_loss_clip": 0.01063652, + "auxiliary_loss_mlp": 0.01056518, + "balance_loss_clip": 1.03148437, + "balance_loss_mlp": 1.02415895, + "epoch": 0.5319104163535248, + "flos": 24679892494080.0, + "grad_norm": 1.6153446697775842, + "language_loss": 0.76336938, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.78457105, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39453125, + "step": 8847, + "time_per_iteration": 2.451061725616455 + }, + { + "auxiliary_loss_clip": 0.01063547, + "auxiliary_loss_mlp": 0.01057418, + "balance_loss_clip": 1.03375459, + "balance_loss_mlp": 1.02407646, + "epoch": 0.5319705396061927, + "flos": 19384650829440.0, + "grad_norm": 1.6000816393709836, + "language_loss": 0.88560414, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.9068138, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.39453125, + "step": 8848, + "time_per_iteration": 2.377861499786377 + }, + { + "auxiliary_loss_clip": 0.01064002, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.0368613, + "balance_loss_mlp": 1.02381003, + "epoch": 0.5320306628588607, + "flos": 18258352502400.0, + "grad_norm": 2.4054855453451673, + "language_loss": 0.75952625, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.7808181, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40234375, + "step": 8849, + "time_per_iteration": 2.367537498474121 + }, + { + "auxiliary_loss_clip": 0.01065327, + "auxiliary_loss_mlp": 0.01072916, + "balance_loss_clip": 1.04558146, + "balance_loss_mlp": 1.02288961, + "epoch": 0.5320907861115286, + "flos": 23731197586560.0, + "grad_norm": 1.8491787911045516, + "language_loss": 0.81118202, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.83256447, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.42382812, + "step": 8850, + "time_per_iteration": 2.45816707611084 + }, + { + "auxiliary_loss_clip": 0.01061957, + "auxiliary_loss_mlp": 0.0108418, + "balance_loss_clip": 1.05982566, + "balance_loss_mlp": 1.02199626, + "epoch": 0.5321509093641966, + "flos": 34493838163200.0, + "grad_norm": 1.413812688483363, + "language_loss": 0.55373549, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57519686, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.40039062, + "step": 8851, + "time_per_iteration": 2.52815580368042 + }, + { + "auxiliary_loss_clip": 0.01063191, + "auxiliary_loss_mlp": 0.01071621, + "balance_loss_clip": 1.04625344, + "balance_loss_mlp": 1.02245498, + "epoch": 0.5322110326168645, + "flos": 20009910101760.0, + "grad_norm": 1.6223615300649814, + "language_loss": 0.69457793, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.71592605, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40625, + "step": 8852, + "time_per_iteration": 2.4178266525268555 + }, + { + "auxiliary_loss_clip": 0.01018948, + "auxiliary_loss_mlp": 0.0105745, + "balance_loss_clip": 1.05455351, + "balance_loss_mlp": 1.01103234, + "epoch": 0.5322711558695326, + "flos": 64627931208960.0, + "grad_norm": 0.8501394306061668, + "language_loss": 0.63128453, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.65204853, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.07910156, + "step": 8853, + "time_per_iteration": 2.980318546295166 + }, + { + "auxiliary_loss_clip": 0.01061991, + "auxiliary_loss_mlp": 0.01070419, + "balance_loss_clip": 1.04420471, + "balance_loss_mlp": 1.02035654, + "epoch": 0.5323312791222005, + "flos": 14938461452160.0, + "grad_norm": 2.124012533662979, + "language_loss": 0.80760837, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.82893252, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41601562, + "step": 8854, + "time_per_iteration": 2.3557727336883545 + }, + { + "auxiliary_loss_clip": 0.01056003, + "auxiliary_loss_mlp": 0.01056646, + "balance_loss_clip": 1.03293478, + "balance_loss_mlp": 1.01897204, + "epoch": 0.5323914023748685, + "flos": 23439707712000.0, + "grad_norm": 1.8799614699362066, + "language_loss": 0.75067246, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.77179885, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 8855, + "time_per_iteration": 2.4229915142059326 + }, + { + "auxiliary_loss_clip": 0.01058597, + "auxiliary_loss_mlp": 0.01054917, + "balance_loss_clip": 1.03065777, + "balance_loss_mlp": 1.01924467, + "epoch": 0.5324515256275365, + "flos": 26284989473280.0, + "grad_norm": 1.9966086426987635, + "language_loss": 0.66545069, + "learning_rate": 1.886921714110507e-06, + "loss": 0.68658584, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.39453125, + "step": 8856, + "time_per_iteration": 2.469897508621216 + }, + { + "auxiliary_loss_clip": 0.0106229, + "auxiliary_loss_mlp": 0.01060219, + "balance_loss_clip": 1.03112006, + "balance_loss_mlp": 1.02074814, + "epoch": 0.5325116488802044, + "flos": 26869679879040.0, + "grad_norm": 2.953929306947869, + "language_loss": 0.78457052, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.80579561, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41601562, + "step": 8857, + "time_per_iteration": 2.4262306690216064 + }, + { + "auxiliary_loss_clip": 0.01061951, + "auxiliary_loss_mlp": 0.01049219, + "balance_loss_clip": 1.02344561, + "balance_loss_mlp": 1.02198148, + "epoch": 0.5325717721328724, + "flos": 25883558127360.0, + "grad_norm": 2.006948592875613, + "language_loss": 0.71515667, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.7362684, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40039062, + "step": 8858, + "time_per_iteration": 2.452889919281006 + }, + { + "auxiliary_loss_clip": 0.01062825, + "auxiliary_loss_mlp": 0.01051456, + "balance_loss_clip": 1.02281034, + "balance_loss_mlp": 1.02274477, + "epoch": 0.5326318953855403, + "flos": 21798231229440.0, + "grad_norm": 1.589773861662681, + "language_loss": 0.70252377, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.72366655, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40039062, + "step": 8859, + "time_per_iteration": 2.4315295219421387 + }, + { + "auxiliary_loss_clip": 0.01060615, + "auxiliary_loss_mlp": 0.0103982, + "balance_loss_clip": 1.01590705, + "balance_loss_mlp": 1.02306974, + "epoch": 0.5326920186382084, + "flos": 20921876392320.0, + "grad_norm": 1.4240415589334594, + "language_loss": 0.70712042, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.7281248, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.375, + "step": 8860, + "time_per_iteration": 2.4265458583831787 + }, + { + "auxiliary_loss_clip": 0.01065429, + "auxiliary_loss_mlp": 0.01044272, + "balance_loss_clip": 1.01795053, + "balance_loss_mlp": 1.02629876, + "epoch": 0.5327521418908763, + "flos": 21432376425600.0, + "grad_norm": 1.9121229789274412, + "language_loss": 0.78604925, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80714625, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.390625, + "step": 8861, + "time_per_iteration": 2.4094181060791016 + }, + { + "auxiliary_loss_clip": 0.01065091, + "auxiliary_loss_mlp": 0.0105033, + "balance_loss_clip": 1.02241087, + "balance_loss_mlp": 1.02487087, + "epoch": 0.5328122651435443, + "flos": 21759233374080.0, + "grad_norm": 1.4817956821496452, + "language_loss": 0.86503243, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.8861866, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40234375, + "step": 8862, + "time_per_iteration": 2.4504189491271973 + }, + { + "auxiliary_loss_clip": 0.01067723, + "auxiliary_loss_mlp": 0.01047497, + "balance_loss_clip": 1.01829052, + "balance_loss_mlp": 1.02553236, + "epoch": 0.5328723883962122, + "flos": 18295500055680.0, + "grad_norm": 2.1632944180812705, + "language_loss": 0.63500017, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.65615237, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.421875, + "step": 8863, + "time_per_iteration": 2.3840513229370117 + }, + { + "auxiliary_loss_clip": 0.01070303, + "auxiliary_loss_mlp": 0.01042047, + "balance_loss_clip": 1.01616716, + "balance_loss_mlp": 1.03008246, + "epoch": 0.5329325116488802, + "flos": 25373721409920.0, + "grad_norm": 1.7982023549850212, + "language_loss": 0.74842638, + "learning_rate": 1.883811143046377e-06, + "loss": 0.76954991, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 8864, + "time_per_iteration": 2.4624276161193848 + }, + { + "auxiliary_loss_clip": 0.01068525, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_clip": 1.01681733, + "balance_loss_mlp": 1.02813613, + "epoch": 0.5329926349015481, + "flos": 25590951089280.0, + "grad_norm": 2.2968095804498603, + "language_loss": 0.65510702, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.6762234, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40429688, + "step": 8865, + "time_per_iteration": 2.448385715484619 + }, + { + "auxiliary_loss_clip": 0.01066121, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.01233876, + "balance_loss_mlp": 1.0261364, + "epoch": 0.5330527581542162, + "flos": 22888603900800.0, + "grad_norm": 1.9387653326293621, + "language_loss": 0.79912567, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.82016397, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 8866, + "time_per_iteration": 2.428471803665161 + }, + { + "auxiliary_loss_clip": 0.0107026, + "auxiliary_loss_mlp": 0.0104091, + "balance_loss_clip": 1.0146122, + "balance_loss_mlp": 1.02997971, + "epoch": 0.5331128814068841, + "flos": 16026041214720.0, + "grad_norm": 2.022874823435687, + "language_loss": 0.75490224, + "learning_rate": 1.882644751189108e-06, + "loss": 0.77601397, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 8867, + "time_per_iteration": 2.3903987407684326 + }, + { + "auxiliary_loss_clip": 0.01072026, + "auxiliary_loss_mlp": 0.01041443, + "balance_loss_clip": 1.01276159, + "balance_loss_mlp": 1.02989876, + "epoch": 0.5331730046595521, + "flos": 39343239365760.0, + "grad_norm": 1.5402991963139867, + "language_loss": 0.72724938, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74838406, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.421875, + "step": 8868, + "time_per_iteration": 2.559777021408081 + }, + { + "auxiliary_loss_clip": 0.01071582, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.01219249, + "balance_loss_mlp": 1.03151023, + "epoch": 0.5332331279122201, + "flos": 24023246042880.0, + "grad_norm": 1.803047915036845, + "language_loss": 0.79577821, + "learning_rate": 1.881867178843637e-06, + "loss": 0.81686449, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.40234375, + "step": 8869, + "time_per_iteration": 2.4375386238098145 + }, + { + "auxiliary_loss_clip": 0.01071599, + "auxiliary_loss_mlp": 0.01049441, + "balance_loss_clip": 1.02108145, + "balance_loss_mlp": 1.0286721, + "epoch": 0.533293251164888, + "flos": 17128353571200.0, + "grad_norm": 1.7476242435888947, + "language_loss": 0.77265304, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.79386342, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.4296875, + "step": 8870, + "time_per_iteration": 2.428163528442383 + }, + { + "auxiliary_loss_clip": 0.01071555, + "auxiliary_loss_mlp": 0.01052794, + "balance_loss_clip": 1.02243185, + "balance_loss_mlp": 1.02826035, + "epoch": 0.533353374417556, + "flos": 22125297646080.0, + "grad_norm": 1.8266985496272372, + "language_loss": 0.76380825, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.78505176, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43359375, + "step": 8871, + "time_per_iteration": 2.4485087394714355 + }, + { + "auxiliary_loss_clip": 0.01071414, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_clip": 1.01744938, + "balance_loss_mlp": 1.02932751, + "epoch": 0.533413497670224, + "flos": 15010242940800.0, + "grad_norm": 2.308582605268064, + "language_loss": 0.73747301, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.75862771, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.421875, + "step": 8872, + "time_per_iteration": 2.4122579097747803 + }, + { + "auxiliary_loss_clip": 0.01069784, + "auxiliary_loss_mlp": 0.01047787, + "balance_loss_clip": 1.02046418, + "balance_loss_mlp": 1.02992606, + "epoch": 0.533473620922892, + "flos": 19608932603520.0, + "grad_norm": 1.5827071001111013, + "language_loss": 0.65873134, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67990708, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3984375, + "step": 8873, + "time_per_iteration": 2.3884077072143555 + }, + { + "auxiliary_loss_clip": 0.01069114, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_clip": 1.0191524, + "balance_loss_mlp": 1.02778769, + "epoch": 0.5335337441755599, + "flos": 14281780089600.0, + "grad_norm": 3.0232932357567015, + "language_loss": 0.81635195, + "learning_rate": 1.879923326631099e-06, + "loss": 0.83750701, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.4140625, + "step": 8874, + "time_per_iteration": 2.426406145095825 + }, + { + "auxiliary_loss_clip": 0.01066643, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.02570438, + "epoch": 0.5335938674282279, + "flos": 20813750311680.0, + "grad_norm": 1.8159115886153605, + "language_loss": 0.71033031, + "learning_rate": 1.879534569789582e-06, + "loss": 0.73146635, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41015625, + "step": 8875, + "time_per_iteration": 2.3906476497650146 + }, + { + "auxiliary_loss_clip": 0.01029383, + "auxiliary_loss_mlp": 0.01002804, + "balance_loss_clip": 1.00019372, + "balance_loss_mlp": 1.0216701, + "epoch": 0.5336539906808958, + "flos": 71392596842880.0, + "grad_norm": 0.7239415715877608, + "language_loss": 0.5973134, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61763531, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.07714844, + "step": 8876, + "time_per_iteration": 4.360065698623657 + }, + { + "auxiliary_loss_clip": 0.01063285, + "auxiliary_loss_mlp": 0.01048926, + "balance_loss_clip": 1.02215147, + "balance_loss_mlp": 1.02249122, + "epoch": 0.5337141139335638, + "flos": 20152076624640.0, + "grad_norm": 5.097698883494553, + "language_loss": 0.7586025, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.7797246, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 8877, + "time_per_iteration": 2.431522846221924 + }, + { + "auxiliary_loss_clip": 0.0102167, + "auxiliary_loss_mlp": 0.01006494, + "balance_loss_clip": 1.00355005, + "balance_loss_mlp": 1.0139122, + "epoch": 0.5337742371862317, + "flos": 67725181319040.0, + "grad_norm": 0.7660830315031517, + "language_loss": 0.57327259, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59355426, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.02941895, + "router_z_loss_mlp": 0.07714844, + "step": 8878, + "time_per_iteration": 4.378432989120483 + }, + { + "auxiliary_loss_clip": 0.01064334, + "auxiliary_loss_mlp": 0.01074067, + "balance_loss_clip": 1.04300117, + "balance_loss_mlp": 1.02121174, + "epoch": 0.5338343604388998, + "flos": 25007761872000.0, + "grad_norm": 1.6162311876035236, + "language_loss": 0.73438466, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.75576866, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.43164062, + "step": 8879, + "time_per_iteration": 3.745462417602539 + }, + { + "auxiliary_loss_clip": 0.0106212, + "auxiliary_loss_mlp": 0.0107164, + "balance_loss_clip": 1.04164672, + "balance_loss_mlp": 1.02051866, + "epoch": 0.5338944836915677, + "flos": 17600344508160.0, + "grad_norm": 2.2149738614853476, + "language_loss": 0.84270668, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.86404431, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41601562, + "step": 8880, + "time_per_iteration": 2.388364553451538 + }, + { + "auxiliary_loss_clip": 0.0105886, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_clip": 1.0339036, + "balance_loss_mlp": 1.0191474, + "epoch": 0.5339546069442357, + "flos": 21723098250240.0, + "grad_norm": 1.4240444263016985, + "language_loss": 0.80492198, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.82612884, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.3984375, + "step": 8881, + "time_per_iteration": 2.3901116847991943 + }, + { + "auxiliary_loss_clip": 0.01010673, + "auxiliary_loss_mlp": 0.01038322, + "balance_loss_clip": 1.03555655, + "balance_loss_mlp": 1.00304937, + "epoch": 0.5340147301969036, + "flos": 69720642743040.0, + "grad_norm": 0.8228580020017534, + "language_loss": 0.59327441, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61376441, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.02770996, + "router_z_loss_mlp": 0.07617188, + "step": 8882, + "time_per_iteration": 2.9947543144226074 + }, + { + "auxiliary_loss_clip": 0.01011024, + "auxiliary_loss_mlp": 0.01041214, + "balance_loss_clip": 1.03872287, + "balance_loss_mlp": 1.00315666, + "epoch": 0.5340748534495716, + "flos": 63878067694080.0, + "grad_norm": 0.9773490547480955, + "language_loss": 0.63756943, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65809184, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.07861328, + "step": 8883, + "time_per_iteration": 2.825155735015869 + }, + { + "auxiliary_loss_clip": 0.01063402, + "auxiliary_loss_mlp": 0.0105799, + "balance_loss_clip": 1.02824736, + "balance_loss_mlp": 1.02087343, + "epoch": 0.5341349767022396, + "flos": 28693053878400.0, + "grad_norm": 2.202805627840597, + "language_loss": 0.83480918, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.85602313, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42578125, + "step": 8884, + "time_per_iteration": 2.4382941722869873 + }, + { + "auxiliary_loss_clip": 0.01061153, + "auxiliary_loss_mlp": 0.01054613, + "balance_loss_clip": 1.02602673, + "balance_loss_mlp": 1.02173042, + "epoch": 0.5341950999549075, + "flos": 16288762262400.0, + "grad_norm": 1.7152142221216333, + "language_loss": 0.72637749, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74753517, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39453125, + "step": 8885, + "time_per_iteration": 2.373394727706909 + }, + { + "auxiliary_loss_clip": 0.01067503, + "auxiliary_loss_mlp": 0.01048063, + "balance_loss_clip": 1.01842749, + "balance_loss_mlp": 1.02426291, + "epoch": 0.5342552232075756, + "flos": 14354783475840.0, + "grad_norm": 1.914030344860083, + "language_loss": 0.79961038, + "learning_rate": 1.87525854926798e-06, + "loss": 0.82076609, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.43164062, + "step": 8886, + "time_per_iteration": 3.8442862033843994 + }, + { + "auxiliary_loss_clip": 0.01066852, + "auxiliary_loss_mlp": 0.01053243, + "balance_loss_clip": 1.0221293, + "balance_loss_mlp": 1.02473044, + "epoch": 0.5343153464602435, + "flos": 30296719491840.0, + "grad_norm": 1.593547363668411, + "language_loss": 0.75513285, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.77633381, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.421875, + "step": 8887, + "time_per_iteration": 2.513382911682129 + }, + { + "auxiliary_loss_clip": 0.01065697, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_clip": 1.01595831, + "balance_loss_mlp": 1.02530527, + "epoch": 0.5343754697129115, + "flos": 15595387194240.0, + "grad_norm": 2.2184434876933463, + "language_loss": 0.71537983, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.73647594, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40429688, + "step": 8888, + "time_per_iteration": 2.385096549987793 + }, + { + "auxiliary_loss_clip": 0.01075961, + "auxiliary_loss_mlp": 0.01054517, + "balance_loss_clip": 1.02256858, + "balance_loss_mlp": 1.02951789, + "epoch": 0.5344355929655794, + "flos": 16908680096640.0, + "grad_norm": 2.011258751882969, + "language_loss": 0.78616232, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.8074671, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.46484375, + "step": 8889, + "time_per_iteration": 2.4146435260772705 + }, + { + "auxiliary_loss_clip": 0.0107231, + "auxiliary_loss_mlp": 0.01053564, + "balance_loss_clip": 1.02183008, + "balance_loss_mlp": 1.02991343, + "epoch": 0.5344957162182474, + "flos": 16797307259520.0, + "grad_norm": 1.9295596384091895, + "language_loss": 0.70762539, + "learning_rate": 1.873703773589102e-06, + "loss": 0.7288841, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.42382812, + "step": 8890, + "time_per_iteration": 2.384955883026123 + }, + { + "auxiliary_loss_clip": 0.01077029, + "auxiliary_loss_mlp": 0.0104806, + "balance_loss_clip": 1.01658857, + "balance_loss_mlp": 1.03237569, + "epoch": 0.5345558394709153, + "flos": 12704998089600.0, + "grad_norm": 2.382859091124453, + "language_loss": 0.79682755, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.81807852, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 8891, + "time_per_iteration": 2.4077796936035156 + }, + { + "auxiliary_loss_clip": 0.01069439, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.01788688, + "balance_loss_mlp": 1.0292294, + "epoch": 0.5346159627235834, + "flos": 22453969985280.0, + "grad_norm": 1.7376531924742231, + "language_loss": 0.76005864, + "learning_rate": 1.872926414425699e-06, + "loss": 0.78119588, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 8892, + "time_per_iteration": 2.4401135444641113 + }, + { + "auxiliary_loss_clip": 0.01075915, + "auxiliary_loss_mlp": 0.0105406, + "balance_loss_clip": 1.02654696, + "balance_loss_mlp": 1.03265631, + "epoch": 0.5346760859762513, + "flos": 22414762661760.0, + "grad_norm": 1.7297153268963914, + "language_loss": 0.89013106, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.91143084, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.43164062, + "step": 8893, + "time_per_iteration": 2.4406864643096924 + }, + { + "auxiliary_loss_clip": 0.01073125, + "auxiliary_loss_mlp": 0.01049624, + "balance_loss_clip": 1.02203941, + "balance_loss_mlp": 1.03128338, + "epoch": 0.5347362092289193, + "flos": 22815146666880.0, + "grad_norm": 1.74171596092802, + "language_loss": 0.73870063, + "learning_rate": 1.872149074536869e-06, + "loss": 0.75992811, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 8894, + "time_per_iteration": 2.4509146213531494 + }, + { + "auxiliary_loss_clip": 0.01076504, + "auxiliary_loss_mlp": 0.01058843, + "balance_loss_clip": 1.02915967, + "balance_loss_mlp": 1.03374577, + "epoch": 0.5347963324815872, + "flos": 23218428314880.0, + "grad_norm": 1.5000022437689657, + "language_loss": 0.75850934, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77986282, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42773438, + "step": 8895, + "time_per_iteration": 2.423671007156372 + }, + { + "auxiliary_loss_clip": 0.01073373, + "auxiliary_loss_mlp": 0.01046, + "balance_loss_clip": 1.01759219, + "balance_loss_mlp": 1.03143585, + "epoch": 0.5348564557342552, + "flos": 22600256048640.0, + "grad_norm": 1.5996643824935284, + "language_loss": 0.78036553, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.80155921, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41992188, + "step": 8896, + "time_per_iteration": 2.4725427627563477 + }, + { + "auxiliary_loss_clip": 0.01073017, + "auxiliary_loss_mlp": 0.01049554, + "balance_loss_clip": 1.02113438, + "balance_loss_mlp": 1.03266931, + "epoch": 0.5349165789869232, + "flos": 18001461651840.0, + "grad_norm": 1.6579895812082601, + "language_loss": 0.79489326, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.81611896, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40429688, + "step": 8897, + "time_per_iteration": 2.3870222568511963 + }, + { + "auxiliary_loss_clip": 0.01072928, + "auxiliary_loss_mlp": 0.01054936, + "balance_loss_clip": 1.02508652, + "balance_loss_mlp": 1.03006744, + "epoch": 0.5349767022395912, + "flos": 17158972700160.0, + "grad_norm": 1.853986619594393, + "language_loss": 0.77072966, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.79200828, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.4296875, + "step": 8898, + "time_per_iteration": 2.392573356628418 + }, + { + "auxiliary_loss_clip": 0.0104644, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.04523051, + "balance_loss_mlp": 1.03651476, + "epoch": 0.5350368254922592, + "flos": 70988302765440.0, + "grad_norm": 0.8914213708600844, + "language_loss": 0.58073032, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60167176, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.0246582, + "router_z_loss_mlp": 0.09960938, + "step": 8899, + "time_per_iteration": 3.2076053619384766 + }, + { + "auxiliary_loss_clip": 0.01067996, + "auxiliary_loss_mlp": 0.01056608, + "balance_loss_clip": 1.02878428, + "balance_loss_mlp": 1.02760029, + "epoch": 0.5350969487449271, + "flos": 27416594327040.0, + "grad_norm": 1.5508776547425214, + "language_loss": 0.71142709, + "learning_rate": 1.869817171696868e-06, + "loss": 0.73267317, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40429688, + "step": 8900, + "time_per_iteration": 2.4627418518066406 + }, + { + "auxiliary_loss_clip": 0.01069403, + "auxiliary_loss_mlp": 0.01042836, + "balance_loss_clip": 1.0142498, + "balance_loss_mlp": 1.0265485, + "epoch": 0.5351570719975951, + "flos": 19315173490560.0, + "grad_norm": 1.7897480741791756, + "language_loss": 0.72815812, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.74928057, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4296875, + "step": 8901, + "time_per_iteration": 2.4339797496795654 + }, + { + "auxiliary_loss_clip": 0.01067898, + "auxiliary_loss_mlp": 0.01048836, + "balance_loss_clip": 1.01965368, + "balance_loss_mlp": 1.02506447, + "epoch": 0.535217195250263, + "flos": 19827558737280.0, + "grad_norm": 1.9213024303147992, + "language_loss": 0.78756297, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.80873024, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 8902, + "time_per_iteration": 2.3829216957092285 + }, + { + "auxiliary_loss_clip": 0.01063051, + "auxiliary_loss_mlp": 0.01047291, + "balance_loss_clip": 1.02174425, + "balance_loss_mlp": 1.02379465, + "epoch": 0.535277318502931, + "flos": 22126763923200.0, + "grad_norm": 1.4526084444728036, + "language_loss": 0.70623171, + "learning_rate": 1.868651286721281e-06, + "loss": 0.7273351, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39257812, + "step": 8903, + "time_per_iteration": 2.4510228633880615 + }, + { + "auxiliary_loss_clip": 0.01063821, + "auxiliary_loss_mlp": 0.01051648, + "balance_loss_clip": 1.02049828, + "balance_loss_mlp": 1.02136683, + "epoch": 0.5353374417555989, + "flos": 25044734868480.0, + "grad_norm": 1.5721375966744437, + "language_loss": 0.73643661, + "learning_rate": 1.86826266833795e-06, + "loss": 0.75759137, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.42578125, + "step": 8904, + "time_per_iteration": 2.4856839179992676 + }, + { + "auxiliary_loss_clip": 0.01063687, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_clip": 1.01869118, + "balance_loss_mlp": 1.02245307, + "epoch": 0.535397565008267, + "flos": 19387757940480.0, + "grad_norm": 2.00494889914011, + "language_loss": 0.74273026, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.76383775, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41210938, + "step": 8905, + "time_per_iteration": 2.3885810375213623 + }, + { + "auxiliary_loss_clip": 0.01056087, + "auxiliary_loss_mlp": 0.01040577, + "balance_loss_clip": 1.01464868, + "balance_loss_mlp": 1.01894772, + "epoch": 0.5354576882609349, + "flos": 21470117472000.0, + "grad_norm": 1.5119191310032039, + "language_loss": 0.84734917, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.86831582, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37109375, + "step": 8906, + "time_per_iteration": 2.4262685775756836 + }, + { + "auxiliary_loss_clip": 0.01061227, + "auxiliary_loss_mlp": 0.01050648, + "balance_loss_clip": 1.01734066, + "balance_loss_mlp": 1.01943147, + "epoch": 0.5355178115136029, + "flos": 20776463112960.0, + "grad_norm": 1.988213604635427, + "language_loss": 0.74982154, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.7709403, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.41796875, + "step": 8907, + "time_per_iteration": 2.3803696632385254 + }, + { + "auxiliary_loss_clip": 0.01059859, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.01930213, + "balance_loss_mlp": 1.01949835, + "epoch": 0.5355779347662708, + "flos": 23512885655040.0, + "grad_norm": 1.901327394138041, + "language_loss": 0.77715266, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79822034, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40234375, + "step": 8908, + "time_per_iteration": 2.4347689151763916 + }, + { + "auxiliary_loss_clip": 0.01062941, + "auxiliary_loss_mlp": 0.01049264, + "balance_loss_clip": 1.02014148, + "balance_loss_mlp": 1.0206486, + "epoch": 0.5356380580189388, + "flos": 20302168026240.0, + "grad_norm": 2.035663742641289, + "language_loss": 0.75531268, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.77643466, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.421875, + "step": 8909, + "time_per_iteration": 2.370511293411255 + }, + { + "auxiliary_loss_clip": 0.01059988, + "auxiliary_loss_mlp": 0.01050983, + "balance_loss_clip": 1.02401829, + "balance_loss_mlp": 1.01988292, + "epoch": 0.5356981812716068, + "flos": 21360560025600.0, + "grad_norm": 2.002884545803835, + "language_loss": 0.85591042, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.87702012, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40039062, + "step": 8910, + "time_per_iteration": 2.4434726238250732 + }, + { + "auxiliary_loss_clip": 0.01060415, + "auxiliary_loss_mlp": 0.01041084, + "balance_loss_clip": 1.01209211, + "balance_loss_mlp": 1.01856661, + "epoch": 0.5357583045242748, + "flos": 23110162588800.0, + "grad_norm": 1.4727261373466636, + "language_loss": 0.82746887, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.84848392, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41796875, + "step": 8911, + "time_per_iteration": 2.4135525226593018 + }, + { + "auxiliary_loss_clip": 0.01060349, + "auxiliary_loss_mlp": 0.01040149, + "balance_loss_clip": 1.01333928, + "balance_loss_mlp": 1.01986504, + "epoch": 0.5358184277769428, + "flos": 21140711994240.0, + "grad_norm": 2.7258566904273196, + "language_loss": 0.70417905, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.72518408, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 8912, + "time_per_iteration": 2.4221699237823486 + }, + { + "auxiliary_loss_clip": 0.01061747, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_clip": 1.02188802, + "balance_loss_mlp": 1.02104568, + "epoch": 0.5358785510296107, + "flos": 16281675256320.0, + "grad_norm": 1.932447615197762, + "language_loss": 0.72819209, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.74929237, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40625, + "step": 8913, + "time_per_iteration": 2.3562655448913574 + }, + { + "auxiliary_loss_clip": 0.01062682, + "auxiliary_loss_mlp": 0.01041091, + "balance_loss_clip": 1.01319635, + "balance_loss_mlp": 1.02065992, + "epoch": 0.5359386742822787, + "flos": 16976097665280.0, + "grad_norm": 1.5969317224802104, + "language_loss": 0.7265929, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7476306, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41992188, + "step": 8914, + "time_per_iteration": 2.4059157371520996 + }, + { + "auxiliary_loss_clip": 0.01063633, + "auxiliary_loss_mlp": 0.01046583, + "balance_loss_clip": 1.01613724, + "balance_loss_mlp": 1.02039218, + "epoch": 0.5359987975349466, + "flos": 20811900009600.0, + "grad_norm": 1.9155088400901905, + "language_loss": 0.71881568, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.73991787, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.43164062, + "step": 8915, + "time_per_iteration": 3.643850803375244 + }, + { + "auxiliary_loss_clip": 0.01061087, + "auxiliary_loss_mlp": 0.01045207, + "balance_loss_clip": 1.0162394, + "balance_loss_mlp": 1.02081382, + "epoch": 0.5360589207876146, + "flos": 22198859614080.0, + "grad_norm": 1.732350776190522, + "language_loss": 0.769216, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.79027897, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40234375, + "step": 8916, + "time_per_iteration": 2.396199941635132 + }, + { + "auxiliary_loss_clip": 0.01063134, + "auxiliary_loss_mlp": 0.01044788, + "balance_loss_clip": 1.01642776, + "balance_loss_mlp": 1.02132297, + "epoch": 0.5361190440402825, + "flos": 31393027094400.0, + "grad_norm": 8.614904049543634, + "language_loss": 0.73389709, + "learning_rate": 1.863211089308289e-06, + "loss": 0.75497633, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41796875, + "step": 8917, + "time_per_iteration": 3.8461360931396484 + }, + { + "auxiliary_loss_clip": 0.01062627, + "auxiliary_loss_mlp": 0.01049861, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.02211475, + "epoch": 0.5361791672929506, + "flos": 16068984053760.0, + "grad_norm": 2.159824558487007, + "language_loss": 0.72848344, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.74960834, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 8918, + "time_per_iteration": 3.8624536991119385 + }, + { + "auxiliary_loss_clip": 0.01064461, + "auxiliary_loss_mlp": 0.01043696, + "balance_loss_clip": 1.01679039, + "balance_loss_mlp": 1.02293301, + "epoch": 0.5362392905456185, + "flos": 20739874141440.0, + "grad_norm": 1.4154404216487477, + "language_loss": 0.75639176, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77747333, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.4140625, + "step": 8919, + "time_per_iteration": 2.3792266845703125 + }, + { + "auxiliary_loss_clip": 0.01062174, + "auxiliary_loss_mlp": 0.01047321, + "balance_loss_clip": 1.02142859, + "balance_loss_mlp": 1.02101207, + "epoch": 0.5362994137982865, + "flos": 17339334117120.0, + "grad_norm": 1.81102550749132, + "language_loss": 0.73100811, + "learning_rate": 1.862045463611864e-06, + "loss": 0.75210309, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.41210938, + "step": 8920, + "time_per_iteration": 2.380683422088623 + }, + { + "auxiliary_loss_clip": 0.01061143, + "auxiliary_loss_mlp": 0.01044706, + "balance_loss_clip": 1.01651287, + "balance_loss_mlp": 1.02094889, + "epoch": 0.5363595370509544, + "flos": 42812314122240.0, + "grad_norm": 1.698400698304111, + "language_loss": 0.69555867, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.71661723, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 8921, + "time_per_iteration": 2.5705759525299072 + }, + { + "auxiliary_loss_clip": 0.01065236, + "auxiliary_loss_mlp": 0.01042895, + "balance_loss_clip": 1.01427317, + "balance_loss_mlp": 1.02408421, + "epoch": 0.5364196603036224, + "flos": 19170947197440.0, + "grad_norm": 1.79979168271998, + "language_loss": 0.82978195, + "learning_rate": 1.86126840594594e-06, + "loss": 0.85086322, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41210938, + "step": 8922, + "time_per_iteration": 2.4011123180389404 + }, + { + "auxiliary_loss_clip": 0.0106221, + "auxiliary_loss_mlp": 0.01041407, + "balance_loss_clip": 1.0146687, + "balance_loss_mlp": 1.02110839, + "epoch": 0.5364797835562904, + "flos": 17930099099520.0, + "grad_norm": 2.281972116780324, + "language_loss": 0.77675593, + "learning_rate": 1.860879884996686e-06, + "loss": 0.79779208, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 8923, + "time_per_iteration": 2.4456005096435547 + }, + { + "auxiliary_loss_clip": 0.01065447, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_clip": 1.0193634, + "balance_loss_mlp": 1.02280951, + "epoch": 0.5365399068089584, + "flos": 30226718482560.0, + "grad_norm": 1.5464700502781064, + "language_loss": 0.71232474, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.73345351, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.42578125, + "step": 8924, + "time_per_iteration": 2.482163190841675 + }, + { + "auxiliary_loss_clip": 0.01065051, + "auxiliary_loss_mlp": 0.01048715, + "balance_loss_clip": 1.01760125, + "balance_loss_mlp": 1.02306151, + "epoch": 0.5366000300616264, + "flos": 24890768305920.0, + "grad_norm": 1.8043655231516553, + "language_loss": 0.88353157, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.90466917, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.41992188, + "step": 8925, + "time_per_iteration": 2.422456741333008 + }, + { + "auxiliary_loss_clip": 0.01063248, + "auxiliary_loss_mlp": 0.01041822, + "balance_loss_clip": 1.0154767, + "balance_loss_mlp": 1.02147818, + "epoch": 0.5366601533142943, + "flos": 29825322048000.0, + "grad_norm": 1.5371652857121307, + "language_loss": 0.78513134, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80618203, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41796875, + "step": 8926, + "time_per_iteration": 4.001922130584717 + }, + { + "auxiliary_loss_clip": 0.01060921, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.01407409, + "balance_loss_mlp": 1.02237046, + "epoch": 0.5367202765669623, + "flos": 27198107838720.0, + "grad_norm": 1.3834995510839738, + "language_loss": 0.67799723, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69899845, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 8927, + "time_per_iteration": 2.4455482959747314 + }, + { + "auxiliary_loss_clip": 0.01060971, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.01331377, + "balance_loss_mlp": 1.02010584, + "epoch": 0.5367803998196302, + "flos": 20228920260480.0, + "grad_norm": 1.7501057741621895, + "language_loss": 0.75113106, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.77213061, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40820312, + "step": 8928, + "time_per_iteration": 2.424966335296631 + }, + { + "auxiliary_loss_clip": 0.01060328, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.01158547, + "balance_loss_mlp": 1.02106953, + "epoch": 0.5368405230722982, + "flos": 32153435706240.0, + "grad_norm": 2.03069349144544, + "language_loss": 0.64169168, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.66264516, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.39257812, + "step": 8929, + "time_per_iteration": 2.471461772918701 + }, + { + "auxiliary_loss_clip": 0.01060655, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.01983595, + "balance_loss_mlp": 1.0205375, + "epoch": 0.5369006463249661, + "flos": 26246794579200.0, + "grad_norm": 1.6783940610683814, + "language_loss": 0.6731981, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.69425082, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.40039062, + "step": 8930, + "time_per_iteration": 2.453794002532959 + }, + { + "auxiliary_loss_clip": 0.01060031, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.01163232, + "balance_loss_mlp": 1.01999247, + "epoch": 0.5369607695776342, + "flos": 26210170696320.0, + "grad_norm": 1.519035641934808, + "language_loss": 0.68409687, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.70508569, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40039062, + "step": 8931, + "time_per_iteration": 2.416675090789795 + }, + { + "auxiliary_loss_clip": 0.0106145, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.01747727, + "balance_loss_mlp": 1.02184939, + "epoch": 0.5370208928303021, + "flos": 25007866606080.0, + "grad_norm": 1.614254314826076, + "language_loss": 0.76886606, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.78990686, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39648438, + "step": 8932, + "time_per_iteration": 2.4670121669769287 + }, + { + "auxiliary_loss_clip": 0.01059796, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.01225448, + "balance_loss_mlp": 1.01968837, + "epoch": 0.5370810160829701, + "flos": 31790897481600.0, + "grad_norm": 2.015454669579529, + "language_loss": 0.67043734, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.6914115, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40234375, + "step": 8933, + "time_per_iteration": 2.48069429397583 + }, + { + "auxiliary_loss_clip": 0.01057378, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.01481557, + "balance_loss_mlp": 1.01921618, + "epoch": 0.537141139335638, + "flos": 23841453260160.0, + "grad_norm": 1.644395959033918, + "language_loss": 0.83793044, + "learning_rate": 1.856606505975565e-06, + "loss": 0.8589046, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 8934, + "time_per_iteration": 2.4439468383789062 + }, + { + "auxiliary_loss_clip": 0.01056457, + "auxiliary_loss_mlp": 0.01044267, + "balance_loss_clip": 1.01789832, + "balance_loss_mlp": 1.01879072, + "epoch": 0.537201262588306, + "flos": 18508016701440.0, + "grad_norm": 1.9821992931733459, + "language_loss": 0.80805588, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82906306, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37695312, + "step": 8935, + "time_per_iteration": 2.3380212783813477 + }, + { + "auxiliary_loss_clip": 0.010578, + "auxiliary_loss_mlp": 0.01043129, + "balance_loss_clip": 1.01723635, + "balance_loss_mlp": 1.01845884, + "epoch": 0.537261385840974, + "flos": 25661859793920.0, + "grad_norm": 10.030755595023885, + "language_loss": 0.8467598, + "learning_rate": 1.855829598084659e-06, + "loss": 0.86776906, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 8936, + "time_per_iteration": 2.4430556297302246 + }, + { + "auxiliary_loss_clip": 0.01055912, + "auxiliary_loss_mlp": 0.01039837, + "balance_loss_clip": 1.01547027, + "balance_loss_mlp": 1.01791883, + "epoch": 0.537321509093642, + "flos": 40733410815360.0, + "grad_norm": 1.2860961306905667, + "language_loss": 0.73651946, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.75747693, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37890625, + "step": 8937, + "time_per_iteration": 2.5403835773468018 + }, + { + "auxiliary_loss_clip": 0.01057225, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.01350975, + "balance_loss_mlp": 1.01719332, + "epoch": 0.53738163234631, + "flos": 17237526992640.0, + "grad_norm": 2.3331083130226755, + "language_loss": 0.82505441, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.84601092, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.40039062, + "step": 8938, + "time_per_iteration": 2.3777172565460205 + }, + { + "auxiliary_loss_clip": 0.01061157, + "auxiliary_loss_mlp": 0.01045818, + "balance_loss_clip": 1.01911533, + "balance_loss_mlp": 1.02012837, + "epoch": 0.5374417555989779, + "flos": 12821188694400.0, + "grad_norm": 2.3610462377042185, + "language_loss": 0.81864643, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.8397162, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41015625, + "step": 8939, + "time_per_iteration": 2.3595237731933594 + }, + { + "auxiliary_loss_clip": 0.01010822, + "auxiliary_loss_mlp": 0.01002732, + "balance_loss_clip": 1.000229, + "balance_loss_mlp": 1.00365186, + "epoch": 0.5375018788516459, + "flos": 67252771445760.0, + "grad_norm": 0.7078105291319816, + "language_loss": 0.52529275, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54542834, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.07128906, + "step": 8940, + "time_per_iteration": 3.0362002849578857 + }, + { + "auxiliary_loss_clip": 0.01055438, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.01290321, + "balance_loss_mlp": 1.01785064, + "epoch": 0.5375620021043138, + "flos": 18113183602560.0, + "grad_norm": 1.6973258593163194, + "language_loss": 0.73459923, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.75552058, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 8941, + "time_per_iteration": 2.35707688331604 + }, + { + "auxiliary_loss_clip": 0.01054146, + "auxiliary_loss_mlp": 0.01038855, + "balance_loss_clip": 1.0161221, + "balance_loss_mlp": 1.01740837, + "epoch": 0.5376221253569818, + "flos": 23148252748800.0, + "grad_norm": 1.6732213765550117, + "language_loss": 0.80721354, + "learning_rate": 1.853499006090237e-06, + "loss": 0.8281436, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 8942, + "time_per_iteration": 2.432715654373169 + }, + { + "auxiliary_loss_clip": 0.01058069, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.0140388, + "balance_loss_mlp": 1.018085, + "epoch": 0.5376822486096497, + "flos": 29970979706880.0, + "grad_norm": 1.9204127927109742, + "language_loss": 0.71803719, + "learning_rate": 1.853110593448911e-06, + "loss": 0.73902243, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 8943, + "time_per_iteration": 2.436572313308716 + }, + { + "auxiliary_loss_clip": 0.01009241, + "auxiliary_loss_mlp": 0.01002972, + "balance_loss_clip": 1.00082672, + "balance_loss_mlp": 1.00205779, + "epoch": 0.5377423718623178, + "flos": 54165752726400.0, + "grad_norm": 0.8367067700047615, + "language_loss": 0.59753829, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61766046, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.07177734, + "step": 8944, + "time_per_iteration": 3.037797451019287 + }, + { + "auxiliary_loss_clip": 0.01061648, + "auxiliary_loss_mlp": 0.01045077, + "balance_loss_clip": 1.01649046, + "balance_loss_mlp": 1.01888394, + "epoch": 0.5378024951149857, + "flos": 23255994804480.0, + "grad_norm": 3.0749352379848665, + "language_loss": 0.78804541, + "learning_rate": 1.852333784891169e-06, + "loss": 0.80911267, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.42773438, + "step": 8945, + "time_per_iteration": 2.3829212188720703 + }, + { + "auxiliary_loss_clip": 0.01056655, + "auxiliary_loss_mlp": 0.01036239, + "balance_loss_clip": 1.01201558, + "balance_loss_mlp": 1.01705623, + "epoch": 0.5378626183676537, + "flos": 24022966752000.0, + "grad_norm": 1.7580223898635348, + "language_loss": 0.69726533, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.71819425, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.39648438, + "step": 8946, + "time_per_iteration": 2.442201614379883 + }, + { + "auxiliary_loss_clip": 0.01055776, + "auxiliary_loss_mlp": 0.01043655, + "balance_loss_clip": 1.02043331, + "balance_loss_mlp": 1.01800525, + "epoch": 0.5379227416203216, + "flos": 27160576260480.0, + "grad_norm": 1.510240069492998, + "language_loss": 0.78098416, + "learning_rate": 1.851556998731498e-06, + "loss": 0.80197847, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 8947, + "time_per_iteration": 2.4441444873809814 + }, + { + "auxiliary_loss_clip": 0.0105712, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.01393962, + "balance_loss_mlp": 1.0179261, + "epoch": 0.5379828648729896, + "flos": 24680451075840.0, + "grad_norm": 1.7497469169330293, + "language_loss": 0.6138885, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.63484848, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 8948, + "time_per_iteration": 2.4484026432037354 + }, + { + "auxiliary_loss_clip": 0.0105775, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.01834655, + "balance_loss_mlp": 1.01910567, + "epoch": 0.5380429881256577, + "flos": 22522330160640.0, + "grad_norm": 1.815220120212949, + "language_loss": 0.80242872, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.82341307, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.38671875, + "step": 8949, + "time_per_iteration": 2.390437126159668 + }, + { + "auxiliary_loss_clip": 0.01054392, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.0167737, + "balance_loss_mlp": 1.01757836, + "epoch": 0.5381031113783256, + "flos": 26978329630080.0, + "grad_norm": 1.6309538217010142, + "language_loss": 0.79235423, + "learning_rate": 1.850391861746111e-06, + "loss": 0.81329799, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 8950, + "time_per_iteration": 2.483891248703003 + }, + { + "auxiliary_loss_clip": 0.01055683, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.01493907, + "balance_loss_mlp": 1.01814222, + "epoch": 0.5381632346309936, + "flos": 24752930791680.0, + "grad_norm": 1.5781124079692752, + "language_loss": 0.73622531, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.75716949, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 8951, + "time_per_iteration": 2.4104650020599365 + }, + { + "auxiliary_loss_clip": 0.01056404, + "auxiliary_loss_mlp": 0.01037683, + "balance_loss_clip": 1.01247025, + "balance_loss_mlp": 1.01691258, + "epoch": 0.5382233578836615, + "flos": 15559147336320.0, + "grad_norm": 2.2400974920651584, + "language_loss": 0.764476, + "learning_rate": 1.849615132097085e-06, + "loss": 0.7854169, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39453125, + "step": 8952, + "time_per_iteration": 2.409898519515991 + }, + { + "auxiliary_loss_clip": 0.01058143, + "auxiliary_loss_mlp": 0.01037982, + "balance_loss_clip": 1.01094496, + "balance_loss_mlp": 1.01887238, + "epoch": 0.5382834811363295, + "flos": 25083278876160.0, + "grad_norm": 1.472357334311386, + "language_loss": 0.80165809, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.82261932, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39257812, + "step": 8953, + "time_per_iteration": 2.422800064086914 + }, + { + "auxiliary_loss_clip": 0.01055782, + "auxiliary_loss_mlp": 0.01038036, + "balance_loss_clip": 1.0137893, + "balance_loss_mlp": 1.01808465, + "epoch": 0.5383436043889974, + "flos": 13297054792320.0, + "grad_norm": 1.8543790006106247, + "language_loss": 0.82048732, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.84142548, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37695312, + "step": 8954, + "time_per_iteration": 2.3814477920532227 + }, + { + "auxiliary_loss_clip": 0.01057086, + "auxiliary_loss_mlp": 0.01039816, + "balance_loss_clip": 1.01475799, + "balance_loss_mlp": 1.01823997, + "epoch": 0.5384037276416654, + "flos": 23038276366080.0, + "grad_norm": 1.857722653483034, + "language_loss": 0.78084671, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.80181575, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 8955, + "time_per_iteration": 3.631215810775757 + }, + { + "auxiliary_loss_clip": 0.01056768, + "auxiliary_loss_mlp": 0.01041021, + "balance_loss_clip": 1.01500928, + "balance_loss_mlp": 1.01889789, + "epoch": 0.5384638508943334, + "flos": 20630107226880.0, + "grad_norm": 1.5954811077793964, + "language_loss": 0.79965377, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.82063162, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 8956, + "time_per_iteration": 3.8077375888824463 + }, + { + "auxiliary_loss_clip": 0.01011196, + "auxiliary_loss_mlp": 0.01007055, + "balance_loss_clip": 1.00439668, + "balance_loss_mlp": 1.00440657, + "epoch": 0.5385239741470014, + "flos": 66734660736000.0, + "grad_norm": 0.8608436064406197, + "language_loss": 0.63602746, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65620995, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.06787109, + "step": 8957, + "time_per_iteration": 2.9715232849121094 + }, + { + "auxiliary_loss_clip": 0.01010713, + "auxiliary_loss_mlp": 0.0100368, + "balance_loss_clip": 1.00136757, + "balance_loss_mlp": 1.00385141, + "epoch": 0.5385840973996693, + "flos": 64712490255360.0, + "grad_norm": 0.7096124123491183, + "language_loss": 0.51777202, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.537916, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.06835938, + "step": 8958, + "time_per_iteration": 4.472727298736572 + }, + { + "auxiliary_loss_clip": 0.01058434, + "auxiliary_loss_mlp": 0.01039583, + "balance_loss_clip": 1.01422727, + "balance_loss_mlp": 1.01836693, + "epoch": 0.5386442206523373, + "flos": 26140553712000.0, + "grad_norm": 1.5589692663596597, + "language_loss": 0.78063565, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.80161583, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40234375, + "step": 8959, + "time_per_iteration": 2.430861473083496 + }, + { + "auxiliary_loss_clip": 0.01058341, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.01409721, + "balance_loss_mlp": 1.01832962, + "epoch": 0.5387043439050052, + "flos": 18251090939520.0, + "grad_norm": 2.310057288415181, + "language_loss": 0.84586954, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.86683899, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.40039062, + "step": 8960, + "time_per_iteration": 2.3853800296783447 + }, + { + "auxiliary_loss_clip": 0.01059604, + "auxiliary_loss_mlp": 0.01038375, + "balance_loss_clip": 1.01330543, + "balance_loss_mlp": 1.01970553, + "epoch": 0.5387644671576732, + "flos": 29787022419840.0, + "grad_norm": 1.4471658114563881, + "language_loss": 0.79901946, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.81999922, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3984375, + "step": 8961, + "time_per_iteration": 2.435152769088745 + }, + { + "auxiliary_loss_clip": 0.01059181, + "auxiliary_loss_mlp": 0.01048647, + "balance_loss_clip": 1.02301741, + "balance_loss_mlp": 1.01931548, + "epoch": 0.5388245904103413, + "flos": 22373600302080.0, + "grad_norm": 1.7904018516199225, + "language_loss": 0.85113066, + "learning_rate": 1.845731828364681e-06, + "loss": 0.87220895, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3984375, + "step": 8962, + "time_per_iteration": 2.4178013801574707 + }, + { + "auxiliary_loss_clip": 0.01013907, + "auxiliary_loss_mlp": 0.01011693, + "balance_loss_clip": 1.00952375, + "balance_loss_mlp": 1.00663519, + "epoch": 0.5388847136630092, + "flos": 69804538473600.0, + "grad_norm": 0.7358616053125006, + "language_loss": 0.54220402, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56246006, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.07275391, + "step": 8963, + "time_per_iteration": 2.953850746154785 + }, + { + "auxiliary_loss_clip": 0.01011479, + "auxiliary_loss_mlp": 0.01008706, + "balance_loss_clip": 1.00636983, + "balance_loss_mlp": 1.00427961, + "epoch": 0.5389448369156772, + "flos": 69818642663040.0, + "grad_norm": 0.8405623080527205, + "language_loss": 0.63612169, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65632361, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.07177734, + "step": 8964, + "time_per_iteration": 3.1220006942749023 + }, + { + "auxiliary_loss_clip": 0.01059031, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_clip": 1.02021682, + "balance_loss_mlp": 1.01766276, + "epoch": 0.5390049601683451, + "flos": 31721105940480.0, + "grad_norm": 1.4758028928091504, + "language_loss": 0.7089926, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.73006678, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 8965, + "time_per_iteration": 2.4459476470947266 + }, + { + "auxiliary_loss_clip": 0.01059777, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.01539755, + "balance_loss_mlp": 1.01841962, + "epoch": 0.5390650834210131, + "flos": 18112520286720.0, + "grad_norm": 2.076461512921895, + "language_loss": 0.83626616, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.85729551, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.4140625, + "step": 8966, + "time_per_iteration": 3.8342537879943848 + }, + { + "auxiliary_loss_clip": 0.01057914, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.01946676, + "balance_loss_mlp": 1.01850271, + "epoch": 0.539125206673681, + "flos": 17415863550720.0, + "grad_norm": 2.3315091395767267, + "language_loss": 0.7388761, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.7599113, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 8967, + "time_per_iteration": 2.3663673400878906 + }, + { + "auxiliary_loss_clip": 0.01058863, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.01834643, + "epoch": 0.539185329926349, + "flos": 22197882096000.0, + "grad_norm": 1.6398824528864735, + "language_loss": 0.83116883, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.8521809, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40625, + "step": 8968, + "time_per_iteration": 2.4037864208221436 + }, + { + "auxiliary_loss_clip": 0.01058233, + "auxiliary_loss_mlp": 0.01043357, + "balance_loss_clip": 1.01481867, + "balance_loss_mlp": 1.01843548, + "epoch": 0.539245453179017, + "flos": 21433319032320.0, + "grad_norm": 1.6166746259951301, + "language_loss": 0.74946511, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.77048099, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.3984375, + "step": 8969, + "time_per_iteration": 2.403905153274536 + }, + { + "auxiliary_loss_clip": 0.01061107, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.01594973, + "balance_loss_mlp": 1.01872611, + "epoch": 0.539305576431685, + "flos": 20734113767040.0, + "grad_norm": 1.9428928612055358, + "language_loss": 0.83089077, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.85194671, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.421875, + "step": 8970, + "time_per_iteration": 2.401843309402466 + }, + { + "auxiliary_loss_clip": 0.01057607, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.02035439, + "balance_loss_mlp": 1.01804173, + "epoch": 0.5393656996843529, + "flos": 30919116032640.0, + "grad_norm": 1.3666022681546044, + "language_loss": 0.76155531, + "learning_rate": 1.842237354749146e-06, + "loss": 0.78258121, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39453125, + "step": 8971, + "time_per_iteration": 2.4846408367156982 + }, + { + "auxiliary_loss_clip": 0.01010692, + "auxiliary_loss_mlp": 0.01024174, + "balance_loss_clip": 1.02190936, + "balance_loss_mlp": 1.00369453, + "epoch": 0.5394258229370209, + "flos": 50315252699520.0, + "grad_norm": 0.8917501742508555, + "language_loss": 0.60393536, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62428403, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.0703125, + "step": 8972, + "time_per_iteration": 3.0506210327148438 + }, + { + "auxiliary_loss_clip": 0.01059305, + "auxiliary_loss_mlp": 0.01043468, + "balance_loss_clip": 1.01787424, + "balance_loss_mlp": 1.01899874, + "epoch": 0.5394859461896888, + "flos": 25410729317760.0, + "grad_norm": 1.4545777137239582, + "language_loss": 0.79535711, + "learning_rate": 1.841460870485045e-06, + "loss": 0.81638491, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 8973, + "time_per_iteration": 2.463809013366699 + }, + { + "auxiliary_loss_clip": 0.01063968, + "auxiliary_loss_mlp": 0.01049678, + "balance_loss_clip": 1.01815927, + "balance_loss_mlp": 1.01955462, + "epoch": 0.5395460694423568, + "flos": 25477448659200.0, + "grad_norm": 2.5273185583683877, + "language_loss": 0.7508719, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.7720083, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 8974, + "time_per_iteration": 2.4357120990753174 + }, + { + "auxiliary_loss_clip": 0.01014775, + "auxiliary_loss_mlp": 0.01010143, + "balance_loss_clip": 1.00778222, + "balance_loss_mlp": 1.00765443, + "epoch": 0.5396061926950249, + "flos": 53246524872960.0, + "grad_norm": 0.734340002377585, + "language_loss": 0.51125991, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53150904, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07128906, + "step": 8975, + "time_per_iteration": 3.0572211742401123 + }, + { + "auxiliary_loss_clip": 0.01059818, + "auxiliary_loss_mlp": 0.01043163, + "balance_loss_clip": 1.01642489, + "balance_loss_mlp": 1.02067208, + "epoch": 0.5396663159476928, + "flos": 26723847663360.0, + "grad_norm": 1.5587239840296387, + "language_loss": 0.73087239, + "learning_rate": 1.840296189214344e-06, + "loss": 0.75190222, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 8976, + "time_per_iteration": 2.448598623275757 + }, + { + "auxiliary_loss_clip": 0.01061, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.01599395, + "balance_loss_mlp": 1.02124524, + "epoch": 0.5397264392003608, + "flos": 23252398934400.0, + "grad_norm": 1.7167900138141599, + "language_loss": 0.7147845, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.73582113, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39648438, + "step": 8977, + "time_per_iteration": 2.4112281799316406 + }, + { + "auxiliary_loss_clip": 0.01061746, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.01149499, + "balance_loss_mlp": 1.02172613, + "epoch": 0.5397865624530287, + "flos": 18293265728640.0, + "grad_norm": 1.6486295666862683, + "language_loss": 0.73721743, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.75821996, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40039062, + "step": 8978, + "time_per_iteration": 2.3499584197998047 + }, + { + "auxiliary_loss_clip": 0.01065171, + "auxiliary_loss_mlp": 0.01044442, + "balance_loss_clip": 1.01356697, + "balance_loss_mlp": 1.02182221, + "epoch": 0.5398466857056967, + "flos": 15296810313600.0, + "grad_norm": 1.9610702643816238, + "language_loss": 0.755826, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.77692211, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43359375, + "step": 8979, + "time_per_iteration": 2.380565881729126 + }, + { + "auxiliary_loss_clip": 0.01066956, + "auxiliary_loss_mlp": 0.01053353, + "balance_loss_clip": 1.02374196, + "balance_loss_mlp": 1.02393889, + "epoch": 0.5399068089583646, + "flos": 17820786032640.0, + "grad_norm": 1.9055526557088598, + "language_loss": 0.77763009, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79883313, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 8980, + "time_per_iteration": 2.37343430519104 + }, + { + "auxiliary_loss_clip": 0.01062933, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.01379442, + "balance_loss_mlp": 1.02177286, + "epoch": 0.5399669322110326, + "flos": 27380389380480.0, + "grad_norm": 2.4123427923894485, + "language_loss": 0.83400095, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.85502386, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.41210938, + "step": 8981, + "time_per_iteration": 2.4467689990997314 + }, + { + "auxiliary_loss_clip": 0.01064948, + "auxiliary_loss_mlp": 0.01043723, + "balance_loss_clip": 1.01597071, + "balance_loss_mlp": 1.02251744, + "epoch": 0.5400270554637006, + "flos": 20448070064640.0, + "grad_norm": 1.9947635355718314, + "language_loss": 0.67855889, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69964564, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42382812, + "step": 8982, + "time_per_iteration": 2.3779404163360596 + }, + { + "auxiliary_loss_clip": 0.0106243, + "auxiliary_loss_mlp": 0.01049405, + "balance_loss_clip": 1.02234459, + "balance_loss_mlp": 1.02275705, + "epoch": 0.5400871787163686, + "flos": 21688499226240.0, + "grad_norm": 1.4867207218214564, + "language_loss": 0.84106934, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.86218768, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 8983, + "time_per_iteration": 2.393679618835449 + }, + { + "auxiliary_loss_clip": 0.01062137, + "auxiliary_loss_mlp": 0.01045951, + "balance_loss_clip": 1.01642323, + "balance_loss_mlp": 1.02227402, + "epoch": 0.5401473019690365, + "flos": 19203835564800.0, + "grad_norm": 1.861331151480305, + "language_loss": 0.71988642, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.74096733, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.3984375, + "step": 8984, + "time_per_iteration": 2.396200656890869 + }, + { + "auxiliary_loss_clip": 0.01064089, + "auxiliary_loss_mlp": 0.01048806, + "balance_loss_clip": 1.02078032, + "balance_loss_mlp": 1.02291405, + "epoch": 0.5402074252217045, + "flos": 20626441534080.0, + "grad_norm": 1.935681432914346, + "language_loss": 0.81596434, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.83709329, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 8985, + "time_per_iteration": 2.395359516143799 + }, + { + "auxiliary_loss_clip": 0.0105837, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.0146513, + "balance_loss_mlp": 1.02084017, + "epoch": 0.5402675484743724, + "flos": 24972290064000.0, + "grad_norm": 1.4581307226063513, + "language_loss": 0.79929209, + "learning_rate": 1.83641431418363e-06, + "loss": 0.82027662, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 8986, + "time_per_iteration": 2.4424142837524414 + }, + { + "auxiliary_loss_clip": 0.01060759, + "auxiliary_loss_mlp": 0.01047344, + "balance_loss_clip": 1.02178597, + "balance_loss_mlp": 1.02067447, + "epoch": 0.5403276717270404, + "flos": 19458142974720.0, + "grad_norm": 1.5989405020568415, + "language_loss": 0.78253478, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.80361587, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 8987, + "time_per_iteration": 2.4046552181243896 + }, + { + "auxiliary_loss_clip": 0.01059715, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.02473998, + "balance_loss_mlp": 1.02029669, + "epoch": 0.5403877949797083, + "flos": 18441157714560.0, + "grad_norm": 1.7717174374703635, + "language_loss": 0.72350985, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.74461377, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 8988, + "time_per_iteration": 2.3936660289764404 + }, + { + "auxiliary_loss_clip": 0.01059741, + "auxiliary_loss_mlp": 0.01046483, + "balance_loss_clip": 1.01848066, + "balance_loss_mlp": 1.01922727, + "epoch": 0.5404479182323764, + "flos": 28291622532480.0, + "grad_norm": 2.31919775424533, + "language_loss": 0.6898632, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.7109254, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40429688, + "step": 8989, + "time_per_iteration": 2.4300780296325684 + }, + { + "auxiliary_loss_clip": 0.0106002, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.02255774, + "balance_loss_mlp": 1.01850891, + "epoch": 0.5405080414850444, + "flos": 23366215566720.0, + "grad_norm": 1.4090341533778252, + "language_loss": 0.78394675, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.80505866, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 8990, + "time_per_iteration": 2.426922082901001 + }, + { + "auxiliary_loss_clip": 0.01058001, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.02207994, + "balance_loss_mlp": 1.01815677, + "epoch": 0.5405681647377123, + "flos": 21105344920320.0, + "grad_norm": 1.753818868406025, + "language_loss": 0.70264602, + "learning_rate": 1.834473608367745e-06, + "loss": 0.72369778, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3984375, + "step": 8991, + "time_per_iteration": 2.372962713241577 + }, + { + "auxiliary_loss_clip": 0.01058298, + "auxiliary_loss_mlp": 0.01058244, + "balance_loss_clip": 1.03059959, + "balance_loss_mlp": 1.01753485, + "epoch": 0.5406282879903803, + "flos": 20448139887360.0, + "grad_norm": 1.7247015679145334, + "language_loss": 0.7786051, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.79977047, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 8992, + "time_per_iteration": 2.3975396156311035 + }, + { + "auxiliary_loss_clip": 0.01058848, + "auxiliary_loss_mlp": 0.01051674, + "balance_loss_clip": 1.02456617, + "balance_loss_mlp": 1.01787519, + "epoch": 0.5406884112430482, + "flos": 14208637057920.0, + "grad_norm": 2.37075758450615, + "language_loss": 0.78151548, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.80262077, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41015625, + "step": 8993, + "time_per_iteration": 2.3424320220947266 + }, + { + "auxiliary_loss_clip": 0.01055387, + "auxiliary_loss_mlp": 0.01046805, + "balance_loss_clip": 1.02091265, + "balance_loss_mlp": 1.01706314, + "epoch": 0.5407485344957162, + "flos": 23874516184320.0, + "grad_norm": 1.6973041111487555, + "language_loss": 0.70863628, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72965825, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 8994, + "time_per_iteration": 3.70424747467041 + }, + { + "auxiliary_loss_clip": 0.01059238, + "auxiliary_loss_mlp": 0.0105201, + "balance_loss_clip": 1.0233283, + "balance_loss_mlp": 1.01786327, + "epoch": 0.5408086577483842, + "flos": 23147379964800.0, + "grad_norm": 2.491419053312409, + "language_loss": 0.76449114, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.78560364, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4140625, + "step": 8995, + "time_per_iteration": 3.7523932456970215 + }, + { + "auxiliary_loss_clip": 0.01056811, + "auxiliary_loss_mlp": 0.01042638, + "balance_loss_clip": 1.01572061, + "balance_loss_mlp": 1.01795602, + "epoch": 0.5408687810010522, + "flos": 18770039521920.0, + "grad_norm": 1.9015684050157444, + "language_loss": 0.74601519, + "learning_rate": 1.832533059471282e-06, + "loss": 0.76700962, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38867188, + "step": 8996, + "time_per_iteration": 2.3674206733703613 + }, + { + "auxiliary_loss_clip": 0.01056012, + "auxiliary_loss_mlp": 0.01046712, + "balance_loss_clip": 1.02129626, + "balance_loss_mlp": 1.0178709, + "epoch": 0.5409289042537201, + "flos": 13880697857280.0, + "grad_norm": 1.8456103430696942, + "language_loss": 0.74739599, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.7684232, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 8997, + "time_per_iteration": 2.375185489654541 + }, + { + "auxiliary_loss_clip": 0.0105908, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.01876044, + "balance_loss_mlp": 1.01901495, + "epoch": 0.5409890275063881, + "flos": 14464480567680.0, + "grad_norm": 2.2164161532583826, + "language_loss": 0.73034811, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.75139749, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40039062, + "step": 8998, + "time_per_iteration": 3.674541711807251 + }, + { + "auxiliary_loss_clip": 0.01057384, + "auxiliary_loss_mlp": 0.01039049, + "balance_loss_clip": 1.01389563, + "balance_loss_mlp": 1.01794624, + "epoch": 0.541049150759056, + "flos": 48975706454400.0, + "grad_norm": 1.4637379598060303, + "language_loss": 0.71076822, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.73173249, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39453125, + "step": 8999, + "time_per_iteration": 2.637082576751709 + }, + { + "auxiliary_loss_clip": 0.01056786, + "auxiliary_loss_mlp": 0.01048848, + "balance_loss_clip": 1.02212119, + "balance_loss_mlp": 1.01861525, + "epoch": 0.541109274011724, + "flos": 18146700374400.0, + "grad_norm": 2.258126134273791, + "language_loss": 0.82385063, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.84490699, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3828125, + "step": 9000, + "time_per_iteration": 2.3275983333587646 + }, + { + "auxiliary_loss_clip": 0.01057739, + "auxiliary_loss_mlp": 0.01045158, + "balance_loss_clip": 1.01696467, + "balance_loss_mlp": 1.0198704, + "epoch": 0.541169397264392, + "flos": 20521492387200.0, + "grad_norm": 2.0428991569937223, + "language_loss": 0.74478143, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.76581037, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.37890625, + "step": 9001, + "time_per_iteration": 2.3980133533477783 + }, + { + "auxiliary_loss_clip": 0.01060872, + "auxiliary_loss_mlp": 0.01044522, + "balance_loss_clip": 1.0159955, + "balance_loss_mlp": 1.01910973, + "epoch": 0.54122952051706, + "flos": 20043112671360.0, + "grad_norm": 2.374825741441473, + "language_loss": 0.87238884, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.89344275, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41796875, + "step": 9002, + "time_per_iteration": 2.3451480865478516 + }, + { + "auxiliary_loss_clip": 0.01058118, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.01422012, + "balance_loss_mlp": 1.01970601, + "epoch": 0.541289643769728, + "flos": 19061250105600.0, + "grad_norm": 1.9461522351018676, + "language_loss": 0.79572153, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.81667662, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38476562, + "step": 9003, + "time_per_iteration": 2.377026081085205 + }, + { + "auxiliary_loss_clip": 0.01059758, + "auxiliary_loss_mlp": 0.01042404, + "balance_loss_clip": 1.01512861, + "balance_loss_mlp": 1.02079535, + "epoch": 0.5413497670223959, + "flos": 22381210978560.0, + "grad_norm": 2.0004025544164254, + "language_loss": 0.71071774, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.73173934, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 9004, + "time_per_iteration": 2.389606237411499 + }, + { + "auxiliary_loss_clip": 0.01012111, + "auxiliary_loss_mlp": 0.010256, + "balance_loss_clip": 1.02302504, + "balance_loss_mlp": 1.00480962, + "epoch": 0.5414098902750639, + "flos": 70028331488640.0, + "grad_norm": 0.9729350548688701, + "language_loss": 0.59331584, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.613693, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.07324219, + "step": 9005, + "time_per_iteration": 4.5205957889556885 + }, + { + "auxiliary_loss_clip": 0.01061705, + "auxiliary_loss_mlp": 0.01045404, + "balance_loss_clip": 1.0195955, + "balance_loss_mlp": 1.02079582, + "epoch": 0.5414700135277318, + "flos": 21797882115840.0, + "grad_norm": 1.8178538627835596, + "language_loss": 0.80296308, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.82403409, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40820312, + "step": 9006, + "time_per_iteration": 2.40203857421875 + }, + { + "auxiliary_loss_clip": 0.01057371, + "auxiliary_loss_mlp": 0.01038006, + "balance_loss_clip": 1.0153321, + "balance_loss_mlp": 1.01975799, + "epoch": 0.5415301367803999, + "flos": 16907039262720.0, + "grad_norm": 2.249568031199077, + "language_loss": 0.84355617, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.86450994, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.375, + "step": 9007, + "time_per_iteration": 2.3698244094848633 + }, + { + "auxiliary_loss_clip": 0.01061981, + "auxiliary_loss_mlp": 0.01041259, + "balance_loss_clip": 1.01416326, + "balance_loss_mlp": 1.02195454, + "epoch": 0.5415902600330678, + "flos": 25702952330880.0, + "grad_norm": 1.950225815954539, + "language_loss": 0.67885053, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69988298, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40039062, + "step": 9008, + "time_per_iteration": 2.4727377891540527 + }, + { + "auxiliary_loss_clip": 0.01064084, + "auxiliary_loss_mlp": 0.01046062, + "balance_loss_clip": 1.01754737, + "balance_loss_mlp": 1.02193022, + "epoch": 0.5416503832857358, + "flos": 19207152144000.0, + "grad_norm": 2.509320249086475, + "language_loss": 0.75636637, + "learning_rate": 1.827488379924234e-06, + "loss": 0.77746785, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.421875, + "step": 9009, + "time_per_iteration": 2.3743577003479004 + }, + { + "auxiliary_loss_clip": 0.0106322, + "auxiliary_loss_mlp": 0.01048498, + "balance_loss_clip": 1.01933932, + "balance_loss_mlp": 1.02208197, + "epoch": 0.5417105065384037, + "flos": 12712888056960.0, + "grad_norm": 1.935199508487144, + "language_loss": 0.8962605, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.91737771, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41210938, + "step": 9010, + "time_per_iteration": 2.3980228900909424 + }, + { + "auxiliary_loss_clip": 0.01059963, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_clip": 1.01856911, + "balance_loss_mlp": 1.02083111, + "epoch": 0.5417706297910717, + "flos": 30334635095040.0, + "grad_norm": 1.7943799969821448, + "language_loss": 0.66993344, + "learning_rate": 1.826712372694122e-06, + "loss": 0.69096839, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 9011, + "time_per_iteration": 2.447270393371582 + }, + { + "auxiliary_loss_clip": 0.01061599, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_clip": 1.01603103, + "balance_loss_mlp": 1.02191806, + "epoch": 0.5418307530437396, + "flos": 29019771181440.0, + "grad_norm": 2.0449270734460376, + "language_loss": 0.80795693, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.82898295, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39648438, + "step": 9012, + "time_per_iteration": 2.4754998683929443 + }, + { + "auxiliary_loss_clip": 0.01060309, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_clip": 1.02090609, + "balance_loss_mlp": 1.02084887, + "epoch": 0.5418908762964076, + "flos": 16872510061440.0, + "grad_norm": 1.8757561546603014, + "language_loss": 0.76193905, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.78299415, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.39453125, + "step": 9013, + "time_per_iteration": 2.3597066402435303 + }, + { + "auxiliary_loss_clip": 0.01062558, + "auxiliary_loss_mlp": 0.01050776, + "balance_loss_clip": 1.02317941, + "balance_loss_mlp": 1.02081013, + "epoch": 0.5419509995490756, + "flos": 18948795016320.0, + "grad_norm": 1.974864113811349, + "language_loss": 0.73032314, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.75145644, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 9014, + "time_per_iteration": 2.3981499671936035 + }, + { + "auxiliary_loss_clip": 0.01061494, + "auxiliary_loss_mlp": 0.0105499, + "balance_loss_clip": 1.0285852, + "balance_loss_mlp": 1.02150881, + "epoch": 0.5420111228017436, + "flos": 18076734276480.0, + "grad_norm": 1.4777247001602036, + "language_loss": 0.81576651, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.83693141, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40039062, + "step": 9015, + "time_per_iteration": 2.3688113689422607 + }, + { + "auxiliary_loss_clip": 0.01061985, + "auxiliary_loss_mlp": 0.01054927, + "balance_loss_clip": 1.02592325, + "balance_loss_mlp": 1.0204612, + "epoch": 0.5420712460544116, + "flos": 19060796257920.0, + "grad_norm": 2.245958185716074, + "language_loss": 0.82187343, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.84304249, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4140625, + "step": 9016, + "time_per_iteration": 2.409641742706299 + }, + { + "auxiliary_loss_clip": 0.01059075, + "auxiliary_loss_mlp": 0.01044416, + "balance_loss_clip": 1.01900089, + "balance_loss_mlp": 1.0196172, + "epoch": 0.5421313693070795, + "flos": 18186117166080.0, + "grad_norm": 1.6622537856888557, + "language_loss": 0.82373416, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.84476912, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39453125, + "step": 9017, + "time_per_iteration": 2.396249771118164 + }, + { + "auxiliary_loss_clip": 0.01056014, + "auxiliary_loss_mlp": 0.01044802, + "balance_loss_clip": 1.01998281, + "balance_loss_mlp": 1.01834893, + "epoch": 0.5421914925597475, + "flos": 13005111070080.0, + "grad_norm": 1.9589094103525435, + "language_loss": 0.79151911, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.8125273, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 9018, + "time_per_iteration": 2.3783037662506104 + }, + { + "auxiliary_loss_clip": 0.01059489, + "auxiliary_loss_mlp": 0.01048949, + "balance_loss_clip": 1.02057695, + "balance_loss_mlp": 1.01819682, + "epoch": 0.5422516158124154, + "flos": 46756591660800.0, + "grad_norm": 1.6955387125928798, + "language_loss": 0.67519015, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.69627452, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 9019, + "time_per_iteration": 2.6251652240753174 + }, + { + "auxiliary_loss_clip": 0.01055669, + "auxiliary_loss_mlp": 0.01041498, + "balance_loss_clip": 1.01894343, + "balance_loss_mlp": 1.01807308, + "epoch": 0.5423117390650835, + "flos": 31757310887040.0, + "grad_norm": 1.811198562467502, + "language_loss": 0.70897895, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72995055, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.375, + "step": 9020, + "time_per_iteration": 2.48136305809021 + }, + { + "auxiliary_loss_clip": 0.01055018, + "auxiliary_loss_mlp": 0.01043244, + "balance_loss_clip": 1.0191642, + "balance_loss_mlp": 1.01767564, + "epoch": 0.5423718623177514, + "flos": 27200656368000.0, + "grad_norm": 1.7940169446069285, + "language_loss": 0.80727732, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82825994, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37304688, + "step": 9021, + "time_per_iteration": 2.4461467266082764 + }, + { + "auxiliary_loss_clip": 0.01058422, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.02068448, + "balance_loss_mlp": 1.01955914, + "epoch": 0.5424319855704194, + "flos": 23545424908800.0, + "grad_norm": 2.029745206896073, + "language_loss": 0.79678822, + "learning_rate": 1.822444805916788e-06, + "loss": 0.81783134, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38867188, + "step": 9022, + "time_per_iteration": 2.4300336837768555 + }, + { + "auxiliary_loss_clip": 0.01056649, + "auxiliary_loss_mlp": 0.01042405, + "balance_loss_clip": 1.01723957, + "balance_loss_mlp": 1.0175575, + "epoch": 0.5424921088230873, + "flos": 26614394951040.0, + "grad_norm": 1.672323156821775, + "language_loss": 0.83438879, + "learning_rate": 1.822056885403915e-06, + "loss": 0.85537934, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.390625, + "step": 9023, + "time_per_iteration": 2.40997576713562 + }, + { + "auxiliary_loss_clip": 0.01056093, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.01485729, + "balance_loss_mlp": 1.01730251, + "epoch": 0.5425522320757553, + "flos": 23585679573120.0, + "grad_norm": 1.6788339390128948, + "language_loss": 0.72745001, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.74840409, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 9024, + "time_per_iteration": 2.433309316635132 + }, + { + "auxiliary_loss_clip": 0.01057034, + "auxiliary_loss_mlp": 0.0104201, + "balance_loss_clip": 1.01614118, + "balance_loss_mlp": 1.01740849, + "epoch": 0.5426123553284232, + "flos": 30590932452480.0, + "grad_norm": 1.5624531973643847, + "language_loss": 0.66219771, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.68318808, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39648438, + "step": 9025, + "time_per_iteration": 2.4732935428619385 + }, + { + "auxiliary_loss_clip": 0.01059305, + "auxiliary_loss_mlp": 0.01044837, + "balance_loss_clip": 1.01960087, + "balance_loss_mlp": 1.01925826, + "epoch": 0.5426724785810912, + "flos": 12494296834560.0, + "grad_norm": 1.9017165047952214, + "language_loss": 0.74200058, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76304197, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40039062, + "step": 9026, + "time_per_iteration": 2.3399176597595215 + }, + { + "auxiliary_loss_clip": 0.01058637, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.0134809, + "balance_loss_mlp": 1.01818252, + "epoch": 0.5427326018337592, + "flos": 26063500608000.0, + "grad_norm": 1.8259223815879184, + "language_loss": 0.79725444, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.81826639, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40429688, + "step": 9027, + "time_per_iteration": 2.443751573562622 + }, + { + "auxiliary_loss_clip": 0.01011653, + "auxiliary_loss_mlp": 0.01002156, + "balance_loss_clip": 0.99973613, + "balance_loss_mlp": 1.00441337, + "epoch": 0.5427927250864272, + "flos": 65981374041600.0, + "grad_norm": 0.7442778816826011, + "language_loss": 0.56616509, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58630323, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.07226562, + "step": 9028, + "time_per_iteration": 3.05761456489563 + }, + { + "auxiliary_loss_clip": 0.01059637, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.01527393, + "balance_loss_mlp": 1.01893997, + "epoch": 0.5428528483390952, + "flos": 19974333559680.0, + "grad_norm": 2.143365225552012, + "language_loss": 0.79777998, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.81882066, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40625, + "step": 9029, + "time_per_iteration": 2.372952938079834 + }, + { + "auxiliary_loss_clip": 0.01055827, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.00933278, + "balance_loss_mlp": 1.01771736, + "epoch": 0.5429129715917631, + "flos": 21831329064960.0, + "grad_norm": 1.4213641814921112, + "language_loss": 0.83825862, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85917109, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 9030, + "time_per_iteration": 2.4428200721740723 + }, + { + "auxiliary_loss_clip": 0.01056462, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.01617944, + "balance_loss_mlp": 1.01809311, + "epoch": 0.5429730948444311, + "flos": 27781436701440.0, + "grad_norm": 1.7163038713351189, + "language_loss": 0.75879824, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.77977717, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38476562, + "step": 9031, + "time_per_iteration": 2.419968605041504 + }, + { + "auxiliary_loss_clip": 0.01054955, + "auxiliary_loss_mlp": 0.01037526, + "balance_loss_clip": 1.01585364, + "balance_loss_mlp": 1.01806211, + "epoch": 0.543033218097099, + "flos": 26759249648640.0, + "grad_norm": 2.1053979154442377, + "language_loss": 0.86058158, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.88150644, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36914062, + "step": 9032, + "time_per_iteration": 2.4241983890533447 + }, + { + "auxiliary_loss_clip": 0.01060876, + "auxiliary_loss_mlp": 0.01046955, + "balance_loss_clip": 1.0174861, + "balance_loss_mlp": 1.01911294, + "epoch": 0.5430933413497671, + "flos": 22674132218880.0, + "grad_norm": 1.8537280352805028, + "language_loss": 0.7478106, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76888895, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41796875, + "step": 9033, + "time_per_iteration": 3.7659451961517334 + }, + { + "auxiliary_loss_clip": 0.01055965, + "auxiliary_loss_mlp": 0.01042192, + "balance_loss_clip": 1.01680052, + "balance_loss_mlp": 1.01665318, + "epoch": 0.543153464602435, + "flos": 24606365437440.0, + "grad_norm": 1.6516160195770817, + "language_loss": 0.77413392, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.79511547, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 9034, + "time_per_iteration": 2.4380745887756348 + }, + { + "auxiliary_loss_clip": 0.01056679, + "auxiliary_loss_mlp": 0.01046697, + "balance_loss_clip": 1.0220331, + "balance_loss_mlp": 1.01847863, + "epoch": 0.543213587855103, + "flos": 19024730956800.0, + "grad_norm": 1.7384672178748555, + "language_loss": 0.853266, + "learning_rate": 1.817402369770655e-06, + "loss": 0.87429976, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3828125, + "step": 9035, + "time_per_iteration": 3.786783456802368 + }, + { + "auxiliary_loss_clip": 0.01014882, + "auxiliary_loss_mlp": 0.01003294, + "balance_loss_clip": 1.00087392, + "balance_loss_mlp": 1.00749063, + "epoch": 0.5432737111077709, + "flos": 65683251008640.0, + "grad_norm": 0.7305709731943748, + "language_loss": 0.56051302, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.58069479, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.07373047, + "step": 9036, + "time_per_iteration": 2.9957268238067627 + }, + { + "auxiliary_loss_clip": 0.01059588, + "auxiliary_loss_mlp": 0.0104411, + "balance_loss_clip": 1.01657248, + "balance_loss_mlp": 1.01951742, + "epoch": 0.5433338343604389, + "flos": 22090558976640.0, + "grad_norm": 1.6428754783239325, + "language_loss": 0.76165104, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.78268802, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 9037, + "time_per_iteration": 3.8074212074279785 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.02489209, + "balance_loss_mlp": 1.01887214, + "epoch": 0.5433939576131068, + "flos": 34671371760000.0, + "grad_norm": 1.7527397128064792, + "language_loss": 0.67730588, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.69841921, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 9038, + "time_per_iteration": 2.517861843109131 + }, + { + "auxiliary_loss_clip": 0.01055571, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.01552153, + "balance_loss_mlp": 1.01778579, + "epoch": 0.5434540808657748, + "flos": 20302307671680.0, + "grad_norm": 2.425787863563316, + "language_loss": 0.79687029, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.81782311, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37695312, + "step": 9039, + "time_per_iteration": 2.39618182182312 + }, + { + "auxiliary_loss_clip": 0.0105772, + "auxiliary_loss_mlp": 0.01043938, + "balance_loss_clip": 1.01917827, + "balance_loss_mlp": 1.01872134, + "epoch": 0.5435142041184428, + "flos": 23111663777280.0, + "grad_norm": 1.936345146366669, + "language_loss": 0.77743638, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.79845297, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 9040, + "time_per_iteration": 2.3906710147857666 + }, + { + "auxiliary_loss_clip": 0.01013408, + "auxiliary_loss_mlp": 0.01002224, + "balance_loss_clip": 1.00011444, + "balance_loss_mlp": 1.00613451, + "epoch": 0.5435743273711108, + "flos": 64009479651840.0, + "grad_norm": 0.6630439675029843, + "language_loss": 0.52525878, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54541516, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.07275391, + "step": 9041, + "time_per_iteration": 3.0061399936676025 + }, + { + "auxiliary_loss_clip": 0.01059877, + "auxiliary_loss_mlp": 0.0104653, + "balance_loss_clip": 1.0195055, + "balance_loss_mlp": 1.02010858, + "epoch": 0.5436344506237788, + "flos": 25117738254720.0, + "grad_norm": 1.7887217616826259, + "language_loss": 0.7783438, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.79940784, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 9042, + "time_per_iteration": 2.411620616912842 + }, + { + "auxiliary_loss_clip": 0.01056717, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.01270223, + "balance_loss_mlp": 1.01762748, + "epoch": 0.5436945738764467, + "flos": 19571959607040.0, + "grad_norm": 1.6724308970173312, + "language_loss": 0.68378264, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.70472205, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.390625, + "step": 9043, + "time_per_iteration": 2.385835647583008 + }, + { + "auxiliary_loss_clip": 0.0105515, + "auxiliary_loss_mlp": 0.01039795, + "balance_loss_clip": 1.01646543, + "balance_loss_mlp": 1.01798427, + "epoch": 0.5437546971291147, + "flos": 21141445132800.0, + "grad_norm": 1.6768725016350807, + "language_loss": 0.85946238, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.88041186, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 9044, + "time_per_iteration": 3.8827810287475586 + }, + { + "auxiliary_loss_clip": 0.01059681, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.01893795, + "balance_loss_mlp": 1.01818919, + "epoch": 0.5438148203817826, + "flos": 25117528786560.0, + "grad_norm": 2.5803185596156504, + "language_loss": 0.6302526, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.65132588, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4140625, + "step": 9045, + "time_per_iteration": 2.3982245922088623 + }, + { + "auxiliary_loss_clip": 0.01058662, + "auxiliary_loss_mlp": 0.01043754, + "balance_loss_clip": 1.01838672, + "balance_loss_mlp": 1.01891708, + "epoch": 0.5438749436344507, + "flos": 23001827040000.0, + "grad_norm": 1.4712084884167334, + "language_loss": 0.71204925, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.73307335, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 9046, + "time_per_iteration": 2.445315361022949 + }, + { + "auxiliary_loss_clip": 0.01056287, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_clip": 1.01396275, + "balance_loss_mlp": 1.01809633, + "epoch": 0.5439350668871186, + "flos": 15486109038720.0, + "grad_norm": 1.5465169320588217, + "language_loss": 0.78237331, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.80334604, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3828125, + "step": 9047, + "time_per_iteration": 2.3545403480529785 + }, + { + "auxiliary_loss_clip": 0.01056563, + "auxiliary_loss_mlp": 0.01044949, + "balance_loss_clip": 1.01779294, + "balance_loss_mlp": 1.01801372, + "epoch": 0.5439951901397866, + "flos": 17237457169920.0, + "grad_norm": 1.7731542647759895, + "language_loss": 0.74016345, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.76117849, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38476562, + "step": 9048, + "time_per_iteration": 2.3727214336395264 + }, + { + "auxiliary_loss_clip": 0.01058725, + "auxiliary_loss_mlp": 0.01044026, + "balance_loss_clip": 1.01580942, + "balance_loss_mlp": 1.01965606, + "epoch": 0.5440553133924545, + "flos": 18660028227840.0, + "grad_norm": 2.1578440972650963, + "language_loss": 0.94451684, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.96554434, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.390625, + "step": 9049, + "time_per_iteration": 2.3596088886260986 + }, + { + "auxiliary_loss_clip": 0.01055475, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_clip": 1.01731825, + "balance_loss_mlp": 1.01648462, + "epoch": 0.5441154366451225, + "flos": 27121787873280.0, + "grad_norm": 1.7969916840490479, + "language_loss": 0.74946129, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.77044261, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 9050, + "time_per_iteration": 2.453023672103882 + }, + { + "auxiliary_loss_clip": 0.01058283, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.01430154, + "balance_loss_mlp": 1.01848006, + "epoch": 0.5441755598977904, + "flos": 25992696637440.0, + "grad_norm": 1.756650253366105, + "language_loss": 0.69444597, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.71544123, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 9051, + "time_per_iteration": 2.418930768966675 + }, + { + "auxiliary_loss_clip": 0.01056819, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.016904, + "balance_loss_mlp": 1.01815462, + "epoch": 0.5442356831504584, + "flos": 32378660087040.0, + "grad_norm": 1.6585199862090179, + "language_loss": 0.6877389, + "learning_rate": 1.810810185460011e-06, + "loss": 0.70872915, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 9052, + "time_per_iteration": 2.485384464263916 + }, + { + "auxiliary_loss_clip": 0.01059764, + "auxiliary_loss_mlp": 0.01042077, + "balance_loss_clip": 1.01525521, + "balance_loss_mlp": 1.01955175, + "epoch": 0.5442958064031264, + "flos": 24163317884160.0, + "grad_norm": 1.7106228395563101, + "language_loss": 0.93766999, + "learning_rate": 1.810422473773436e-06, + "loss": 0.95868838, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 9053, + "time_per_iteration": 2.4144763946533203 + }, + { + "auxiliary_loss_clip": 0.01058876, + "auxiliary_loss_mlp": 0.0104566, + "balance_loss_clip": 1.01796818, + "balance_loss_mlp": 1.0188812, + "epoch": 0.5443559296557944, + "flos": 18763860211200.0, + "grad_norm": 1.849519385746669, + "language_loss": 0.84817815, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.86922354, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40039062, + "step": 9054, + "time_per_iteration": 2.359365940093994 + }, + { + "auxiliary_loss_clip": 0.01058843, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.01943111, + "epoch": 0.5444160529084624, + "flos": 22631608316160.0, + "grad_norm": 1.9640408192433894, + "language_loss": 0.70381314, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.72487473, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 9055, + "time_per_iteration": 2.4353179931640625 + }, + { + "auxiliary_loss_clip": 0.0101025, + "auxiliary_loss_mlp": 0.01011342, + "balance_loss_clip": 1.008636, + "balance_loss_mlp": 1.00335169, + "epoch": 0.5444761761611303, + "flos": 69668376704640.0, + "grad_norm": 0.7338477128881574, + "language_loss": 0.57765657, + "learning_rate": 1.80925938190531e-06, + "loss": 0.5978725, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.02709961, + "router_z_loss_mlp": 0.06884766, + "step": 9056, + "time_per_iteration": 3.027512550354004 + }, + { + "auxiliary_loss_clip": 0.0105844, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_clip": 1.01450729, + "balance_loss_mlp": 1.01849353, + "epoch": 0.5445362994137983, + "flos": 14277695460480.0, + "grad_norm": 3.297214325210018, + "language_loss": 0.71262908, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.73363632, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 9057, + "time_per_iteration": 2.3674979209899902 + }, + { + "auxiliary_loss_clip": 0.01056728, + "auxiliary_loss_mlp": 0.01041818, + "balance_loss_clip": 1.017344, + "balance_loss_mlp": 1.01950347, + "epoch": 0.5445964226664662, + "flos": 28984927777920.0, + "grad_norm": 1.9426110025911156, + "language_loss": 0.76045471, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.7814402, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37109375, + "step": 9058, + "time_per_iteration": 2.467259407043457 + }, + { + "auxiliary_loss_clip": 0.01011074, + "auxiliary_loss_mlp": 0.01008864, + "balance_loss_clip": 1.00656343, + "balance_loss_mlp": 1.00409472, + "epoch": 0.5446565459191343, + "flos": 68616548040960.0, + "grad_norm": 0.808543220785197, + "language_loss": 0.62751162, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64771092, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.06982422, + "step": 9059, + "time_per_iteration": 3.1290409564971924 + }, + { + "auxiliary_loss_clip": 0.01055083, + "auxiliary_loss_mlp": 0.01045065, + "balance_loss_clip": 1.02055573, + "balance_loss_mlp": 1.01742005, + "epoch": 0.5447166691718022, + "flos": 16215549408000.0, + "grad_norm": 1.7115498003963618, + "language_loss": 0.8074283, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.82842982, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 9060, + "time_per_iteration": 2.390475034713745 + }, + { + "auxiliary_loss_clip": 0.01057722, + "auxiliary_loss_mlp": 0.01044311, + "balance_loss_clip": 1.01900315, + "balance_loss_mlp": 1.01949477, + "epoch": 0.5447767924244702, + "flos": 25847841939840.0, + "grad_norm": 2.1661163492544406, + "language_loss": 0.80990827, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.83092862, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 9061, + "time_per_iteration": 2.4083383083343506 + }, + { + "auxiliary_loss_clip": 0.01057697, + "auxiliary_loss_mlp": 0.0104155, + "balance_loss_clip": 1.01617002, + "balance_loss_mlp": 1.01922059, + "epoch": 0.5448369156771381, + "flos": 19676838931200.0, + "grad_norm": 1.7657601749236296, + "language_loss": 0.87870806, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89970052, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 9062, + "time_per_iteration": 2.3853800296783447 + }, + { + "auxiliary_loss_clip": 0.01061386, + "auxiliary_loss_mlp": 0.01049293, + "balance_loss_clip": 1.02161312, + "balance_loss_mlp": 1.01974452, + "epoch": 0.5448970389298061, + "flos": 19280783934720.0, + "grad_norm": 2.0243415416616455, + "language_loss": 0.83890337, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.86001015, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41601562, + "step": 9063, + "time_per_iteration": 2.3655283451080322 + }, + { + "auxiliary_loss_clip": 0.0105773, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.02241397, + "balance_loss_mlp": 1.01749837, + "epoch": 0.544957162182474, + "flos": 20990760238080.0, + "grad_norm": 1.6991257408756613, + "language_loss": 0.64819884, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.66925704, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 9064, + "time_per_iteration": 2.406205177307129 + }, + { + "auxiliary_loss_clip": 0.01058422, + "auxiliary_loss_mlp": 0.01050351, + "balance_loss_clip": 1.02224183, + "balance_loss_mlp": 1.01843405, + "epoch": 0.545017285435142, + "flos": 25373407207680.0, + "grad_norm": 1.6492213959280924, + "language_loss": 0.80796456, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.82905233, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40039062, + "step": 9065, + "time_per_iteration": 2.414774179458618 + }, + { + "auxiliary_loss_clip": 0.01055712, + "auxiliary_loss_mlp": 0.01044683, + "balance_loss_clip": 1.02055502, + "balance_loss_mlp": 1.01817989, + "epoch": 0.54507740868781, + "flos": 19133764732800.0, + "grad_norm": 1.8932295716576684, + "language_loss": 0.79279208, + "learning_rate": 1.805382881379827e-06, + "loss": 0.81379604, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37695312, + "step": 9066, + "time_per_iteration": 2.3795690536499023 + }, + { + "auxiliary_loss_clip": 0.01059763, + "auxiliary_loss_mlp": 0.01051599, + "balance_loss_clip": 1.02415705, + "balance_loss_mlp": 1.01855993, + "epoch": 0.545137531940478, + "flos": 26248609969920.0, + "grad_norm": 1.6187287960385142, + "language_loss": 0.76905054, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.79016417, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41210938, + "step": 9067, + "time_per_iteration": 2.415585517883301 + }, + { + "auxiliary_loss_clip": 0.01062184, + "auxiliary_loss_mlp": 0.01054935, + "balance_loss_clip": 1.02380896, + "balance_loss_mlp": 1.01986396, + "epoch": 0.545197655193146, + "flos": 37554254922240.0, + "grad_norm": 2.009421348764547, + "language_loss": 0.64262295, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.6637941, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.42382812, + "step": 9068, + "time_per_iteration": 2.528505802154541 + }, + { + "auxiliary_loss_clip": 0.01057167, + "auxiliary_loss_mlp": 0.01057328, + "balance_loss_clip": 1.03130496, + "balance_loss_mlp": 1.0185945, + "epoch": 0.5452577784458139, + "flos": 26030053658880.0, + "grad_norm": 1.5305675024301644, + "language_loss": 0.73225141, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.75339639, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9069, + "time_per_iteration": 2.435340404510498 + }, + { + "auxiliary_loss_clip": 0.01055682, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.0192616, + "balance_loss_mlp": 1.0190897, + "epoch": 0.5453179016984819, + "flos": 17638085554560.0, + "grad_norm": 1.6532645044201513, + "language_loss": 0.75292534, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.77391875, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 9070, + "time_per_iteration": 2.3401498794555664 + }, + { + "auxiliary_loss_clip": 0.01057634, + "auxiliary_loss_mlp": 0.0105093, + "balance_loss_clip": 1.02637339, + "balance_loss_mlp": 1.01812756, + "epoch": 0.5453780249511498, + "flos": 23215705228800.0, + "grad_norm": 2.50302580000591, + "language_loss": 0.62152565, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.64261127, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39453125, + "step": 9071, + "time_per_iteration": 2.4144999980926514 + }, + { + "auxiliary_loss_clip": 0.01009875, + "auxiliary_loss_mlp": 0.01004611, + "balance_loss_clip": 1.00244141, + "balance_loss_mlp": 1.00274849, + "epoch": 0.5454381482038179, + "flos": 68692728360960.0, + "grad_norm": 0.7003940584138308, + "language_loss": 0.57189238, + "learning_rate": 1.80305733435899e-06, + "loss": 0.5920372, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.07128906, + "step": 9072, + "time_per_iteration": 3.1287894248962402 + }, + { + "auxiliary_loss_clip": 0.01054369, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.01827073, + "balance_loss_mlp": 1.01694596, + "epoch": 0.5454982714564858, + "flos": 13259802504960.0, + "grad_norm": 1.7849556806745304, + "language_loss": 0.71198654, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.73295903, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 9073, + "time_per_iteration": 3.548638343811035 + }, + { + "auxiliary_loss_clip": 0.01055031, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.02060819, + "balance_loss_mlp": 1.01800632, + "epoch": 0.5455583947091538, + "flos": 21834785289600.0, + "grad_norm": 1.769839850691107, + "language_loss": 0.72245842, + "learning_rate": 1.802282211606627e-06, + "loss": 0.74346447, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 9074, + "time_per_iteration": 3.9618887901306152 + }, + { + "auxiliary_loss_clip": 0.01054941, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.0215348, + "balance_loss_mlp": 1.01682413, + "epoch": 0.5456185179618217, + "flos": 17816596669440.0, + "grad_norm": 2.0372779810032076, + "language_loss": 0.70494938, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.72596377, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 9075, + "time_per_iteration": 2.4025380611419678 + }, + { + "auxiliary_loss_clip": 0.01055341, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.02083969, + "balance_loss_mlp": 1.0186727, + "epoch": 0.5456786412144897, + "flos": 21068337012480.0, + "grad_norm": 1.630491771771455, + "language_loss": 0.82153249, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.84252554, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 9076, + "time_per_iteration": 2.3757872581481934 + }, + { + "auxiliary_loss_clip": 0.0105496, + "auxiliary_loss_mlp": 0.01040224, + "balance_loss_clip": 1.01563132, + "balance_loss_mlp": 1.01661229, + "epoch": 0.5457387644671576, + "flos": 23293840584960.0, + "grad_norm": 1.7440947756358194, + "language_loss": 0.815027, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.83597887, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 9077, + "time_per_iteration": 3.861351251602173 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01040948, + "balance_loss_clip": 1.01645088, + "balance_loss_mlp": 1.0181489, + "epoch": 0.5457988877198257, + "flos": 21615949687680.0, + "grad_norm": 1.9450405511709188, + "language_loss": 0.69382048, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.71479756, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 9078, + "time_per_iteration": 2.3997509479522705 + }, + { + "auxiliary_loss_clip": 0.01057581, + "auxiliary_loss_mlp": 0.01047052, + "balance_loss_clip": 1.02051544, + "balance_loss_mlp": 1.01772714, + "epoch": 0.5458590109724936, + "flos": 23761537424640.0, + "grad_norm": 1.7305129958128598, + "language_loss": 0.81537807, + "learning_rate": 1.800344536188764e-06, + "loss": 0.83642447, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 9079, + "time_per_iteration": 2.4489526748657227 + }, + { + "auxiliary_loss_clip": 0.01058803, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_clip": 1.01748967, + "balance_loss_mlp": 1.01814556, + "epoch": 0.5459191342251616, + "flos": 24423176200320.0, + "grad_norm": 1.8586634212553703, + "language_loss": 0.77055687, + "learning_rate": 1.799957023759277e-06, + "loss": 0.79159677, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 9080, + "time_per_iteration": 2.4146924018859863 + }, + { + "auxiliary_loss_clip": 0.01058776, + "auxiliary_loss_mlp": 0.01048129, + "balance_loss_clip": 1.02032924, + "balance_loss_mlp": 1.01926124, + "epoch": 0.5459792574778296, + "flos": 23621884519680.0, + "grad_norm": 2.1254029987814937, + "language_loss": 0.85092694, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.87199599, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39648438, + "step": 9081, + "time_per_iteration": 2.453568935394287 + }, + { + "auxiliary_loss_clip": 0.01059474, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_clip": 1.01924896, + "balance_loss_mlp": 1.01894546, + "epoch": 0.5460393807304975, + "flos": 19134532782720.0, + "grad_norm": 1.504669813873107, + "language_loss": 0.71161693, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.73267734, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 9082, + "time_per_iteration": 2.3735578060150146 + }, + { + "auxiliary_loss_clip": 0.01054918, + "auxiliary_loss_mlp": 0.01035352, + "balance_loss_clip": 1.01037776, + "balance_loss_mlp": 1.01720166, + "epoch": 0.5460995039831655, + "flos": 35917072536960.0, + "grad_norm": 1.5452739085026146, + "language_loss": 0.67266226, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.69356495, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 9083, + "time_per_iteration": 2.564061403274536 + }, + { + "auxiliary_loss_clip": 0.0105598, + "auxiliary_loss_mlp": 0.01040735, + "balance_loss_clip": 1.01597559, + "balance_loss_mlp": 1.01849961, + "epoch": 0.5461596272358334, + "flos": 26758062662400.0, + "grad_norm": 2.370125658442472, + "language_loss": 0.80986738, + "learning_rate": 1.798407050044766e-06, + "loss": 0.83083451, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 9084, + "time_per_iteration": 3.876662492752075 + }, + { + "auxiliary_loss_clip": 0.01059754, + "auxiliary_loss_mlp": 0.01043495, + "balance_loss_clip": 1.01714993, + "balance_loss_mlp": 1.01994848, + "epoch": 0.5462197504885015, + "flos": 20885531800320.0, + "grad_norm": 1.7604703283909713, + "language_loss": 0.76402938, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.78506184, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 9085, + "time_per_iteration": 2.4194583892822266 + }, + { + "auxiliary_loss_clip": 0.01058594, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.01474071, + "balance_loss_mlp": 1.02012062, + "epoch": 0.5462798737411694, + "flos": 25803991405440.0, + "grad_norm": 1.840647445605235, + "language_loss": 0.75356686, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.77454853, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38476562, + "step": 9086, + "time_per_iteration": 2.5095789432525635 + }, + { + "auxiliary_loss_clip": 0.01056973, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.01680779, + "balance_loss_mlp": 1.01875019, + "epoch": 0.5463399969938374, + "flos": 25773861035520.0, + "grad_norm": 1.7431648957505663, + "language_loss": 0.78161323, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.80258954, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 9087, + "time_per_iteration": 2.4573400020599365 + }, + { + "auxiliary_loss_clip": 0.01059844, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_clip": 1.01575887, + "balance_loss_mlp": 1.02024388, + "epoch": 0.5464001202465053, + "flos": 18842309769600.0, + "grad_norm": 1.8553745117706277, + "language_loss": 0.79202592, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.8130542, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39453125, + "step": 9088, + "time_per_iteration": 2.361616373062134 + }, + { + "auxiliary_loss_clip": 0.01014764, + "auxiliary_loss_mlp": 0.01004959, + "balance_loss_clip": 1.00274217, + "balance_loss_mlp": 1.00765014, + "epoch": 0.5464602434991733, + "flos": 69046084897920.0, + "grad_norm": 0.8686643638952096, + "language_loss": 0.57721162, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59740883, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.07128906, + "step": 9089, + "time_per_iteration": 3.085942268371582 + }, + { + "auxiliary_loss_clip": 0.01060177, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.01139545, + "balance_loss_mlp": 1.02088404, + "epoch": 0.5465203667518412, + "flos": 27558900495360.0, + "grad_norm": 1.7527443894645216, + "language_loss": 0.78339219, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.80435592, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39257812, + "step": 9090, + "time_per_iteration": 2.4447054862976074 + }, + { + "auxiliary_loss_clip": 0.01062489, + "auxiliary_loss_mlp": 0.01045298, + "balance_loss_clip": 1.01640189, + "balance_loss_mlp": 1.02019429, + "epoch": 0.5465804900045093, + "flos": 21209281637760.0, + "grad_norm": 1.8353209137931406, + "language_loss": 0.74716306, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.76824093, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 9091, + "time_per_iteration": 2.4215595722198486 + }, + { + "auxiliary_loss_clip": 0.0105973, + "auxiliary_loss_mlp": 0.01044555, + "balance_loss_clip": 1.01820946, + "balance_loss_mlp": 1.01971078, + "epoch": 0.5466406132571772, + "flos": 22487940604800.0, + "grad_norm": 1.7411741473210525, + "language_loss": 0.7912339, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.81227678, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40039062, + "step": 9092, + "time_per_iteration": 2.4153802394866943 + }, + { + "auxiliary_loss_clip": 0.0106211, + "auxiliary_loss_mlp": 0.01047296, + "balance_loss_clip": 1.01934135, + "balance_loss_mlp": 1.02169061, + "epoch": 0.5467007365098452, + "flos": 17674883994240.0, + "grad_norm": 2.5320627347006983, + "language_loss": 0.7656368, + "learning_rate": 1.794920057818476e-06, + "loss": 0.78673083, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40429688, + "step": 9093, + "time_per_iteration": 2.3647782802581787 + }, + { + "auxiliary_loss_clip": 0.01059182, + "auxiliary_loss_mlp": 0.01046233, + "balance_loss_clip": 1.01819468, + "balance_loss_mlp": 1.01908255, + "epoch": 0.5467608597625132, + "flos": 15698136925440.0, + "grad_norm": 2.1316505229713507, + "language_loss": 0.7074858, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.72853994, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40039062, + "step": 9094, + "time_per_iteration": 2.4637675285339355 + }, + { + "auxiliary_loss_clip": 0.01059276, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.02317381, + "balance_loss_mlp": 1.02088583, + "epoch": 0.5468209830151811, + "flos": 24311768451840.0, + "grad_norm": 3.2070443733822, + "language_loss": 0.69727045, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.71835291, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38476562, + "step": 9095, + "time_per_iteration": 2.4257540702819824 + }, + { + "auxiliary_loss_clip": 0.01058623, + "auxiliary_loss_mlp": 0.0104012, + "balance_loss_clip": 1.0158608, + "balance_loss_mlp": 1.02027047, + "epoch": 0.5468811062678491, + "flos": 29165114638080.0, + "grad_norm": 1.4886080606432184, + "language_loss": 0.67555988, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.69654733, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3828125, + "step": 9096, + "time_per_iteration": 2.4329347610473633 + }, + { + "auxiliary_loss_clip": 0.01011873, + "auxiliary_loss_mlp": 0.01002845, + "balance_loss_clip": 1.00001979, + "balance_loss_mlp": 1.00490308, + "epoch": 0.546941229520517, + "flos": 67864031262720.0, + "grad_norm": 0.7469479413703247, + "language_loss": 0.57640612, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59655321, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.06982422, + "step": 9097, + "time_per_iteration": 3.1705825328826904 + }, + { + "auxiliary_loss_clip": 0.01010673, + "auxiliary_loss_mlp": 0.0100359, + "balance_loss_clip": 1.0009675, + "balance_loss_mlp": 1.00363612, + "epoch": 0.5470013527731851, + "flos": 58267594563840.0, + "grad_norm": 0.9149350828114573, + "language_loss": 0.64953506, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66967767, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.0703125, + "step": 9098, + "time_per_iteration": 2.979968547821045 + }, + { + "auxiliary_loss_clip": 0.01057928, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_clip": 1.02067637, + "balance_loss_mlp": 1.01852584, + "epoch": 0.547061476025853, + "flos": 22964819132160.0, + "grad_norm": 1.5894528429345922, + "language_loss": 0.74389493, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.76494068, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 9099, + "time_per_iteration": 2.4378929138183594 + }, + { + "auxiliary_loss_clip": 0.01055169, + "auxiliary_loss_mlp": 0.01037864, + "balance_loss_clip": 1.01429605, + "balance_loss_mlp": 1.01673758, + "epoch": 0.547121599278521, + "flos": 29967034723200.0, + "grad_norm": 1.9386559693545249, + "language_loss": 0.74160415, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.7625345, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.38476562, + "step": 9100, + "time_per_iteration": 2.513603687286377 + }, + { + "auxiliary_loss_clip": 0.01055692, + "auxiliary_loss_mlp": 0.01039856, + "balance_loss_clip": 1.01351047, + "balance_loss_mlp": 1.01794147, + "epoch": 0.5471817225311889, + "flos": 36534057816960.0, + "grad_norm": 1.675797508750771, + "language_loss": 0.68811095, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.70906639, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37695312, + "step": 9101, + "time_per_iteration": 2.5238590240478516 + }, + { + "auxiliary_loss_clip": 0.0105625, + "auxiliary_loss_mlp": 0.0103972, + "balance_loss_clip": 1.01494861, + "balance_loss_mlp": 1.01781082, + "epoch": 0.5472418457838569, + "flos": 25774070503680.0, + "grad_norm": 1.8344269243276754, + "language_loss": 0.79140317, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.81236291, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 9102, + "time_per_iteration": 2.442070245742798 + }, + { + "auxiliary_loss_clip": 0.01056554, + "auxiliary_loss_mlp": 0.01049568, + "balance_loss_clip": 1.02520192, + "balance_loss_mlp": 1.01861191, + "epoch": 0.5473019690365248, + "flos": 27886560405120.0, + "grad_norm": 2.4063789105401265, + "language_loss": 0.73228234, + "learning_rate": 1.791046361258413e-06, + "loss": 0.75334352, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 9103, + "time_per_iteration": 2.4447858333587646 + }, + { + "auxiliary_loss_clip": 0.01056168, + "auxiliary_loss_mlp": 0.0104084, + "balance_loss_clip": 1.01649714, + "balance_loss_mlp": 1.01859212, + "epoch": 0.5473620922891929, + "flos": 57629313354240.0, + "grad_norm": 1.3878463714227298, + "language_loss": 0.65671688, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67768699, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.375, + "step": 9104, + "time_per_iteration": 2.7730796337127686 + }, + { + "auxiliary_loss_clip": 0.01061809, + "auxiliary_loss_mlp": 0.01041501, + "balance_loss_clip": 1.01327181, + "balance_loss_mlp": 1.02094364, + "epoch": 0.5474222155418608, + "flos": 19353054182400.0, + "grad_norm": 1.8794357176388636, + "language_loss": 0.83077937, + "learning_rate": 1.790271716558888e-06, + "loss": 0.85181242, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40820312, + "step": 9105, + "time_per_iteration": 2.390254259109497 + }, + { + "auxiliary_loss_clip": 0.01056568, + "auxiliary_loss_mlp": 0.01039438, + "balance_loss_clip": 1.01629972, + "balance_loss_mlp": 1.01821017, + "epoch": 0.5474823387945288, + "flos": 25119239443200.0, + "grad_norm": 3.204498211599707, + "language_loss": 0.81005132, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.83101141, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3828125, + "step": 9106, + "time_per_iteration": 2.4481019973754883 + }, + { + "auxiliary_loss_clip": 0.01056917, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.02140689, + "balance_loss_mlp": 1.01932883, + "epoch": 0.5475424620471967, + "flos": 18003207219840.0, + "grad_norm": 1.705114293923401, + "language_loss": 0.70554245, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.72655737, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.375, + "step": 9107, + "time_per_iteration": 2.386042833328247 + }, + { + "auxiliary_loss_clip": 0.01057644, + "auxiliary_loss_mlp": 0.01037954, + "balance_loss_clip": 1.01370645, + "balance_loss_mlp": 1.01868069, + "epoch": 0.5476025852998647, + "flos": 22308242503680.0, + "grad_norm": 1.5664655245451466, + "language_loss": 0.64725584, + "learning_rate": 1.789109809193197e-06, + "loss": 0.66821182, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.390625, + "step": 9108, + "time_per_iteration": 2.4163575172424316 + }, + { + "auxiliary_loss_clip": 0.01057324, + "auxiliary_loss_mlp": 0.01039284, + "balance_loss_clip": 1.01492906, + "balance_loss_mlp": 1.01880455, + "epoch": 0.5476627085525327, + "flos": 20119467548160.0, + "grad_norm": 1.680457114895876, + "language_loss": 0.75850153, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77946758, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38476562, + "step": 9109, + "time_per_iteration": 2.405853509902954 + }, + { + "auxiliary_loss_clip": 0.01058008, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.01543808, + "balance_loss_mlp": 1.02104759, + "epoch": 0.5477228318052006, + "flos": 17711612611200.0, + "grad_norm": 1.8772983344192442, + "language_loss": 0.78879553, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.80977654, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 9110, + "time_per_iteration": 2.396664619445801 + }, + { + "auxiliary_loss_clip": 0.01058381, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.01290345, + "balance_loss_mlp": 1.02123523, + "epoch": 0.5477829550578687, + "flos": 25847702294400.0, + "grad_norm": 1.4757584353713669, + "language_loss": 0.72065884, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.74160099, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.37109375, + "step": 9111, + "time_per_iteration": 2.4412314891815186 + }, + { + "auxiliary_loss_clip": 0.01059541, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.01580083, + "balance_loss_mlp": 1.02045894, + "epoch": 0.5478430783105366, + "flos": 23038555656960.0, + "grad_norm": 1.5422860223946497, + "language_loss": 0.71650338, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73751539, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 9112, + "time_per_iteration": 2.4126458168029785 + }, + { + "auxiliary_loss_clip": 0.01061836, + "auxiliary_loss_mlp": 0.01042224, + "balance_loss_clip": 1.01608098, + "balance_loss_mlp": 1.02076626, + "epoch": 0.5479032015632046, + "flos": 16070275774080.0, + "grad_norm": 1.8573697805886338, + "language_loss": 0.89720356, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.91824412, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.41210938, + "step": 9113, + "time_per_iteration": 3.609755039215088 + }, + { + "auxiliary_loss_clip": 0.01060921, + "auxiliary_loss_mlp": 0.01043192, + "balance_loss_clip": 1.01696599, + "balance_loss_mlp": 1.02116871, + "epoch": 0.5479633248158725, + "flos": 24277588364160.0, + "grad_norm": 1.5948466480736014, + "language_loss": 0.74089706, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.76193821, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 9114, + "time_per_iteration": 3.9397969245910645 + }, + { + "auxiliary_loss_clip": 0.01057103, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.01348174, + "balance_loss_mlp": 1.01996183, + "epoch": 0.5480234480685405, + "flos": 26357050252800.0, + "grad_norm": 1.4588372593868142, + "language_loss": 0.72803497, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74897897, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 9115, + "time_per_iteration": 2.4961321353912354 + }, + { + "auxiliary_loss_clip": 0.01058967, + "auxiliary_loss_mlp": 0.01044229, + "balance_loss_clip": 1.01743054, + "balance_loss_mlp": 1.01976871, + "epoch": 0.5480835713212084, + "flos": 22053970005120.0, + "grad_norm": 1.757304810601309, + "language_loss": 0.73384494, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.75487691, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 9116, + "time_per_iteration": 3.723086357116699 + }, + { + "auxiliary_loss_clip": 0.01059432, + "auxiliary_loss_mlp": 0.01046944, + "balance_loss_clip": 1.02065873, + "balance_loss_mlp": 1.02144825, + "epoch": 0.5481436945738765, + "flos": 25299880151040.0, + "grad_norm": 1.8292617755087848, + "language_loss": 0.78270292, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.80376673, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 9117, + "time_per_iteration": 2.4519572257995605 + }, + { + "auxiliary_loss_clip": 0.01056168, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.01497889, + "balance_loss_mlp": 1.01943874, + "epoch": 0.5482038178265444, + "flos": 33579532811520.0, + "grad_norm": 1.5642591156223873, + "language_loss": 0.63661695, + "learning_rate": 1.785237306671674e-06, + "loss": 0.65755963, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 9118, + "time_per_iteration": 2.479496717453003 + }, + { + "auxiliary_loss_clip": 0.01059246, + "auxiliary_loss_mlp": 0.0104145, + "balance_loss_clip": 1.01346004, + "balance_loss_mlp": 1.0200057, + "epoch": 0.5482639410792124, + "flos": 19025184804480.0, + "grad_norm": 2.437196049113347, + "language_loss": 0.79900599, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.82001299, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.39257812, + "step": 9119, + "time_per_iteration": 2.4355697631835938 + }, + { + "auxiliary_loss_clip": 0.01057214, + "auxiliary_loss_mlp": 0.0103578, + "balance_loss_clip": 1.01316595, + "balance_loss_mlp": 1.02003741, + "epoch": 0.5483240643318803, + "flos": 25409158306560.0, + "grad_norm": 1.5927542802251113, + "language_loss": 0.82988691, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.85081685, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37109375, + "step": 9120, + "time_per_iteration": 2.4370572566986084 + }, + { + "auxiliary_loss_clip": 0.0106128, + "auxiliary_loss_mlp": 0.01042177, + "balance_loss_clip": 1.01632047, + "balance_loss_mlp": 1.0203383, + "epoch": 0.5483841875845483, + "flos": 21465928108800.0, + "grad_norm": 1.6575122046839157, + "language_loss": 0.8156057, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.83664024, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40820312, + "step": 9121, + "time_per_iteration": 2.4428751468658447 + }, + { + "auxiliary_loss_clip": 0.01058995, + "auxiliary_loss_mlp": 0.01046764, + "balance_loss_clip": 1.01826072, + "balance_loss_mlp": 1.01870871, + "epoch": 0.5484443108372163, + "flos": 24746297633280.0, + "grad_norm": 3.0112534090175087, + "language_loss": 0.62547362, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.64653122, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40234375, + "step": 9122, + "time_per_iteration": 2.3998727798461914 + }, + { + "auxiliary_loss_clip": 0.01057774, + "auxiliary_loss_mlp": 0.01041566, + "balance_loss_clip": 1.01858246, + "balance_loss_mlp": 1.01998746, + "epoch": 0.5485044340898843, + "flos": 25374175257600.0, + "grad_norm": 1.6011034039766179, + "language_loss": 0.72441429, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.7454077, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37695312, + "step": 9123, + "time_per_iteration": 3.8539044857025146 + }, + { + "auxiliary_loss_clip": 0.01057338, + "auxiliary_loss_mlp": 0.01040612, + "balance_loss_clip": 1.01630533, + "balance_loss_mlp": 1.01843381, + "epoch": 0.5485645573425523, + "flos": 12640338518400.0, + "grad_norm": 1.83592829153088, + "language_loss": 0.85070956, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.87168908, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.38867188, + "step": 9124, + "time_per_iteration": 2.3644046783447266 + }, + { + "auxiliary_loss_clip": 0.0105814, + "auxiliary_loss_mlp": 0.01041352, + "balance_loss_clip": 1.01629412, + "balance_loss_mlp": 1.01996672, + "epoch": 0.5486246805952202, + "flos": 28328176592640.0, + "grad_norm": 2.0022509465836213, + "language_loss": 0.81611747, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.83711237, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 9125, + "time_per_iteration": 2.4593863487243652 + }, + { + "auxiliary_loss_clip": 0.01058825, + "auxiliary_loss_mlp": 0.01044182, + "balance_loss_clip": 1.01813483, + "balance_loss_mlp": 1.01937294, + "epoch": 0.5486848038478882, + "flos": 16799087738880.0, + "grad_norm": 2.677761396135111, + "language_loss": 0.75152779, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.77255785, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 9126, + "time_per_iteration": 2.3763856887817383 + }, + { + "auxiliary_loss_clip": 0.01060757, + "auxiliary_loss_mlp": 0.01042554, + "balance_loss_clip": 1.01321626, + "balance_loss_mlp": 1.01959276, + "epoch": 0.5487449271005561, + "flos": 17235327576960.0, + "grad_norm": 2.7724145172170562, + "language_loss": 0.69432455, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.71535766, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41210938, + "step": 9127, + "time_per_iteration": 2.4257378578186035 + }, + { + "auxiliary_loss_clip": 0.01057798, + "auxiliary_loss_mlp": 0.01045756, + "balance_loss_clip": 1.01744425, + "balance_loss_mlp": 1.01910233, + "epoch": 0.5488050503532241, + "flos": 17340102167040.0, + "grad_norm": 1.813821031854537, + "language_loss": 0.84516877, + "learning_rate": 1.781365618532181e-06, + "loss": 0.86620432, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.38671875, + "step": 9128, + "time_per_iteration": 2.3799655437469482 + }, + { + "auxiliary_loss_clip": 0.01059307, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.02130389, + "balance_loss_mlp": 1.01984167, + "epoch": 0.548865173605892, + "flos": 17238190308480.0, + "grad_norm": 1.8378892621440883, + "language_loss": 0.75626445, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.777336, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 9129, + "time_per_iteration": 2.42330002784729 + }, + { + "auxiliary_loss_clip": 0.01061106, + "auxiliary_loss_mlp": 0.01044205, + "balance_loss_clip": 1.0147723, + "balance_loss_mlp": 1.02044392, + "epoch": 0.5489252968585601, + "flos": 17455769101440.0, + "grad_norm": 2.5365159810898232, + "language_loss": 0.6503489, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.67140198, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40625, + "step": 9130, + "time_per_iteration": 2.36358380317688 + }, + { + "auxiliary_loss_clip": 0.01059274, + "auxiliary_loss_mlp": 0.01045499, + "balance_loss_clip": 1.01930881, + "balance_loss_mlp": 1.0192281, + "epoch": 0.548985420111228, + "flos": 26322171937920.0, + "grad_norm": 1.7023299433605819, + "language_loss": 0.64812708, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.66917479, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 9131, + "time_per_iteration": 2.482997179031372 + }, + { + "auxiliary_loss_clip": 0.01058823, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.01460278, + "balance_loss_mlp": 1.01849937, + "epoch": 0.549045543363896, + "flos": 18692846772480.0, + "grad_norm": 1.8080689508585666, + "language_loss": 0.75938559, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.78039742, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40234375, + "step": 9132, + "time_per_iteration": 2.348004102706909 + }, + { + "auxiliary_loss_clip": 0.01056829, + "auxiliary_loss_mlp": 0.01040741, + "balance_loss_clip": 1.0157783, + "balance_loss_mlp": 1.0173707, + "epoch": 0.5491056666165639, + "flos": 24716237086080.0, + "grad_norm": 1.5645339539347949, + "language_loss": 0.82409668, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.84507239, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39453125, + "step": 9133, + "time_per_iteration": 2.434020757675171 + }, + { + "auxiliary_loss_clip": 0.01055777, + "auxiliary_loss_mlp": 0.01051665, + "balance_loss_clip": 1.02601123, + "balance_loss_mlp": 1.01753724, + "epoch": 0.5491657898692319, + "flos": 21575939402880.0, + "grad_norm": 1.8929803192733115, + "language_loss": 0.71067679, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.7317512, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 9134, + "time_per_iteration": 2.3680477142333984 + }, + { + "auxiliary_loss_clip": 0.010598, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.01835537, + "balance_loss_mlp": 1.01895714, + "epoch": 0.5492259131219, + "flos": 50474562566400.0, + "grad_norm": 1.7410897079528607, + "language_loss": 0.62225509, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.64329892, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40820312, + "step": 9135, + "time_per_iteration": 2.660022497177124 + }, + { + "auxiliary_loss_clip": 0.01060233, + "auxiliary_loss_mlp": 0.01047796, + "balance_loss_clip": 1.0178988, + "balance_loss_mlp": 1.01868749, + "epoch": 0.5492860363745679, + "flos": 25118087368320.0, + "grad_norm": 1.7998948514213355, + "language_loss": 0.7357502, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.75683051, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41601562, + "step": 9136, + "time_per_iteration": 2.3954274654388428 + }, + { + "auxiliary_loss_clip": 0.01061349, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.0164659, + "balance_loss_mlp": 1.01851749, + "epoch": 0.5493461596272359, + "flos": 22632795302400.0, + "grad_norm": 2.4501265862613857, + "language_loss": 0.69885302, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.71991915, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.4296875, + "step": 9137, + "time_per_iteration": 2.4319875240325928 + }, + { + "auxiliary_loss_clip": 0.01010287, + "auxiliary_loss_mlp": 0.01006758, + "balance_loss_clip": 1.00445724, + "balance_loss_mlp": 1.00325394, + "epoch": 0.5494062828799038, + "flos": 66148853166720.0, + "grad_norm": 0.7614728242226136, + "language_loss": 0.65465063, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67482108, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.0703125, + "step": 9138, + "time_per_iteration": 3.0608794689178467 + }, + { + "auxiliary_loss_clip": 0.01058883, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.01390481, + "balance_loss_mlp": 1.01878309, + "epoch": 0.5494664061325718, + "flos": 21104891072640.0, + "grad_norm": 1.9903706062729905, + "language_loss": 0.76111007, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.78210342, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 9139, + "time_per_iteration": 2.4276340007781982 + }, + { + "auxiliary_loss_clip": 0.01057916, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.01618242, + "balance_loss_mlp": 1.018224, + "epoch": 0.5495265293852397, + "flos": 14391686649600.0, + "grad_norm": 1.9922199902026902, + "language_loss": 0.73040128, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.75139672, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39648438, + "step": 9140, + "time_per_iteration": 2.3565568923950195 + }, + { + "auxiliary_loss_clip": 0.01056359, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.01297116, + "balance_loss_mlp": 1.01720381, + "epoch": 0.5495866526379077, + "flos": 25548182807040.0, + "grad_norm": 1.9294143340894114, + "language_loss": 0.77730656, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.7982648, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 9141, + "time_per_iteration": 2.46065616607666 + }, + { + "auxiliary_loss_clip": 0.01057258, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.01641035, + "balance_loss_mlp": 1.01941705, + "epoch": 0.5496467758905756, + "flos": 21316395288960.0, + "grad_norm": 2.012828809143257, + "language_loss": 0.76231593, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.78329229, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37890625, + "step": 9142, + "time_per_iteration": 2.3724138736724854 + }, + { + "auxiliary_loss_clip": 0.0106103, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.01498246, + "balance_loss_mlp": 1.02071309, + "epoch": 0.5497068991432437, + "flos": 22232097095040.0, + "grad_norm": 1.8764464890788948, + "language_loss": 0.78219438, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.80323309, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40234375, + "step": 9143, + "time_per_iteration": 2.4085166454315186 + }, + { + "auxiliary_loss_clip": 0.01056793, + "auxiliary_loss_mlp": 0.01041002, + "balance_loss_clip": 1.0144068, + "balance_loss_mlp": 1.01811051, + "epoch": 0.5497670223959116, + "flos": 18478095799680.0, + "grad_norm": 2.359969146439659, + "language_loss": 0.8166585, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.83763641, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 9144, + "time_per_iteration": 2.3414220809936523 + }, + { + "auxiliary_loss_clip": 0.01060302, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_clip": 1.0175401, + "balance_loss_mlp": 1.02054191, + "epoch": 0.5498271456485796, + "flos": 29203833202560.0, + "grad_norm": 1.79061867070704, + "language_loss": 0.72018629, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.74122602, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 9145, + "time_per_iteration": 2.4653677940368652 + }, + { + "auxiliary_loss_clip": 0.01057707, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_clip": 1.01858437, + "balance_loss_mlp": 1.01884604, + "epoch": 0.5498872689012475, + "flos": 34822929438720.0, + "grad_norm": 1.6445743225522298, + "language_loss": 0.71706307, + "learning_rate": 1.774398678985076e-06, + "loss": 0.73808444, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38867188, + "step": 9146, + "time_per_iteration": 2.478318691253662 + }, + { + "auxiliary_loss_clip": 0.01055288, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.01337981, + "balance_loss_mlp": 1.01894283, + "epoch": 0.5499473921539155, + "flos": 25920740592000.0, + "grad_norm": 1.818373377820676, + "language_loss": 0.65076184, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.67167181, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 9147, + "time_per_iteration": 2.442758798599243 + }, + { + "auxiliary_loss_clip": 0.01057737, + "auxiliary_loss_mlp": 0.01043291, + "balance_loss_clip": 1.01696908, + "balance_loss_mlp": 1.0186944, + "epoch": 0.5500075154065835, + "flos": 22272596138880.0, + "grad_norm": 1.9667471021979908, + "language_loss": 0.82788002, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.8488903, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.390625, + "step": 9148, + "time_per_iteration": 2.3792545795440674 + }, + { + "auxiliary_loss_clip": 0.01057844, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.01833153, + "balance_loss_mlp": 1.01813698, + "epoch": 0.5500676386592515, + "flos": 28036267781760.0, + "grad_norm": 1.651629966173748, + "language_loss": 0.80273116, + "learning_rate": 1.773237789559453e-06, + "loss": 0.82375538, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39648438, + "step": 9149, + "time_per_iteration": 2.489269971847534 + }, + { + "auxiliary_loss_clip": 0.01058571, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.01268196, + "balance_loss_mlp": 1.01903105, + "epoch": 0.5501277619119195, + "flos": 23913688596480.0, + "grad_norm": 1.8980725538064207, + "language_loss": 0.7345587, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.75551587, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.39453125, + "step": 9150, + "time_per_iteration": 2.3829362392425537 + }, + { + "auxiliary_loss_clip": 0.01059936, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_clip": 1.01476526, + "balance_loss_mlp": 1.01850092, + "epoch": 0.5501878851645874, + "flos": 20922714264960.0, + "grad_norm": 1.943021795179864, + "language_loss": 0.76154155, + "learning_rate": 1.772463906245477e-06, + "loss": 0.78257966, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.4140625, + "step": 9151, + "time_per_iteration": 2.4296443462371826 + }, + { + "auxiliary_loss_clip": 0.01058362, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.01549184, + "balance_loss_mlp": 1.01897454, + "epoch": 0.5502480084172554, + "flos": 20664322225920.0, + "grad_norm": 2.2818107367432394, + "language_loss": 0.76577061, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78676689, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 9152, + "time_per_iteration": 3.7227983474731445 + }, + { + "auxiliary_loss_clip": 0.01057719, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.0139401, + "balance_loss_mlp": 1.01881444, + "epoch": 0.5503081316699233, + "flos": 26431345359360.0, + "grad_norm": 1.8805064295393548, + "language_loss": 0.83817899, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.85913396, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38867188, + "step": 9153, + "time_per_iteration": 2.4264204502105713 + }, + { + "auxiliary_loss_clip": 0.01057937, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.01628625, + "balance_loss_mlp": 1.01885462, + "epoch": 0.5503682549225913, + "flos": 30627800714880.0, + "grad_norm": 1.8086797563121977, + "language_loss": 0.75392109, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.77493596, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 9154, + "time_per_iteration": 3.8839645385742188 + }, + { + "auxiliary_loss_clip": 0.01061868, + "auxiliary_loss_mlp": 0.01044657, + "balance_loss_clip": 1.01759696, + "balance_loss_mlp": 1.02007222, + "epoch": 0.5504283781752592, + "flos": 22564330392960.0, + "grad_norm": 1.8243345205920185, + "language_loss": 0.73644543, + "learning_rate": 1.770916243273199e-06, + "loss": 0.75751066, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41796875, + "step": 9155, + "time_per_iteration": 2.3925323486328125 + }, + { + "auxiliary_loss_clip": 0.01010598, + "auxiliary_loss_mlp": 0.01003507, + "balance_loss_clip": 1.00118196, + "balance_loss_mlp": 1.00365508, + "epoch": 0.5504885014279273, + "flos": 67898071704960.0, + "grad_norm": 0.7483967329612776, + "language_loss": 0.55426359, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.5744046, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.06933594, + "step": 9156, + "time_per_iteration": 4.5518903732299805 + }, + { + "auxiliary_loss_clip": 0.01056535, + "auxiliary_loss_mlp": 0.01037358, + "balance_loss_clip": 1.01349211, + "balance_loss_mlp": 1.01811743, + "epoch": 0.5505486246805952, + "flos": 22449117306240.0, + "grad_norm": 1.6590720663489695, + "language_loss": 0.83128595, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.85222483, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38476562, + "step": 9157, + "time_per_iteration": 2.4331817626953125 + }, + { + "auxiliary_loss_clip": 0.01060699, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.01497984, + "balance_loss_mlp": 1.0191648, + "epoch": 0.5506087479332632, + "flos": 26905675357440.0, + "grad_norm": 2.1230303098158467, + "language_loss": 0.77217239, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.79322898, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.4140625, + "step": 9158, + "time_per_iteration": 2.4065189361572266 + }, + { + "auxiliary_loss_clip": 0.01056571, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.01658714, + "balance_loss_mlp": 1.01893508, + "epoch": 0.5506688711859311, + "flos": 22929137856000.0, + "grad_norm": 1.603470520352, + "language_loss": 0.71154547, + "learning_rate": 1.769368719290979e-06, + "loss": 0.73251241, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.375, + "step": 9159, + "time_per_iteration": 2.4332242012023926 + }, + { + "auxiliary_loss_clip": 0.01058774, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.01721215, + "balance_loss_mlp": 1.01881242, + "epoch": 0.5507289944385991, + "flos": 29605124903040.0, + "grad_norm": 1.6310180404094303, + "language_loss": 0.69669431, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.71772587, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 9160, + "time_per_iteration": 2.4329280853271484 + }, + { + "auxiliary_loss_clip": 0.01055675, + "auxiliary_loss_mlp": 0.01039526, + "balance_loss_clip": 1.01589859, + "balance_loss_mlp": 1.01898551, + "epoch": 0.5507891176912671, + "flos": 15333713487360.0, + "grad_norm": 1.911008898251302, + "language_loss": 0.73328185, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.75423384, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 9161, + "time_per_iteration": 2.358996629714966 + }, + { + "auxiliary_loss_clip": 0.01058389, + "auxiliary_loss_mlp": 0.01044513, + "balance_loss_clip": 1.01671302, + "balance_loss_mlp": 1.01892507, + "epoch": 0.5508492409439351, + "flos": 26577107752320.0, + "grad_norm": 1.5852653899863, + "language_loss": 0.70630443, + "learning_rate": 1.768208168081359e-06, + "loss": 0.72733349, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39453125, + "step": 9162, + "time_per_iteration": 2.454695701599121 + }, + { + "auxiliary_loss_clip": 0.01056287, + "auxiliary_loss_mlp": 0.01044026, + "balance_loss_clip": 1.01770473, + "balance_loss_mlp": 1.01784992, + "epoch": 0.5509093641966031, + "flos": 25442360876160.0, + "grad_norm": 1.8403649382133367, + "language_loss": 0.86780512, + "learning_rate": 1.767821335237733e-06, + "loss": 0.88880825, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 9163, + "time_per_iteration": 3.812758684158325 + }, + { + "auxiliary_loss_clip": 0.01058324, + "auxiliary_loss_mlp": 0.01041836, + "balance_loss_clip": 1.01652741, + "balance_loss_mlp": 1.01998055, + "epoch": 0.550969487449271, + "flos": 18697524894720.0, + "grad_norm": 1.5936049004617316, + "language_loss": 0.81575346, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.83675498, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 9164, + "time_per_iteration": 2.3507559299468994 + }, + { + "auxiliary_loss_clip": 0.01059465, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.01304948, + "balance_loss_mlp": 1.01955223, + "epoch": 0.551029610701939, + "flos": 22707683902080.0, + "grad_norm": 2.0132349047060423, + "language_loss": 0.75174403, + "learning_rate": 1.767047695977863e-06, + "loss": 0.77274239, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3984375, + "step": 9165, + "time_per_iteration": 2.413749933242798 + }, + { + "auxiliary_loss_clip": 0.01055703, + "auxiliary_loss_mlp": 0.01042199, + "balance_loss_clip": 1.01654553, + "balance_loss_mlp": 1.01743317, + "epoch": 0.5510897339546069, + "flos": 12419722437120.0, + "grad_norm": 1.9307878726343064, + "language_loss": 0.8033185, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.82429749, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 9166, + "time_per_iteration": 2.323681592941284 + }, + { + "auxiliary_loss_clip": 0.01058961, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.01298642, + "balance_loss_mlp": 1.0184536, + "epoch": 0.5511498572072749, + "flos": 18769585674240.0, + "grad_norm": 2.0362108526293228, + "language_loss": 0.76960921, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.79059917, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40429688, + "step": 9167, + "time_per_iteration": 2.389540910720825 + }, + { + "auxiliary_loss_clip": 0.01055936, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.01993799, + "balance_loss_mlp": 1.01783371, + "epoch": 0.5512099804599428, + "flos": 19572308720640.0, + "grad_norm": 2.9724391212911816, + "language_loss": 0.81968546, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.84069109, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38085938, + "step": 9168, + "time_per_iteration": 2.347132444381714 + }, + { + "auxiliary_loss_clip": 0.01059637, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.02266049, + "balance_loss_mlp": 1.01880789, + "epoch": 0.5512701037126109, + "flos": 26244525340800.0, + "grad_norm": 1.6442007315658245, + "language_loss": 0.70274997, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.72385061, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40820312, + "step": 9169, + "time_per_iteration": 2.4525113105773926 + }, + { + "auxiliary_loss_clip": 0.01054048, + "auxiliary_loss_mlp": 0.01040807, + "balance_loss_clip": 1.01717997, + "balance_loss_mlp": 1.01662254, + "epoch": 0.5513302269652788, + "flos": 21944307824640.0, + "grad_norm": 2.064096716722073, + "language_loss": 0.86044455, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.88139307, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 9170, + "time_per_iteration": 2.364689350128174 + }, + { + "auxiliary_loss_clip": 0.01009583, + "auxiliary_loss_mlp": 0.01008562, + "balance_loss_clip": 1.00638044, + "balance_loss_mlp": 1.00234532, + "epoch": 0.5513903502179468, + "flos": 68232818620800.0, + "grad_norm": 0.784304240043542, + "language_loss": 0.60005444, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.62023592, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.07226562, + "step": 9171, + "time_per_iteration": 3.057135581970215 + }, + { + "auxiliary_loss_clip": 0.01056048, + "auxiliary_loss_mlp": 0.01048219, + "balance_loss_clip": 1.02291048, + "balance_loss_mlp": 1.01789653, + "epoch": 0.5514504734706147, + "flos": 18733241082240.0, + "grad_norm": 1.6370679319809294, + "language_loss": 0.71507764, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.73612034, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 9172, + "time_per_iteration": 2.3793253898620605 + }, + { + "auxiliary_loss_clip": 0.01056245, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_clip": 1.01818156, + "balance_loss_mlp": 1.01813924, + "epoch": 0.5515105967232827, + "flos": 22269942875520.0, + "grad_norm": 1.9080651096520829, + "language_loss": 0.77429104, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.79528385, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 9173, + "time_per_iteration": 2.412353277206421 + }, + { + "auxiliary_loss_clip": 0.0105692, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.01752555, + "balance_loss_mlp": 1.01841056, + "epoch": 0.5515707199759508, + "flos": 22556789539200.0, + "grad_norm": 1.6313557595557857, + "language_loss": 0.76699287, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.78799868, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 9174, + "time_per_iteration": 2.4581298828125 + }, + { + "auxiliary_loss_clip": 0.01059179, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.0146904, + "balance_loss_mlp": 1.01957667, + "epoch": 0.5516308432286187, + "flos": 28289876964480.0, + "grad_norm": 1.6873688739908637, + "language_loss": 0.73654068, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.75753796, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 9175, + "time_per_iteration": 2.5249581336975098 + }, + { + "auxiliary_loss_clip": 0.01058421, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.01688695, + "balance_loss_mlp": 1.01816273, + "epoch": 0.5516909664812867, + "flos": 18763650743040.0, + "grad_norm": 1.9715915898190761, + "language_loss": 0.70634639, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.72736961, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 9176, + "time_per_iteration": 2.369164228439331 + }, + { + "auxiliary_loss_clip": 0.01056797, + "auxiliary_loss_mlp": 0.01039555, + "balance_loss_clip": 1.01319742, + "balance_loss_mlp": 1.01835287, + "epoch": 0.5517510897339546, + "flos": 27739261912320.0, + "grad_norm": 1.7803438814773822, + "language_loss": 0.72027975, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.74124324, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38476562, + "step": 9177, + "time_per_iteration": 2.485382318496704 + }, + { + "auxiliary_loss_clip": 0.01059516, + "auxiliary_loss_mlp": 0.01041565, + "balance_loss_clip": 1.01570892, + "balance_loss_mlp": 1.01976287, + "epoch": 0.5518112129866226, + "flos": 18403521402240.0, + "grad_norm": 1.6251339359601547, + "language_loss": 0.81377089, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.83478171, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 9178, + "time_per_iteration": 2.359722852706909 + }, + { + "auxiliary_loss_clip": 0.01059886, + "auxiliary_loss_mlp": 0.01047798, + "balance_loss_clip": 1.02011776, + "balance_loss_mlp": 1.01927853, + "epoch": 0.5518713362392905, + "flos": 25081498396800.0, + "grad_norm": 1.5878298282377064, + "language_loss": 0.75393605, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77501297, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 9179, + "time_per_iteration": 2.4454519748687744 + }, + { + "auxiliary_loss_clip": 0.01058863, + "auxiliary_loss_mlp": 0.01037536, + "balance_loss_clip": 1.01253748, + "balance_loss_mlp": 1.01919413, + "epoch": 0.5519314594919585, + "flos": 36537514041600.0, + "grad_norm": 1.6244368030687935, + "language_loss": 0.7070663, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72803032, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39648438, + "step": 9180, + "time_per_iteration": 2.5000698566436768 + }, + { + "auxiliary_loss_clip": 0.01060585, + "auxiliary_loss_mlp": 0.01046234, + "balance_loss_clip": 1.01713526, + "balance_loss_mlp": 1.02002239, + "epoch": 0.5519915827446265, + "flos": 20447581305600.0, + "grad_norm": 2.1207412318155554, + "language_loss": 0.68842822, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.70949638, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40429688, + "step": 9181, + "time_per_iteration": 2.4087836742401123 + }, + { + "auxiliary_loss_clip": 0.01060608, + "auxiliary_loss_mlp": 0.01045185, + "balance_loss_clip": 1.01588368, + "balance_loss_mlp": 1.01898932, + "epoch": 0.5520517059972945, + "flos": 23766948685440.0, + "grad_norm": 1.913604945843073, + "language_loss": 0.80197525, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.82303315, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41601562, + "step": 9182, + "time_per_iteration": 2.3929319381713867 + }, + { + "auxiliary_loss_clip": 0.01058953, + "auxiliary_loss_mlp": 0.01040612, + "balance_loss_clip": 1.01328969, + "balance_loss_mlp": 1.01886892, + "epoch": 0.5521118292499624, + "flos": 22195473212160.0, + "grad_norm": 2.3564041284992383, + "language_loss": 0.84101868, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.86201435, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 9183, + "time_per_iteration": 2.4321517944335938 + }, + { + "auxiliary_loss_clip": 0.01057446, + "auxiliary_loss_mlp": 0.01040794, + "balance_loss_clip": 1.01419806, + "balance_loss_mlp": 1.01842761, + "epoch": 0.5521719525026304, + "flos": 23582258259840.0, + "grad_norm": 1.3356188840544858, + "language_loss": 0.6805346, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.70151699, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 9184, + "time_per_iteration": 2.400406837463379 + }, + { + "auxiliary_loss_clip": 0.01059987, + "auxiliary_loss_mlp": 0.01039879, + "balance_loss_clip": 1.01117325, + "balance_loss_mlp": 1.02014494, + "epoch": 0.5522320757552983, + "flos": 26136503994240.0, + "grad_norm": 1.5977206569476512, + "language_loss": 0.77196604, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.7929647, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.3984375, + "step": 9185, + "time_per_iteration": 2.429492235183716 + }, + { + "auxiliary_loss_clip": 0.01059312, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.02195764, + "balance_loss_mlp": 1.01875532, + "epoch": 0.5522921990079663, + "flos": 24675144549120.0, + "grad_norm": 1.7531684120012776, + "language_loss": 0.75334585, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.77444363, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40625, + "step": 9186, + "time_per_iteration": 2.390329599380493 + }, + { + "auxiliary_loss_clip": 0.01061007, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_clip": 1.0211674, + "balance_loss_mlp": 1.02012503, + "epoch": 0.5523523222606344, + "flos": 22747030871040.0, + "grad_norm": 1.8701277765106823, + "language_loss": 0.67645961, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.69755113, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 9187, + "time_per_iteration": 2.4065563678741455 + }, + { + "auxiliary_loss_clip": 0.01060101, + "auxiliary_loss_mlp": 0.01045413, + "balance_loss_clip": 1.01916349, + "balance_loss_mlp": 1.01991844, + "epoch": 0.5524124455133023, + "flos": 19754799730560.0, + "grad_norm": 1.8499425442188828, + "language_loss": 0.78257847, + "learning_rate": 1.758153413657318e-06, + "loss": 0.80363363, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 9188, + "time_per_iteration": 2.377748489379883 + }, + { + "auxiliary_loss_clip": 0.01058234, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.01621377, + "balance_loss_mlp": 1.01853347, + "epoch": 0.5524725687659703, + "flos": 23293700939520.0, + "grad_norm": 1.8213033852410643, + "language_loss": 0.83582628, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.85684085, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 9189, + "time_per_iteration": 2.378924608230591 + }, + { + "auxiliary_loss_clip": 0.01058503, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.01767063, + "balance_loss_mlp": 1.01921511, + "epoch": 0.5525326920186382, + "flos": 24861056872320.0, + "grad_norm": 1.6622967414901515, + "language_loss": 0.77373821, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.79476559, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 9190, + "time_per_iteration": 2.4461700916290283 + }, + { + "auxiliary_loss_clip": 0.01061698, + "auxiliary_loss_mlp": 0.01052716, + "balance_loss_clip": 1.02119708, + "balance_loss_mlp": 1.0188098, + "epoch": 0.5525928152713062, + "flos": 13734725996160.0, + "grad_norm": 2.560013088717474, + "language_loss": 0.80507934, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.82622343, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.4296875, + "step": 9191, + "time_per_iteration": 2.340862274169922 + }, + { + "auxiliary_loss_clip": 0.0105698, + "auxiliary_loss_mlp": 0.01041044, + "balance_loss_clip": 1.01512825, + "balance_loss_mlp": 1.01812005, + "epoch": 0.5526529385239741, + "flos": 13070957627520.0, + "grad_norm": 1.767515491681345, + "language_loss": 0.69881183, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71979213, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38867188, + "step": 9192, + "time_per_iteration": 3.5769171714782715 + }, + { + "auxiliary_loss_clip": 0.01056235, + "auxiliary_loss_mlp": 0.01037714, + "balance_loss_clip": 1.01328766, + "balance_loss_mlp": 1.0183475, + "epoch": 0.5527130617766421, + "flos": 23147275230720.0, + "grad_norm": 1.4675193631851224, + "language_loss": 0.78197569, + "learning_rate": 1.756220509823588e-06, + "loss": 0.80291522, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 9193, + "time_per_iteration": 3.776726722717285 + }, + { + "auxiliary_loss_clip": 0.01058172, + "auxiliary_loss_mlp": 0.01047287, + "balance_loss_clip": 1.02171636, + "balance_loss_mlp": 1.01765156, + "epoch": 0.55277318502931, + "flos": 21284554262400.0, + "grad_norm": 1.5600468795275042, + "language_loss": 0.79391778, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.8149724, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40625, + "step": 9194, + "time_per_iteration": 2.3703150749206543 + }, + { + "auxiliary_loss_clip": 0.0106053, + "auxiliary_loss_mlp": 0.01041485, + "balance_loss_clip": 1.01525903, + "balance_loss_mlp": 1.01778853, + "epoch": 0.5528333082819781, + "flos": 38323077171840.0, + "grad_norm": 2.041674738626649, + "language_loss": 0.70133263, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.72235274, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.42773438, + "step": 9195, + "time_per_iteration": 3.9303488731384277 + }, + { + "auxiliary_loss_clip": 0.0106237, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_clip": 1.01774824, + "balance_loss_mlp": 1.01920605, + "epoch": 0.552893431534646, + "flos": 13552758656640.0, + "grad_norm": 1.9671246255640147, + "language_loss": 0.75125611, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.77235055, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.43164062, + "step": 9196, + "time_per_iteration": 2.403614044189453 + }, + { + "auxiliary_loss_clip": 0.01057416, + "auxiliary_loss_mlp": 0.01038935, + "balance_loss_clip": 1.01399636, + "balance_loss_mlp": 1.01893687, + "epoch": 0.552953554787314, + "flos": 21938477627520.0, + "grad_norm": 1.519266390381783, + "language_loss": 0.77587616, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.79683965, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38476562, + "step": 9197, + "time_per_iteration": 2.384580612182617 + }, + { + "auxiliary_loss_clip": 0.01057892, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.01206553, + "balance_loss_mlp": 1.01864028, + "epoch": 0.5530136780399819, + "flos": 43656199528320.0, + "grad_norm": 2.729385044324578, + "language_loss": 0.7685467, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78948969, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.39257812, + "step": 9198, + "time_per_iteration": 2.6204497814178467 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.01482391, + "balance_loss_mlp": 1.01713586, + "epoch": 0.5530738012926499, + "flos": 25044350843520.0, + "grad_norm": 1.5509690090048953, + "language_loss": 0.79867828, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81963396, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38671875, + "step": 9199, + "time_per_iteration": 2.437286376953125 + }, + { + "auxiliary_loss_clip": 0.0105681, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.01164293, + "balance_loss_mlp": 1.0172224, + "epoch": 0.553133924545318, + "flos": 16471148538240.0, + "grad_norm": 1.6753277825684778, + "language_loss": 0.64931262, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.67025316, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39648438, + "step": 9200, + "time_per_iteration": 2.369597911834717 + }, + { + "auxiliary_loss_clip": 0.01059909, + "auxiliary_loss_mlp": 0.01042238, + "balance_loss_clip": 1.01150608, + "balance_loss_mlp": 1.01835704, + "epoch": 0.5531940477979859, + "flos": 24605108628480.0, + "grad_norm": 1.5441726668124012, + "language_loss": 0.66981852, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.69084001, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.41601562, + "step": 9201, + "time_per_iteration": 2.4304957389831543 + }, + { + "auxiliary_loss_clip": 0.01060132, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.0194633, + "balance_loss_mlp": 1.02012205, + "epoch": 0.5532541710506539, + "flos": 22158604949760.0, + "grad_norm": 1.945114592552578, + "language_loss": 0.62201154, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.64309752, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40039062, + "step": 9202, + "time_per_iteration": 2.42339825630188 + }, + { + "auxiliary_loss_clip": 0.01057778, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.01389968, + "balance_loss_mlp": 1.01951957, + "epoch": 0.5533142943033218, + "flos": 21396206390400.0, + "grad_norm": 2.078790064874674, + "language_loss": 0.65846097, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.67944157, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3828125, + "step": 9203, + "time_per_iteration": 3.7850546836853027 + }, + { + "auxiliary_loss_clip": 0.01057199, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.01190984, + "balance_loss_mlp": 1.01755726, + "epoch": 0.5533744175559898, + "flos": 23549404803840.0, + "grad_norm": 1.5353117249854624, + "language_loss": 0.6431849, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.66414154, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 9204, + "time_per_iteration": 2.429094076156616 + }, + { + "auxiliary_loss_clip": 0.01056566, + "auxiliary_loss_mlp": 0.01045506, + "balance_loss_clip": 1.02077007, + "balance_loss_mlp": 1.01784348, + "epoch": 0.5534345408086577, + "flos": 24060358684800.0, + "grad_norm": 1.5521957660321217, + "language_loss": 0.78408074, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.80510151, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 9205, + "time_per_iteration": 2.4150044918060303 + }, + { + "auxiliary_loss_clip": 0.01058247, + "auxiliary_loss_mlp": 0.01042597, + "balance_loss_clip": 1.01757479, + "balance_loss_mlp": 1.0199337, + "epoch": 0.5534946640613257, + "flos": 33770262902400.0, + "grad_norm": 1.383561578313534, + "language_loss": 0.73353618, + "learning_rate": 1.751196045993537e-06, + "loss": 0.75454462, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 9206, + "time_per_iteration": 2.529114246368408 + }, + { + "auxiliary_loss_clip": 0.01058336, + "auxiliary_loss_mlp": 0.0104665, + "balance_loss_clip": 1.01943469, + "balance_loss_mlp": 1.01896524, + "epoch": 0.5535547873139937, + "flos": 15158309483520.0, + "grad_norm": 2.0684553373031576, + "language_loss": 0.76330268, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.78435254, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 9207, + "time_per_iteration": 2.3636984825134277 + }, + { + "auxiliary_loss_clip": 0.01061024, + "auxiliary_loss_mlp": 0.01048886, + "balance_loss_clip": 1.01944113, + "balance_loss_mlp": 1.0195483, + "epoch": 0.5536149105666617, + "flos": 16979972826240.0, + "grad_norm": 3.3978010434470387, + "language_loss": 0.64710319, + "learning_rate": 1.750423192272189e-06, + "loss": 0.66820228, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4140625, + "step": 9208, + "time_per_iteration": 2.363457441329956 + }, + { + "auxiliary_loss_clip": 0.01058564, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.01352549, + "balance_loss_mlp": 1.01869249, + "epoch": 0.5536750338193296, + "flos": 18148969612800.0, + "grad_norm": 2.0488900148248157, + "language_loss": 0.66506851, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.68605435, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 9209, + "time_per_iteration": 2.363802433013916 + }, + { + "auxiliary_loss_clip": 0.01058367, + "auxiliary_loss_mlp": 0.01050775, + "balance_loss_clip": 1.02283204, + "balance_loss_mlp": 1.01834893, + "epoch": 0.5537351570719976, + "flos": 22746681757440.0, + "grad_norm": 2.045828516512412, + "language_loss": 0.83813059, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.85922205, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40039062, + "step": 9210, + "time_per_iteration": 2.4034178256988525 + }, + { + "auxiliary_loss_clip": 0.01055989, + "auxiliary_loss_mlp": 0.01043739, + "balance_loss_clip": 1.01908612, + "balance_loss_mlp": 1.01767623, + "epoch": 0.5537952803246655, + "flos": 26354920659840.0, + "grad_norm": 4.461585705198792, + "language_loss": 0.73940337, + "learning_rate": 1.74926398270663e-06, + "loss": 0.76040059, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 9211, + "time_per_iteration": 2.4348394870758057 + }, + { + "auxiliary_loss_clip": 0.01060295, + "auxiliary_loss_mlp": 0.01047412, + "balance_loss_clip": 1.01738286, + "balance_loss_mlp": 1.01858091, + "epoch": 0.5538554035773335, + "flos": 18036549434880.0, + "grad_norm": 2.3025685649430505, + "language_loss": 0.67700422, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.69808125, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41796875, + "step": 9212, + "time_per_iteration": 2.385986566543579 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_clip": 1.01449835, + "balance_loss_mlp": 1.01704848, + "epoch": 0.5539155268300014, + "flos": 31684900993920.0, + "grad_norm": 1.4806686956497581, + "language_loss": 0.53044724, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.55149055, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.42578125, + "step": 9213, + "time_per_iteration": 2.466909646987915 + }, + { + "auxiliary_loss_clip": 0.01060093, + "auxiliary_loss_mlp": 0.01046559, + "balance_loss_clip": 1.02014208, + "balance_loss_mlp": 1.01943111, + "epoch": 0.5539756500826695, + "flos": 15192908507520.0, + "grad_norm": 2.666401618199012, + "language_loss": 0.87095213, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.89201862, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40625, + "step": 9214, + "time_per_iteration": 2.366302967071533 + }, + { + "auxiliary_loss_clip": 0.01058483, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.01510561, + "balance_loss_mlp": 1.01983917, + "epoch": 0.5540357733353375, + "flos": 26352092839680.0, + "grad_norm": 1.6833256450571068, + "language_loss": 0.71177828, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.7327792, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 9215, + "time_per_iteration": 2.428729772567749 + }, + { + "auxiliary_loss_clip": 0.01060357, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.01350701, + "balance_loss_mlp": 1.01950169, + "epoch": 0.5540958965880054, + "flos": 21322644422400.0, + "grad_norm": 3.5470472541215035, + "language_loss": 0.74340463, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.76441431, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40820312, + "step": 9216, + "time_per_iteration": 2.415776491165161 + }, + { + "auxiliary_loss_clip": 0.01055119, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.01369214, + "balance_loss_mlp": 1.01777899, + "epoch": 0.5541560198406734, + "flos": 25665630220800.0, + "grad_norm": 2.1169619820523584, + "language_loss": 0.73433208, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.75525969, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37304688, + "step": 9217, + "time_per_iteration": 2.429340124130249 + }, + { + "auxiliary_loss_clip": 0.01055415, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.01384676, + "balance_loss_mlp": 1.01749587, + "epoch": 0.5542161430933413, + "flos": 21938687095680.0, + "grad_norm": 1.7233229313224785, + "language_loss": 0.79160762, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.81253815, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37890625, + "step": 9218, + "time_per_iteration": 2.3886399269104004 + }, + { + "auxiliary_loss_clip": 0.01059, + "auxiliary_loss_mlp": 0.01046397, + "balance_loss_clip": 1.01573682, + "balance_loss_mlp": 1.01734102, + "epoch": 0.5542762663460093, + "flos": 19570493329920.0, + "grad_norm": 1.6695513465954157, + "language_loss": 0.73087752, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.75193149, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.41601562, + "step": 9219, + "time_per_iteration": 2.38936448097229 + }, + { + "auxiliary_loss_clip": 0.01060027, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.01533306, + "balance_loss_mlp": 1.01891708, + "epoch": 0.5543363895986773, + "flos": 19498083436800.0, + "grad_norm": 1.6751041061945051, + "language_loss": 0.72195196, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.74297571, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 9220, + "time_per_iteration": 2.3764688968658447 + }, + { + "auxiliary_loss_clip": 0.01056131, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.01240098, + "balance_loss_mlp": 1.01766109, + "epoch": 0.5543965128513453, + "flos": 22634575781760.0, + "grad_norm": 1.5590197795281522, + "language_loss": 0.80327368, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.82420874, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38476562, + "step": 9221, + "time_per_iteration": 2.391430377960205 + }, + { + "auxiliary_loss_clip": 0.01057491, + "auxiliary_loss_mlp": 0.01043244, + "balance_loss_clip": 1.01555204, + "balance_loss_mlp": 1.01797605, + "epoch": 0.5544566361040132, + "flos": 25988891299200.0, + "grad_norm": 3.3599689568082294, + "language_loss": 0.84447789, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86548531, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39453125, + "step": 9222, + "time_per_iteration": 2.4187347888946533 + }, + { + "auxiliary_loss_clip": 0.0106106, + "auxiliary_loss_mlp": 0.01050578, + "balance_loss_clip": 1.02112174, + "balance_loss_mlp": 1.01922345, + "epoch": 0.5545167593566812, + "flos": 28256290369920.0, + "grad_norm": 1.655745439943986, + "language_loss": 0.76688802, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.7880044, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41796875, + "step": 9223, + "time_per_iteration": 2.443096160888672 + }, + { + "auxiliary_loss_clip": 0.01056318, + "auxiliary_loss_mlp": 0.01043665, + "balance_loss_clip": 1.01865458, + "balance_loss_mlp": 1.0175581, + "epoch": 0.5545768826093491, + "flos": 28475265617280.0, + "grad_norm": 1.7141676486825865, + "language_loss": 0.83399391, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.8549937, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 9224, + "time_per_iteration": 2.451061487197876 + }, + { + "auxiliary_loss_clip": 0.01058041, + "auxiliary_loss_mlp": 0.01051763, + "balance_loss_clip": 1.0261209, + "balance_loss_mlp": 1.01814878, + "epoch": 0.5546370058620171, + "flos": 18477083370240.0, + "grad_norm": 2.3772323927073393, + "language_loss": 0.59058785, + "learning_rate": 1.743855475904141e-06, + "loss": 0.61168587, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3984375, + "step": 9225, + "time_per_iteration": 2.3830294609069824 + }, + { + "auxiliary_loss_clip": 0.01057728, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.01447284, + "balance_loss_mlp": 1.01727414, + "epoch": 0.554697129114685, + "flos": 22929382235520.0, + "grad_norm": 1.9745354379826432, + "language_loss": 0.68642306, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.7074008, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40429688, + "step": 9226, + "time_per_iteration": 2.4468395709991455 + }, + { + "auxiliary_loss_clip": 0.01057513, + "auxiliary_loss_mlp": 0.0104388, + "balance_loss_clip": 1.01804686, + "balance_loss_mlp": 1.01761198, + "epoch": 0.5547572523673531, + "flos": 21796136547840.0, + "grad_norm": 1.5383973367518706, + "language_loss": 0.75837314, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.77938712, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 9227, + "time_per_iteration": 2.41269588470459 + }, + { + "auxiliary_loss_clip": 0.01060073, + "auxiliary_loss_mlp": 0.01044583, + "balance_loss_clip": 1.01611614, + "balance_loss_mlp": 1.01992846, + "epoch": 0.5548173756200211, + "flos": 22341829098240.0, + "grad_norm": 1.7260189792259566, + "language_loss": 0.74539554, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.76644206, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40234375, + "step": 9228, + "time_per_iteration": 2.369199275970459 + }, + { + "auxiliary_loss_clip": 0.01059161, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.01380181, + "balance_loss_mlp": 1.01848316, + "epoch": 0.554877498872689, + "flos": 17857759029120.0, + "grad_norm": 1.6917172683992476, + "language_loss": 0.773045, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.79402751, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40625, + "step": 9229, + "time_per_iteration": 2.4117865562438965 + }, + { + "auxiliary_loss_clip": 0.01057646, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_clip": 1.0191505, + "balance_loss_mlp": 1.01800489, + "epoch": 0.554937622125357, + "flos": 17237387347200.0, + "grad_norm": 1.7401500435094004, + "language_loss": 0.70279574, + "learning_rate": 1.741924325613172e-06, + "loss": 0.72384155, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39648438, + "step": 9230, + "time_per_iteration": 2.3469128608703613 + }, + { + "auxiliary_loss_clip": 0.01059352, + "auxiliary_loss_mlp": 0.01040902, + "balance_loss_clip": 1.01284027, + "balance_loss_mlp": 1.01856756, + "epoch": 0.5549977453780249, + "flos": 25367088251520.0, + "grad_norm": 3.112531765614294, + "language_loss": 0.70716316, + "learning_rate": 1.741538124855163e-06, + "loss": 0.72816569, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40625, + "step": 9231, + "time_per_iteration": 3.6563682556152344 + }, + { + "auxiliary_loss_clip": 0.01060253, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_clip": 1.01683545, + "balance_loss_mlp": 1.01902509, + "epoch": 0.555057868630693, + "flos": 25078042172160.0, + "grad_norm": 1.633228671135284, + "language_loss": 0.79433107, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.81539232, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41210938, + "step": 9232, + "time_per_iteration": 2.4073214530944824 + }, + { + "auxiliary_loss_clip": 0.01056008, + "auxiliary_loss_mlp": 0.0104421, + "balance_loss_clip": 1.01847291, + "balance_loss_mlp": 1.01747739, + "epoch": 0.5551179918833609, + "flos": 26103022133760.0, + "grad_norm": 1.619224569939363, + "language_loss": 0.84006721, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.86106944, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9233, + "time_per_iteration": 3.844449758529663 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.01394713, + "balance_loss_mlp": 1.01825488, + "epoch": 0.5551781151360289, + "flos": 19383917690880.0, + "grad_norm": 2.610989924249469, + "language_loss": 0.77043319, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.7914753, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41796875, + "step": 9234, + "time_per_iteration": 2.3553028106689453 + }, + { + "auxiliary_loss_clip": 0.01056578, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.01279926, + "balance_loss_mlp": 1.01779366, + "epoch": 0.5552382383886968, + "flos": 21724878729600.0, + "grad_norm": 1.91421092057046, + "language_loss": 0.67222404, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.69316828, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 9235, + "time_per_iteration": 3.729624032974243 + }, + { + "auxiliary_loss_clip": 0.01058749, + "auxiliary_loss_mlp": 0.0104731, + "balance_loss_clip": 1.01825917, + "balance_loss_mlp": 1.01713562, + "epoch": 0.5552983616413648, + "flos": 14355307146240.0, + "grad_norm": 1.9562642390149214, + "language_loss": 0.69683111, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.71789175, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41601562, + "step": 9236, + "time_per_iteration": 2.334315061569214 + }, + { + "auxiliary_loss_clip": 0.01054895, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.01179123, + "balance_loss_mlp": 1.01688552, + "epoch": 0.5553584848940327, + "flos": 25477518481920.0, + "grad_norm": 1.6407874131645486, + "language_loss": 0.86509657, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88602424, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 9237, + "time_per_iteration": -0.22570204734802246 + }, + { + "auxiliary_loss_clip": 0.01055987, + "auxiliary_loss_mlp": 0.01045905, + "balance_loss_clip": 1.01969123, + "balance_loss_mlp": 1.01730657, + "epoch": 0.5554186081467007, + "flos": 22162759401600.0, + "grad_norm": 1.8260989042868407, + "language_loss": 0.75146019, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.77247918, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9238, + "time_per_iteration": 2.3745672702789307 + }, + { + "auxiliary_loss_clip": 0.01058365, + "auxiliary_loss_mlp": 0.01042123, + "balance_loss_clip": 1.01576567, + "balance_loss_mlp": 1.01737332, + "epoch": 0.5554787313993687, + "flos": 49744807994880.0, + "grad_norm": 1.6443770670761424, + "language_loss": 0.79886454, + "learning_rate": 1.73844887285358e-06, + "loss": 0.8198694, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.41015625, + "step": 9239, + "time_per_iteration": 2.655271291732788 + }, + { + "auxiliary_loss_clip": 0.01058926, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.01327538, + "balance_loss_mlp": 1.01922548, + "epoch": 0.5555388546520367, + "flos": 22126275164160.0, + "grad_norm": 1.4830358866742888, + "language_loss": 0.8080231, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82899362, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3984375, + "step": 9240, + "time_per_iteration": 2.3737666606903076 + }, + { + "auxiliary_loss_clip": 0.01058145, + "auxiliary_loss_mlp": 0.01043661, + "balance_loss_clip": 1.01693439, + "balance_loss_mlp": 1.01781738, + "epoch": 0.5555989779047047, + "flos": 24680939834880.0, + "grad_norm": 2.2400575629296204, + "language_loss": 0.66702247, + "learning_rate": 1.737676658740786e-06, + "loss": 0.68804061, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40234375, + "step": 9241, + "time_per_iteration": 2.389888048171997 + }, + { + "auxiliary_loss_clip": 0.01059436, + "auxiliary_loss_mlp": 0.01046466, + "balance_loss_clip": 1.01793909, + "balance_loss_mlp": 1.01867723, + "epoch": 0.5556591011573726, + "flos": 16105607936640.0, + "grad_norm": 1.9946343640563353, + "language_loss": 0.73946714, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.76052618, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40820312, + "step": 9242, + "time_per_iteration": 2.364753246307373 + }, + { + "auxiliary_loss_clip": 0.01058297, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.01529455, + "balance_loss_mlp": 1.01790464, + "epoch": 0.5557192244100406, + "flos": 12932840822400.0, + "grad_norm": 2.1696820197609226, + "language_loss": 0.6529848, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.67400289, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40429688, + "step": 9243, + "time_per_iteration": 3.894040107727051 + }, + { + "auxiliary_loss_clip": 0.01059827, + "auxiliary_loss_mlp": 0.01044207, + "balance_loss_clip": 1.01711059, + "balance_loss_mlp": 1.01971555, + "epoch": 0.5557793476627085, + "flos": 23110616436480.0, + "grad_norm": 1.957881261016502, + "language_loss": 0.76107538, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.78211576, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 9244, + "time_per_iteration": 2.362924098968506 + }, + { + "auxiliary_loss_clip": 0.01055836, + "auxiliary_loss_mlp": 0.01043492, + "balance_loss_clip": 1.01945949, + "balance_loss_mlp": 1.01808524, + "epoch": 0.5558394709153766, + "flos": 21427139721600.0, + "grad_norm": 2.5477495621374278, + "language_loss": 0.75771207, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77870536, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37695312, + "step": 9245, + "time_per_iteration": 2.38907790184021 + }, + { + "auxiliary_loss_clip": 0.01060548, + "auxiliary_loss_mlp": 0.01047634, + "balance_loss_clip": 1.01876187, + "balance_loss_mlp": 1.01852441, + "epoch": 0.5558995941680445, + "flos": 25077274122240.0, + "grad_norm": 2.380829236453101, + "language_loss": 0.80332035, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82440215, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.421875, + "step": 9246, + "time_per_iteration": 2.4106640815734863 + }, + { + "auxiliary_loss_clip": 0.01059011, + "auxiliary_loss_mlp": 0.01043375, + "balance_loss_clip": 1.0172441, + "balance_loss_mlp": 1.01882577, + "epoch": 0.5559597174207125, + "flos": 20010119569920.0, + "grad_norm": 1.85023833811184, + "language_loss": 0.74488884, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76591277, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 9247, + "time_per_iteration": 2.4263551235198975 + }, + { + "auxiliary_loss_clip": 0.01058955, + "auxiliary_loss_mlp": 0.01050615, + "balance_loss_clip": 1.02086043, + "balance_loss_mlp": 1.01860452, + "epoch": 0.5560198406733804, + "flos": 16834769015040.0, + "grad_norm": 3.220478147195277, + "language_loss": 0.77109385, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.7921896, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40234375, + "step": 9248, + "time_per_iteration": 2.3174362182617188 + }, + { + "auxiliary_loss_clip": 0.01009949, + "auxiliary_loss_mlp": 0.01003662, + "balance_loss_clip": 1.00111091, + "balance_loss_mlp": 1.00220883, + "epoch": 0.5560799639260484, + "flos": 70693391577600.0, + "grad_norm": 0.8534170620176549, + "language_loss": 0.59518552, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61532164, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.07763672, + "step": 9249, + "time_per_iteration": 3.1202709674835205 + }, + { + "auxiliary_loss_clip": 0.01055985, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.01439142, + "balance_loss_mlp": 1.01699352, + "epoch": 0.5561400871787163, + "flos": 23147484698880.0, + "grad_norm": 1.864325578461777, + "language_loss": 0.80538189, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82635331, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 9250, + "time_per_iteration": 2.3554506301879883 + }, + { + "auxiliary_loss_clip": 0.01057965, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.01823294, + "balance_loss_mlp": 1.01717389, + "epoch": 0.5562002104313843, + "flos": 17565466193280.0, + "grad_norm": 1.9466571000124746, + "language_loss": 0.70088965, + "learning_rate": 1.733816187358836e-06, + "loss": 0.72194374, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40820312, + "step": 9251, + "time_per_iteration": 2.3438754081726074 + }, + { + "auxiliary_loss_clip": 0.01057819, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.01660895, + "balance_loss_mlp": 1.01796377, + "epoch": 0.5562603336840523, + "flos": 25044281020800.0, + "grad_norm": 1.6409065352839491, + "language_loss": 0.76406777, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.7850889, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 9252, + "time_per_iteration": 2.411550283432007 + }, + { + "auxiliary_loss_clip": 0.01059568, + "auxiliary_loss_mlp": 0.01047551, + "balance_loss_clip": 1.01891661, + "balance_loss_mlp": 1.01853609, + "epoch": 0.5563204569367203, + "flos": 29057756607360.0, + "grad_norm": 1.4856609571445794, + "language_loss": 0.74015641, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.76122761, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 9253, + "time_per_iteration": 2.44506573677063 + }, + { + "auxiliary_loss_clip": 0.01058628, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.01719451, + "balance_loss_mlp": 1.01898479, + "epoch": 0.5563805801893883, + "flos": 22089371990400.0, + "grad_norm": 2.0022683284969665, + "language_loss": 0.83640879, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85742056, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39648438, + "step": 9254, + "time_per_iteration": 2.3668556213378906 + }, + { + "auxiliary_loss_clip": 0.01008298, + "auxiliary_loss_mlp": 0.01008935, + "balance_loss_clip": 1.00626516, + "balance_loss_mlp": 1.00119257, + "epoch": 0.5564407034420562, + "flos": 58633379544960.0, + "grad_norm": 0.873450334206818, + "language_loss": 0.64886624, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66903859, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.07128906, + "step": 9255, + "time_per_iteration": 2.882638931274414 + }, + { + "auxiliary_loss_clip": 0.01056786, + "auxiliary_loss_mlp": 0.01041714, + "balance_loss_clip": 1.01663232, + "balance_loss_mlp": 1.01857901, + "epoch": 0.5565008266947242, + "flos": 23111209929600.0, + "grad_norm": 1.691654122854711, + "language_loss": 0.70080554, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.72179055, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 9256, + "time_per_iteration": 2.367661237716675 + }, + { + "auxiliary_loss_clip": 0.01056445, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.01405787, + "balance_loss_mlp": 1.01791883, + "epoch": 0.5565609499473921, + "flos": 21577370768640.0, + "grad_norm": 1.6353600774456003, + "language_loss": 0.76965111, + "learning_rate": 1.73150038809119e-06, + "loss": 0.790609, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 9257, + "time_per_iteration": 2.400399923324585 + }, + { + "auxiliary_loss_clip": 0.01057296, + "auxiliary_loss_mlp": 0.01045754, + "balance_loss_clip": 1.01898003, + "balance_loss_mlp": 1.01761377, + "epoch": 0.5566210732000602, + "flos": 18368643087360.0, + "grad_norm": 2.6484574031828356, + "language_loss": 0.62938869, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.65041924, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39648438, + "step": 9258, + "time_per_iteration": 2.334697723388672 + }, + { + "auxiliary_loss_clip": 0.01058961, + "auxiliary_loss_mlp": 0.01046223, + "balance_loss_clip": 1.01683784, + "balance_loss_mlp": 1.01863742, + "epoch": 0.5566811964527281, + "flos": 25702149369600.0, + "grad_norm": 1.6445113211658102, + "language_loss": 0.80115783, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.82220972, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40234375, + "step": 9259, + "time_per_iteration": 2.427361011505127 + }, + { + "auxiliary_loss_clip": 0.01058482, + "auxiliary_loss_mlp": 0.01042647, + "balance_loss_clip": 1.01549125, + "balance_loss_mlp": 1.01782179, + "epoch": 0.5567413197053961, + "flos": 26942753088000.0, + "grad_norm": 1.774477557924321, + "language_loss": 0.83204716, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.85305846, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 9260, + "time_per_iteration": 2.421825408935547 + }, + { + "auxiliary_loss_clip": 0.0105825, + "auxiliary_loss_mlp": 0.01047793, + "balance_loss_clip": 1.01934958, + "balance_loss_mlp": 1.01871789, + "epoch": 0.556801442958064, + "flos": 20849536321920.0, + "grad_norm": 1.807488036676987, + "language_loss": 0.70392787, + "learning_rate": 1.729956725348256e-06, + "loss": 0.72498828, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39453125, + "step": 9261, + "time_per_iteration": 2.3937456607818604 + }, + { + "auxiliary_loss_clip": 0.01009073, + "auxiliary_loss_mlp": 0.010024, + "balance_loss_clip": 0.99988443, + "balance_loss_mlp": 1.00188375, + "epoch": 0.556861566210732, + "flos": 70495015898880.0, + "grad_norm": 0.7364127035065363, + "language_loss": 0.61186457, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63197929, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.02514648, + "router_z_loss_mlp": 0.07226562, + "step": 9262, + "time_per_iteration": 2.9974472522735596 + }, + { + "auxiliary_loss_clip": 0.01058992, + "auxiliary_loss_mlp": 0.01043562, + "balance_loss_clip": 1.01648927, + "balance_loss_mlp": 1.01893771, + "epoch": 0.5569216894633999, + "flos": 25336120008960.0, + "grad_norm": 1.5933336652843424, + "language_loss": 0.65901494, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.68004048, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40039062, + "step": 9263, + "time_per_iteration": 2.423570394515991 + }, + { + "auxiliary_loss_clip": 0.01056786, + "auxiliary_loss_mlp": 0.01041353, + "balance_loss_clip": 1.01502001, + "balance_loss_mlp": 1.01771235, + "epoch": 0.556981812716068, + "flos": 22637613070080.0, + "grad_norm": 1.733592292883532, + "language_loss": 0.73715144, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75813282, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.390625, + "step": 9264, + "time_per_iteration": 2.3729918003082275 + }, + { + "auxiliary_loss_clip": 0.01060203, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_clip": 1.01752925, + "balance_loss_mlp": 1.02015877, + "epoch": 0.5570419359687359, + "flos": 11035066982400.0, + "grad_norm": 2.0601590034954866, + "language_loss": 0.77938616, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.80041695, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 9265, + "time_per_iteration": 2.37692928314209 + }, + { + "auxiliary_loss_clip": 0.01056342, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_clip": 1.0204519, + "balance_loss_mlp": 1.01858389, + "epoch": 0.5571020592214039, + "flos": 22821954382080.0, + "grad_norm": 1.372458321936311, + "language_loss": 0.71835935, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73936331, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 9266, + "time_per_iteration": 2.4074583053588867 + }, + { + "auxiliary_loss_clip": 0.01057157, + "auxiliary_loss_mlp": 0.01042163, + "balance_loss_clip": 1.01703334, + "balance_loss_mlp": 1.01839113, + "epoch": 0.5571621824740719, + "flos": 22926728972160.0, + "grad_norm": 2.0116384929176268, + "language_loss": 0.69389522, + "learning_rate": 1.727641538728533e-06, + "loss": 0.71488839, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 9267, + "time_per_iteration": 2.3831422328948975 + }, + { + "auxiliary_loss_clip": 0.01055142, + "auxiliary_loss_mlp": 0.01042562, + "balance_loss_clip": 1.01858878, + "balance_loss_mlp": 1.0178802, + "epoch": 0.5572223057267398, + "flos": 22965587182080.0, + "grad_norm": 2.119624605734058, + "language_loss": 0.76320827, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.78418535, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37304688, + "step": 9268, + "time_per_iteration": 2.3947315216064453 + }, + { + "auxiliary_loss_clip": 0.0105755, + "auxiliary_loss_mlp": 0.0103942, + "balance_loss_clip": 1.01623368, + "balance_loss_mlp": 1.01906157, + "epoch": 0.5572824289794078, + "flos": 20958989034240.0, + "grad_norm": 2.326644502080678, + "language_loss": 0.75527292, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77624261, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38476562, + "step": 9269, + "time_per_iteration": 2.3992502689361572 + }, + { + "auxiliary_loss_clip": 0.01057004, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.01784682, + "balance_loss_mlp": 1.01859093, + "epoch": 0.5573425522320757, + "flos": 25041348466560.0, + "grad_norm": 2.017765674936808, + "language_loss": 0.84115529, + "learning_rate": 1.726484084647256e-06, + "loss": 0.86214489, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38476562, + "step": 9270, + "time_per_iteration": 2.3983871936798096 + }, + { + "auxiliary_loss_clip": 0.01057682, + "auxiliary_loss_mlp": 0.0104271, + "balance_loss_clip": 1.01759231, + "balance_loss_mlp": 1.01880455, + "epoch": 0.5574026754847438, + "flos": 23658508402560.0, + "grad_norm": 1.8755766583253284, + "language_loss": 0.81144142, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.83244526, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 9271, + "time_per_iteration": 2.3901779651641846 + }, + { + "auxiliary_loss_clip": 0.01057754, + "auxiliary_loss_mlp": 0.01038147, + "balance_loss_clip": 1.01276731, + "balance_loss_mlp": 1.0184772, + "epoch": 0.5574627987374117, + "flos": 24781315593600.0, + "grad_norm": 1.7334005736882865, + "language_loss": 0.90879023, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92974925, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 9272, + "time_per_iteration": 3.6055901050567627 + }, + { + "auxiliary_loss_clip": 0.01055494, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.01480603, + "balance_loss_mlp": 1.01814127, + "epoch": 0.5575229219900797, + "flos": 21833877594240.0, + "grad_norm": 1.9560915059089719, + "language_loss": 0.85906267, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.88001955, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 9273, + "time_per_iteration": 3.7747347354888916 + }, + { + "auxiliary_loss_clip": 0.01058388, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.01750064, + "balance_loss_mlp": 1.01864326, + "epoch": 0.5575830452427476, + "flos": 27814010866560.0, + "grad_norm": 2.22567633067447, + "language_loss": 0.7533164, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.77433449, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 9274, + "time_per_iteration": 2.3998196125030518 + }, + { + "auxiliary_loss_clip": 0.01062628, + "auxiliary_loss_mlp": 0.01049105, + "balance_loss_clip": 1.01770544, + "balance_loss_mlp": 1.01949644, + "epoch": 0.5576431684954156, + "flos": 17812093104000.0, + "grad_norm": 2.7991036975604766, + "language_loss": 0.79678273, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.81790006, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.43164062, + "step": 9275, + "time_per_iteration": 3.7139954566955566 + }, + { + "auxiliary_loss_clip": 0.01057553, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.01472342, + "balance_loss_mlp": 1.01880944, + "epoch": 0.5577032917480835, + "flos": 15485969393280.0, + "grad_norm": 1.7858325593050102, + "language_loss": 0.76206326, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.78303093, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38671875, + "step": 9276, + "time_per_iteration": 2.3844759464263916 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01041751, + "balance_loss_clip": 1.01600218, + "balance_loss_mlp": 1.01704359, + "epoch": 0.5577634150007516, + "flos": 21578697400320.0, + "grad_norm": 1.7536603982558525, + "language_loss": 0.76480746, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.78578579, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 9277, + "time_per_iteration": 2.352510452270508 + }, + { + "auxiliary_loss_clip": 0.01053602, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_clip": 1.01751256, + "balance_loss_mlp": 1.01572025, + "epoch": 0.5578235382534195, + "flos": 21138756958080.0, + "grad_norm": 1.6486112247985807, + "language_loss": 0.72085673, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.74180567, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37890625, + "step": 9278, + "time_per_iteration": 2.3810791969299316 + }, + { + "auxiliary_loss_clip": 0.01058954, + "auxiliary_loss_mlp": 0.01042817, + "balance_loss_clip": 1.01660275, + "balance_loss_mlp": 1.01884747, + "epoch": 0.5578836615060875, + "flos": 26503999632000.0, + "grad_norm": 1.4906734843405118, + "language_loss": 0.76550686, + "learning_rate": 1.723012284057868e-06, + "loss": 0.78652453, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 9279, + "time_per_iteration": 2.4189023971557617 + }, + { + "auxiliary_loss_clip": 0.01056025, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.01419997, + "balance_loss_mlp": 1.01711226, + "epoch": 0.5579437847587555, + "flos": 20152844674560.0, + "grad_norm": 1.5430762763666508, + "language_loss": 0.68836212, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.70932794, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 9280, + "time_per_iteration": 2.3903071880340576 + }, + { + "auxiliary_loss_clip": 0.01057436, + "auxiliary_loss_mlp": 0.01047114, + "balance_loss_clip": 1.02142489, + "balance_loss_mlp": 1.0175308, + "epoch": 0.5580039080114234, + "flos": 26101136920320.0, + "grad_norm": 1.8633502549112315, + "language_loss": 0.74431616, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.76536167, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 9281, + "time_per_iteration": 2.4069015979766846 + }, + { + "auxiliary_loss_clip": 0.01054975, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.01775491, + "balance_loss_mlp": 1.01736808, + "epoch": 0.5580640312640914, + "flos": 13770826208640.0, + "grad_norm": 3.618091544205112, + "language_loss": 0.76178563, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.78276265, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 9282, + "time_per_iteration": 2.370868444442749 + }, + { + "auxiliary_loss_clip": 0.010554, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.01408601, + "balance_loss_mlp": 1.0175879, + "epoch": 0.5581241545167593, + "flos": 17675023639680.0, + "grad_norm": 1.9628580380355938, + "language_loss": 0.68100697, + "learning_rate": 1.721469534028297e-06, + "loss": 0.70194048, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37695312, + "step": 9283, + "time_per_iteration": 3.7976675033569336 + }, + { + "auxiliary_loss_clip": 0.01057099, + "auxiliary_loss_mlp": 0.01039366, + "balance_loss_clip": 1.01470196, + "balance_loss_mlp": 1.01857007, + "epoch": 0.5581842777694274, + "flos": 19568259002880.0, + "grad_norm": 1.7396582685026583, + "language_loss": 0.84303057, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.86399519, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38476562, + "step": 9284, + "time_per_iteration": 2.377251148223877 + }, + { + "auxiliary_loss_clip": 0.0105701, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.01135612, + "balance_loss_mlp": 1.01700473, + "epoch": 0.5582444010220953, + "flos": 20594111748480.0, + "grad_norm": 2.2067019018410736, + "language_loss": 0.86766535, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.88860404, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 9285, + "time_per_iteration": 2.3620309829711914 + }, + { + "auxiliary_loss_clip": 0.01058475, + "auxiliary_loss_mlp": 0.01042217, + "balance_loss_clip": 1.01625347, + "balance_loss_mlp": 1.01772678, + "epoch": 0.5583045242747633, + "flos": 19134497871360.0, + "grad_norm": 3.1992466386470277, + "language_loss": 0.75718784, + "learning_rate": 1.720312582354912e-06, + "loss": 0.77819479, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40820312, + "step": 9286, + "time_per_iteration": 2.3977746963500977 + }, + { + "auxiliary_loss_clip": 0.01055835, + "auxiliary_loss_mlp": 0.01046607, + "balance_loss_clip": 1.01896262, + "balance_loss_mlp": 1.01689529, + "epoch": 0.5583646475274312, + "flos": 27453322944000.0, + "grad_norm": 1.5624405617002075, + "language_loss": 0.75754577, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.77857018, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38867188, + "step": 9287, + "time_per_iteration": 2.4230213165283203 + }, + { + "auxiliary_loss_clip": 0.01058374, + "auxiliary_loss_mlp": 0.01044066, + "balance_loss_clip": 1.01669598, + "balance_loss_mlp": 1.01841116, + "epoch": 0.5584247707800992, + "flos": 23652817850880.0, + "grad_norm": 5.7214271044715925, + "language_loss": 0.75936341, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.78038776, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 9288, + "time_per_iteration": 2.4312572479248047 + }, + { + "auxiliary_loss_clip": 0.01059666, + "auxiliary_loss_mlp": 0.01046473, + "balance_loss_clip": 1.01990128, + "balance_loss_mlp": 1.0200907, + "epoch": 0.5584848940327671, + "flos": 13698032290560.0, + "grad_norm": 1.9896155142706071, + "language_loss": 0.7994194, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.82048082, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 9289, + "time_per_iteration": 2.352869749069214 + }, + { + "auxiliary_loss_clip": 0.01060904, + "auxiliary_loss_mlp": 0.01041531, + "balance_loss_clip": 1.01398158, + "balance_loss_mlp": 1.01868773, + "epoch": 0.5585450172854352, + "flos": 27014988424320.0, + "grad_norm": 1.786566677015049, + "language_loss": 0.62665999, + "learning_rate": 1.718770128672817e-06, + "loss": 0.64768434, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.421875, + "step": 9290, + "time_per_iteration": 2.433828353881836 + }, + { + "auxiliary_loss_clip": 0.01058678, + "auxiliary_loss_mlp": 0.01042847, + "balance_loss_clip": 1.01544118, + "balance_loss_mlp": 1.01860046, + "epoch": 0.5586051405381031, + "flos": 23184527518080.0, + "grad_norm": 1.9153586350461669, + "language_loss": 0.68997192, + "learning_rate": 1.7183845418764e-06, + "loss": 0.71098721, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 9291, + "time_per_iteration": 2.366030693054199 + }, + { + "auxiliary_loss_clip": 0.0105727, + "auxiliary_loss_mlp": 0.01051894, + "balance_loss_clip": 1.02623999, + "balance_loss_mlp": 1.01736116, + "epoch": 0.5586652637907711, + "flos": 20774542988160.0, + "grad_norm": 1.76520953942206, + "language_loss": 0.85088307, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.87197471, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3984375, + "step": 9292, + "time_per_iteration": 2.3915297985076904 + }, + { + "auxiliary_loss_clip": 0.01055791, + "auxiliary_loss_mlp": 0.01043012, + "balance_loss_clip": 1.01901567, + "balance_loss_mlp": 1.01832891, + "epoch": 0.5587253870434391, + "flos": 28218654057600.0, + "grad_norm": 1.866151563332103, + "language_loss": 0.75336683, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.77435488, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 9293, + "time_per_iteration": 2.4078938961029053 + }, + { + "auxiliary_loss_clip": 0.0105495, + "auxiliary_loss_mlp": 0.01040617, + "balance_loss_clip": 1.01734757, + "balance_loss_mlp": 1.01691532, + "epoch": 0.558785510296107, + "flos": 26614499685120.0, + "grad_norm": 2.5493642181278244, + "language_loss": 0.73037696, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.75133264, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38085938, + "step": 9294, + "time_per_iteration": 2.417523145675659 + }, + { + "auxiliary_loss_clip": 0.01057156, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.0176084, + "balance_loss_mlp": 1.01801634, + "epoch": 0.558845633548775, + "flos": 20155742317440.0, + "grad_norm": 3.150280677610416, + "language_loss": 0.69511712, + "learning_rate": 1.716842301625806e-06, + "loss": 0.71612287, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 9295, + "time_per_iteration": 2.348029375076294 + }, + { + "auxiliary_loss_clip": 0.01056169, + "auxiliary_loss_mlp": 0.01042911, + "balance_loss_clip": 1.01712596, + "balance_loss_mlp": 1.01835454, + "epoch": 0.5589057568014429, + "flos": 24349684055040.0, + "grad_norm": 1.5364659888082042, + "language_loss": 0.82060313, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.84159398, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 9296, + "time_per_iteration": 2.469758987426758 + }, + { + "auxiliary_loss_clip": 0.010563, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.01212811, + "balance_loss_mlp": 1.01798522, + "epoch": 0.558965880054111, + "flos": 21104123022720.0, + "grad_norm": 1.6407005841635993, + "language_loss": 0.66844463, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.68936545, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3828125, + "step": 9297, + "time_per_iteration": 2.395679235458374 + }, + { + "auxiliary_loss_clip": 0.01059797, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_clip": 1.01635134, + "balance_loss_mlp": 1.01900268, + "epoch": 0.5590260033067789, + "flos": 18435257694720.0, + "grad_norm": 1.7113642474169888, + "language_loss": 0.76317871, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.78422689, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40820312, + "step": 9298, + "time_per_iteration": 2.340498447418213 + }, + { + "auxiliary_loss_clip": 0.01009064, + "auxiliary_loss_mlp": 0.01002672, + "balance_loss_clip": 1.00028825, + "balance_loss_mlp": 1.00208163, + "epoch": 0.5590861265594469, + "flos": 70574058950400.0, + "grad_norm": 0.683027179276209, + "language_loss": 0.52488124, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54499865, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.06933594, + "step": 9299, + "time_per_iteration": 3.0739896297454834 + }, + { + "auxiliary_loss_clip": 0.01056106, + "auxiliary_loss_mlp": 0.01039567, + "balance_loss_clip": 1.01586831, + "balance_loss_mlp": 1.01798081, + "epoch": 0.5591462498121148, + "flos": 30663097966080.0, + "grad_norm": 1.795973136252282, + "language_loss": 0.69891268, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.71986943, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3828125, + "step": 9300, + "time_per_iteration": 2.488100051879883 + }, + { + "auxiliary_loss_clip": 0.0105805, + "auxiliary_loss_mlp": 0.01051968, + "balance_loss_clip": 1.02491927, + "balance_loss_mlp": 1.01834822, + "epoch": 0.5592063730647828, + "flos": 18149458371840.0, + "grad_norm": 1.9611668217491016, + "language_loss": 0.8278262, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84892637, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39648438, + "step": 9301, + "time_per_iteration": 2.3338632583618164 + }, + { + "auxiliary_loss_clip": 0.01055233, + "auxiliary_loss_mlp": 0.01040783, + "balance_loss_clip": 1.01585603, + "balance_loss_mlp": 1.01636755, + "epoch": 0.5592664963174507, + "flos": 24059276432640.0, + "grad_norm": 2.4500762402515983, + "language_loss": 0.68925315, + "learning_rate": 1.714143795138756e-06, + "loss": 0.7102133, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38671875, + "step": 9302, + "time_per_iteration": 2.412785768508911 + }, + { + "auxiliary_loss_clip": 0.01059388, + "auxiliary_loss_mlp": 0.01044058, + "balance_loss_clip": 1.01716399, + "balance_loss_mlp": 1.01843596, + "epoch": 0.5593266195701188, + "flos": 19826895421440.0, + "grad_norm": 1.5370312460046234, + "language_loss": 0.71311402, + "learning_rate": 1.713758337453878e-06, + "loss": 0.7341485, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 9303, + "time_per_iteration": 2.377955913543701 + }, + { + "auxiliary_loss_clip": 0.01055196, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.01736891, + "balance_loss_mlp": 1.01846015, + "epoch": 0.5593867428227867, + "flos": 25299600860160.0, + "grad_norm": 1.560473808683883, + "language_loss": 0.73405856, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.75501496, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 9304, + "time_per_iteration": 2.4263484477996826 + }, + { + "auxiliary_loss_clip": 0.0105593, + "auxiliary_loss_mlp": 0.01039325, + "balance_loss_clip": 1.01367116, + "balance_loss_mlp": 1.01681542, + "epoch": 0.5594468660754547, + "flos": 12932177506560.0, + "grad_norm": 1.9243818026940929, + "language_loss": 0.79250586, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.81345844, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 9305, + "time_per_iteration": 2.35794734954834 + }, + { + "auxiliary_loss_clip": 0.01053449, + "auxiliary_loss_mlp": 0.01042186, + "balance_loss_clip": 1.01842713, + "balance_loss_mlp": 1.0169909, + "epoch": 0.5595069893281227, + "flos": 19061703953280.0, + "grad_norm": 1.6968760729345957, + "language_loss": 0.70550996, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7264663, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36328125, + "step": 9306, + "time_per_iteration": 2.363891363143921 + }, + { + "auxiliary_loss_clip": 0.01008884, + "auxiliary_loss_mlp": 0.01002691, + "balance_loss_clip": 1.00028253, + "balance_loss_mlp": 1.00168788, + "epoch": 0.5595671125807906, + "flos": 70270350099840.0, + "grad_norm": 1.185923492722234, + "language_loss": 0.60368311, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62379885, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.07226562, + "step": 9307, + "time_per_iteration": 3.1439149379730225 + }, + { + "auxiliary_loss_clip": 0.01055172, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.02593017, + "balance_loss_mlp": 1.01708221, + "epoch": 0.5596272358334586, + "flos": 20664531694080.0, + "grad_norm": 1.7467367712463384, + "language_loss": 0.74987692, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.77092433, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 9308, + "time_per_iteration": 2.39603853225708 + }, + { + "auxiliary_loss_clip": 0.01057359, + "auxiliary_loss_mlp": 0.01048404, + "balance_loss_clip": 1.02156961, + "balance_loss_mlp": 1.01723135, + "epoch": 0.5596873590861265, + "flos": 25039986923520.0, + "grad_norm": 1.805438555756017, + "language_loss": 0.70827657, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.72933418, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40234375, + "step": 9309, + "time_per_iteration": 2.407064914703369 + }, + { + "auxiliary_loss_clip": 0.01056548, + "auxiliary_loss_mlp": 0.01047191, + "balance_loss_clip": 1.01804423, + "balance_loss_mlp": 1.01777279, + "epoch": 0.5597474823387946, + "flos": 25957189918080.0, + "grad_norm": 5.202721970385582, + "language_loss": 0.76524115, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.78627849, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.38867188, + "step": 9310, + "time_per_iteration": 2.458327054977417 + }, + { + "auxiliary_loss_clip": 0.01058147, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_clip": 1.01601458, + "balance_loss_mlp": 1.01849103, + "epoch": 0.5598076055914625, + "flos": 26176234988160.0, + "grad_norm": 2.517132350771058, + "language_loss": 0.71590853, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.73691702, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39648438, + "step": 9311, + "time_per_iteration": 3.8497228622436523 + }, + { + "auxiliary_loss_clip": 0.01054282, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.01807725, + "balance_loss_mlp": 1.01622272, + "epoch": 0.5598677288441305, + "flos": 11654984816640.0, + "grad_norm": 1.7499607172699307, + "language_loss": 0.73350573, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.75447893, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 9312, + "time_per_iteration": 3.8601391315460205 + }, + { + "auxiliary_loss_clip": 0.01055706, + "auxiliary_loss_mlp": 0.01039948, + "balance_loss_clip": 1.01649928, + "balance_loss_mlp": 1.01794147, + "epoch": 0.5599278520967984, + "flos": 22965482448000.0, + "grad_norm": 2.3436504583278803, + "language_loss": 0.9106487, + "learning_rate": 1.709904360003822e-06, + "loss": 0.93160522, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37890625, + "step": 9313, + "time_per_iteration": 2.4069085121154785 + }, + { + "auxiliary_loss_clip": 0.01056999, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.01565218, + "balance_loss_mlp": 1.01870525, + "epoch": 0.5599879753494664, + "flos": 21214483430400.0, + "grad_norm": 1.4516446399905245, + "language_loss": 0.78088903, + "learning_rate": 1.709519022520204e-06, + "loss": 0.80186981, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 9314, + "time_per_iteration": 2.390681028366089 + }, + { + "auxiliary_loss_clip": 0.01056635, + "auxiliary_loss_mlp": 0.0103935, + "balance_loss_clip": 1.01484084, + "balance_loss_mlp": 1.01832342, + "epoch": 0.5600480986021343, + "flos": 31901921205120.0, + "grad_norm": 1.8375582845437513, + "language_loss": 0.7126416, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.73360145, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3828125, + "step": 9315, + "time_per_iteration": 3.8582558631896973 + }, + { + "auxiliary_loss_clip": 0.01059262, + "auxiliary_loss_mlp": 0.01052988, + "balance_loss_clip": 1.0258199, + "balance_loss_mlp": 1.01809514, + "epoch": 0.5601082218548024, + "flos": 28474776858240.0, + "grad_norm": 1.8160278998461676, + "language_loss": 0.68117023, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.70229274, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41015625, + "step": 9316, + "time_per_iteration": 2.4142086505889893 + }, + { + "auxiliary_loss_clip": 0.01055488, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.01329815, + "balance_loss_mlp": 1.01785421, + "epoch": 0.5601683451074703, + "flos": 24096039960960.0, + "grad_norm": 1.8602752808413583, + "language_loss": 0.87719476, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.89812618, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37695312, + "step": 9317, + "time_per_iteration": 2.388385772705078 + }, + { + "auxiliary_loss_clip": 0.01058518, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.02001095, + "balance_loss_mlp": 1.01770496, + "epoch": 0.5602284683601383, + "flos": 26355095216640.0, + "grad_norm": 1.6815065082654364, + "language_loss": 0.78544563, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.80650777, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40820312, + "step": 9318, + "time_per_iteration": 2.393500328063965 + }, + { + "auxiliary_loss_clip": 0.01054998, + "auxiliary_loss_mlp": 0.01046602, + "balance_loss_clip": 1.02332079, + "balance_loss_mlp": 1.01719093, + "epoch": 0.5602885916128063, + "flos": 24495306802560.0, + "grad_norm": 1.463101183830637, + "language_loss": 0.76929039, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.79030639, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 9319, + "time_per_iteration": 2.417733669281006 + }, + { + "auxiliary_loss_clip": 0.01055992, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_clip": 1.01137495, + "balance_loss_mlp": 1.01801276, + "epoch": 0.5603487148654742, + "flos": 27343765497600.0, + "grad_norm": 2.1718038108803546, + "language_loss": 0.86273456, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.88364643, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37890625, + "step": 9320, + "time_per_iteration": 2.4052815437316895 + }, + { + "auxiliary_loss_clip": 0.01010659, + "auxiliary_loss_mlp": 0.01012668, + "balance_loss_clip": 1.01018846, + "balance_loss_mlp": 1.00343299, + "epoch": 0.5604088381181422, + "flos": 54084789550080.0, + "grad_norm": 0.7465787145657868, + "language_loss": 0.52628881, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54652202, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.07226562, + "step": 9321, + "time_per_iteration": 2.8485891819000244 + }, + { + "auxiliary_loss_clip": 0.01055973, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.01341987, + "balance_loss_mlp": 1.01812601, + "epoch": 0.5604689613708101, + "flos": 22235308940160.0, + "grad_norm": 1.6780034507753476, + "language_loss": 0.75418532, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.77511501, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 9322, + "time_per_iteration": 3.9475460052490234 + }, + { + "auxiliary_loss_clip": 0.01057453, + "auxiliary_loss_mlp": 0.0104032, + "balance_loss_clip": 1.0147022, + "balance_loss_mlp": 1.01880503, + "epoch": 0.5605290846234782, + "flos": 35296351741440.0, + "grad_norm": 1.6584998785037635, + "language_loss": 0.74390411, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76488185, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 9323, + "time_per_iteration": 2.506676435470581 + }, + { + "auxiliary_loss_clip": 0.01058525, + "auxiliary_loss_mlp": 0.01039985, + "balance_loss_clip": 1.01572645, + "balance_loss_mlp": 1.01903987, + "epoch": 0.5605892078761461, + "flos": 20262367209600.0, + "grad_norm": 1.7020353855666461, + "language_loss": 0.62398374, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.64496887, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.39453125, + "step": 9324, + "time_per_iteration": 2.378683567047119 + }, + { + "auxiliary_loss_clip": 0.01055905, + "auxiliary_loss_mlp": 0.01039712, + "balance_loss_clip": 1.01389134, + "balance_loss_mlp": 1.01714802, + "epoch": 0.5606493311288141, + "flos": 17307458179200.0, + "grad_norm": 2.229199851811408, + "language_loss": 0.88773632, + "learning_rate": 1.705281040409226e-06, + "loss": 0.90869248, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9325, + "time_per_iteration": 2.33491849899292 + }, + { + "auxiliary_loss_clip": 0.01057582, + "auxiliary_loss_mlp": 0.01038634, + "balance_loss_clip": 1.01137114, + "balance_loss_mlp": 1.01745009, + "epoch": 0.560709454381482, + "flos": 21651910254720.0, + "grad_norm": 1.4707490675697381, + "language_loss": 0.74897802, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76994014, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 9326, + "time_per_iteration": 2.4150948524475098 + }, + { + "auxiliary_loss_clip": 0.01059812, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.01265645, + "balance_loss_mlp": 1.01909375, + "epoch": 0.56076957763415, + "flos": 20302307671680.0, + "grad_norm": 1.7916213853018836, + "language_loss": 0.79537249, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.81639814, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.40820312, + "step": 9327, + "time_per_iteration": 2.4020192623138428 + }, + { + "auxiliary_loss_clip": 0.01058733, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.0116775, + "balance_loss_mlp": 1.01952553, + "epoch": 0.5608297008868179, + "flos": 25044734868480.0, + "grad_norm": 2.3282532959035285, + "language_loss": 0.79096985, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.81193376, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 9328, + "time_per_iteration": 2.4061052799224854 + }, + { + "auxiliary_loss_clip": 0.01056474, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.01639175, + "balance_loss_mlp": 1.01773012, + "epoch": 0.560889824139486, + "flos": 19865753631360.0, + "grad_norm": 1.5467392697309061, + "language_loss": 0.74498534, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.76597357, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 9329, + "time_per_iteration": 2.402282953262329 + }, + { + "auxiliary_loss_clip": 0.0105963, + "auxiliary_loss_mlp": 0.0104044, + "balance_loss_clip": 1.01234245, + "balance_loss_mlp": 1.01902533, + "epoch": 0.5609499473921539, + "flos": 22928299983360.0, + "grad_norm": 1.5377900967784905, + "language_loss": 0.84446639, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.86546707, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40625, + "step": 9330, + "time_per_iteration": 2.3976733684539795 + }, + { + "auxiliary_loss_clip": 0.01010313, + "auxiliary_loss_mlp": 0.01004717, + "balance_loss_clip": 1.0020467, + "balance_loss_mlp": 1.00300694, + "epoch": 0.5610100706448219, + "flos": 53032716506880.0, + "grad_norm": 0.7216140745381557, + "language_loss": 0.5793041, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.5994544, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.07324219, + "step": 9331, + "time_per_iteration": 3.0341784954071045 + }, + { + "auxiliary_loss_clip": 0.01058006, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.01330066, + "balance_loss_mlp": 1.01903582, + "epoch": 0.5610701938974898, + "flos": 21833877594240.0, + "grad_norm": 1.8208126047874622, + "language_loss": 0.82594037, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84691358, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 9332, + "time_per_iteration": 2.3856351375579834 + }, + { + "auxiliary_loss_clip": 0.01059455, + "auxiliary_loss_mlp": 0.01040812, + "balance_loss_clip": 1.01332271, + "balance_loss_mlp": 1.01825738, + "epoch": 0.5611303171501578, + "flos": 17456222949120.0, + "grad_norm": 4.6049167720506885, + "language_loss": 0.83076477, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.85176742, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41210938, + "step": 9333, + "time_per_iteration": 2.340019702911377 + }, + { + "auxiliary_loss_clip": 0.01054922, + "auxiliary_loss_mlp": 0.01039279, + "balance_loss_clip": 1.01505518, + "balance_loss_mlp": 1.01676178, + "epoch": 0.5611904404028258, + "flos": 22636705374720.0, + "grad_norm": 1.8016442154367938, + "language_loss": 0.73936832, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.76031029, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38085938, + "step": 9334, + "time_per_iteration": 2.3622076511383057 + }, + { + "auxiliary_loss_clip": 0.01056352, + "auxiliary_loss_mlp": 0.01043028, + "balance_loss_clip": 1.01975834, + "balance_loss_mlp": 1.01804113, + "epoch": 0.5612505636554938, + "flos": 14315541240960.0, + "grad_norm": 3.1314566229992287, + "language_loss": 0.7279228, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.74891663, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3828125, + "step": 9335, + "time_per_iteration": 2.35103440284729 + }, + { + "auxiliary_loss_clip": 0.01056454, + "auxiliary_loss_mlp": 0.010383, + "balance_loss_clip": 1.01320601, + "balance_loss_mlp": 1.01823854, + "epoch": 0.5613106869081618, + "flos": 16507353484800.0, + "grad_norm": 1.868815445267964, + "language_loss": 0.78093797, + "learning_rate": 1.701044410566205e-06, + "loss": 0.80188549, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 9336, + "time_per_iteration": 2.335078239440918 + }, + { + "auxiliary_loss_clip": 0.01054874, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.01611674, + "balance_loss_mlp": 1.01737046, + "epoch": 0.5613708101608297, + "flos": 24057495953280.0, + "grad_norm": 2.430784907509139, + "language_loss": 0.65667808, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.67763209, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 9337, + "time_per_iteration": 2.4050772190093994 + }, + { + "auxiliary_loss_clip": 0.01008807, + "auxiliary_loss_mlp": 0.01006276, + "balance_loss_clip": 1.00361753, + "balance_loss_mlp": 1.00173378, + "epoch": 0.5614309334134977, + "flos": 64902977832960.0, + "grad_norm": 0.874617988977845, + "language_loss": 0.62668276, + "learning_rate": 1.700274261035102e-06, + "loss": 0.6468336, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.07080078, + "step": 9338, + "time_per_iteration": 2.9741971492767334 + }, + { + "auxiliary_loss_clip": 0.01057736, + "auxiliary_loss_mlp": 0.01047911, + "balance_loss_clip": 1.02094603, + "balance_loss_mlp": 1.01813722, + "epoch": 0.5614910566661656, + "flos": 32918662085760.0, + "grad_norm": 1.8273105964181269, + "language_loss": 0.67368859, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.69474506, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 9339, + "time_per_iteration": 2.490867853164673 + }, + { + "auxiliary_loss_clip": 0.01055849, + "auxiliary_loss_mlp": 0.01043894, + "balance_loss_clip": 1.01766813, + "balance_loss_mlp": 1.01847756, + "epoch": 0.5615511799188336, + "flos": 18587862714240.0, + "grad_norm": 1.821941202289845, + "language_loss": 0.71072841, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.73172581, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37304688, + "step": 9340, + "time_per_iteration": 2.334085464477539 + }, + { + "auxiliary_loss_clip": 0.01054826, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.01413345, + "balance_loss_mlp": 1.01826, + "epoch": 0.5616113031715015, + "flos": 22818917093760.0, + "grad_norm": 2.090479714698616, + "language_loss": 0.7888875, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.80980611, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3671875, + "step": 9341, + "time_per_iteration": 2.3955166339874268 + }, + { + "auxiliary_loss_clip": 0.01056191, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.01298952, + "balance_loss_mlp": 1.01733911, + "epoch": 0.5616714264241696, + "flos": 22344622007040.0, + "grad_norm": 1.6905855695476195, + "language_loss": 0.80394554, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.82489312, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38867188, + "step": 9342, + "time_per_iteration": 2.362762212753296 + }, + { + "auxiliary_loss_clip": 0.010578, + "auxiliary_loss_mlp": 0.01045391, + "balance_loss_clip": 1.01902175, + "balance_loss_mlp": 1.0179441, + "epoch": 0.5617315496768375, + "flos": 18806768138880.0, + "grad_norm": 1.84055991315106, + "language_loss": 0.77753919, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.79857111, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 9343, + "time_per_iteration": 2.3555991649627686 + }, + { + "auxiliary_loss_clip": 0.01056052, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_clip": 1.02101612, + "balance_loss_mlp": 1.01828003, + "epoch": 0.5617916729295055, + "flos": 18368328885120.0, + "grad_norm": 3.230588659609028, + "language_loss": 0.70748818, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.72851241, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 9344, + "time_per_iteration": 2.3543155193328857 + }, + { + "auxiliary_loss_clip": 0.01057873, + "auxiliary_loss_mlp": 0.01041594, + "balance_loss_clip": 1.01520097, + "balance_loss_mlp": 1.01901484, + "epoch": 0.5618517961821734, + "flos": 28178818329600.0, + "grad_norm": 2.0370183942746767, + "language_loss": 0.683855, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.70484966, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 9345, + "time_per_iteration": 2.4349169731140137 + }, + { + "auxiliary_loss_clip": 0.01057461, + "auxiliary_loss_mlp": 0.01045343, + "balance_loss_clip": 1.02076197, + "balance_loss_mlp": 1.01837873, + "epoch": 0.5619119194348414, + "flos": 15485969393280.0, + "grad_norm": 2.1191707063301015, + "language_loss": 0.88249314, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.90352124, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.390625, + "step": 9346, + "time_per_iteration": 2.319436550140381 + }, + { + "auxiliary_loss_clip": 0.01057317, + "auxiliary_loss_mlp": 0.01046314, + "balance_loss_clip": 1.01921797, + "balance_loss_mlp": 1.0183996, + "epoch": 0.5619720426875094, + "flos": 29127478325760.0, + "grad_norm": 4.338464176892236, + "language_loss": 0.60727662, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.62831295, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 9347, + "time_per_iteration": 2.454298734664917 + }, + { + "auxiliary_loss_clip": 0.01059143, + "auxiliary_loss_mlp": 0.01047489, + "balance_loss_clip": 1.02051175, + "balance_loss_mlp": 1.01926911, + "epoch": 0.5620321659401774, + "flos": 18002788283520.0, + "grad_norm": 3.023624401468501, + "language_loss": 0.7154842, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.73655051, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 9348, + "time_per_iteration": 2.346240997314453 + }, + { + "auxiliary_loss_clip": 0.01060459, + "auxiliary_loss_mlp": 0.01042448, + "balance_loss_clip": 1.01438642, + "balance_loss_mlp": 1.01889467, + "epoch": 0.5620922891928454, + "flos": 20593483344000.0, + "grad_norm": 1.8982188024228155, + "language_loss": 0.80409181, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.82512093, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.4140625, + "step": 9349, + "time_per_iteration": 2.399677276611328 + }, + { + "auxiliary_loss_clip": 0.01058182, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.01084018, + "balance_loss_mlp": 1.01890278, + "epoch": 0.5621524124455133, + "flos": 26285792434560.0, + "grad_norm": 2.3622569765184855, + "language_loss": 0.68631899, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.70728302, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39257812, + "step": 9350, + "time_per_iteration": 2.3992104530334473 + }, + { + "auxiliary_loss_clip": 0.01059685, + "auxiliary_loss_mlp": 0.01042724, + "balance_loss_clip": 1.01633108, + "balance_loss_mlp": 1.01945043, + "epoch": 0.5622125356981813, + "flos": 12749477028480.0, + "grad_norm": 1.893234625538157, + "language_loss": 0.79780602, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.81883013, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 9351, + "time_per_iteration": 3.709948778152466 + }, + { + "auxiliary_loss_clip": 0.0106258, + "auxiliary_loss_mlp": 0.01043211, + "balance_loss_clip": 1.01501799, + "balance_loss_mlp": 1.0213511, + "epoch": 0.5622726589508492, + "flos": 23804200972800.0, + "grad_norm": 1.5998315848123077, + "language_loss": 0.59906721, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.62012517, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 9352, + "time_per_iteration": 3.7839183807373047 + }, + { + "auxiliary_loss_clip": 0.0105466, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.01559103, + "balance_loss_mlp": 1.01772523, + "epoch": 0.5623327822035172, + "flos": 24717040047360.0, + "grad_norm": 1.3834391344306762, + "language_loss": 0.72785926, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.74878842, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36914062, + "step": 9353, + "time_per_iteration": 2.441418409347534 + }, + { + "auxiliary_loss_clip": 0.01058121, + "auxiliary_loss_mlp": 0.01040703, + "balance_loss_clip": 1.01433396, + "balance_loss_mlp": 1.01817644, + "epoch": 0.5623929054561851, + "flos": 14018640105600.0, + "grad_norm": 2.490437652393347, + "language_loss": 0.77509654, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.79608482, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40039062, + "step": 9354, + "time_per_iteration": 3.610450029373169 + }, + { + "auxiliary_loss_clip": 0.0105936, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.01194382, + "balance_loss_mlp": 1.01882005, + "epoch": 0.5624530287088532, + "flos": 20703354992640.0, + "grad_norm": 1.7266201035999869, + "language_loss": 0.74004686, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.76102173, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 9355, + "time_per_iteration": 2.3969128131866455 + }, + { + "auxiliary_loss_clip": 0.01057322, + "auxiliary_loss_mlp": 0.01038373, + "balance_loss_clip": 1.0124445, + "balance_loss_mlp": 1.01769531, + "epoch": 0.5625131519615211, + "flos": 21469838181120.0, + "grad_norm": 1.5605605743948345, + "language_loss": 0.74220723, + "learning_rate": 1.693344975084274e-06, + "loss": 0.76316416, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 9356, + "time_per_iteration": 2.3933475017547607 + }, + { + "auxiliary_loss_clip": 0.01055913, + "auxiliary_loss_mlp": 0.01038177, + "balance_loss_clip": 1.01377523, + "balance_loss_mlp": 1.01794136, + "epoch": 0.5625732752141891, + "flos": 18697001224320.0, + "grad_norm": 2.188850257185513, + "language_loss": 0.85069621, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.87163717, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 9357, + "time_per_iteration": 2.365926504135132 + }, + { + "auxiliary_loss_clip": 0.01055637, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.0122323, + "balance_loss_mlp": 1.01715004, + "epoch": 0.562633398466857, + "flos": 16215968344320.0, + "grad_norm": 2.273626814182386, + "language_loss": 0.73516345, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.75609684, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 9358, + "time_per_iteration": 2.362377405166626 + }, + { + "auxiliary_loss_clip": 0.01056159, + "auxiliary_loss_mlp": 0.01043798, + "balance_loss_clip": 1.01853788, + "balance_loss_mlp": 1.01786876, + "epoch": 0.562693521719525, + "flos": 22490838247680.0, + "grad_norm": 1.7550871187634007, + "language_loss": 0.78603393, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.80703342, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 9359, + "time_per_iteration": 2.4174253940582275 + }, + { + "auxiliary_loss_clip": 0.01054486, + "auxiliary_loss_mlp": 0.01042498, + "balance_loss_clip": 1.01761913, + "balance_loss_mlp": 1.01645041, + "epoch": 0.562753644972193, + "flos": 25330185077760.0, + "grad_norm": 1.712079122039542, + "language_loss": 0.72180551, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.74277538, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38085938, + "step": 9360, + "time_per_iteration": 2.40791916847229 + }, + { + "auxiliary_loss_clip": 0.01011239, + "auxiliary_loss_mlp": 0.01002477, + "balance_loss_clip": 1.00010514, + "balance_loss_mlp": 1.00399148, + "epoch": 0.562813768224861, + "flos": 67389631441920.0, + "grad_norm": 0.7833824320668576, + "language_loss": 0.55675793, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57689512, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07226562, + "step": 9361, + "time_per_iteration": 4.348718166351318 + }, + { + "auxiliary_loss_clip": 0.01055647, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.01875424, + "balance_loss_mlp": 1.0184691, + "epoch": 0.562873891477529, + "flos": 23330045531520.0, + "grad_norm": 1.437425642134479, + "language_loss": 0.82417035, + "learning_rate": 1.691036046141018e-06, + "loss": 0.84513479, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.37109375, + "step": 9362, + "time_per_iteration": 2.3725786209106445 + }, + { + "auxiliary_loss_clip": 0.01055384, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.01782513, + "balance_loss_mlp": 1.01734865, + "epoch": 0.5629340147301969, + "flos": 38471283360000.0, + "grad_norm": 1.8534468767850678, + "language_loss": 0.75059605, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.7715863, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 9363, + "time_per_iteration": 2.584226608276367 + }, + { + "auxiliary_loss_clip": 0.01057141, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.02341151, + "balance_loss_mlp": 1.01736164, + "epoch": 0.5629941379828649, + "flos": 29240736376320.0, + "grad_norm": 1.8895204875648448, + "language_loss": 0.84109938, + "learning_rate": 1.690266496731839e-06, + "loss": 0.8621639, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 9364, + "time_per_iteration": 2.421346426010132 + }, + { + "auxiliary_loss_clip": 0.01055378, + "auxiliary_loss_mlp": 0.01041069, + "balance_loss_clip": 1.01672602, + "balance_loss_mlp": 1.01792908, + "epoch": 0.5630542612355328, + "flos": 19420052814720.0, + "grad_norm": 2.272359094482832, + "language_loss": 0.66664791, + "learning_rate": 1.689881739637642e-06, + "loss": 0.6876123, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.375, + "step": 9365, + "time_per_iteration": 2.416754722595215 + }, + { + "auxiliary_loss_clip": 0.01058656, + "auxiliary_loss_mlp": 0.01050581, + "balance_loss_clip": 1.02141058, + "balance_loss_mlp": 1.01710081, + "epoch": 0.5631143844882008, + "flos": 22265404398720.0, + "grad_norm": 2.6387864575251996, + "language_loss": 0.83952981, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.86062217, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41601562, + "step": 9366, + "time_per_iteration": 2.348599910736084 + }, + { + "auxiliary_loss_clip": 0.01056335, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.01131904, + "balance_loss_mlp": 1.01879096, + "epoch": 0.5631745077408687, + "flos": 22964225639040.0, + "grad_norm": 1.554844901162832, + "language_loss": 0.74185359, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.76276159, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.375, + "step": 9367, + "time_per_iteration": 2.4300789833068848 + }, + { + "auxiliary_loss_clip": 0.01008908, + "auxiliary_loss_mlp": 0.01007917, + "balance_loss_clip": 1.00543761, + "balance_loss_mlp": 1.00169575, + "epoch": 0.5632346309935368, + "flos": 65076948604800.0, + "grad_norm": 0.6481489262827694, + "language_loss": 0.53508335, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55525166, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.07226562, + "step": 9368, + "time_per_iteration": 3.105098247528076 + }, + { + "auxiliary_loss_clip": 0.01056551, + "auxiliary_loss_mlp": 0.01044165, + "balance_loss_clip": 1.01787949, + "balance_loss_mlp": 1.01862633, + "epoch": 0.5632947542462047, + "flos": 23001792128640.0, + "grad_norm": 1.5685351932083478, + "language_loss": 0.69633293, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71734011, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 9369, + "time_per_iteration": 2.4433796405792236 + }, + { + "auxiliary_loss_clip": 0.01054321, + "auxiliary_loss_mlp": 0.01040209, + "balance_loss_clip": 1.01487684, + "balance_loss_mlp": 1.0165236, + "epoch": 0.5633548774988727, + "flos": 30481270272000.0, + "grad_norm": 1.9673707397368405, + "language_loss": 0.76886958, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78981483, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 9370, + "time_per_iteration": 2.428602695465088 + }, + { + "auxiliary_loss_clip": 0.01057475, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.01540983, + "balance_loss_mlp": 1.01690209, + "epoch": 0.5634150007515406, + "flos": 18514056366720.0, + "grad_norm": 2.2314743215713193, + "language_loss": 0.76945245, + "learning_rate": 1.687573444537108e-06, + "loss": 0.79045886, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40625, + "step": 9371, + "time_per_iteration": 2.4036200046539307 + }, + { + "auxiliary_loss_clip": 0.01055208, + "auxiliary_loss_mlp": 0.0104852, + "balance_loss_clip": 1.02390337, + "balance_loss_mlp": 1.01719487, + "epoch": 0.5634751240042086, + "flos": 19243671292800.0, + "grad_norm": 1.7926796863919519, + "language_loss": 0.77414119, + "learning_rate": 1.687188770067285e-06, + "loss": 0.79517841, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 9372, + "time_per_iteration": 2.3473870754241943 + }, + { + "auxiliary_loss_clip": 0.01056143, + "auxiliary_loss_mlp": 0.01041413, + "balance_loss_clip": 1.01631939, + "balance_loss_mlp": 1.01904154, + "epoch": 0.5635352472568766, + "flos": 12019827191040.0, + "grad_norm": 2.040820058003114, + "language_loss": 0.73364133, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.75461692, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 9373, + "time_per_iteration": 2.4173049926757812 + }, + { + "auxiliary_loss_clip": 0.01060081, + "auxiliary_loss_mlp": 0.01042575, + "balance_loss_clip": 1.01447701, + "balance_loss_mlp": 1.01971531, + "epoch": 0.5635953705095446, + "flos": 21870571299840.0, + "grad_norm": 1.9942537200869856, + "language_loss": 0.84247488, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.86350143, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40429688, + "step": 9374, + "time_per_iteration": 2.362138509750366 + }, + { + "auxiliary_loss_clip": 0.01055016, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.01075602, + "balance_loss_mlp": 1.01724374, + "epoch": 0.5636554937622126, + "flos": 27124929895680.0, + "grad_norm": 1.8477550926639839, + "language_loss": 0.67851031, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.69940919, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 9375, + "time_per_iteration": 2.457329034805298 + }, + { + "auxiliary_loss_clip": 0.01057509, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_clip": 1.01539063, + "balance_loss_mlp": 1.01788831, + "epoch": 0.5637156170148805, + "flos": 12925753816320.0, + "grad_norm": 2.6765731588899127, + "language_loss": 0.8142826, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83528286, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39648438, + "step": 9376, + "time_per_iteration": 2.319622755050659 + }, + { + "auxiliary_loss_clip": 0.01059233, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.01598024, + "balance_loss_mlp": 1.01814759, + "epoch": 0.5637757402675485, + "flos": 45549295246080.0, + "grad_norm": 1.4587973807656855, + "language_loss": 0.70266855, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.72369719, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 9377, + "time_per_iteration": 2.639840841293335 + }, + { + "auxiliary_loss_clip": 0.0105568, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.01377416, + "balance_loss_mlp": 1.01979971, + "epoch": 0.5638358635202164, + "flos": 20885008129920.0, + "grad_norm": 1.4353540271301388, + "language_loss": 0.75042665, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.77136946, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 9378, + "time_per_iteration": 2.433823585510254 + }, + { + "auxiliary_loss_clip": 0.01060336, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.01623917, + "balance_loss_mlp": 1.01853633, + "epoch": 0.5638959867728844, + "flos": 18805581152640.0, + "grad_norm": 2.2720914082981167, + "language_loss": 0.83660364, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.85765159, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41796875, + "step": 9379, + "time_per_iteration": 2.385291576385498 + }, + { + "auxiliary_loss_clip": 0.01057094, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.01489377, + "balance_loss_mlp": 1.01777685, + "epoch": 0.5639561100255523, + "flos": 27489108954240.0, + "grad_norm": 3.033956504849508, + "language_loss": 0.73501676, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.75600314, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 9380, + "time_per_iteration": 2.409416675567627 + }, + { + "auxiliary_loss_clip": 0.01057954, + "auxiliary_loss_mlp": 0.01047815, + "balance_loss_clip": 1.01925242, + "balance_loss_mlp": 1.01822114, + "epoch": 0.5640162332782204, + "flos": 18075617112960.0, + "grad_norm": 2.4917643960033122, + "language_loss": 0.75122398, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.77228165, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39648438, + "step": 9381, + "time_per_iteration": 2.3851940631866455 + }, + { + "auxiliary_loss_clip": 0.01059449, + "auxiliary_loss_mlp": 0.01043219, + "balance_loss_clip": 1.01714814, + "balance_loss_mlp": 1.01997876, + "epoch": 0.5640763565308883, + "flos": 20883856055040.0, + "grad_norm": 2.0987239660044428, + "language_loss": 0.74058092, + "learning_rate": 1.683342680176499e-06, + "loss": 0.76160765, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 9382, + "time_per_iteration": 2.3891780376434326 + }, + { + "auxiliary_loss_clip": 0.01014248, + "auxiliary_loss_mlp": 0.01006783, + "balance_loss_clip": 1.00389779, + "balance_loss_mlp": 1.00697041, + "epoch": 0.5641364797835563, + "flos": 64444707060480.0, + "grad_norm": 0.7359058435459532, + "language_loss": 0.54467261, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56488299, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.02880859, + "router_z_loss_mlp": 0.07275391, + "step": 9383, + "time_per_iteration": 3.1711373329162598 + }, + { + "auxiliary_loss_clip": 0.01060109, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.01779938, + "balance_loss_mlp": 1.01951301, + "epoch": 0.5641966030362242, + "flos": 18659958405120.0, + "grad_norm": 2.1990885625899264, + "language_loss": 0.7146951, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73576307, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40625, + "step": 9384, + "time_per_iteration": 2.344680070877075 + }, + { + "auxiliary_loss_clip": 0.01058882, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_clip": 1.01834059, + "balance_loss_mlp": 1.01840925, + "epoch": 0.5642567262888922, + "flos": 22491222272640.0, + "grad_norm": 2.32887950802258, + "language_loss": 0.77026761, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.79130256, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40429688, + "step": 9385, + "time_per_iteration": 2.4221150875091553 + }, + { + "auxiliary_loss_clip": 0.01055828, + "auxiliary_loss_mlp": 0.0104774, + "balance_loss_clip": 1.02007115, + "balance_loss_mlp": 1.01739907, + "epoch": 0.5643168495415603, + "flos": 13003190945280.0, + "grad_norm": 2.140020689054966, + "language_loss": 0.84097928, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.86201501, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3828125, + "step": 9386, + "time_per_iteration": 2.3416872024536133 + }, + { + "auxiliary_loss_clip": 0.01062058, + "auxiliary_loss_mlp": 0.01055321, + "balance_loss_clip": 1.02696061, + "balance_loss_mlp": 1.01980495, + "epoch": 0.5643769727942282, + "flos": 18587304132480.0, + "grad_norm": 2.001553493932271, + "language_loss": 0.72132051, + "learning_rate": 1.681420084607516e-06, + "loss": 0.74249434, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.421875, + "step": 9387, + "time_per_iteration": 2.366070032119751 + }, + { + "auxiliary_loss_clip": 0.01060034, + "auxiliary_loss_mlp": 0.0105093, + "balance_loss_clip": 1.0237745, + "balance_loss_mlp": 1.01955342, + "epoch": 0.5644370960468962, + "flos": 33804757192320.0, + "grad_norm": 1.5774820526984588, + "language_loss": 0.75411272, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.77522236, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 9388, + "time_per_iteration": 2.513049364089966 + }, + { + "auxiliary_loss_clip": 0.01055408, + "auxiliary_loss_mlp": 0.01041245, + "balance_loss_clip": 1.01685429, + "balance_loss_mlp": 1.01782513, + "epoch": 0.5644972192995641, + "flos": 21213855025920.0, + "grad_norm": 2.1761693664065382, + "language_loss": 0.83304507, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.8540116, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 9389, + "time_per_iteration": 2.3622193336486816 + }, + { + "auxiliary_loss_clip": 0.01059924, + "auxiliary_loss_mlp": 0.01045277, + "balance_loss_clip": 1.01764441, + "balance_loss_mlp": 1.01883388, + "epoch": 0.5645573425522321, + "flos": 18586745550720.0, + "grad_norm": 2.7737168378033545, + "language_loss": 0.66203213, + "learning_rate": 1.680266672116467e-06, + "loss": 0.68308407, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 9390, + "time_per_iteration": 3.586611270904541 + }, + { + "auxiliary_loss_clip": 0.01056938, + "auxiliary_loss_mlp": 0.01037306, + "balance_loss_clip": 1.01363134, + "balance_loss_mlp": 1.0187912, + "epoch": 0.5646174658049, + "flos": 18112834488960.0, + "grad_norm": 1.61830463331607, + "language_loss": 0.92971283, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.95065528, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38085938, + "step": 9391, + "time_per_iteration": 2.377021551132202 + }, + { + "auxiliary_loss_clip": 0.01061841, + "auxiliary_loss_mlp": 0.01046765, + "balance_loss_clip": 1.01578212, + "balance_loss_mlp": 1.01856613, + "epoch": 0.564677589057568, + "flos": 28328700263040.0, + "grad_norm": 2.389408318807455, + "language_loss": 0.61967111, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.64075714, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.43359375, + "step": 9392, + "time_per_iteration": 3.938218355178833 + }, + { + "auxiliary_loss_clip": 0.01055888, + "auxiliary_loss_mlp": 0.01042841, + "balance_loss_clip": 1.01449287, + "balance_loss_mlp": 1.01689339, + "epoch": 0.564737712310236, + "flos": 22162654667520.0, + "grad_norm": 2.4685339141529186, + "language_loss": 0.83153039, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.85251766, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.390625, + "step": 9393, + "time_per_iteration": 2.397209882736206 + }, + { + "auxiliary_loss_clip": 0.01056906, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.02497208, + "balance_loss_mlp": 1.01839495, + "epoch": 0.564797835562904, + "flos": 20957976604800.0, + "grad_norm": 2.360077096582207, + "language_loss": 0.88241589, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.90348965, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 9394, + "time_per_iteration": 3.867898941040039 + }, + { + "auxiliary_loss_clip": 0.01057601, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.01707458, + "balance_loss_mlp": 1.01876783, + "epoch": 0.5648579588155719, + "flos": 17419354686720.0, + "grad_norm": 1.892315894350741, + "language_loss": 0.85473049, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.87573385, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38867188, + "step": 9395, + "time_per_iteration": 2.3330233097076416 + }, + { + "auxiliary_loss_clip": 0.01010274, + "auxiliary_loss_mlp": 0.01006383, + "balance_loss_clip": 1.00323594, + "balance_loss_mlp": 1.00276637, + "epoch": 0.5649180820682399, + "flos": 69925965782400.0, + "grad_norm": 0.8047315058692487, + "language_loss": 0.58497566, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60514224, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.03149414, + "router_z_loss_mlp": 0.07519531, + "step": 9396, + "time_per_iteration": 3.0166380405426025 + }, + { + "auxiliary_loss_clip": 0.01059107, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_clip": 1.01488137, + "balance_loss_mlp": 1.01817274, + "epoch": 0.5649782053209078, + "flos": 24971906039040.0, + "grad_norm": 2.3103382236432015, + "language_loss": 0.72458661, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.74559575, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 9397, + "time_per_iteration": 2.3875210285186768 + }, + { + "auxiliary_loss_clip": 0.01059023, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.01324344, + "balance_loss_mlp": 1.01807177, + "epoch": 0.5650383285735758, + "flos": 21725507134080.0, + "grad_norm": 1.7456950414473367, + "language_loss": 0.67838115, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69938731, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41015625, + "step": 9398, + "time_per_iteration": 2.403930425643921 + }, + { + "auxiliary_loss_clip": 0.01010616, + "auxiliary_loss_mlp": 0.01008183, + "balance_loss_clip": 1.00470173, + "balance_loss_mlp": 1.0031116, + "epoch": 0.5650984518262439, + "flos": 65901318560640.0, + "grad_norm": 0.776288550895725, + "language_loss": 0.58228523, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.6024732, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.03491211, + "router_z_loss_mlp": 0.07519531, + "step": 9399, + "time_per_iteration": 2.9562344551086426 + }, + { + "auxiliary_loss_clip": 0.01059288, + "auxiliary_loss_mlp": 0.01043908, + "balance_loss_clip": 1.01527405, + "balance_loss_mlp": 1.01887989, + "epoch": 0.5651585750789118, + "flos": 21031538572800.0, + "grad_norm": 1.9093042594447873, + "language_loss": 0.74590206, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.76693404, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40429688, + "step": 9400, + "time_per_iteration": 3.8087432384490967 + }, + { + "auxiliary_loss_clip": 0.01059615, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.01789403, + "balance_loss_mlp": 1.01868439, + "epoch": 0.5652186983315798, + "flos": 18550924629120.0, + "grad_norm": 2.169843438709384, + "language_loss": 0.61704648, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63811511, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41015625, + "step": 9401, + "time_per_iteration": 2.3585643768310547 + }, + { + "auxiliary_loss_clip": 0.01054549, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.01467109, + "balance_loss_mlp": 1.01697123, + "epoch": 0.5652788215842477, + "flos": 18477676863360.0, + "grad_norm": 1.9691448556945297, + "language_loss": 0.82516575, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.84611201, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 9402, + "time_per_iteration": 2.3356659412384033 + }, + { + "auxiliary_loss_clip": 0.01056453, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.01139867, + "balance_loss_mlp": 1.01856196, + "epoch": 0.5653389448369157, + "flos": 30042761195520.0, + "grad_norm": 1.509488389437512, + "language_loss": 0.78701055, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80792087, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37890625, + "step": 9403, + "time_per_iteration": 2.4841253757476807 + }, + { + "auxiliary_loss_clip": 0.0105799, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.01785922, + "balance_loss_mlp": 1.01855779, + "epoch": 0.5653990680895836, + "flos": 16726608023040.0, + "grad_norm": 1.5641480245386388, + "language_loss": 0.69610381, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.71712261, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 9404, + "time_per_iteration": 2.3679237365722656 + }, + { + "auxiliary_loss_clip": 0.01055059, + "auxiliary_loss_mlp": 0.01038789, + "balance_loss_clip": 1.0157932, + "balance_loss_mlp": 1.01783776, + "epoch": 0.5654591913422516, + "flos": 14537379219840.0, + "grad_norm": 1.793891642667489, + "language_loss": 0.68657511, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.70751363, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37109375, + "step": 9405, + "time_per_iteration": 2.341883659362793 + }, + { + "auxiliary_loss_clip": 0.01056363, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.01695347, + "balance_loss_mlp": 1.0196743, + "epoch": 0.5655193145949196, + "flos": 26208809153280.0, + "grad_norm": 3.208832411442583, + "language_loss": 0.74926889, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.77023482, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 9406, + "time_per_iteration": 2.4525160789489746 + }, + { + "auxiliary_loss_clip": 0.01060249, + "auxiliary_loss_mlp": 0.01043293, + "balance_loss_clip": 1.01729381, + "balance_loss_mlp": 1.01904523, + "epoch": 0.5655794378475876, + "flos": 25045398184320.0, + "grad_norm": 1.7598757386560056, + "language_loss": 0.80870628, + "learning_rate": 1.673732740698882e-06, + "loss": 0.82974172, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.41210938, + "step": 9407, + "time_per_iteration": 2.4073002338409424 + }, + { + "auxiliary_loss_clip": 0.01056825, + "auxiliary_loss_mlp": 0.01041268, + "balance_loss_clip": 1.01642513, + "balance_loss_mlp": 1.01972699, + "epoch": 0.5656395611002555, + "flos": 31031431476480.0, + "grad_norm": 1.4182586048414427, + "language_loss": 0.72465122, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.74563217, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 9408, + "time_per_iteration": 2.5082805156707764 + }, + { + "auxiliary_loss_clip": 0.01056635, + "auxiliary_loss_mlp": 0.01040925, + "balance_loss_clip": 1.01602209, + "balance_loss_mlp": 1.01855147, + "epoch": 0.5656996843529235, + "flos": 20228501324160.0, + "grad_norm": 1.9555609566229977, + "language_loss": 0.82303578, + "learning_rate": 1.672964276570308e-06, + "loss": 0.84401143, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38085938, + "step": 9409, + "time_per_iteration": 2.3717715740203857 + }, + { + "auxiliary_loss_clip": 0.01056563, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.01358438, + "balance_loss_mlp": 1.01750541, + "epoch": 0.5657598076055914, + "flos": 20995193980800.0, + "grad_norm": 1.7111634511952234, + "language_loss": 0.79590911, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.81686604, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.390625, + "step": 9410, + "time_per_iteration": 2.3791637420654297 + }, + { + "auxiliary_loss_clip": 0.01059537, + "auxiliary_loss_mlp": 0.01042933, + "balance_loss_clip": 1.01598001, + "balance_loss_mlp": 1.02002931, + "epoch": 0.5658199308582594, + "flos": 11545217902080.0, + "grad_norm": 2.1813964605172247, + "language_loss": 0.84656888, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.86759365, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39453125, + "step": 9411, + "time_per_iteration": 2.3606479167938232 + }, + { + "auxiliary_loss_clip": 0.01061101, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.01427865, + "balance_loss_mlp": 1.02061427, + "epoch": 0.5658800541109275, + "flos": 14171314947840.0, + "grad_norm": 2.462035329990107, + "language_loss": 0.69344926, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.71446604, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 9412, + "time_per_iteration": 2.3773415088653564 + }, + { + "auxiliary_loss_clip": 0.01053736, + "auxiliary_loss_mlp": 0.01037664, + "balance_loss_clip": 1.01600337, + "balance_loss_mlp": 1.01746595, + "epoch": 0.5659401773635954, + "flos": 27303929769600.0, + "grad_norm": 2.19064894568191, + "language_loss": 0.59615433, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.61706829, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36328125, + "step": 9413, + "time_per_iteration": 2.420997381210327 + }, + { + "auxiliary_loss_clip": 0.01054057, + "auxiliary_loss_mlp": 0.01038118, + "balance_loss_clip": 1.0142045, + "balance_loss_mlp": 1.01702178, + "epoch": 0.5660003006162634, + "flos": 16727236427520.0, + "grad_norm": 1.832825870082147, + "language_loss": 0.70071828, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.72164005, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 9414, + "time_per_iteration": 2.376802682876587 + }, + { + "auxiliary_loss_clip": 0.01054142, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.01549363, + "balance_loss_mlp": 1.01731658, + "epoch": 0.5660604238689313, + "flos": 21652364102400.0, + "grad_norm": 1.6425773839279085, + "language_loss": 0.79047763, + "learning_rate": 1.670659182280247e-06, + "loss": 0.81139874, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3671875, + "step": 9415, + "time_per_iteration": 2.39070725440979 + }, + { + "auxiliary_loss_clip": 0.01012248, + "auxiliary_loss_mlp": 0.01011515, + "balance_loss_clip": 1.00878489, + "balance_loss_mlp": 1.00513875, + "epoch": 0.5661205471215993, + "flos": 68820755783040.0, + "grad_norm": 0.8041251921894411, + "language_loss": 0.4917706, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51200819, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.02734375, + "router_z_loss_mlp": 0.07128906, + "step": 9416, + "time_per_iteration": 3.135566234588623 + }, + { + "auxiliary_loss_clip": 0.01056526, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_clip": 1.01925969, + "balance_loss_mlp": 1.01829219, + "epoch": 0.5661806703742672, + "flos": 28620504339840.0, + "grad_norm": 2.5670646660297884, + "language_loss": 0.64985567, + "learning_rate": 1.6698909172706e-06, + "loss": 0.67085826, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3828125, + "step": 9417, + "time_per_iteration": 2.4306368827819824 + }, + { + "auxiliary_loss_clip": 0.01057156, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.01777625, + "balance_loss_mlp": 1.01813793, + "epoch": 0.5662407936269352, + "flos": 21396869706240.0, + "grad_norm": 2.0548390862183163, + "language_loss": 0.70527893, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.72627938, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 9418, + "time_per_iteration": 2.3985869884490967 + }, + { + "auxiliary_loss_clip": 0.01055809, + "auxiliary_loss_mlp": 0.0104542, + "balance_loss_clip": 1.01804948, + "balance_loss_mlp": 1.01731563, + "epoch": 0.5663009168796032, + "flos": 25658997062400.0, + "grad_norm": 1.8558630048884273, + "language_loss": 0.66156423, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.68257648, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38476562, + "step": 9419, + "time_per_iteration": 2.4074909687042236 + }, + { + "auxiliary_loss_clip": 0.01011377, + "auxiliary_loss_mlp": 0.0100358, + "balance_loss_clip": 1.00121975, + "balance_loss_mlp": 1.0043509, + "epoch": 0.5663610401322712, + "flos": 67926699020160.0, + "grad_norm": 0.7246517064192552, + "language_loss": 0.59786212, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61801171, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.0703125, + "step": 9420, + "time_per_iteration": 3.0855016708374023 + }, + { + "auxiliary_loss_clip": 0.01053929, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_clip": 1.02037954, + "balance_loss_mlp": 1.016922, + "epoch": 0.5664211633849391, + "flos": 24608180828160.0, + "grad_norm": 4.534910332992511, + "language_loss": 0.75317121, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.7741589, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 9421, + "time_per_iteration": 2.3974058628082275 + }, + { + "auxiliary_loss_clip": 0.01057036, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_clip": 1.01630604, + "balance_loss_mlp": 1.01817453, + "epoch": 0.5664812866376071, + "flos": 11648212012800.0, + "grad_norm": 1.9005915958894368, + "language_loss": 0.74140751, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.76239491, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38867188, + "step": 9422, + "time_per_iteration": 2.3696045875549316 + }, + { + "auxiliary_loss_clip": 0.01053853, + "auxiliary_loss_mlp": 0.01041614, + "balance_loss_clip": 1.01976252, + "balance_loss_mlp": 1.01749861, + "epoch": 0.566541409890275, + "flos": 24642849674880.0, + "grad_norm": 1.6270598756509607, + "language_loss": 0.82178295, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.84273762, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 9423, + "time_per_iteration": 2.3969924449920654 + }, + { + "auxiliary_loss_clip": 0.01052854, + "auxiliary_loss_mlp": 0.01052975, + "balance_loss_clip": 1.02592659, + "balance_loss_mlp": 1.01569724, + "epoch": 0.566601533142943, + "flos": 22269558850560.0, + "grad_norm": 1.6739056217873451, + "language_loss": 0.81886715, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.83992541, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37109375, + "step": 9424, + "time_per_iteration": 2.396890878677368 + }, + { + "auxiliary_loss_clip": 0.01058866, + "auxiliary_loss_mlp": 0.01053955, + "balance_loss_clip": 1.02589333, + "balance_loss_mlp": 1.01804852, + "epoch": 0.5666616563956111, + "flos": 29970351302400.0, + "grad_norm": 2.650324527112347, + "language_loss": 0.79871309, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.81984138, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40820312, + "step": 9425, + "time_per_iteration": 2.4261481761932373 + }, + { + "auxiliary_loss_clip": 0.01057049, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.02273822, + "balance_loss_mlp": 1.01878667, + "epoch": 0.566721779648279, + "flos": 17780601191040.0, + "grad_norm": 4.393953977013748, + "language_loss": 0.59929949, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.62033355, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3828125, + "step": 9426, + "time_per_iteration": 2.396416187286377 + }, + { + "auxiliary_loss_clip": 0.01058708, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.01533079, + "balance_loss_mlp": 1.01774168, + "epoch": 0.566781902900947, + "flos": 21032411356800.0, + "grad_norm": 1.837477443292821, + "language_loss": 0.8268019, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.84780014, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.41015625, + "step": 9427, + "time_per_iteration": 2.399181842803955 + }, + { + "auxiliary_loss_clip": 0.01055136, + "auxiliary_loss_mlp": 0.0104172, + "balance_loss_clip": 1.01877189, + "balance_loss_mlp": 1.01791191, + "epoch": 0.5668420261536149, + "flos": 23147484698880.0, + "grad_norm": 1.9989461165912032, + "language_loss": 0.86719805, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88816655, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.37304688, + "step": 9428, + "time_per_iteration": 2.44124436378479 + }, + { + "auxiliary_loss_clip": 0.01060285, + "auxiliary_loss_mlp": 0.01045124, + "balance_loss_clip": 1.01986384, + "balance_loss_mlp": 1.02057052, + "epoch": 0.5669021494062829, + "flos": 22600500428160.0, + "grad_norm": 2.454240613713165, + "language_loss": 0.7437256, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.76477975, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 9429, + "time_per_iteration": 3.666529655456543 + }, + { + "auxiliary_loss_clip": 0.01056898, + "auxiliary_loss_mlp": 0.010433, + "balance_loss_clip": 1.01720476, + "balance_loss_mlp": 1.01699686, + "epoch": 0.5669622726589508, + "flos": 17380356831360.0, + "grad_norm": 1.8081619030812368, + "language_loss": 0.76713371, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.78813565, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 9430, + "time_per_iteration": 2.3478167057037354 + }, + { + "auxiliary_loss_clip": 0.01057194, + "auxiliary_loss_mlp": 0.01042169, + "balance_loss_clip": 1.01696837, + "balance_loss_mlp": 1.0176785, + "epoch": 0.5670223959116188, + "flos": 18762463756800.0, + "grad_norm": 1.8762251426933314, + "language_loss": 0.73629367, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.75728726, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39453125, + "step": 9431, + "time_per_iteration": 3.847378969192505 + }, + { + "auxiliary_loss_clip": 0.01052172, + "auxiliary_loss_mlp": 0.01039764, + "balance_loss_clip": 1.01871181, + "balance_loss_mlp": 1.01828647, + "epoch": 0.5670825191642868, + "flos": 13552479365760.0, + "grad_norm": 1.6070463436402618, + "language_loss": 0.73840511, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75932443, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 9432, + "time_per_iteration": 2.3581840991973877 + }, + { + "auxiliary_loss_clip": 0.01057576, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.01816881, + "balance_loss_mlp": 1.01874304, + "epoch": 0.5671426424169548, + "flos": 22052957575680.0, + "grad_norm": 2.1405731594787847, + "language_loss": 0.78779054, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80879855, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 9433, + "time_per_iteration": 3.6906118392944336 + }, + { + "auxiliary_loss_clip": 0.01059175, + "auxiliary_loss_mlp": 0.01042458, + "balance_loss_clip": 1.01297724, + "balance_loss_mlp": 1.01850629, + "epoch": 0.5672027656696227, + "flos": 21322923713280.0, + "grad_norm": 2.653955555850362, + "language_loss": 0.65214205, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.67315841, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40625, + "step": 9434, + "time_per_iteration": 2.3835320472717285 + }, + { + "auxiliary_loss_clip": 0.01056426, + "auxiliary_loss_mlp": 0.0104048, + "balance_loss_clip": 1.01495779, + "balance_loss_mlp": 1.01882911, + "epoch": 0.5672628889222907, + "flos": 23512920566400.0, + "grad_norm": 1.6421212521881932, + "language_loss": 0.67755008, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.69851911, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37695312, + "step": 9435, + "time_per_iteration": 2.3970589637756348 + }, + { + "auxiliary_loss_clip": 0.01055247, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.01444101, + "balance_loss_mlp": 1.01849532, + "epoch": 0.5673230121749586, + "flos": 27120810355200.0, + "grad_norm": 1.5751048145444824, + "language_loss": 0.72466922, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.7455948, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 9436, + "time_per_iteration": 2.3960695266723633 + }, + { + "auxiliary_loss_clip": 0.01057485, + "auxiliary_loss_mlp": 0.01039606, + "balance_loss_clip": 1.01359487, + "balance_loss_mlp": 1.01809764, + "epoch": 0.5673831354276266, + "flos": 31140569986560.0, + "grad_norm": 1.4939608596607286, + "language_loss": 0.74899954, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76997042, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 9437, + "time_per_iteration": 2.4957878589630127 + }, + { + "auxiliary_loss_clip": 0.01061432, + "auxiliary_loss_mlp": 0.010445, + "balance_loss_clip": 1.01723671, + "balance_loss_mlp": 1.02192807, + "epoch": 0.5674432586802945, + "flos": 27671949077760.0, + "grad_norm": 1.7619277462146563, + "language_loss": 0.62179649, + "learning_rate": 1.661827179985277e-06, + "loss": 0.64285582, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 9438, + "time_per_iteration": 2.420778274536133 + }, + { + "auxiliary_loss_clip": 0.01058295, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.01480246, + "balance_loss_mlp": 1.0195843, + "epoch": 0.5675033819329626, + "flos": 26613941103360.0, + "grad_norm": 1.447575633280863, + "language_loss": 0.75890762, + "learning_rate": 1.661443332486909e-06, + "loss": 0.7799015, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9439, + "time_per_iteration": 3.8625569343566895 + }, + { + "auxiliary_loss_clip": 0.01059876, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_clip": 1.01490653, + "balance_loss_mlp": 1.02171004, + "epoch": 0.5675635051856306, + "flos": 19097385229440.0, + "grad_norm": 1.8142686041357212, + "language_loss": 0.84285086, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.86388147, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.3828125, + "step": 9440, + "time_per_iteration": 2.381423234939575 + }, + { + "auxiliary_loss_clip": 0.01062558, + "auxiliary_loss_mlp": 0.01050625, + "balance_loss_clip": 1.02294481, + "balance_loss_mlp": 1.02086508, + "epoch": 0.5676236284382985, + "flos": 17565361459200.0, + "grad_norm": 2.0648410527645886, + "language_loss": 0.77664793, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.7977798, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41601562, + "step": 9441, + "time_per_iteration": 2.3416824340820312 + }, + { + "auxiliary_loss_clip": 0.01059085, + "auxiliary_loss_mlp": 0.01041662, + "balance_loss_clip": 1.01631832, + "balance_loss_mlp": 1.02015376, + "epoch": 0.5676837516909665, + "flos": 15953352030720.0, + "grad_norm": 2.0047640238209965, + "language_loss": 0.8429684, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.86397588, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38867188, + "step": 9442, + "time_per_iteration": 2.385270118713379 + }, + { + "auxiliary_loss_clip": 0.01056322, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.01266313, + "balance_loss_mlp": 1.0212245, + "epoch": 0.5677438749436344, + "flos": 18294941473920.0, + "grad_norm": 1.9635026421659756, + "language_loss": 0.75874031, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.77965128, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 9443, + "time_per_iteration": 2.3514723777770996 + }, + { + "auxiliary_loss_clip": 0.01058485, + "auxiliary_loss_mlp": 0.01044186, + "balance_loss_clip": 1.0198555, + "balance_loss_mlp": 1.01941633, + "epoch": 0.5678039981963025, + "flos": 17930343479040.0, + "grad_norm": 2.1419919263038665, + "language_loss": 0.78862095, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.80964768, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.390625, + "step": 9444, + "time_per_iteration": 2.3738808631896973 + }, + { + "auxiliary_loss_clip": 0.0106039, + "auxiliary_loss_mlp": 0.01048381, + "balance_loss_clip": 1.02213073, + "balance_loss_mlp": 1.02116013, + "epoch": 0.5678641214489704, + "flos": 19315382958720.0, + "grad_norm": 1.716331537955263, + "language_loss": 0.8210746, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.84216231, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 9445, + "time_per_iteration": 2.356755256652832 + }, + { + "auxiliary_loss_clip": 0.01056842, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.01113653, + "balance_loss_mlp": 1.01849246, + "epoch": 0.5679242447016384, + "flos": 27749700408960.0, + "grad_norm": 1.6160276298515326, + "language_loss": 0.7160629, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73698092, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 9446, + "time_per_iteration": 2.4642629623413086 + }, + { + "auxiliary_loss_clip": 0.01059544, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.01180613, + "balance_loss_mlp": 1.01952851, + "epoch": 0.5679843679543063, + "flos": 23767961114880.0, + "grad_norm": 2.0819691741700717, + "language_loss": 0.74552643, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.76650101, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 9447, + "time_per_iteration": 2.391909599304199 + }, + { + "auxiliary_loss_clip": 0.01059726, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.01394653, + "balance_loss_mlp": 1.01958382, + "epoch": 0.5680444912069743, + "flos": 25590741621120.0, + "grad_norm": 1.9863335488184843, + "language_loss": 0.76689839, + "learning_rate": 1.657989284462725e-06, + "loss": 0.78789389, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 9448, + "time_per_iteration": 2.431476354598999 + }, + { + "auxiliary_loss_clip": 0.01059509, + "auxiliary_loss_mlp": 0.01039114, + "balance_loss_clip": 1.01362681, + "balance_loss_mlp": 1.0201683, + "epoch": 0.5681046144596422, + "flos": 23694678437760.0, + "grad_norm": 2.122287393100708, + "language_loss": 0.78145564, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.80244184, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 9449, + "time_per_iteration": 2.4132816791534424 + }, + { + "auxiliary_loss_clip": 0.01056529, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.01612735, + "balance_loss_mlp": 1.01776838, + "epoch": 0.5681647377123102, + "flos": 27999539164800.0, + "grad_norm": 1.771368793366794, + "language_loss": 0.75902414, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.78000879, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9450, + "time_per_iteration": 2.4368133544921875 + }, + { + "auxiliary_loss_clip": 0.01061581, + "auxiliary_loss_mlp": 0.01041775, + "balance_loss_clip": 1.01646709, + "balance_loss_mlp": 1.02082753, + "epoch": 0.5682248609649782, + "flos": 22746646846080.0, + "grad_norm": 3.262495508048384, + "language_loss": 0.6785019, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69953549, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40820312, + "step": 9451, + "time_per_iteration": 2.4191763401031494 + }, + { + "auxiliary_loss_clip": 0.01061772, + "auxiliary_loss_mlp": 0.01045199, + "balance_loss_clip": 1.01562333, + "balance_loss_mlp": 1.01940846, + "epoch": 0.5682849842176462, + "flos": 21287521728000.0, + "grad_norm": 2.2194648615807617, + "language_loss": 0.73976332, + "learning_rate": 1.656454488573026e-06, + "loss": 0.76083302, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42382812, + "step": 9452, + "time_per_iteration": 2.3792662620544434 + }, + { + "auxiliary_loss_clip": 0.01054967, + "auxiliary_loss_mlp": 0.01038254, + "balance_loss_clip": 1.01397121, + "balance_loss_mlp": 1.01766038, + "epoch": 0.5683451074703142, + "flos": 21140642171520.0, + "grad_norm": 1.5869523399215737, + "language_loss": 0.71361184, + "learning_rate": 1.656070822132428e-06, + "loss": 0.73454404, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37304688, + "step": 9453, + "time_per_iteration": 2.415027141571045 + }, + { + "auxiliary_loss_clip": 0.01056191, + "auxiliary_loss_mlp": 0.01040598, + "balance_loss_clip": 1.01586235, + "balance_loss_mlp": 1.01799524, + "epoch": 0.5684052307229821, + "flos": 22343435020800.0, + "grad_norm": 1.6019621741376417, + "language_loss": 0.71509349, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.73606133, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 9454, + "time_per_iteration": 2.396164894104004 + }, + { + "auxiliary_loss_clip": 0.01053772, + "auxiliary_loss_mlp": 0.01041082, + "balance_loss_clip": 1.01868248, + "balance_loss_mlp": 1.01671076, + "epoch": 0.5684653539756501, + "flos": 21797567913600.0, + "grad_norm": 2.056862292015668, + "language_loss": 0.62264562, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.64359415, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.37109375, + "step": 9455, + "time_per_iteration": 2.3931291103363037 + }, + { + "auxiliary_loss_clip": 0.01059738, + "auxiliary_loss_mlp": 0.01052282, + "balance_loss_clip": 1.02414906, + "balance_loss_mlp": 1.01894653, + "epoch": 0.568525477228318, + "flos": 22998615194880.0, + "grad_norm": 1.7634350807731827, + "language_loss": 0.7369386, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75805879, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40820312, + "step": 9456, + "time_per_iteration": 2.377664804458618 + }, + { + "auxiliary_loss_clip": 0.01055256, + "auxiliary_loss_mlp": 0.01045655, + "balance_loss_clip": 1.02162218, + "balance_loss_mlp": 1.01709509, + "epoch": 0.568585600480986, + "flos": 21391563179520.0, + "grad_norm": 2.7593444229887325, + "language_loss": 0.77261573, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79362482, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 9457, + "time_per_iteration": 2.399627685546875 + }, + { + "auxiliary_loss_clip": 0.01056935, + "auxiliary_loss_mlp": 0.01049768, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.01751208, + "epoch": 0.568645723733654, + "flos": 30006067489920.0, + "grad_norm": 1.8220797971127245, + "language_loss": 0.67212141, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.69318843, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 9458, + "time_per_iteration": 2.439864158630371 + }, + { + "auxiliary_loss_clip": 0.01058031, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.01536322, + "balance_loss_mlp": 1.01829338, + "epoch": 0.568705846986322, + "flos": 20411620738560.0, + "grad_norm": 2.31795203439138, + "language_loss": 0.70246398, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.72345161, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39648438, + "step": 9459, + "time_per_iteration": 2.3814280033111572 + }, + { + "auxiliary_loss_clip": 0.01060259, + "auxiliary_loss_mlp": 0.01047423, + "balance_loss_clip": 1.02106643, + "balance_loss_mlp": 1.01965725, + "epoch": 0.5687659702389899, + "flos": 17455804012800.0, + "grad_norm": 2.210188125881042, + "language_loss": 0.78124559, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.80232245, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40625, + "step": 9460, + "time_per_iteration": 2.335477828979492 + }, + { + "auxiliary_loss_clip": 0.01058284, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_clip": 1.0213517, + "balance_loss_mlp": 1.0181129, + "epoch": 0.5688260934916579, + "flos": 25405038766080.0, + "grad_norm": 1.6311125344805677, + "language_loss": 0.73046041, + "learning_rate": 1.65300196133547e-06, + "loss": 0.75153005, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 9461, + "time_per_iteration": 2.41902232170105 + }, + { + "auxiliary_loss_clip": 0.01056097, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_clip": 1.02005363, + "balance_loss_mlp": 1.01719475, + "epoch": 0.5688862167443258, + "flos": 21607186936320.0, + "grad_norm": 1.9515234225402238, + "language_loss": 0.73925102, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.76027513, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 9462, + "time_per_iteration": 2.3553099632263184 + }, + { + "auxiliary_loss_clip": 0.01053253, + "auxiliary_loss_mlp": 0.01042043, + "balance_loss_clip": 1.0197506, + "balance_loss_mlp": 1.0164938, + "epoch": 0.5689463399969938, + "flos": 22417904684160.0, + "grad_norm": 2.295071393595814, + "language_loss": 0.75005972, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.77101266, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3671875, + "step": 9463, + "time_per_iteration": 2.3887345790863037 + }, + { + "auxiliary_loss_clip": 0.01056709, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.01740086, + "balance_loss_mlp": 1.01801944, + "epoch": 0.5690064632496618, + "flos": 18295814257920.0, + "grad_norm": 1.9872401227122782, + "language_loss": 0.75659609, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.77756822, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.38671875, + "step": 9464, + "time_per_iteration": 2.3748157024383545 + }, + { + "auxiliary_loss_clip": 0.01057539, + "auxiliary_loss_mlp": 0.01047525, + "balance_loss_clip": 1.0224669, + "balance_loss_mlp": 1.01780462, + "epoch": 0.5690665865023298, + "flos": 21578208641280.0, + "grad_norm": 1.6689193831193228, + "language_loss": 0.84629148, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86734211, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39648438, + "step": 9465, + "time_per_iteration": 2.40130352973938 + }, + { + "auxiliary_loss_clip": 0.01054331, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.01368475, + "balance_loss_mlp": 1.01741147, + "epoch": 0.5691267097549978, + "flos": 24420418202880.0, + "grad_norm": 1.5128674928003591, + "language_loss": 0.73335814, + "learning_rate": 1.651084350506125e-06, + "loss": 0.75427461, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 9466, + "time_per_iteration": 2.395613431930542 + }, + { + "auxiliary_loss_clip": 0.01013885, + "auxiliary_loss_mlp": 0.01005002, + "balance_loss_clip": 1.00265348, + "balance_loss_mlp": 1.00667953, + "epoch": 0.5691868330076657, + "flos": 61654238000640.0, + "grad_norm": 0.7177133049195442, + "language_loss": 0.55458283, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57477176, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.07226562, + "step": 9467, + "time_per_iteration": 3.0899758338928223 + }, + { + "auxiliary_loss_clip": 0.01058604, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.01262999, + "balance_loss_mlp": 1.01761281, + "epoch": 0.5692469562603337, + "flos": 21324110699520.0, + "grad_norm": 2.0364122937605273, + "language_loss": 0.64847547, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.66946137, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41015625, + "step": 9468, + "time_per_iteration": 2.3684208393096924 + }, + { + "auxiliary_loss_clip": 0.01055798, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.01549101, + "balance_loss_mlp": 1.0173862, + "epoch": 0.5693070795130016, + "flos": 23366774148480.0, + "grad_norm": 1.7666740836438684, + "language_loss": 0.80662835, + "learning_rate": 1.64993394266317e-06, + "loss": 0.82758605, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3828125, + "step": 9469, + "time_per_iteration": 3.639204978942871 + }, + { + "auxiliary_loss_clip": 0.01061264, + "auxiliary_loss_mlp": 0.01047305, + "balance_loss_clip": 1.01830149, + "balance_loss_mlp": 1.01935279, + "epoch": 0.5693672027656697, + "flos": 18696268085760.0, + "grad_norm": 2.0420897259085504, + "language_loss": 0.7110917, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.73217738, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.41992188, + "step": 9470, + "time_per_iteration": 2.354099750518799 + }, + { + "auxiliary_loss_clip": 0.01056507, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.01257539, + "balance_loss_mlp": 1.01812518, + "epoch": 0.5694273260183376, + "flos": 20448139887360.0, + "grad_norm": 1.7076465569960073, + "language_loss": 0.75448298, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.77540612, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38476562, + "step": 9471, + "time_per_iteration": 3.779123067855835 + }, + { + "auxiliary_loss_clip": 0.01057169, + "auxiliary_loss_mlp": 0.0104089, + "balance_loss_clip": 1.01554608, + "balance_loss_mlp": 1.01911521, + "epoch": 0.5694874492710056, + "flos": 17602229721600.0, + "grad_norm": 1.6135908380893915, + "language_loss": 0.58758497, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.60856557, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 9472, + "time_per_iteration": 3.7279274463653564 + }, + { + "auxiliary_loss_clip": 0.01054897, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.01473975, + "balance_loss_mlp": 1.01849079, + "epoch": 0.5695475725236735, + "flos": 13369988355840.0, + "grad_norm": 1.8961406024202287, + "language_loss": 0.75419343, + "learning_rate": 1.648400251450638e-06, + "loss": 0.77513361, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 9473, + "time_per_iteration": 2.337942361831665 + }, + { + "auxiliary_loss_clip": 0.01015467, + "auxiliary_loss_mlp": 0.01011427, + "balance_loss_clip": 1.00900733, + "balance_loss_mlp": 1.00771809, + "epoch": 0.5696076957763415, + "flos": 68170951958400.0, + "grad_norm": 0.6840943689829038, + "language_loss": 0.5768314, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59710032, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.07714844, + "step": 9474, + "time_per_iteration": 3.043489456176758 + }, + { + "auxiliary_loss_clip": 0.01056583, + "auxiliary_loss_mlp": 0.01041031, + "balance_loss_clip": 1.01416135, + "balance_loss_mlp": 1.01797414, + "epoch": 0.5696678190290094, + "flos": 33836912421120.0, + "grad_norm": 1.8063648204545353, + "language_loss": 0.55522847, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.57620466, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 9475, + "time_per_iteration": 2.499204635620117 + }, + { + "auxiliary_loss_clip": 0.01057611, + "auxiliary_loss_mlp": 0.010457, + "balance_loss_clip": 1.01886594, + "balance_loss_mlp": 1.0176791, + "epoch": 0.5697279422816774, + "flos": 26355479241600.0, + "grad_norm": 1.503299354446975, + "language_loss": 0.80268013, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82371324, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 9476, + "time_per_iteration": 2.4094772338867188 + }, + { + "auxiliary_loss_clip": 0.01058804, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_clip": 1.01843786, + "balance_loss_mlp": 1.01831365, + "epoch": 0.5697880655343454, + "flos": 22929382235520.0, + "grad_norm": 2.2394583667237318, + "language_loss": 0.6794852, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.7005288, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40429688, + "step": 9477, + "time_per_iteration": 2.3922054767608643 + }, + { + "auxiliary_loss_clip": 0.01057883, + "auxiliary_loss_mlp": 0.01040683, + "balance_loss_clip": 1.01313365, + "balance_loss_mlp": 1.01775002, + "epoch": 0.5698481887870134, + "flos": 26760087521280.0, + "grad_norm": 1.924560840008803, + "language_loss": 0.71284944, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73383504, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40234375, + "step": 9478, + "time_per_iteration": 2.458282709121704 + }, + { + "auxiliary_loss_clip": 0.0105349, + "auxiliary_loss_mlp": 0.01036725, + "balance_loss_clip": 1.01339531, + "balance_loss_mlp": 1.01735449, + "epoch": 0.5699083120396814, + "flos": 15741359055360.0, + "grad_norm": 1.7893645376752374, + "language_loss": 0.70481324, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.72571534, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36132812, + "step": 9479, + "time_per_iteration": 3.8688933849334717 + }, + { + "auxiliary_loss_clip": 0.01054945, + "auxiliary_loss_mlp": 0.01038789, + "balance_loss_clip": 1.01467299, + "balance_loss_mlp": 1.0175724, + "epoch": 0.5699684352923493, + "flos": 19536243419520.0, + "grad_norm": 1.82323568223002, + "language_loss": 0.72467065, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.74560797, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 9480, + "time_per_iteration": 2.378706216812134 + }, + { + "auxiliary_loss_clip": 0.01056184, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.01484203, + "balance_loss_mlp": 1.01731718, + "epoch": 0.5700285585450173, + "flos": 16252417670400.0, + "grad_norm": 2.0750684695884005, + "language_loss": 0.73376185, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.75471628, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38671875, + "step": 9481, + "time_per_iteration": 2.369520902633667 + }, + { + "auxiliary_loss_clip": 0.01057109, + "auxiliary_loss_mlp": 0.01048063, + "balance_loss_clip": 1.02079964, + "balance_loss_mlp": 1.01831651, + "epoch": 0.5700886817976852, + "flos": 19863973152000.0, + "grad_norm": 1.628531207077819, + "language_loss": 0.79280233, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.81385398, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38867188, + "step": 9482, + "time_per_iteration": 2.372015953063965 + }, + { + "auxiliary_loss_clip": 0.01056512, + "auxiliary_loss_mlp": 0.01039654, + "balance_loss_clip": 1.0140475, + "balance_loss_mlp": 1.01798725, + "epoch": 0.5701488050503533, + "flos": 23840580476160.0, + "grad_norm": 1.5346923315985341, + "language_loss": 0.78712308, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80808479, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38476562, + "step": 9483, + "time_per_iteration": 2.407238245010376 + }, + { + "auxiliary_loss_clip": 0.01057018, + "auxiliary_loss_mlp": 0.01044938, + "balance_loss_clip": 1.01841438, + "balance_loss_mlp": 1.01772571, + "epoch": 0.5702089283030212, + "flos": 23658543313920.0, + "grad_norm": 1.6239014004759877, + "language_loss": 0.82265639, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.84367597, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 9484, + "time_per_iteration": 2.3969709873199463 + }, + { + "auxiliary_loss_clip": 0.0105825, + "auxiliary_loss_mlp": 0.0104159, + "balance_loss_clip": 1.01412463, + "balance_loss_mlp": 1.01772046, + "epoch": 0.5702690515556892, + "flos": 27889946807040.0, + "grad_norm": 2.116695717742763, + "language_loss": 0.6185658, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.63956416, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40625, + "step": 9485, + "time_per_iteration": 2.4600908756256104 + }, + { + "auxiliary_loss_clip": 0.01055621, + "auxiliary_loss_mlp": 0.01048169, + "balance_loss_clip": 1.02182412, + "balance_loss_mlp": 1.01637602, + "epoch": 0.5703291748083571, + "flos": 24022827106560.0, + "grad_norm": 1.7851248679162126, + "language_loss": 0.66980362, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.6908415, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 9486, + "time_per_iteration": 2.380516529083252 + }, + { + "auxiliary_loss_clip": 0.01008927, + "auxiliary_loss_mlp": 0.01009179, + "balance_loss_clip": 1.00649667, + "balance_loss_mlp": 1.00155234, + "epoch": 0.5703892980610251, + "flos": 57019867061760.0, + "grad_norm": 0.6717152268103516, + "language_loss": 0.480317, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50049806, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.07373047, + "step": 9487, + "time_per_iteration": 3.072537660598755 + }, + { + "auxiliary_loss_clip": 0.0105596, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.01174176, + "balance_loss_mlp": 1.017802, + "epoch": 0.570449421313693, + "flos": 24349928434560.0, + "grad_norm": 1.6140677204483456, + "language_loss": 0.87307906, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.89401168, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 9488, + "time_per_iteration": 2.3926823139190674 + }, + { + "auxiliary_loss_clip": 0.01057253, + "auxiliary_loss_mlp": 0.01038441, + "balance_loss_clip": 1.01101041, + "balance_loss_mlp": 1.01665568, + "epoch": 0.570509544566361, + "flos": 24827365543680.0, + "grad_norm": 1.5987781746329246, + "language_loss": 0.7958535, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81681043, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 9489, + "time_per_iteration": 2.418394088745117 + }, + { + "auxiliary_loss_clip": 0.0105578, + "auxiliary_loss_mlp": 0.01046164, + "balance_loss_clip": 1.02219093, + "balance_loss_mlp": 1.01681876, + "epoch": 0.570569667819029, + "flos": 21396241301760.0, + "grad_norm": 1.8871972374427497, + "language_loss": 0.70452535, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72554475, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.390625, + "step": 9490, + "time_per_iteration": 2.3594772815704346 + }, + { + "auxiliary_loss_clip": 0.01056534, + "auxiliary_loss_mlp": 0.01040207, + "balance_loss_clip": 1.01565039, + "balance_loss_mlp": 1.01782596, + "epoch": 0.570629791071697, + "flos": 23215775051520.0, + "grad_norm": 1.5999651571646583, + "language_loss": 0.76874959, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78971696, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38671875, + "step": 9491, + "time_per_iteration": 2.4088194370269775 + }, + { + "auxiliary_loss_clip": 0.01010915, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.03097236, + "balance_loss_mlp": 1.00323987, + "epoch": 0.570689914324365, + "flos": 65281505460480.0, + "grad_norm": 0.8247426639473324, + "language_loss": 0.57479417, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59523892, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.07666016, + "step": 9492, + "time_per_iteration": 3.0613343715667725 + }, + { + "auxiliary_loss_clip": 0.01056168, + "auxiliary_loss_mlp": 0.0104244, + "balance_loss_clip": 1.01689351, + "balance_loss_mlp": 1.0182693, + "epoch": 0.5707500375770329, + "flos": 21140851639680.0, + "grad_norm": 2.0961596157082347, + "language_loss": 0.73081791, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.75180399, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 9493, + "time_per_iteration": 2.391953706741333 + }, + { + "auxiliary_loss_clip": 0.01058531, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.01410604, + "balance_loss_mlp": 1.01808178, + "epoch": 0.5708101608297009, + "flos": 20811725452800.0, + "grad_norm": 1.588551633508979, + "language_loss": 0.78733397, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80832845, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40429688, + "step": 9494, + "time_per_iteration": 2.3706045150756836 + }, + { + "auxiliary_loss_clip": 0.01059861, + "auxiliary_loss_mlp": 0.01043975, + "balance_loss_clip": 1.01413691, + "balance_loss_mlp": 1.01871896, + "epoch": 0.5708702840823688, + "flos": 25811148234240.0, + "grad_norm": 2.1364965949322197, + "language_loss": 0.82278192, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.84382027, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41015625, + "step": 9495, + "time_per_iteration": 2.420576572418213 + }, + { + "auxiliary_loss_clip": 0.01063141, + "auxiliary_loss_mlp": 0.01049456, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.01968515, + "epoch": 0.5709304073350369, + "flos": 23651002460160.0, + "grad_norm": 2.2610602275447262, + "language_loss": 0.6733402, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.69446617, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.43554688, + "step": 9496, + "time_per_iteration": 2.3802120685577393 + }, + { + "auxiliary_loss_clip": 0.01060997, + "auxiliary_loss_mlp": 0.01048772, + "balance_loss_clip": 1.01948261, + "balance_loss_mlp": 1.01932383, + "epoch": 0.5709905305877048, + "flos": 16106620366080.0, + "grad_norm": 3.3706200131415702, + "language_loss": 0.70314479, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.72424257, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41601562, + "step": 9497, + "time_per_iteration": 2.478105068206787 + }, + { + "auxiliary_loss_clip": 0.01060597, + "auxiliary_loss_mlp": 0.01044525, + "balance_loss_clip": 1.01536632, + "balance_loss_mlp": 1.02046728, + "epoch": 0.5710506538403728, + "flos": 24749753857920.0, + "grad_norm": 1.8996737020512495, + "language_loss": 0.82497096, + "learning_rate": 1.638819551358182e-06, + "loss": 0.84602219, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40234375, + "step": 9498, + "time_per_iteration": 2.5104923248291016 + }, + { + "auxiliary_loss_clip": 0.01059581, + "auxiliary_loss_mlp": 0.01042209, + "balance_loss_clip": 1.01483893, + "balance_loss_mlp": 1.019629, + "epoch": 0.5711107770930407, + "flos": 21981141175680.0, + "grad_norm": 1.8808834172717348, + "language_loss": 0.67554456, + "learning_rate": 1.638436499891469e-06, + "loss": 0.69656247, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3984375, + "step": 9499, + "time_per_iteration": 2.4099626541137695 + }, + { + "auxiliary_loss_clip": 0.01058559, + "auxiliary_loss_mlp": 0.01043166, + "balance_loss_clip": 1.01540232, + "balance_loss_mlp": 1.02005672, + "epoch": 0.5711709003457087, + "flos": 19572972036480.0, + "grad_norm": 1.6707626323624971, + "language_loss": 0.72868401, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.74970126, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38476562, + "step": 9500, + "time_per_iteration": 2.364004135131836 + }, + { + "auxiliary_loss_clip": 0.01059936, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.0158329, + "balance_loss_mlp": 1.01942039, + "epoch": 0.5712310235983766, + "flos": 24241557974400.0, + "grad_norm": 1.9588612500759415, + "language_loss": 0.77825814, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.7992847, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 9501, + "time_per_iteration": 2.4257142543792725 + }, + { + "auxiliary_loss_clip": 0.01059119, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.01571202, + "balance_loss_mlp": 1.01927018, + "epoch": 0.5712911468510447, + "flos": 20995089246720.0, + "grad_norm": 1.5701910873817941, + "language_loss": 0.75599229, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77698922, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3984375, + "step": 9502, + "time_per_iteration": 2.366684913635254 + }, + { + "auxiliary_loss_clip": 0.01056732, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.01599777, + "balance_loss_mlp": 1.01927543, + "epoch": 0.5713512701037126, + "flos": 18915976471680.0, + "grad_norm": 1.5411826006344016, + "language_loss": 0.83148485, + "learning_rate": 1.636904431275105e-06, + "loss": 0.8524484, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.375, + "step": 9503, + "time_per_iteration": 2.4386911392211914 + }, + { + "auxiliary_loss_clip": 0.01057823, + "auxiliary_loss_mlp": 0.01041224, + "balance_loss_clip": 1.01578474, + "balance_loss_mlp": 1.0204879, + "epoch": 0.5714113933563806, + "flos": 17412686616960.0, + "grad_norm": 2.108042959499498, + "language_loss": 0.87428325, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.89527369, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 9504, + "time_per_iteration": 2.3479714393615723 + }, + { + "auxiliary_loss_clip": 0.01056442, + "auxiliary_loss_mlp": 0.01039399, + "balance_loss_clip": 1.01435328, + "balance_loss_mlp": 1.01867616, + "epoch": 0.5714715166090486, + "flos": 20192331288960.0, + "grad_norm": 1.693219939484887, + "language_loss": 0.76204503, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.78300345, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 9505, + "time_per_iteration": 2.362290859222412 + }, + { + "auxiliary_loss_clip": 0.01055754, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.01275039, + "balance_loss_mlp": 1.01777506, + "epoch": 0.5715316398617165, + "flos": 18550680249600.0, + "grad_norm": 1.4928605390946246, + "language_loss": 0.8296628, + "learning_rate": 1.635755524332509e-06, + "loss": 0.85058331, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 9506, + "time_per_iteration": 2.3837249279022217 + }, + { + "auxiliary_loss_clip": 0.01056486, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.01171279, + "balance_loss_mlp": 1.01831138, + "epoch": 0.5715917631143845, + "flos": 18477223015680.0, + "grad_norm": 2.3766490509489486, + "language_loss": 0.7835598, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.80447865, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38085938, + "step": 9507, + "time_per_iteration": 2.3627235889434814 + }, + { + "auxiliary_loss_clip": 0.01058824, + "auxiliary_loss_mlp": 0.01048981, + "balance_loss_clip": 1.0206089, + "balance_loss_mlp": 1.01902151, + "epoch": 0.5716518863670524, + "flos": 24019021768320.0, + "grad_norm": 1.701103427389121, + "language_loss": 0.69872522, + "learning_rate": 1.63498965540751e-06, + "loss": 0.71980327, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.3984375, + "step": 9508, + "time_per_iteration": 3.7153103351593018 + }, + { + "auxiliary_loss_clip": 0.01056738, + "auxiliary_loss_mlp": 0.01037366, + "balance_loss_clip": 1.01257086, + "balance_loss_mlp": 1.01773107, + "epoch": 0.5717120096197205, + "flos": 17818586616960.0, + "grad_norm": 1.9625765442518446, + "language_loss": 0.81152916, + "learning_rate": 1.634606741699593e-06, + "loss": 0.83247024, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 9509, + "time_per_iteration": 2.3681812286376953 + }, + { + "auxiliary_loss_clip": 0.01055449, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.01763272, + "balance_loss_mlp": 1.01719654, + "epoch": 0.5717721328723884, + "flos": 21865125127680.0, + "grad_norm": 1.9993386053172415, + "language_loss": 0.7425251, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.76350415, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 9510, + "time_per_iteration": 2.356755256652832 + }, + { + "auxiliary_loss_clip": 0.01057216, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.01690948, + "balance_loss_mlp": 1.01827407, + "epoch": 0.5718322561250564, + "flos": 28436407407360.0, + "grad_norm": 1.492955321518968, + "language_loss": 0.7017312, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.72271383, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.390625, + "step": 9511, + "time_per_iteration": 3.879596471786499 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.01440561, + "balance_loss_mlp": 1.01791513, + "epoch": 0.5718923793777243, + "flos": 13551013088640.0, + "grad_norm": 6.805787727722015, + "language_loss": 0.63699365, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.65794045, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 9512, + "time_per_iteration": 3.805906057357788 + }, + { + "auxiliary_loss_clip": 0.01054895, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.01839328, + "balance_loss_mlp": 1.01691353, + "epoch": 0.5719525026303923, + "flos": 17821065323520.0, + "grad_norm": 2.8229518966912623, + "language_loss": 0.76595831, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78692794, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.38085938, + "step": 9513, + "time_per_iteration": 2.3460934162139893 + }, + { + "auxiliary_loss_clip": 0.01012936, + "auxiliary_loss_mlp": 0.01011846, + "balance_loss_clip": 1.00882959, + "balance_loss_mlp": 1.00511169, + "epoch": 0.5720126258830602, + "flos": 61295262600960.0, + "grad_norm": 0.8952054791590704, + "language_loss": 0.66925633, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68950415, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.03015137, + "router_z_loss_mlp": 0.078125, + "step": 9514, + "time_per_iteration": 3.021477222442627 + }, + { + "auxiliary_loss_clip": 0.01059387, + "auxiliary_loss_mlp": 0.01044944, + "balance_loss_clip": 1.01802695, + "balance_loss_mlp": 1.01852405, + "epoch": 0.5720727491357283, + "flos": 23986901450880.0, + "grad_norm": 1.8262650899168478, + "language_loss": 0.82732415, + "learning_rate": 1.63230955093099e-06, + "loss": 0.84836745, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41015625, + "step": 9515, + "time_per_iteration": 2.3834378719329834 + }, + { + "auxiliary_loss_clip": 0.0105251, + "auxiliary_loss_mlp": 0.01044216, + "balance_loss_clip": 1.01924133, + "balance_loss_mlp": 1.01613271, + "epoch": 0.5721328723883962, + "flos": 23404270815360.0, + "grad_norm": 1.5716611387768071, + "language_loss": 0.86725569, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88822293, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 9516, + "time_per_iteration": 2.4139959812164307 + }, + { + "auxiliary_loss_clip": 0.01054065, + "auxiliary_loss_mlp": 0.01043584, + "balance_loss_clip": 1.01893115, + "balance_loss_mlp": 1.01662636, + "epoch": 0.5721929956410642, + "flos": 18803975230080.0, + "grad_norm": 1.6856519116282316, + "language_loss": 0.8856144, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.90659088, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.375, + "step": 9517, + "time_per_iteration": 2.3733081817626953 + }, + { + "auxiliary_loss_clip": 0.01055652, + "auxiliary_loss_mlp": 0.01048178, + "balance_loss_clip": 1.02263129, + "balance_loss_mlp": 1.01839757, + "epoch": 0.5722531188937322, + "flos": 27195489486720.0, + "grad_norm": 1.818854795603614, + "language_loss": 0.8590256, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.88006389, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 9518, + "time_per_iteration": 2.4328629970550537 + }, + { + "auxiliary_loss_clip": 0.0105394, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.01448298, + "balance_loss_mlp": 1.01744103, + "epoch": 0.5723132421464001, + "flos": 15194758809600.0, + "grad_norm": 1.8079125390897117, + "language_loss": 0.79965115, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.82056725, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 9519, + "time_per_iteration": 3.889305591583252 + }, + { + "auxiliary_loss_clip": 0.01056241, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.0157032, + "balance_loss_mlp": 1.01849544, + "epoch": 0.5723733653990681, + "flos": 27598212552960.0, + "grad_norm": 1.4024176519535674, + "language_loss": 0.83497787, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85592747, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37890625, + "step": 9520, + "time_per_iteration": 2.4327268600463867 + }, + { + "auxiliary_loss_clip": 0.01057276, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.01536608, + "balance_loss_mlp": 1.01936615, + "epoch": 0.572433488651736, + "flos": 18221903176320.0, + "grad_norm": 1.9220522304433219, + "language_loss": 0.74948633, + "learning_rate": 1.630012862105243e-06, + "loss": 0.77047551, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 9521, + "time_per_iteration": 2.3500499725341797 + }, + { + "auxiliary_loss_clip": 0.01056051, + "auxiliary_loss_mlp": 0.01043779, + "balance_loss_clip": 1.01934075, + "balance_loss_mlp": 1.01744485, + "epoch": 0.5724936119044041, + "flos": 31247753460480.0, + "grad_norm": 2.0142121741845957, + "language_loss": 0.79991895, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.82091725, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38671875, + "step": 9522, + "time_per_iteration": 2.4569313526153564 + }, + { + "auxiliary_loss_clip": 0.01054261, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.01707304, + "balance_loss_mlp": 1.01765156, + "epoch": 0.572553735157072, + "flos": 19201356858240.0, + "grad_norm": 1.6144533012989668, + "language_loss": 0.72739983, + "learning_rate": 1.629247411248102e-06, + "loss": 0.74834025, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 9523, + "time_per_iteration": 2.3570334911346436 + }, + { + "auxiliary_loss_clip": 0.01053823, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.01519811, + "balance_loss_mlp": 1.01759934, + "epoch": 0.57261385840974, + "flos": 21213855025920.0, + "grad_norm": 2.201312971851895, + "language_loss": 0.71506977, + "learning_rate": 1.628864706900738e-06, + "loss": 0.73598212, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 9524, + "time_per_iteration": 2.392056941986084 + }, + { + "auxiliary_loss_clip": 0.01056607, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.01258552, + "balance_loss_mlp": 1.01898026, + "epoch": 0.5726739816624079, + "flos": 33983128661760.0, + "grad_norm": 1.6363937201212755, + "language_loss": 0.66453052, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.68546373, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37695312, + "step": 9525, + "time_per_iteration": 2.465132474899292 + }, + { + "auxiliary_loss_clip": 0.01054315, + "auxiliary_loss_mlp": 0.01036478, + "balance_loss_clip": 1.01342297, + "balance_loss_mlp": 1.01714325, + "epoch": 0.5727341049150759, + "flos": 24274935100800.0, + "grad_norm": 1.772454948715186, + "language_loss": 0.73324937, + "learning_rate": 1.628099340440984e-06, + "loss": 0.7541573, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37109375, + "step": 9526, + "time_per_iteration": 2.418215274810791 + }, + { + "auxiliary_loss_clip": 0.01054861, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.01562321, + "balance_loss_mlp": 1.01872015, + "epoch": 0.5727942281677438, + "flos": 28399364588160.0, + "grad_norm": 1.6067947480641316, + "language_loss": 0.81519699, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.83613873, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 9527, + "time_per_iteration": 2.435356855392456 + }, + { + "auxiliary_loss_clip": 0.01055246, + "auxiliary_loss_mlp": 0.01039339, + "balance_loss_clip": 1.0146389, + "balance_loss_mlp": 1.0177784, + "epoch": 0.5728543514204119, + "flos": 19535754660480.0, + "grad_norm": 1.631690567122575, + "language_loss": 0.73218423, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.75313008, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 9528, + "time_per_iteration": 2.3709001541137695 + }, + { + "auxiliary_loss_clip": 0.01057479, + "auxiliary_loss_mlp": 0.01042835, + "balance_loss_clip": 1.01783657, + "balance_loss_mlp": 1.01927447, + "epoch": 0.5729144746730798, + "flos": 21505694014080.0, + "grad_norm": 1.9557791997165668, + "language_loss": 0.87040937, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.8914125, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 9529, + "time_per_iteration": 2.364185333251953 + }, + { + "auxiliary_loss_clip": 0.01011718, + "auxiliary_loss_mlp": 0.01004513, + "balance_loss_clip": 1.00197423, + "balance_loss_mlp": 1.00431764, + "epoch": 0.5729745979257478, + "flos": 58678626533760.0, + "grad_norm": 0.7642069643283972, + "language_loss": 0.56201917, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58218151, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.07421875, + "step": 9530, + "time_per_iteration": 2.889549732208252 + }, + { + "auxiliary_loss_clip": 0.01059109, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.01439846, + "balance_loss_mlp": 1.02073693, + "epoch": 0.5730347211784158, + "flos": 18551099185920.0, + "grad_norm": 1.7934784479854953, + "language_loss": 0.68321764, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.70419455, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3828125, + "step": 9531, + "time_per_iteration": 2.3667116165161133 + }, + { + "auxiliary_loss_clip": 0.01057494, + "auxiliary_loss_mlp": 0.01042403, + "balance_loss_clip": 1.01683283, + "balance_loss_mlp": 1.01890993, + "epoch": 0.5730948444310837, + "flos": 38030051197440.0, + "grad_norm": 1.8639937626570713, + "language_loss": 0.76890218, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.78990114, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 9532, + "time_per_iteration": 2.490511655807495 + }, + { + "auxiliary_loss_clip": 0.01056267, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_clip": 1.01871789, + "balance_loss_mlp": 1.01831198, + "epoch": 0.5731549676837517, + "flos": 25225934158080.0, + "grad_norm": 1.3439861470678454, + "language_loss": 0.7978667, + "learning_rate": 1.625421002822686e-06, + "loss": 0.81885237, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 9533, + "time_per_iteration": 2.510737419128418 + }, + { + "auxiliary_loss_clip": 0.01056153, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.01638699, + "balance_loss_mlp": 1.01925421, + "epoch": 0.5732150909364196, + "flos": 23367088350720.0, + "grad_norm": 1.561395890016325, + "language_loss": 0.86078918, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.88173115, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36914062, + "step": 9534, + "time_per_iteration": 2.3958029747009277 + }, + { + "auxiliary_loss_clip": 0.01057189, + "auxiliary_loss_mlp": 0.01054574, + "balance_loss_clip": 1.02750206, + "balance_loss_mlp": 1.01830578, + "epoch": 0.5732752141890877, + "flos": 23078147005440.0, + "grad_norm": 1.7602103403020786, + "language_loss": 0.76200438, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.78312206, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38867188, + "step": 9535, + "time_per_iteration": 2.446237087249756 + }, + { + "auxiliary_loss_clip": 0.01059407, + "auxiliary_loss_mlp": 0.01045488, + "balance_loss_clip": 1.02075195, + "balance_loss_mlp": 1.01936531, + "epoch": 0.5733353374417556, + "flos": 24351150332160.0, + "grad_norm": 1.4825531042901026, + "language_loss": 0.71928382, + "learning_rate": 1.624273356614346e-06, + "loss": 0.74033284, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.40039062, + "step": 9536, + "time_per_iteration": 2.4280905723571777 + }, + { + "auxiliary_loss_clip": 0.01055826, + "auxiliary_loss_mlp": 0.01044634, + "balance_loss_clip": 1.02026796, + "balance_loss_mlp": 1.0179615, + "epoch": 0.5733954606944236, + "flos": 27197619079680.0, + "grad_norm": 1.6806462526849715, + "language_loss": 0.70715308, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.7281577, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 9537, + "time_per_iteration": 2.427006483078003 + }, + { + "auxiliary_loss_clip": 0.01055895, + "auxiliary_loss_mlp": 0.01044823, + "balance_loss_clip": 1.01894236, + "balance_loss_mlp": 1.01734495, + "epoch": 0.5734555839470915, + "flos": 28763927671680.0, + "grad_norm": 1.7944879743907458, + "language_loss": 0.64101231, + "learning_rate": 1.623508330355902e-06, + "loss": 0.66201949, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38476562, + "step": 9538, + "time_per_iteration": 2.4524643421173096 + }, + { + "auxiliary_loss_clip": 0.0105472, + "auxiliary_loss_mlp": 0.0104072, + "balance_loss_clip": 1.0155195, + "balance_loss_mlp": 1.01741087, + "epoch": 0.5735157071997595, + "flos": 22965691916160.0, + "grad_norm": 2.0682741367384656, + "language_loss": 0.84145463, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.86240906, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 9539, + "time_per_iteration": 2.409797191619873 + }, + { + "auxiliary_loss_clip": 0.01056875, + "auxiliary_loss_mlp": 0.01043167, + "balance_loss_clip": 1.01835966, + "balance_loss_mlp": 1.0178144, + "epoch": 0.5735758304524274, + "flos": 18988456187520.0, + "grad_norm": 2.129863662189192, + "language_loss": 0.74585205, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.7668525, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 9540, + "time_per_iteration": 2.382070779800415 + }, + { + "auxiliary_loss_clip": 0.01054394, + "auxiliary_loss_mlp": 0.01043062, + "balance_loss_clip": 1.02009046, + "balance_loss_mlp": 1.01670384, + "epoch": 0.5736359537050955, + "flos": 28396816058880.0, + "grad_norm": 1.746768083030801, + "language_loss": 0.81313366, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.83410823, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37695312, + "step": 9541, + "time_per_iteration": 2.4585421085357666 + }, + { + "auxiliary_loss_clip": 0.01058591, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.0141809, + "balance_loss_mlp": 1.01847827, + "epoch": 0.5736960769577634, + "flos": 15626460170880.0, + "grad_norm": 2.6077008280685914, + "language_loss": 0.6650753, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.68606693, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 9542, + "time_per_iteration": 2.399834632873535 + }, + { + "auxiliary_loss_clip": 0.01055622, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.0138849, + "balance_loss_mlp": 1.01652551, + "epoch": 0.5737562002104314, + "flos": 18003032663040.0, + "grad_norm": 1.9131403423004232, + "language_loss": 0.84907281, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.87000036, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.390625, + "step": 9543, + "time_per_iteration": 2.3519742488861084 + }, + { + "auxiliary_loss_clip": 0.01058311, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.01875734, + "balance_loss_mlp": 1.01785409, + "epoch": 0.5738163234630994, + "flos": 20697315327360.0, + "grad_norm": 1.8362391959046571, + "language_loss": 0.74657309, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.76761252, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 9544, + "time_per_iteration": 2.3877358436584473 + }, + { + "auxiliary_loss_clip": 0.01056761, + "auxiliary_loss_mlp": 0.01042732, + "balance_loss_clip": 1.01726878, + "balance_loss_mlp": 1.01714706, + "epoch": 0.5738764467157673, + "flos": 23148182926080.0, + "grad_norm": 1.8905485670736228, + "language_loss": 0.77390069, + "learning_rate": 1.620831188925733e-06, + "loss": 0.79489565, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39648438, + "step": 9545, + "time_per_iteration": 2.3817641735076904 + }, + { + "auxiliary_loss_clip": 0.01055785, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.02039492, + "balance_loss_mlp": 1.01711965, + "epoch": 0.5739365699684353, + "flos": 29491762118400.0, + "grad_norm": 2.8624954256366597, + "language_loss": 0.57990003, + "learning_rate": 1.620448797546459e-06, + "loss": 0.60091233, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 9546, + "time_per_iteration": 2.4600863456726074 + }, + { + "auxiliary_loss_clip": 0.01055828, + "auxiliary_loss_mlp": 0.01039367, + "balance_loss_clip": 1.01486945, + "balance_loss_mlp": 1.01726961, + "epoch": 0.5739966932211032, + "flos": 14026390427520.0, + "grad_norm": 2.233097543508225, + "language_loss": 0.77461058, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.79556251, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38476562, + "step": 9547, + "time_per_iteration": 2.344376802444458 + }, + { + "auxiliary_loss_clip": 0.01056066, + "auxiliary_loss_mlp": 0.01041373, + "balance_loss_clip": 1.01667285, + "balance_loss_mlp": 1.01750493, + "epoch": 0.5740568164737713, + "flos": 19061040637440.0, + "grad_norm": 1.974300833717856, + "language_loss": 0.76437676, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.78535116, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 9548, + "time_per_iteration": 3.6503734588623047 + }, + { + "auxiliary_loss_clip": 0.01055971, + "auxiliary_loss_mlp": 0.01043589, + "balance_loss_clip": 1.01838863, + "balance_loss_mlp": 1.01619649, + "epoch": 0.5741169397264392, + "flos": 22126729011840.0, + "grad_norm": 2.1838803591765314, + "language_loss": 0.71336406, + "learning_rate": 1.619301709822355e-06, + "loss": 0.73435968, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39648438, + "step": 9549, + "time_per_iteration": 2.381854295730591 + }, + { + "auxiliary_loss_clip": 0.01057214, + "auxiliary_loss_mlp": 0.01041288, + "balance_loss_clip": 1.01758885, + "balance_loss_mlp": 1.01906312, + "epoch": 0.5741770629791072, + "flos": 24935666181120.0, + "grad_norm": 1.8544123467706213, + "language_loss": 0.80050874, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.82149374, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3828125, + "step": 9550, + "time_per_iteration": 2.423672914505005 + }, + { + "auxiliary_loss_clip": 0.01055947, + "auxiliary_loss_mlp": 0.01038587, + "balance_loss_clip": 1.01159835, + "balance_loss_mlp": 1.01665938, + "epoch": 0.5742371862317751, + "flos": 18800623739520.0, + "grad_norm": 1.900863217902449, + "language_loss": 0.6847226, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70566797, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39257812, + "step": 9551, + "time_per_iteration": 3.792492628097534 + }, + { + "auxiliary_loss_clip": 0.01058259, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.01620662, + "balance_loss_mlp": 1.01881564, + "epoch": 0.5742973094844431, + "flos": 24459555703680.0, + "grad_norm": 2.385278242115737, + "language_loss": 0.73599768, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.75699139, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.39453125, + "step": 9552, + "time_per_iteration": 3.800985813140869 + }, + { + "auxiliary_loss_clip": 0.01056547, + "auxiliary_loss_mlp": 0.01044372, + "balance_loss_clip": 1.0186224, + "balance_loss_mlp": 1.01777923, + "epoch": 0.574357432737111, + "flos": 21651700786560.0, + "grad_norm": 2.076525830401587, + "language_loss": 0.81585115, + "learning_rate": 1.617772461696843e-06, + "loss": 0.8368603, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9553, + "time_per_iteration": 2.4008424282073975 + }, + { + "auxiliary_loss_clip": 0.01058238, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.01131737, + "balance_loss_mlp": 1.01804817, + "epoch": 0.5744175559897791, + "flos": 16543802810880.0, + "grad_norm": 2.0506797752593813, + "language_loss": 0.84328806, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.86425114, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40234375, + "step": 9554, + "time_per_iteration": 2.3592336177825928 + }, + { + "auxiliary_loss_clip": 0.01059461, + "auxiliary_loss_mlp": 0.01042304, + "balance_loss_clip": 1.01592314, + "balance_loss_mlp": 1.0188024, + "epoch": 0.574477679242447, + "flos": 24206435280000.0, + "grad_norm": 1.5666021644075383, + "language_loss": 0.72358561, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.74460334, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40625, + "step": 9555, + "time_per_iteration": 2.4518346786499023 + }, + { + "auxiliary_loss_clip": 0.01055863, + "auxiliary_loss_mlp": 0.01042305, + "balance_loss_clip": 1.01567411, + "balance_loss_mlp": 1.0176785, + "epoch": 0.574537802495115, + "flos": 14902116860160.0, + "grad_norm": 4.120499054832592, + "language_loss": 0.74109733, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.762079, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 9556, + "time_per_iteration": 2.3397746086120605 + }, + { + "auxiliary_loss_clip": 0.01056754, + "auxiliary_loss_mlp": 0.01043192, + "balance_loss_clip": 1.01632237, + "balance_loss_mlp": 1.01822662, + "epoch": 0.5745979257477829, + "flos": 24933850790400.0, + "grad_norm": 1.5302490802948858, + "language_loss": 0.74935424, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.77035373, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 9557, + "time_per_iteration": 2.424532651901245 + }, + { + "auxiliary_loss_clip": 0.01056519, + "auxiliary_loss_mlp": 0.01044479, + "balance_loss_clip": 1.01822901, + "balance_loss_mlp": 1.01757216, + "epoch": 0.5746580490004509, + "flos": 17234873729280.0, + "grad_norm": 2.064412403423656, + "language_loss": 0.68950069, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.71051061, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 9558, + "time_per_iteration": 2.3553335666656494 + }, + { + "auxiliary_loss_clip": 0.01062057, + "auxiliary_loss_mlp": 0.01048792, + "balance_loss_clip": 1.01975238, + "balance_loss_mlp": 1.01969409, + "epoch": 0.5747181722531189, + "flos": 13187043498240.0, + "grad_norm": 1.9105734592067514, + "language_loss": 0.71823871, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73934722, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.42382812, + "step": 9559, + "time_per_iteration": 2.3596718311309814 + }, + { + "auxiliary_loss_clip": 0.01057058, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.01146233, + "balance_loss_mlp": 1.01893568, + "epoch": 0.5747782955057869, + "flos": 22961991312000.0, + "grad_norm": 1.6117850597342274, + "language_loss": 0.80160224, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.82252038, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3828125, + "step": 9560, + "time_per_iteration": 3.8129396438598633 + }, + { + "auxiliary_loss_clip": 0.01058193, + "auxiliary_loss_mlp": 0.01040991, + "balance_loss_clip": 1.01557541, + "balance_loss_mlp": 1.0187273, + "epoch": 0.5748384187584549, + "flos": 23402141222400.0, + "grad_norm": 1.6703156139946893, + "language_loss": 0.65132344, + "learning_rate": 1.614714662090588e-06, + "loss": 0.67231524, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39453125, + "step": 9561, + "time_per_iteration": 2.3952810764312744 + }, + { + "auxiliary_loss_clip": 0.01062905, + "auxiliary_loss_mlp": 0.01047404, + "balance_loss_clip": 1.019068, + "balance_loss_mlp": 1.02075672, + "epoch": 0.5748985420111228, + "flos": 17784546174720.0, + "grad_norm": 2.2816899113471503, + "language_loss": 0.72708106, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.7481842, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.421875, + "step": 9562, + "time_per_iteration": 2.360301971435547 + }, + { + "auxiliary_loss_clip": 0.01057477, + "auxiliary_loss_mlp": 0.01044444, + "balance_loss_clip": 1.02041101, + "balance_loss_mlp": 1.01859164, + "epoch": 0.5749586652637908, + "flos": 19865195049600.0, + "grad_norm": 1.954112149862691, + "language_loss": 0.84965956, + "learning_rate": 1.613950357999751e-06, + "loss": 0.87067872, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38867188, + "step": 9563, + "time_per_iteration": 2.3926923274993896 + }, + { + "auxiliary_loss_clip": 0.01060864, + "auxiliary_loss_mlp": 0.01050624, + "balance_loss_clip": 1.02083373, + "balance_loss_mlp": 1.01930857, + "epoch": 0.5750187885164587, + "flos": 21286195096320.0, + "grad_norm": 1.944516805137725, + "language_loss": 0.58618474, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.60729963, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41601562, + "step": 9564, + "time_per_iteration": 2.396207571029663 + }, + { + "auxiliary_loss_clip": 0.01054916, + "auxiliary_loss_mlp": 0.01042414, + "balance_loss_clip": 1.0178206, + "balance_loss_mlp": 1.01720595, + "epoch": 0.5750789117691267, + "flos": 18803730850560.0, + "grad_norm": 1.7168238394188347, + "language_loss": 0.77022219, + "learning_rate": 1.613186112465078e-06, + "loss": 0.79119551, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37695312, + "step": 9565, + "time_per_iteration": 2.3691141605377197 + }, + { + "auxiliary_loss_clip": 0.01014201, + "auxiliary_loss_mlp": 0.01009131, + "balance_loss_clip": 1.00662792, + "balance_loss_mlp": 1.00661302, + "epoch": 0.5751390350217946, + "flos": 70659490780800.0, + "grad_norm": 0.9570041109482518, + "language_loss": 0.60824341, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.6284768, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.07617188, + "step": 9566, + "time_per_iteration": 3.092327356338501 + }, + { + "auxiliary_loss_clip": 0.01057559, + "auxiliary_loss_mlp": 0.01041118, + "balance_loss_clip": 1.01369965, + "balance_loss_mlp": 1.01891398, + "epoch": 0.5751991582744627, + "flos": 14245470408960.0, + "grad_norm": 3.266615226074235, + "language_loss": 0.76611084, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.78709763, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.38671875, + "step": 9567, + "time_per_iteration": 2.3595707416534424 + }, + { + "auxiliary_loss_clip": 0.01057732, + "auxiliary_loss_mlp": 0.01043996, + "balance_loss_clip": 1.01874709, + "balance_loss_mlp": 1.01788974, + "epoch": 0.5752592815271306, + "flos": 18327306170880.0, + "grad_norm": 1.4741556993108602, + "language_loss": 0.7576738, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.77869117, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3984375, + "step": 9568, + "time_per_iteration": 2.346219539642334 + }, + { + "auxiliary_loss_clip": 0.01058347, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.0123446, + "balance_loss_mlp": 1.01827359, + "epoch": 0.5753194047797986, + "flos": 20921701835520.0, + "grad_norm": 1.6513363306708648, + "language_loss": 0.72996294, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.75093091, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 9569, + "time_per_iteration": 2.3752152919769287 + }, + { + "auxiliary_loss_clip": 0.01056967, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.01694226, + "balance_loss_mlp": 1.01703906, + "epoch": 0.5753795280324665, + "flos": 19280783934720.0, + "grad_norm": 2.8361671419822088, + "language_loss": 0.57394445, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.59496772, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.3984375, + "step": 9570, + "time_per_iteration": 2.3558568954467773 + }, + { + "auxiliary_loss_clip": 0.01054539, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.01524758, + "balance_loss_mlp": 1.0163238, + "epoch": 0.5754396512851345, + "flos": 21651805520640.0, + "grad_norm": 1.5655896432295775, + "language_loss": 0.65644765, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.67739779, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 9571, + "time_per_iteration": 2.393368721008301 + }, + { + "auxiliary_loss_clip": 0.01057363, + "auxiliary_loss_mlp": 0.01046192, + "balance_loss_clip": 1.01715279, + "balance_loss_mlp": 1.01711607, + "epoch": 0.5754997745378025, + "flos": 51019871091840.0, + "grad_norm": 2.958459336154477, + "language_loss": 0.67842793, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69946349, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40234375, + "step": 9572, + "time_per_iteration": 2.6232669353485107 + }, + { + "auxiliary_loss_clip": 0.01057571, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_clip": 1.01979721, + "balance_loss_mlp": 1.01767421, + "epoch": 0.5755598977904705, + "flos": 22855785356160.0, + "grad_norm": 1.8165436994359578, + "language_loss": 0.74191487, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.76295954, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 9573, + "time_per_iteration": 2.380631446838379 + }, + { + "auxiliary_loss_clip": 0.01054121, + "auxiliary_loss_mlp": 0.01042041, + "balance_loss_clip": 1.01829445, + "balance_loss_mlp": 1.01786995, + "epoch": 0.5756200210431385, + "flos": 38471283360000.0, + "grad_norm": 1.7414687037198726, + "language_loss": 0.76890129, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78986287, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36328125, + "step": 9574, + "time_per_iteration": 2.515338659286499 + }, + { + "auxiliary_loss_clip": 0.01059482, + "auxiliary_loss_mlp": 0.01049562, + "balance_loss_clip": 1.02051044, + "balance_loss_mlp": 1.01736033, + "epoch": 0.5756801442958064, + "flos": 23909010474240.0, + "grad_norm": 2.814625144854651, + "language_loss": 0.68373936, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.70482981, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.421875, + "step": 9575, + "time_per_iteration": 2.403566837310791 + }, + { + "auxiliary_loss_clip": 0.01056071, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.01718402, + "balance_loss_mlp": 1.01827312, + "epoch": 0.5757402675484744, + "flos": 21104227756800.0, + "grad_norm": 1.4500763305336692, + "language_loss": 0.80810601, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.82908142, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 9576, + "time_per_iteration": 2.3726954460144043 + }, + { + "auxiliary_loss_clip": 0.0105461, + "auxiliary_loss_mlp": 0.01044322, + "balance_loss_clip": 1.0192647, + "balance_loss_mlp": 1.01634288, + "epoch": 0.5758003908011423, + "flos": 20558046447360.0, + "grad_norm": 1.8200115837941528, + "language_loss": 0.70378321, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.72477257, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 9577, + "time_per_iteration": 2.410407781600952 + }, + { + "auxiliary_loss_clip": 0.01059209, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.01443231, + "balance_loss_mlp": 1.01808548, + "epoch": 0.5758605140538103, + "flos": 16472056233600.0, + "grad_norm": 1.7218234156834145, + "language_loss": 0.67362154, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.69463599, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 9578, + "time_per_iteration": 2.35194730758667 + }, + { + "auxiliary_loss_clip": 0.0105694, + "auxiliary_loss_mlp": 0.01040715, + "balance_loss_clip": 1.01545501, + "balance_loss_mlp": 1.01827669, + "epoch": 0.5759206373064782, + "flos": 21286509298560.0, + "grad_norm": 1.5980151605609199, + "language_loss": 0.73586762, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.75684416, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 9579, + "time_per_iteration": 2.387449264526367 + }, + { + "auxiliary_loss_clip": 0.01059945, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.01109886, + "balance_loss_mlp": 1.01806045, + "epoch": 0.5759807605591463, + "flos": 26066677541760.0, + "grad_norm": 3.1641898025400654, + "language_loss": 0.66418159, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.68518817, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.41796875, + "step": 9580, + "time_per_iteration": 2.404876232147217 + }, + { + "auxiliary_loss_clip": 0.01057571, + "auxiliary_loss_mlp": 0.01042797, + "balance_loss_clip": 1.01589179, + "balance_loss_mlp": 1.0178144, + "epoch": 0.5760408838118142, + "flos": 18872265582720.0, + "grad_norm": 1.8120562681990553, + "language_loss": 0.85973215, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.88073581, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 9581, + "time_per_iteration": 2.3992364406585693 + }, + { + "auxiliary_loss_clip": 0.01061385, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_clip": 1.01914883, + "balance_loss_mlp": 1.01978445, + "epoch": 0.5761010070644822, + "flos": 15377214908160.0, + "grad_norm": 3.234614820958906, + "language_loss": 0.69307148, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.71416068, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41601562, + "step": 9582, + "time_per_iteration": 2.404843807220459 + }, + { + "auxiliary_loss_clip": 0.01011151, + "auxiliary_loss_mlp": 0.0100305, + "balance_loss_clip": 1.0006057, + "balance_loss_mlp": 1.00341892, + "epoch": 0.5761611303171501, + "flos": 71468009112960.0, + "grad_norm": 0.6564265343705085, + "language_loss": 0.572083, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59222507, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.07714844, + "step": 9583, + "time_per_iteration": 3.177868127822876 + }, + { + "auxiliary_loss_clip": 0.01059181, + "auxiliary_loss_mlp": 0.01041749, + "balance_loss_clip": 1.0145936, + "balance_loss_mlp": 1.01838732, + "epoch": 0.5762212535698181, + "flos": 16245435398400.0, + "grad_norm": 1.8088932579830939, + "language_loss": 0.83131158, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.85232091, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40820312, + "step": 9584, + "time_per_iteration": 2.3496522903442383 + }, + { + "auxiliary_loss_clip": 0.01011165, + "auxiliary_loss_mlp": 0.0100462, + "balance_loss_clip": 1.00191438, + "balance_loss_mlp": 1.0033356, + "epoch": 0.5762813768224861, + "flos": 70181879114880.0, + "grad_norm": 0.627365348316528, + "language_loss": 0.49564311, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51580095, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.02709961, + "router_z_loss_mlp": 0.078125, + "step": 9585, + "time_per_iteration": 3.1105260848999023 + }, + { + "auxiliary_loss_clip": 0.01054824, + "auxiliary_loss_mlp": 0.01037615, + "balance_loss_clip": 1.01151979, + "balance_loss_mlp": 1.0167383, + "epoch": 0.5763415000751541, + "flos": 20517093555840.0, + "grad_norm": 1.5626981059409593, + "language_loss": 0.85430479, + "learning_rate": 1.605165098835465e-06, + "loss": 0.87522918, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 9586, + "time_per_iteration": 2.356426239013672 + }, + { + "auxiliary_loss_clip": 0.01057157, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_clip": 1.01637983, + "balance_loss_mlp": 1.01770544, + "epoch": 0.5764016233278221, + "flos": 15814606821120.0, + "grad_norm": 1.7913494493481767, + "language_loss": 0.8077755, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82877839, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 9587, + "time_per_iteration": 2.3580310344696045 + }, + { + "auxiliary_loss_clip": 0.01057491, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.00952649, + "balance_loss_mlp": 1.01784897, + "epoch": 0.57646174658049, + "flos": 20771400965760.0, + "grad_norm": 1.7425088920472596, + "language_loss": 0.66716671, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68810129, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 9588, + "time_per_iteration": 3.746570587158203 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_clip": 1.01778495, + "balance_loss_mlp": 1.01817942, + "epoch": 0.576521869833158, + "flos": 23548811310720.0, + "grad_norm": 1.8302220205137567, + "language_loss": 0.7964797, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.81754047, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40234375, + "step": 9589, + "time_per_iteration": 2.398541212081909 + }, + { + "auxiliary_loss_clip": 0.01054745, + "auxiliary_loss_mlp": 0.01034577, + "balance_loss_clip": 1.0107348, + "balance_loss_mlp": 1.01670337, + "epoch": 0.5765819930858259, + "flos": 20265544143360.0, + "grad_norm": 2.1033425790581357, + "language_loss": 0.81439692, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.83529013, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38085938, + "step": 9590, + "time_per_iteration": 2.40799880027771 + }, + { + "auxiliary_loss_clip": 0.01057947, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.00741363, + "balance_loss_mlp": 1.01925242, + "epoch": 0.5766421163384939, + "flos": 23147659255680.0, + "grad_norm": 2.0425778459712287, + "language_loss": 0.64280772, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.6637094, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 9591, + "time_per_iteration": 3.851085901260376 + }, + { + "auxiliary_loss_clip": 0.01058155, + "auxiliary_loss_mlp": 0.01042655, + "balance_loss_clip": 1.0163933, + "balance_loss_mlp": 1.01847732, + "epoch": 0.5767022395911618, + "flos": 25847702294400.0, + "grad_norm": 1.5670008313729025, + "language_loss": 0.78456253, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.8055706, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39648438, + "step": 9592, + "time_per_iteration": 3.853929281234741 + }, + { + "auxiliary_loss_clip": 0.01009183, + "auxiliary_loss_mlp": 0.01002715, + "balance_loss_clip": 1.00000882, + "balance_loss_mlp": 1.0014801, + "epoch": 0.5767623628438299, + "flos": 68289586358400.0, + "grad_norm": 0.816191985922381, + "language_loss": 0.59858215, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61870116, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.02709961, + "router_z_loss_mlp": 0.07714844, + "step": 9593, + "time_per_iteration": 3.1290857791900635 + }, + { + "auxiliary_loss_clip": 0.01059364, + "auxiliary_loss_mlp": 0.01045414, + "balance_loss_clip": 1.01670814, + "balance_loss_mlp": 1.0180198, + "epoch": 0.5768224860964978, + "flos": 30187196956800.0, + "grad_norm": 1.9346675176770052, + "language_loss": 0.71688581, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73793364, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.4140625, + "step": 9594, + "time_per_iteration": 2.432359457015991 + }, + { + "auxiliary_loss_clip": 0.01057161, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.01396596, + "balance_loss_mlp": 1.01786542, + "epoch": 0.5768826093491658, + "flos": 17894068709760.0, + "grad_norm": 1.7811884400853808, + "language_loss": 0.71808088, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73903346, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.39257812, + "step": 9595, + "time_per_iteration": 2.396301507949829 + }, + { + "auxiliary_loss_clip": 0.01057875, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_clip": 1.02043521, + "balance_loss_mlp": 1.01795936, + "epoch": 0.5769427326018337, + "flos": 17456222949120.0, + "grad_norm": 1.8648511378428372, + "language_loss": 0.70623684, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.72729564, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.3984375, + "step": 9596, + "time_per_iteration": 2.3457531929016113 + }, + { + "auxiliary_loss_clip": 0.01061332, + "auxiliary_loss_mlp": 0.0105015, + "balance_loss_clip": 1.01982343, + "balance_loss_mlp": 1.01861262, + "epoch": 0.5770028558545017, + "flos": 39420152824320.0, + "grad_norm": 2.637289367304534, + "language_loss": 0.69003308, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.71114784, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.42773438, + "step": 9597, + "time_per_iteration": 2.541459560394287 + }, + { + "auxiliary_loss_clip": 0.0105572, + "auxiliary_loss_mlp": 0.01044336, + "balance_loss_clip": 1.01839578, + "balance_loss_mlp": 1.01678944, + "epoch": 0.5770629791071697, + "flos": 21535510181760.0, + "grad_norm": 1.974355369407956, + "language_loss": 0.82335824, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.8443588, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 9598, + "time_per_iteration": 2.369436025619507 + }, + { + "auxiliary_loss_clip": 0.01058018, + "auxiliary_loss_mlp": 0.01044532, + "balance_loss_clip": 1.01673257, + "balance_loss_mlp": 1.01822758, + "epoch": 0.5771231023598377, + "flos": 20885741268480.0, + "grad_norm": 1.5788211809089572, + "language_loss": 0.73893368, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.75995922, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 9599, + "time_per_iteration": 3.7819316387176514 + }, + { + "auxiliary_loss_clip": 0.0105387, + "auxiliary_loss_mlp": 0.01034547, + "balance_loss_clip": 1.01090801, + "balance_loss_mlp": 1.01689243, + "epoch": 0.5771832256125057, + "flos": 18076245517440.0, + "grad_norm": 1.9339555296442994, + "language_loss": 0.79680347, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.81768763, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 9600, + "time_per_iteration": 2.3551025390625 + }, + { + "auxiliary_loss_clip": 0.01059152, + "auxiliary_loss_mlp": 0.0105, + "balance_loss_clip": 1.0229156, + "balance_loss_mlp": 1.01925206, + "epoch": 0.5772433488651736, + "flos": 26357888125440.0, + "grad_norm": 1.7871054422036199, + "language_loss": 0.73623025, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.75732183, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 9601, + "time_per_iteration": 2.4281699657440186 + }, + { + "auxiliary_loss_clip": 0.0105662, + "auxiliary_loss_mlp": 0.01048949, + "balance_loss_clip": 1.0219121, + "balance_loss_mlp": 1.01818264, + "epoch": 0.5773034721178416, + "flos": 19680015864960.0, + "grad_norm": 2.2747137442379834, + "language_loss": 0.69216019, + "learning_rate": 1.599058274973348e-06, + "loss": 0.71321589, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3828125, + "step": 9602, + "time_per_iteration": 2.3615126609802246 + }, + { + "auxiliary_loss_clip": 0.0105416, + "auxiliary_loss_mlp": 0.01041156, + "balance_loss_clip": 1.01701665, + "balance_loss_mlp": 1.01766896, + "epoch": 0.5773635953705095, + "flos": 25081707864960.0, + "grad_norm": 1.4490217209872032, + "language_loss": 0.74543029, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.76638341, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 9603, + "time_per_iteration": 2.4447476863861084 + }, + { + "auxiliary_loss_clip": 0.01055694, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.01185, + "balance_loss_mlp": 1.01760828, + "epoch": 0.5774237186231775, + "flos": 21031922597760.0, + "grad_norm": 1.6077537678299514, + "language_loss": 0.77694672, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.79788667, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3828125, + "step": 9604, + "time_per_iteration": 2.3969345092773438 + }, + { + "auxiliary_loss_clip": 0.01058381, + "auxiliary_loss_mlp": 0.01046333, + "balance_loss_clip": 1.01720977, + "balance_loss_mlp": 1.0179863, + "epoch": 0.5774838418758454, + "flos": 15230824110720.0, + "grad_norm": 1.6975033733354346, + "language_loss": 0.84335047, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.86439764, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40234375, + "step": 9605, + "time_per_iteration": 2.3570096492767334 + }, + { + "auxiliary_loss_clip": 0.01062191, + "auxiliary_loss_mlp": 0.01052514, + "balance_loss_clip": 1.01911211, + "balance_loss_mlp": 1.0191865, + "epoch": 0.5775439651285135, + "flos": 23581594944000.0, + "grad_norm": 2.440394033219565, + "language_loss": 0.79023451, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.81138158, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.4296875, + "step": 9606, + "time_per_iteration": 2.3742012977600098 + }, + { + "auxiliary_loss_clip": 0.01056119, + "auxiliary_loss_mlp": 0.01041625, + "balance_loss_clip": 1.0156132, + "balance_loss_mlp": 1.01791525, + "epoch": 0.5776040883811814, + "flos": 18039551811840.0, + "grad_norm": 1.711896172128816, + "language_loss": 0.74448699, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76546443, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 9607, + "time_per_iteration": 2.3592264652252197 + }, + { + "auxiliary_loss_clip": 0.010593, + "auxiliary_loss_mlp": 0.0104273, + "balance_loss_clip": 1.01614594, + "balance_loss_mlp": 1.01881349, + "epoch": 0.5776642116338494, + "flos": 18623648724480.0, + "grad_norm": 2.115830723275719, + "language_loss": 0.70205438, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.72307467, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40429688, + "step": 9608, + "time_per_iteration": 2.340737819671631 + }, + { + "auxiliary_loss_clip": 0.01057757, + "auxiliary_loss_mlp": 0.01046951, + "balance_loss_clip": 1.01972318, + "balance_loss_mlp": 1.01787519, + "epoch": 0.5777243348865173, + "flos": 28401284712960.0, + "grad_norm": 1.7356385676757342, + "language_loss": 0.78224707, + "learning_rate": 1.596387759940665e-06, + "loss": 0.80329406, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 9609, + "time_per_iteration": 2.434516668319702 + }, + { + "auxiliary_loss_clip": 0.01059426, + "auxiliary_loss_mlp": 0.01042787, + "balance_loss_clip": 1.01381922, + "balance_loss_mlp": 1.01833725, + "epoch": 0.5777844581391853, + "flos": 24023560245120.0, + "grad_norm": 1.71507321756123, + "language_loss": 0.78088254, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.80190468, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41015625, + "step": 9610, + "time_per_iteration": 2.383596420288086 + }, + { + "auxiliary_loss_clip": 0.0105715, + "auxiliary_loss_mlp": 0.0104629, + "balance_loss_clip": 1.01793039, + "balance_loss_mlp": 1.01758265, + "epoch": 0.5778445813918534, + "flos": 17776621296000.0, + "grad_norm": 2.9001799806263597, + "language_loss": 0.70885479, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.72988915, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.39453125, + "step": 9611, + "time_per_iteration": 2.339589834213257 + }, + { + "auxiliary_loss_clip": 0.01056367, + "auxiliary_loss_mlp": 0.01043517, + "balance_loss_clip": 1.01502562, + "balance_loss_mlp": 1.0171864, + "epoch": 0.5779047046445213, + "flos": 22232201829120.0, + "grad_norm": 1.9865063766774689, + "language_loss": 0.84334707, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.86434591, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39257812, + "step": 9612, + "time_per_iteration": 2.378924608230591 + }, + { + "auxiliary_loss_clip": 0.01058032, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.01650715, + "balance_loss_mlp": 1.01787198, + "epoch": 0.5779648278971893, + "flos": 21433284120960.0, + "grad_norm": 1.6743146183862951, + "language_loss": 0.80675328, + "learning_rate": 1.594862087742667e-06, + "loss": 0.8277657, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40234375, + "step": 9613, + "time_per_iteration": 2.369762420654297 + }, + { + "auxiliary_loss_clip": 0.01055825, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.01321566, + "balance_loss_mlp": 1.01740193, + "epoch": 0.5780249511498572, + "flos": 19025114981760.0, + "grad_norm": 1.8325557246204063, + "language_loss": 0.77387351, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79480535, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38476562, + "step": 9614, + "time_per_iteration": 2.36098575592041 + }, + { + "auxiliary_loss_clip": 0.01060395, + "auxiliary_loss_mlp": 0.01048587, + "balance_loss_clip": 1.02058518, + "balance_loss_mlp": 1.01845717, + "epoch": 0.5780850744025252, + "flos": 12124008288000.0, + "grad_norm": 2.298379494956112, + "language_loss": 0.81960964, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.84069943, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41992188, + "step": 9615, + "time_per_iteration": 2.3248143196105957 + }, + { + "auxiliary_loss_clip": 0.01058362, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_clip": 1.01614642, + "balance_loss_mlp": 1.01829767, + "epoch": 0.5781451976551931, + "flos": 25043303502720.0, + "grad_norm": 1.4736801224528853, + "language_loss": 0.68005568, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.70107055, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40039062, + "step": 9616, + "time_per_iteration": 2.4304521083831787 + }, + { + "auxiliary_loss_clip": 0.01056951, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.01450908, + "balance_loss_mlp": 1.01841211, + "epoch": 0.5782053209078611, + "flos": 19244578988160.0, + "grad_norm": 1.6462956688568904, + "language_loss": 0.78965247, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.81062531, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9617, + "time_per_iteration": 2.3750863075256348 + }, + { + "auxiliary_loss_clip": 0.01056287, + "auxiliary_loss_mlp": 0.01045009, + "balance_loss_clip": 1.01770997, + "balance_loss_mlp": 1.01757121, + "epoch": 0.578265444160529, + "flos": 25992661726080.0, + "grad_norm": 1.478044688320972, + "language_loss": 0.7624132, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.78342617, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 9618, + "time_per_iteration": 2.4222350120544434 + }, + { + "auxiliary_loss_clip": 0.01056493, + "auxiliary_loss_mlp": 0.01038418, + "balance_loss_clip": 1.01367044, + "balance_loss_mlp": 1.01815546, + "epoch": 0.5783255674131971, + "flos": 21797533002240.0, + "grad_norm": 1.6284973878103197, + "language_loss": 0.82992291, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.85087204, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 9619, + "time_per_iteration": 2.3860421180725098 + }, + { + "auxiliary_loss_clip": 0.01056488, + "auxiliary_loss_mlp": 0.01050187, + "balance_loss_clip": 1.02225614, + "balance_loss_mlp": 1.01820874, + "epoch": 0.578385690665865, + "flos": 24788612067840.0, + "grad_norm": 2.0482177651231095, + "language_loss": 0.73814917, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.75921595, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.3828125, + "step": 9620, + "time_per_iteration": 2.4099926948547363 + }, + { + "auxiliary_loss_clip": 0.01056304, + "auxiliary_loss_mlp": 0.01042383, + "balance_loss_clip": 1.01584744, + "balance_loss_mlp": 1.01732731, + "epoch": 0.578445813918533, + "flos": 21211865078400.0, + "grad_norm": 1.5859147209737507, + "language_loss": 0.78496319, + "learning_rate": 1.591811481689916e-06, + "loss": 0.80595005, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 9621, + "time_per_iteration": 2.3856754302978516 + }, + { + "auxiliary_loss_clip": 0.01058138, + "auxiliary_loss_mlp": 0.01043036, + "balance_loss_clip": 1.01617813, + "balance_loss_mlp": 1.0179894, + "epoch": 0.5785059371712009, + "flos": 25045607652480.0, + "grad_norm": 1.7824326290333916, + "language_loss": 0.71849024, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.73950195, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40234375, + "step": 9622, + "time_per_iteration": 2.438464879989624 + }, + { + "auxiliary_loss_clip": 0.01012319, + "auxiliary_loss_mlp": 0.0101865, + "balance_loss_clip": 1.01609921, + "balance_loss_mlp": 1.00467443, + "epoch": 0.5785660604238689, + "flos": 70839503084160.0, + "grad_norm": 0.7814281771486934, + "language_loss": 0.56029576, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58060545, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.07617188, + "step": 9623, + "time_per_iteration": 3.037371873855591 + }, + { + "auxiliary_loss_clip": 0.0106014, + "auxiliary_loss_mlp": 0.01046546, + "balance_loss_clip": 1.01669562, + "balance_loss_mlp": 1.01856542, + "epoch": 0.578626183676537, + "flos": 31648626224640.0, + "grad_norm": 1.7556502206280105, + "language_loss": 0.72541893, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.74648583, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41601562, + "step": 9624, + "time_per_iteration": 2.4599552154541016 + }, + { + "auxiliary_loss_clip": 0.01058987, + "auxiliary_loss_mlp": 0.01049237, + "balance_loss_clip": 1.0214138, + "balance_loss_mlp": 1.01916838, + "epoch": 0.5786863069292049, + "flos": 21864287255040.0, + "grad_norm": 2.047362446522189, + "language_loss": 0.83611226, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.85719442, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 9625, + "time_per_iteration": 2.3695106506347656 + }, + { + "auxiliary_loss_clip": 0.01056054, + "auxiliary_loss_mlp": 0.01046173, + "balance_loss_clip": 1.01815844, + "balance_loss_mlp": 1.01745319, + "epoch": 0.5787464301818729, + "flos": 23363911416960.0, + "grad_norm": 1.4648004450804337, + "language_loss": 0.70975208, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.73077434, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38671875, + "step": 9626, + "time_per_iteration": 2.408174514770508 + }, + { + "auxiliary_loss_clip": 0.01057288, + "auxiliary_loss_mlp": 0.01043697, + "balance_loss_clip": 1.01615953, + "balance_loss_mlp": 1.01829147, + "epoch": 0.5788065534345408, + "flos": 30002820733440.0, + "grad_norm": 1.461207446328878, + "language_loss": 0.72577387, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.74678373, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.390625, + "step": 9627, + "time_per_iteration": 3.8399624824523926 + }, + { + "auxiliary_loss_clip": 0.01055939, + "auxiliary_loss_mlp": 0.01042476, + "balance_loss_clip": 1.01634502, + "balance_loss_mlp": 1.01730394, + "epoch": 0.5788666766872088, + "flos": 24526903449600.0, + "grad_norm": 1.5555220597265118, + "language_loss": 0.85159552, + "learning_rate": 1.589143013764458e-06, + "loss": 0.87257969, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9628, + "time_per_iteration": 2.395768880844116 + }, + { + "auxiliary_loss_clip": 0.01057629, + "auxiliary_loss_mlp": 0.01045902, + "balance_loss_clip": 1.01815009, + "balance_loss_mlp": 1.01833034, + "epoch": 0.5789267999398767, + "flos": 23731686345600.0, + "grad_norm": 1.5549609405216833, + "language_loss": 0.73841393, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.75944924, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39257812, + "step": 9629, + "time_per_iteration": 2.402574300765991 + }, + { + "auxiliary_loss_clip": 0.01058163, + "auxiliary_loss_mlp": 0.01045159, + "balance_loss_clip": 1.01744223, + "balance_loss_mlp": 1.01936352, + "epoch": 0.5789869231925447, + "flos": 21134183569920.0, + "grad_norm": 10.201285133251142, + "language_loss": 0.76392716, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.78496039, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38867188, + "step": 9630, + "time_per_iteration": 2.358173370361328 + }, + { + "auxiliary_loss_clip": 0.01054795, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.01981437, + "balance_loss_mlp": 1.01704037, + "epoch": 0.5790470464452127, + "flos": 21208723056000.0, + "grad_norm": 1.657666302905593, + "language_loss": 0.79782975, + "learning_rate": 1.587999618060523e-06, + "loss": 0.81882352, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 9631, + "time_per_iteration": 3.843238353729248 + }, + { + "auxiliary_loss_clip": 0.0105614, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.01976979, + "balance_loss_mlp": 1.01753652, + "epoch": 0.5791071696978807, + "flos": 23403258385920.0, + "grad_norm": 1.781086778691973, + "language_loss": 0.76595688, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.78697705, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9632, + "time_per_iteration": 3.8624846935272217 + }, + { + "auxiliary_loss_clip": 0.0105651, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.01720786, + "balance_loss_mlp": 1.01726532, + "epoch": 0.5791672929505486, + "flos": 24205387939200.0, + "grad_norm": 1.685535399947355, + "language_loss": 0.8030079, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.82401478, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39257812, + "step": 9633, + "time_per_iteration": 2.3842391967773438 + }, + { + "auxiliary_loss_clip": 0.01062234, + "auxiliary_loss_mlp": 0.01054559, + "balance_loss_clip": 1.02213442, + "balance_loss_mlp": 1.01924312, + "epoch": 0.5792274162032166, + "flos": 24347833752960.0, + "grad_norm": 1.6804724402989795, + "language_loss": 0.79375452, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.81492245, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.4296875, + "step": 9634, + "time_per_iteration": 2.4450557231903076 + }, + { + "auxiliary_loss_clip": 0.0105994, + "auxiliary_loss_mlp": 0.01053008, + "balance_loss_clip": 1.02505326, + "balance_loss_mlp": 1.01928973, + "epoch": 0.5792875394558845, + "flos": 20448349355520.0, + "grad_norm": 2.2766818334440386, + "language_loss": 0.64118737, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.6623168, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 9635, + "time_per_iteration": 2.3672003746032715 + }, + { + "auxiliary_loss_clip": 0.01055429, + "auxiliary_loss_mlp": 0.01041164, + "balance_loss_clip": 1.01770353, + "balance_loss_mlp": 1.01782441, + "epoch": 0.5793476627085525, + "flos": 24059206609920.0, + "grad_norm": 1.605461942062675, + "language_loss": 0.78430903, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.80527496, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 9636, + "time_per_iteration": 2.4172332286834717 + }, + { + "auxiliary_loss_clip": 0.01053981, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.01251769, + "balance_loss_mlp": 1.01693773, + "epoch": 0.5794077859612206, + "flos": 22053201955200.0, + "grad_norm": 1.7593519316617432, + "language_loss": 0.70011115, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.72102123, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37109375, + "step": 9637, + "time_per_iteration": 2.3615307807922363 + }, + { + "auxiliary_loss_clip": 0.01056605, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.01811254, + "balance_loss_mlp": 1.01710665, + "epoch": 0.5794679092138885, + "flos": 11434124355840.0, + "grad_norm": 2.8916856193908593, + "language_loss": 0.74326253, + "learning_rate": 1.585332242234043e-06, + "loss": 0.76428223, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 9638, + "time_per_iteration": 2.312392473220825 + }, + { + "auxiliary_loss_clip": 0.0105689, + "auxiliary_loss_mlp": 0.01042969, + "balance_loss_clip": 1.01747012, + "balance_loss_mlp": 1.01868081, + "epoch": 0.5795280324665565, + "flos": 18879212943360.0, + "grad_norm": 1.6453715448636221, + "language_loss": 0.73439491, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.75539356, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 9639, + "time_per_iteration": 3.7946226596832275 + }, + { + "auxiliary_loss_clip": 0.0105739, + "auxiliary_loss_mlp": 0.01044136, + "balance_loss_clip": 1.01761198, + "balance_loss_mlp": 1.01800632, + "epoch": 0.5795881557192244, + "flos": 13005111070080.0, + "grad_norm": 1.7753387642090828, + "language_loss": 0.70351684, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.72453207, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 9640, + "time_per_iteration": 2.3464341163635254 + }, + { + "auxiliary_loss_clip": 0.01062364, + "auxiliary_loss_mlp": 0.01052029, + "balance_loss_clip": 1.02251232, + "balance_loss_mlp": 1.01960611, + "epoch": 0.5796482789718924, + "flos": 19931530366080.0, + "grad_norm": 2.452272375985523, + "language_loss": 0.78904641, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.81019038, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.42773438, + "step": 9641, + "time_per_iteration": 2.336226224899292 + }, + { + "auxiliary_loss_clip": 0.01057222, + "auxiliary_loss_mlp": 0.01038123, + "balance_loss_clip": 1.01313651, + "balance_loss_mlp": 1.01856506, + "epoch": 0.5797084022245603, + "flos": 21649780661760.0, + "grad_norm": 1.9002487461427917, + "language_loss": 0.75600505, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.77695847, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 9642, + "time_per_iteration": 2.3731956481933594 + }, + { + "auxiliary_loss_clip": 0.01057753, + "auxiliary_loss_mlp": 0.01049485, + "balance_loss_clip": 1.02217436, + "balance_loss_mlp": 1.01757026, + "epoch": 0.5797685254772283, + "flos": 26030367861120.0, + "grad_norm": 1.5965365376771556, + "language_loss": 0.74851978, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.76959217, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 9643, + "time_per_iteration": 2.4040513038635254 + }, + { + "auxiliary_loss_clip": 0.01058808, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.01374662, + "balance_loss_mlp": 1.01948464, + "epoch": 0.5798286487298963, + "flos": 22704227677440.0, + "grad_norm": 1.8556618209861921, + "language_loss": 0.68342769, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.70442533, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39453125, + "step": 9644, + "time_per_iteration": 2.3822808265686035 + }, + { + "auxiliary_loss_clip": 0.01061954, + "auxiliary_loss_mlp": 0.01044557, + "balance_loss_clip": 1.01668584, + "balance_loss_mlp": 1.01964831, + "epoch": 0.5798887719825643, + "flos": 23147868723840.0, + "grad_norm": 2.2459748846739607, + "language_loss": 0.86554164, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.88660675, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.42382812, + "step": 9645, + "time_per_iteration": 2.3785078525543213 + }, + { + "auxiliary_loss_clip": 0.01059207, + "auxiliary_loss_mlp": 0.0103908, + "balance_loss_clip": 1.0115428, + "balance_loss_mlp": 1.01924109, + "epoch": 0.5799488952352322, + "flos": 24424886856960.0, + "grad_norm": 1.7384744022672902, + "language_loss": 0.76165122, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.78263414, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 9646, + "time_per_iteration": 2.451570749282837 + }, + { + "auxiliary_loss_clip": 0.01061042, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.01377082, + "balance_loss_mlp": 1.02032638, + "epoch": 0.5800090184879002, + "flos": 38394474635520.0, + "grad_norm": 1.717468288902804, + "language_loss": 0.60406452, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.62510276, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40820312, + "step": 9647, + "time_per_iteration": 2.5303382873535156 + }, + { + "auxiliary_loss_clip": 0.0106033, + "auxiliary_loss_mlp": 0.01044303, + "balance_loss_clip": 1.0134275, + "balance_loss_mlp": 1.01976359, + "epoch": 0.5800691417405681, + "flos": 19784022405120.0, + "grad_norm": 1.8945762046988777, + "language_loss": 0.85357642, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.87462282, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.40625, + "step": 9648, + "time_per_iteration": 2.377918243408203 + }, + { + "auxiliary_loss_clip": 0.01010252, + "auxiliary_loss_mlp": 0.01011856, + "balance_loss_clip": 1.00937653, + "balance_loss_mlp": 1.00232971, + "epoch": 0.5801292649932361, + "flos": 70311407725440.0, + "grad_norm": 0.8387995919109723, + "language_loss": 0.63089406, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65111512, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.07910156, + "step": 9649, + "time_per_iteration": 3.0349202156066895 + }, + { + "auxiliary_loss_clip": 0.01056001, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.01444983, + "balance_loss_mlp": 1.01841903, + "epoch": 0.5801893882459042, + "flos": 18733799664000.0, + "grad_norm": 1.6194712018341269, + "language_loss": 0.82837903, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84934425, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37695312, + "step": 9650, + "time_per_iteration": 2.3652777671813965 + }, + { + "auxiliary_loss_clip": 0.01060189, + "auxiliary_loss_mlp": 0.01042306, + "balance_loss_clip": 1.01531744, + "balance_loss_mlp": 1.01840568, + "epoch": 0.5802495114985721, + "flos": 15595596662400.0, + "grad_norm": 3.0833308115100992, + "language_loss": 0.78731692, + "learning_rate": 1.580380592177698e-06, + "loss": 0.80834186, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41796875, + "step": 9651, + "time_per_iteration": 2.3354527950286865 + }, + { + "auxiliary_loss_clip": 0.01059981, + "auxiliary_loss_mlp": 0.01047263, + "balance_loss_clip": 1.02008367, + "balance_loss_mlp": 1.01960063, + "epoch": 0.5803096347512401, + "flos": 18254547164160.0, + "grad_norm": 2.0570475186334556, + "language_loss": 0.75334066, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.77441311, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40429688, + "step": 9652, + "time_per_iteration": 2.3579297065734863 + }, + { + "auxiliary_loss_clip": 0.01058261, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_clip": 1.02041161, + "balance_loss_mlp": 1.0181632, + "epoch": 0.580369758003908, + "flos": 22892060125440.0, + "grad_norm": 1.993832193804271, + "language_loss": 0.78145969, + "learning_rate": 1.579619037747193e-06, + "loss": 0.80253208, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40039062, + "step": 9653, + "time_per_iteration": 2.394782543182373 + }, + { + "auxiliary_loss_clip": 0.01056679, + "auxiliary_loss_mlp": 0.01049672, + "balance_loss_clip": 1.02209878, + "balance_loss_mlp": 1.01730001, + "epoch": 0.580429881256576, + "flos": 18696687022080.0, + "grad_norm": 2.5784024424664143, + "language_loss": 0.76053554, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.78159904, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.39453125, + "step": 9654, + "time_per_iteration": 2.355658531188965 + }, + { + "auxiliary_loss_clip": 0.01054821, + "auxiliary_loss_mlp": 0.01041025, + "balance_loss_clip": 1.01680124, + "balance_loss_mlp": 1.01804352, + "epoch": 0.5804900045092439, + "flos": 24680800189440.0, + "grad_norm": 1.7159839409931645, + "language_loss": 0.71127498, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.73223346, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 9655, + "time_per_iteration": 2.3872992992401123 + }, + { + "auxiliary_loss_clip": 0.01059301, + "auxiliary_loss_mlp": 0.01046896, + "balance_loss_clip": 1.01821387, + "balance_loss_mlp": 1.01769447, + "epoch": 0.580550127761912, + "flos": 23111663777280.0, + "grad_norm": 2.043749720728159, + "language_loss": 0.71484268, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.73590463, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41601562, + "step": 9656, + "time_per_iteration": 2.36397385597229 + }, + { + "auxiliary_loss_clip": 0.01056719, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02099323, + "balance_loss_mlp": 1.01820326, + "epoch": 0.5806102510145799, + "flos": 18474779220480.0, + "grad_norm": 1.5036982804066188, + "language_loss": 0.730088, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.7510978, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38476562, + "step": 9657, + "time_per_iteration": 2.364748954772949 + }, + { + "auxiliary_loss_clip": 0.01059642, + "auxiliary_loss_mlp": 0.01052427, + "balance_loss_clip": 1.02217197, + "balance_loss_mlp": 1.01821184, + "epoch": 0.5806703742672479, + "flos": 23914491557760.0, + "grad_norm": 2.1874765983282685, + "language_loss": 0.72076333, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.74188399, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.4140625, + "step": 9658, + "time_per_iteration": 2.3941538333892822 + }, + { + "auxiliary_loss_clip": 0.01009736, + "auxiliary_loss_mlp": 0.01004509, + "balance_loss_clip": 1.00162399, + "balance_loss_mlp": 1.00201821, + "epoch": 0.5807304975199158, + "flos": 66308649926400.0, + "grad_norm": 0.6537858521165671, + "language_loss": 0.53615606, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55629855, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.02880859, + "router_z_loss_mlp": 0.07714844, + "step": 9659, + "time_per_iteration": 3.0252506732940674 + }, + { + "auxiliary_loss_clip": 0.01057738, + "auxiliary_loss_mlp": 0.01057052, + "balance_loss_clip": 1.03063512, + "balance_loss_mlp": 1.01781356, + "epoch": 0.5807906207725838, + "flos": 31721105940480.0, + "grad_norm": 1.9023620005876074, + "language_loss": 0.63603634, + "learning_rate": 1.576954100136366e-06, + "loss": 0.65718424, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 9660, + "time_per_iteration": 2.4636917114257812 + }, + { + "auxiliary_loss_clip": 0.01056859, + "auxiliary_loss_mlp": 0.01049564, + "balance_loss_clip": 1.02220607, + "balance_loss_mlp": 1.01635814, + "epoch": 0.5808507440252517, + "flos": 23800151255040.0, + "grad_norm": 1.5501747012657303, + "language_loss": 0.66718972, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.68825388, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 9661, + "time_per_iteration": 2.3849096298217773 + }, + { + "auxiliary_loss_clip": 0.01053464, + "auxiliary_loss_mlp": 0.01040312, + "balance_loss_clip": 1.01569557, + "balance_loss_mlp": 1.01748252, + "epoch": 0.5809108672779197, + "flos": 13697613354240.0, + "grad_norm": 1.5939189637942786, + "language_loss": 0.75306082, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.7739985, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 9662, + "time_per_iteration": 2.3410990238189697 + }, + { + "auxiliary_loss_clip": 0.01009842, + "auxiliary_loss_mlp": 0.01003917, + "balance_loss_clip": 1.00123489, + "balance_loss_mlp": 1.00175309, + "epoch": 0.5809709905305876, + "flos": 69131062880640.0, + "grad_norm": 0.8762795288784939, + "language_loss": 0.58442986, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60456741, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.08105469, + "step": 9663, + "time_per_iteration": 3.1316299438476562 + }, + { + "auxiliary_loss_clip": 0.01056506, + "auxiliary_loss_mlp": 0.01047012, + "balance_loss_clip": 1.02117932, + "balance_loss_mlp": 1.01748478, + "epoch": 0.5810311137832557, + "flos": 19826546307840.0, + "grad_norm": 2.21730367635318, + "language_loss": 0.82590163, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84693682, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 9664, + "time_per_iteration": 2.360711097717285 + }, + { + "auxiliary_loss_clip": 0.0105755, + "auxiliary_loss_mlp": 0.01044814, + "balance_loss_clip": 1.01643062, + "balance_loss_mlp": 1.01728249, + "epoch": 0.5810912370359237, + "flos": 29237315063040.0, + "grad_norm": 1.6989256823917467, + "language_loss": 0.82457912, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.84560275, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40234375, + "step": 9665, + "time_per_iteration": 2.4231061935424805 + }, + { + "auxiliary_loss_clip": 0.01060954, + "auxiliary_loss_mlp": 0.01049165, + "balance_loss_clip": 1.01888609, + "balance_loss_mlp": 1.01912427, + "epoch": 0.5811513602885916, + "flos": 22784422803840.0, + "grad_norm": 1.5773210903481527, + "language_loss": 0.81668073, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83778191, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.41796875, + "step": 9666, + "time_per_iteration": 2.412925958633423 + }, + { + "auxiliary_loss_clip": 0.01054423, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_clip": 1.0177747, + "balance_loss_mlp": 1.01671541, + "epoch": 0.5812114835412596, + "flos": 18733345816320.0, + "grad_norm": 1.7820007140893226, + "language_loss": 0.81189734, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.83286488, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37695312, + "step": 9667, + "time_per_iteration": 3.643038511276245 + }, + { + "auxiliary_loss_clip": 0.01059991, + "auxiliary_loss_mlp": 0.01048293, + "balance_loss_clip": 1.01802564, + "balance_loss_mlp": 1.01790941, + "epoch": 0.5812716067939275, + "flos": 26430123461760.0, + "grad_norm": 1.633648598993552, + "language_loss": 0.79702008, + "learning_rate": 1.573909419957653e-06, + "loss": 0.81810296, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.41992188, + "step": 9668, + "time_per_iteration": 2.4294373989105225 + }, + { + "auxiliary_loss_clip": 0.01058964, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.01554716, + "balance_loss_mlp": 1.0183692, + "epoch": 0.5813317300465956, + "flos": 43396201566720.0, + "grad_norm": 2.1467203833137907, + "language_loss": 0.65852821, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.67953479, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40625, + "step": 9669, + "time_per_iteration": 2.58272647857666 + }, + { + "auxiliary_loss_clip": 0.01058776, + "auxiliary_loss_mlp": 0.01049895, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.01817513, + "epoch": 0.5813918532992635, + "flos": 24784457616000.0, + "grad_norm": 1.5629143302453479, + "language_loss": 0.74283135, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.76391804, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 9670, + "time_per_iteration": 2.450007677078247 + }, + { + "auxiliary_loss_clip": 0.0105883, + "auxiliary_loss_mlp": 0.01046524, + "balance_loss_clip": 1.01865315, + "balance_loss_mlp": 1.01834154, + "epoch": 0.5814519765519315, + "flos": 22856239203840.0, + "grad_norm": 2.0174871921656563, + "language_loss": 0.79804516, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81909877, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40429688, + "step": 9671, + "time_per_iteration": 5.123297452926636 + }, + { + "auxiliary_loss_clip": 0.01060535, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_clip": 1.01727223, + "balance_loss_mlp": 1.01894712, + "epoch": 0.5815120998045994, + "flos": 24059695368960.0, + "grad_norm": 2.4467214405452684, + "language_loss": 0.62598181, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.64705944, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41601562, + "step": 9672, + "time_per_iteration": 2.3861207962036133 + }, + { + "auxiliary_loss_clip": 0.01056388, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.0134151, + "balance_loss_mlp": 1.01863444, + "epoch": 0.5815722230572674, + "flos": 24278356414080.0, + "grad_norm": 1.615518545086455, + "language_loss": 0.82627928, + "learning_rate": 1.572007019492342e-06, + "loss": 0.84723711, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37695312, + "step": 9673, + "time_per_iteration": 2.3996458053588867 + }, + { + "auxiliary_loss_clip": 0.01059879, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.01601815, + "balance_loss_mlp": 1.01881409, + "epoch": 0.5816323463099353, + "flos": 22199278550400.0, + "grad_norm": 1.7602483374194644, + "language_loss": 0.8915112, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.91255623, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.41015625, + "step": 9674, + "time_per_iteration": 2.3507673740386963 + }, + { + "auxiliary_loss_clip": 0.01056269, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_clip": 1.01473272, + "balance_loss_mlp": 1.01694942, + "epoch": 0.5816924695626033, + "flos": 24133292248320.0, + "grad_norm": 1.8301467380684988, + "language_loss": 0.79698992, + "learning_rate": 1.571246172811984e-06, + "loss": 0.81796658, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 9675, + "time_per_iteration": 2.410081624984741 + }, + { + "auxiliary_loss_clip": 0.01057467, + "auxiliary_loss_mlp": 0.01038763, + "balance_loss_clip": 1.01232243, + "balance_loss_mlp": 1.01850343, + "epoch": 0.5817525928152713, + "flos": 21323168092800.0, + "grad_norm": 1.9080999491844914, + "language_loss": 0.70921588, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.73017812, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 9676, + "time_per_iteration": 2.3587865829467773 + }, + { + "auxiliary_loss_clip": 0.01058132, + "auxiliary_loss_mlp": 0.01047664, + "balance_loss_clip": 1.02102089, + "balance_loss_mlp": 1.01793861, + "epoch": 0.5818127160679393, + "flos": 26933536488960.0, + "grad_norm": 2.1599011095058867, + "language_loss": 0.64784658, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.66890454, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 9677, + "time_per_iteration": 2.3892252445220947 + }, + { + "auxiliary_loss_clip": 0.01013769, + "auxiliary_loss_mlp": 0.01013424, + "balance_loss_clip": 1.01072943, + "balance_loss_mlp": 1.00586987, + "epoch": 0.5818728393206073, + "flos": 63914934090240.0, + "grad_norm": 0.8174540337431729, + "language_loss": 0.54335356, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56362545, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.02697754, + "router_z_loss_mlp": 0.07910156, + "step": 9678, + "time_per_iteration": 3.0696749687194824 + }, + { + "auxiliary_loss_clip": 0.0101545, + "auxiliary_loss_mlp": 0.01016007, + "balance_loss_clip": 1.01305091, + "balance_loss_mlp": 1.00731444, + "epoch": 0.5819329625732752, + "flos": 64951017730560.0, + "grad_norm": 0.7463029450869442, + "language_loss": 0.56326097, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58357555, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.02954102, + "router_z_loss_mlp": 0.08105469, + "step": 9679, + "time_per_iteration": 4.336357355117798 + }, + { + "auxiliary_loss_clip": 0.01057994, + "auxiliary_loss_mlp": 0.01039487, + "balance_loss_clip": 1.01439369, + "balance_loss_mlp": 1.01918554, + "epoch": 0.5819930858259432, + "flos": 21214204139520.0, + "grad_norm": 1.5970847905496717, + "language_loss": 0.66561031, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.68658507, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38867188, + "step": 9680, + "time_per_iteration": 2.402846097946167 + }, + { + "auxiliary_loss_clip": 0.01056474, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.01625776, + "balance_loss_mlp": 1.01774049, + "epoch": 0.5820532090786111, + "flos": 19457654215680.0, + "grad_norm": 1.9379370282766968, + "language_loss": 0.84198004, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.86296606, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9681, + "time_per_iteration": 2.3989641666412354 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.0104456, + "balance_loss_clip": 1.01751101, + "balance_loss_mlp": 1.01852179, + "epoch": 0.5821133323312792, + "flos": 17711647522560.0, + "grad_norm": 1.699104077057305, + "language_loss": 0.76698035, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.7880072, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39648438, + "step": 9682, + "time_per_iteration": 2.380551815032959 + }, + { + "auxiliary_loss_clip": 0.01058855, + "auxiliary_loss_mlp": 0.01043067, + "balance_loss_clip": 1.01704383, + "balance_loss_mlp": 1.01883817, + "epoch": 0.5821734555839471, + "flos": 24570649249920.0, + "grad_norm": 2.0272570640066134, + "language_loss": 0.76638001, + "learning_rate": 1.568203437579977e-06, + "loss": 0.78739917, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 9683, + "time_per_iteration": 2.4015114307403564 + }, + { + "auxiliary_loss_clip": 0.01059942, + "auxiliary_loss_mlp": 0.01049352, + "balance_loss_clip": 1.0211947, + "balance_loss_mlp": 1.01877391, + "epoch": 0.5822335788366151, + "flos": 22381176067200.0, + "grad_norm": 1.718989330574959, + "language_loss": 0.74696392, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76805687, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 9684, + "time_per_iteration": 2.399181842803955 + }, + { + "auxiliary_loss_clip": 0.01057353, + "auxiliary_loss_mlp": 0.01046158, + "balance_loss_clip": 1.01933563, + "balance_loss_mlp": 1.01798964, + "epoch": 0.582293702089283, + "flos": 26721334045440.0, + "grad_norm": 2.0943156685813067, + "language_loss": 0.78831232, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80934739, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 9685, + "time_per_iteration": 2.422429323196411 + }, + { + "auxiliary_loss_clip": 0.01057904, + "auxiliary_loss_mlp": 0.0104652, + "balance_loss_clip": 1.02050889, + "balance_loss_mlp": 1.0181694, + "epoch": 0.582353825341951, + "flos": 17347677932160.0, + "grad_norm": 2.0533128560100287, + "language_loss": 0.76515245, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.78619671, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 9686, + "time_per_iteration": 2.3625597953796387 + }, + { + "auxiliary_loss_clip": 0.01014836, + "auxiliary_loss_mlp": 0.0100427, + "balance_loss_clip": 1.00114667, + "balance_loss_mlp": 1.00668001, + "epoch": 0.5824139485946189, + "flos": 55470282877440.0, + "grad_norm": 0.8189624474652363, + "language_loss": 0.57509542, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59528649, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.03125, + "router_z_loss_mlp": 0.08154297, + "step": 9687, + "time_per_iteration": 2.8404717445373535 + }, + { + "auxiliary_loss_clip": 0.01057914, + "auxiliary_loss_mlp": 0.01046718, + "balance_loss_clip": 1.02043211, + "balance_loss_mlp": 1.01852131, + "epoch": 0.582474071847287, + "flos": 20301993469440.0, + "grad_norm": 2.0031903530661275, + "language_loss": 0.7080102, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72905654, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 9688, + "time_per_iteration": 2.38323974609375 + }, + { + "auxiliary_loss_clip": 0.01057227, + "auxiliary_loss_mlp": 0.01049323, + "balance_loss_clip": 1.02445602, + "balance_loss_mlp": 1.01778328, + "epoch": 0.5825341950999549, + "flos": 23876890156800.0, + "grad_norm": 2.590336256695031, + "language_loss": 0.67444384, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.69550931, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.39453125, + "step": 9689, + "time_per_iteration": 2.3901262283325195 + }, + { + "auxiliary_loss_clip": 0.01057084, + "auxiliary_loss_mlp": 0.0104626, + "balance_loss_clip": 1.02045131, + "balance_loss_mlp": 1.01942611, + "epoch": 0.5825943183526229, + "flos": 23111908156800.0, + "grad_norm": 1.6312637743234726, + "language_loss": 0.74426997, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.76530343, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 9690, + "time_per_iteration": 2.4091169834136963 + }, + { + "auxiliary_loss_clip": 0.01058455, + "auxiliary_loss_mlp": 0.01057462, + "balance_loss_clip": 1.02907848, + "balance_loss_mlp": 1.01861525, + "epoch": 0.5826544416052909, + "flos": 22856309026560.0, + "grad_norm": 1.639701397527284, + "language_loss": 0.7661159, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78727502, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.3984375, + "step": 9691, + "time_per_iteration": 2.3949649333953857 + }, + { + "auxiliary_loss_clip": 0.01058294, + "auxiliary_loss_mlp": 0.01059686, + "balance_loss_clip": 1.03410339, + "balance_loss_mlp": 1.01796746, + "epoch": 0.5827145648579588, + "flos": 31500559681920.0, + "grad_norm": 1.7420255440375607, + "language_loss": 0.81438446, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.83556426, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 9692, + "time_per_iteration": 2.4539248943328857 + }, + { + "auxiliary_loss_clip": 0.01013284, + "auxiliary_loss_mlp": 0.01002508, + "balance_loss_clip": 0.99971884, + "balance_loss_mlp": 1.00566137, + "epoch": 0.5827746881106268, + "flos": 69808448545920.0, + "grad_norm": 0.7628410029723887, + "language_loss": 0.56947231, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58963025, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.0279541, + "router_z_loss_mlp": 0.07617188, + "step": 9693, + "time_per_iteration": 3.0034701824188232 + }, + { + "auxiliary_loss_clip": 0.01055995, + "auxiliary_loss_mlp": 0.01051393, + "balance_loss_clip": 1.0275631, + "balance_loss_mlp": 1.01717126, + "epoch": 0.5828348113632947, + "flos": 23111279752320.0, + "grad_norm": 1.9420154820857942, + "language_loss": 0.80321366, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.82428753, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38867188, + "step": 9694, + "time_per_iteration": 2.415757179260254 + }, + { + "auxiliary_loss_clip": 0.0105499, + "auxiliary_loss_mlp": 0.01051943, + "balance_loss_clip": 1.02820849, + "balance_loss_mlp": 1.01785028, + "epoch": 0.5828949346159628, + "flos": 21871967754240.0, + "grad_norm": 1.5198913888807701, + "language_loss": 0.77122825, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.7922976, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 9695, + "time_per_iteration": 2.3731281757354736 + }, + { + "auxiliary_loss_clip": 0.01010941, + "auxiliary_loss_mlp": 0.0101456, + "balance_loss_clip": 1.01169848, + "balance_loss_mlp": 1.00327277, + "epoch": 0.5829550578686307, + "flos": 65958784525440.0, + "grad_norm": 0.7916564620503238, + "language_loss": 0.55058151, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57083654, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.07666016, + "step": 9696, + "time_per_iteration": 3.108586072921753 + }, + { + "auxiliary_loss_clip": 0.01058706, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.02183616, + "balance_loss_mlp": 1.01922393, + "epoch": 0.5830151811212987, + "flos": 16288866996480.0, + "grad_norm": 1.9844589090317828, + "language_loss": 0.77703589, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.79807973, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.39453125, + "step": 9697, + "time_per_iteration": 2.3509905338287354 + }, + { + "auxiliary_loss_clip": 0.01059733, + "auxiliary_loss_mlp": 0.01052893, + "balance_loss_clip": 1.0236392, + "balance_loss_mlp": 1.01857936, + "epoch": 0.5830753043739666, + "flos": 24167751626880.0, + "grad_norm": 1.6301584383224557, + "language_loss": 0.78980643, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.81093276, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41015625, + "step": 9698, + "time_per_iteration": 2.4063656330108643 + }, + { + "auxiliary_loss_clip": 0.01059658, + "auxiliary_loss_mlp": 0.01045729, + "balance_loss_clip": 1.01957417, + "balance_loss_mlp": 1.02055097, + "epoch": 0.5831354276266346, + "flos": 27057651972480.0, + "grad_norm": 1.8997547595693893, + "language_loss": 0.84697342, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.86802727, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 9699, + "time_per_iteration": 2.4957683086395264 + }, + { + "auxiliary_loss_clip": 0.01060169, + "auxiliary_loss_mlp": 0.01047299, + "balance_loss_clip": 1.02075088, + "balance_loss_mlp": 1.02053189, + "epoch": 0.5831955508793025, + "flos": 23622338367360.0, + "grad_norm": 2.164695343857819, + "language_loss": 0.67658496, + "learning_rate": 1.561741113828305e-06, + "loss": 0.69765967, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 9700, + "time_per_iteration": 2.422590494155884 + }, + { + "auxiliary_loss_clip": 0.01060138, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.01312399, + "balance_loss_mlp": 1.01990676, + "epoch": 0.5832556741319705, + "flos": 24972080595840.0, + "grad_norm": 1.6617463492113318, + "language_loss": 0.71865761, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.7396552, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 9701, + "time_per_iteration": 2.403135299682617 + }, + { + "auxiliary_loss_clip": 0.01060354, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.01778436, + "balance_loss_mlp": 1.02189898, + "epoch": 0.5833157973846385, + "flos": 23220453173760.0, + "grad_norm": 1.790134100145381, + "language_loss": 0.86233765, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.88336521, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38476562, + "step": 9702, + "time_per_iteration": 2.4219655990600586 + }, + { + "auxiliary_loss_clip": 0.01059277, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.01854134, + "balance_loss_mlp": 1.02166212, + "epoch": 0.5833759206373065, + "flos": 21976951812480.0, + "grad_norm": 1.5488403762154956, + "language_loss": 0.77953529, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80055416, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 9703, + "time_per_iteration": 2.386042356491089 + }, + { + "auxiliary_loss_clip": 0.01064443, + "auxiliary_loss_mlp": 0.01041012, + "balance_loss_clip": 1.01420212, + "balance_loss_mlp": 1.02393389, + "epoch": 0.5834360438899745, + "flos": 21761328055680.0, + "grad_norm": 1.7322268286976417, + "language_loss": 0.72366005, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.74471462, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40625, + "step": 9704, + "time_per_iteration": 2.4090538024902344 + }, + { + "auxiliary_loss_clip": 0.01062611, + "auxiliary_loss_mlp": 0.01045866, + "balance_loss_clip": 1.0213213, + "balance_loss_mlp": 1.02297032, + "epoch": 0.5834961671426424, + "flos": 15991791304320.0, + "grad_norm": 1.7494760691715012, + "language_loss": 0.82554686, + "learning_rate": 1.559841341236335e-06, + "loss": 0.84663159, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39648438, + "step": 9705, + "time_per_iteration": 2.368354558944702 + }, + { + "auxiliary_loss_clip": 0.01061741, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.01461017, + "balance_loss_mlp": 1.02317071, + "epoch": 0.5835562903953104, + "flos": 22817276259840.0, + "grad_norm": 1.7628637324878906, + "language_loss": 0.82132697, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.84233284, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38671875, + "step": 9706, + "time_per_iteration": 3.8437769412994385 + }, + { + "auxiliary_loss_clip": 0.01060551, + "auxiliary_loss_mlp": 0.01042041, + "balance_loss_clip": 1.01494431, + "balance_loss_mlp": 1.02231216, + "epoch": 0.5836164136479783, + "flos": 48466288673280.0, + "grad_norm": 2.3097329051112094, + "language_loss": 0.75893152, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77995741, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3828125, + "step": 9707, + "time_per_iteration": 2.6114885807037354 + }, + { + "auxiliary_loss_clip": 0.01061802, + "auxiliary_loss_mlp": 0.01040712, + "balance_loss_clip": 1.01591671, + "balance_loss_mlp": 1.02398622, + "epoch": 0.5836765369006464, + "flos": 26904802573440.0, + "grad_norm": 1.7277212846637298, + "language_loss": 0.82326746, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.84429264, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 9708, + "time_per_iteration": 2.4620330333709717 + }, + { + "auxiliary_loss_clip": 0.01062515, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_clip": 1.01629007, + "balance_loss_mlp": 1.02433288, + "epoch": 0.5837366601533143, + "flos": 20083018222080.0, + "grad_norm": 1.4846566879879044, + "language_loss": 0.79136592, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.81241596, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 9709, + "time_per_iteration": 2.3955883979797363 + }, + { + "auxiliary_loss_clip": 0.01016346, + "auxiliary_loss_mlp": 0.01015552, + "balance_loss_clip": 1.01296556, + "balance_loss_mlp": 1.00884533, + "epoch": 0.5837967834059823, + "flos": 65360548512000.0, + "grad_norm": 0.7736790168775292, + "language_loss": 0.56673497, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58705395, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.07519531, + "step": 9710, + "time_per_iteration": 4.419286489486694 + }, + { + "auxiliary_loss_clip": 0.01058999, + "auxiliary_loss_mlp": 0.01043962, + "balance_loss_clip": 1.01935709, + "balance_loss_mlp": 1.02112389, + "epoch": 0.5838569066586502, + "flos": 25337446640640.0, + "grad_norm": 1.57584384020284, + "language_loss": 0.67071807, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.69174767, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 9711, + "time_per_iteration": 3.9427759647369385 + }, + { + "auxiliary_loss_clip": 0.01064906, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.01828945, + "balance_loss_mlp": 1.0220778, + "epoch": 0.5839170299113182, + "flos": 22228361579520.0, + "grad_norm": 2.4206433842761763, + "language_loss": 0.80277276, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.82389879, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 9712, + "time_per_iteration": 2.3955094814300537 + }, + { + "auxiliary_loss_clip": 0.01059619, + "auxiliary_loss_mlp": 0.01048816, + "balance_loss_clip": 1.02328146, + "balance_loss_mlp": 1.02072453, + "epoch": 0.5839771531639861, + "flos": 22198929436800.0, + "grad_norm": 1.808255036455255, + "language_loss": 0.74307442, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.76415884, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.390625, + "step": 9713, + "time_per_iteration": 2.425021171569824 + }, + { + "auxiliary_loss_clip": 0.01063334, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.01752663, + "balance_loss_mlp": 1.02143598, + "epoch": 0.5840372764166541, + "flos": 22418253797760.0, + "grad_norm": 1.8131056496329903, + "language_loss": 0.70846361, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.729581, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.41992188, + "step": 9714, + "time_per_iteration": 2.398369073867798 + }, + { + "auxiliary_loss_clip": 0.01062268, + "auxiliary_loss_mlp": 0.01051991, + "balance_loss_clip": 1.02363062, + "balance_loss_mlp": 1.02103829, + "epoch": 0.5840973996693221, + "flos": 19827244535040.0, + "grad_norm": 1.6732015427109435, + "language_loss": 0.8113693, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.8325119, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41210938, + "step": 9715, + "time_per_iteration": 2.4085938930511475 + }, + { + "auxiliary_loss_clip": 0.01061213, + "auxiliary_loss_mlp": 0.01051681, + "balance_loss_clip": 1.02471626, + "balance_loss_mlp": 1.02165782, + "epoch": 0.5841575229219901, + "flos": 21141898980480.0, + "grad_norm": 1.9361805212644985, + "language_loss": 0.74431419, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.76544309, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 9716, + "time_per_iteration": 2.396636486053467 + }, + { + "auxiliary_loss_clip": 0.0105888, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.0190866, + "balance_loss_mlp": 1.02116513, + "epoch": 0.5842176461746581, + "flos": 24639288716160.0, + "grad_norm": 1.7342186036136324, + "language_loss": 0.76303291, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.78406042, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 9717, + "time_per_iteration": 2.4350569248199463 + }, + { + "auxiliary_loss_clip": 0.01059699, + "auxiliary_loss_mlp": 0.01053342, + "balance_loss_clip": 1.02673495, + "balance_loss_mlp": 1.02004921, + "epoch": 0.584277769427326, + "flos": 19130273596800.0, + "grad_norm": 2.406781584490626, + "language_loss": 0.81289351, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.83402395, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 9718, + "time_per_iteration": 2.340877056121826 + }, + { + "auxiliary_loss_clip": 0.01058515, + "auxiliary_loss_mlp": 0.01054472, + "balance_loss_clip": 1.02822232, + "balance_loss_mlp": 1.02019143, + "epoch": 0.584337892679994, + "flos": 22673992573440.0, + "grad_norm": 1.6621732173893033, + "language_loss": 0.6823613, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.70349121, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 9719, + "time_per_iteration": 3.83272385597229 + }, + { + "auxiliary_loss_clip": 0.01060028, + "auxiliary_loss_mlp": 0.01055958, + "balance_loss_clip": 1.02915967, + "balance_loss_mlp": 1.02017331, + "epoch": 0.5843980159326619, + "flos": 31282771420800.0, + "grad_norm": 2.641470012829243, + "language_loss": 0.76400101, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.7851609, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3984375, + "step": 9720, + "time_per_iteration": 2.463491201400757 + }, + { + "auxiliary_loss_clip": 0.01057975, + "auxiliary_loss_mlp": 0.01057064, + "balance_loss_clip": 1.03115952, + "balance_loss_mlp": 1.01869512, + "epoch": 0.58445813918533, + "flos": 22746995959680.0, + "grad_norm": 1.5220823301755952, + "language_loss": 0.83621228, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85736269, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39257812, + "step": 9721, + "time_per_iteration": 2.3918910026550293 + }, + { + "auxiliary_loss_clip": 0.01012137, + "auxiliary_loss_mlp": 0.01006757, + "balance_loss_clip": 1.00431275, + "balance_loss_mlp": 1.00478566, + "epoch": 0.5845182624379979, + "flos": 60683548936320.0, + "grad_norm": 0.9369790555966969, + "language_loss": 0.71478814, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73497707, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.07324219, + "step": 9722, + "time_per_iteration": 3.0392560958862305 + }, + { + "auxiliary_loss_clip": 0.01056576, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_clip": 1.02362061, + "balance_loss_mlp": 1.01796544, + "epoch": 0.5845783856906659, + "flos": 16361521269120.0, + "grad_norm": 2.741151566696127, + "language_loss": 0.9109118, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.93198299, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 9723, + "time_per_iteration": 2.3523313999176025 + }, + { + "auxiliary_loss_clip": 0.01057029, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_clip": 1.02230906, + "balance_loss_mlp": 1.01857877, + "epoch": 0.5846385089433338, + "flos": 20082389817600.0, + "grad_norm": 1.7480765550085273, + "language_loss": 0.69075775, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.71181512, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38476562, + "step": 9724, + "time_per_iteration": 2.4231467247009277 + }, + { + "auxiliary_loss_clip": 0.01061828, + "auxiliary_loss_mlp": 0.01061403, + "balance_loss_clip": 1.03195846, + "balance_loss_mlp": 1.02076185, + "epoch": 0.5846986321960018, + "flos": 17310111442560.0, + "grad_norm": 2.4149565492202454, + "language_loss": 0.87805009, + "learning_rate": 1.552246441587197e-06, + "loss": 0.89928234, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41015625, + "step": 9725, + "time_per_iteration": 2.3524668216705322 + }, + { + "auxiliary_loss_clip": 0.01061921, + "auxiliary_loss_mlp": 0.01052979, + "balance_loss_clip": 1.02432072, + "balance_loss_mlp": 1.02010047, + "epoch": 0.5847587554486697, + "flos": 17197062860160.0, + "grad_norm": 1.5831958510496844, + "language_loss": 0.840478, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.86162698, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41796875, + "step": 9726, + "time_per_iteration": 2.400054693222046 + }, + { + "auxiliary_loss_clip": 0.01059084, + "auxiliary_loss_mlp": 0.01049149, + "balance_loss_clip": 1.02307844, + "balance_loss_mlp": 1.01917517, + "epoch": 0.5848188787013378, + "flos": 24528125347200.0, + "grad_norm": 1.9164751397416517, + "language_loss": 0.67587441, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69695675, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3984375, + "step": 9727, + "time_per_iteration": 2.424271583557129 + }, + { + "auxiliary_loss_clip": 0.01062589, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.02704024, + "balance_loss_mlp": 1.02173638, + "epoch": 0.5848790019540057, + "flos": 20627419052160.0, + "grad_norm": 1.6944164624528744, + "language_loss": 0.82469261, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84588182, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40820312, + "step": 9728, + "time_per_iteration": 2.4002597332000732 + }, + { + "auxiliary_loss_clip": 0.01058267, + "auxiliary_loss_mlp": 0.01050975, + "balance_loss_clip": 1.0247016, + "balance_loss_mlp": 1.01957393, + "epoch": 0.5849391252066737, + "flos": 22417765038720.0, + "grad_norm": 1.70536624127662, + "language_loss": 0.78270817, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80380058, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9729, + "time_per_iteration": 2.412889242172241 + }, + { + "auxiliary_loss_clip": 0.01060481, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_clip": 1.01850247, + "balance_loss_mlp": 1.01961231, + "epoch": 0.5849992484593417, + "flos": 25409751799680.0, + "grad_norm": 1.9475581020909587, + "language_loss": 0.71908486, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.74015832, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40820312, + "step": 9730, + "time_per_iteration": 2.417783498764038 + }, + { + "auxiliary_loss_clip": 0.01061004, + "auxiliary_loss_mlp": 0.01050643, + "balance_loss_clip": 1.0205071, + "balance_loss_mlp": 1.01900578, + "epoch": 0.5850593717120096, + "flos": 21064217472000.0, + "grad_norm": 1.6425034824779405, + "language_loss": 0.7969386, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.81805503, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.41992188, + "step": 9731, + "time_per_iteration": 2.419891357421875 + }, + { + "auxiliary_loss_clip": 0.01059146, + "auxiliary_loss_mlp": 0.01050478, + "balance_loss_clip": 1.02146268, + "balance_loss_mlp": 1.01908207, + "epoch": 0.5851194949646776, + "flos": 25300368910080.0, + "grad_norm": 1.911264245744276, + "language_loss": 0.71980989, + "learning_rate": 1.549589825316528e-06, + "loss": 0.74090612, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40039062, + "step": 9732, + "time_per_iteration": 2.451434850692749 + }, + { + "auxiliary_loss_clip": 0.01063151, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.0175842, + "balance_loss_mlp": 1.02119899, + "epoch": 0.5851796182173455, + "flos": 23586098509440.0, + "grad_norm": 2.7559214044598, + "language_loss": 0.54748499, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.56860298, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.41992188, + "step": 9733, + "time_per_iteration": 2.4483511447906494 + }, + { + "auxiliary_loss_clip": 0.01061617, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_clip": 1.02011812, + "balance_loss_mlp": 1.02085495, + "epoch": 0.5852397414700136, + "flos": 24821674992000.0, + "grad_norm": 2.9352899712635034, + "language_loss": 0.89963114, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.92073709, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40820312, + "step": 9734, + "time_per_iteration": 2.39829683303833 + }, + { + "auxiliary_loss_clip": 0.01058067, + "auxiliary_loss_mlp": 0.01045235, + "balance_loss_clip": 1.01855564, + "balance_loss_mlp": 1.02112269, + "epoch": 0.5852998647226815, + "flos": 19936767070080.0, + "grad_norm": 1.5407475061721285, + "language_loss": 0.73723245, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.7582655, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.36914062, + "step": 9735, + "time_per_iteration": 2.3957083225250244 + }, + { + "auxiliary_loss_clip": 0.01063355, + "auxiliary_loss_mlp": 0.01052199, + "balance_loss_clip": 1.02268267, + "balance_loss_mlp": 1.02133012, + "epoch": 0.5853599879753495, + "flos": 16719625751040.0, + "grad_norm": 2.907834911069106, + "language_loss": 0.75574118, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.77689672, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.41992188, + "step": 9736, + "time_per_iteration": 2.3294007778167725 + }, + { + "auxiliary_loss_clip": 0.01060304, + "auxiliary_loss_mlp": 0.01045268, + "balance_loss_clip": 1.01808858, + "balance_loss_mlp": 1.02131295, + "epoch": 0.5854201112280174, + "flos": 44454872856960.0, + "grad_norm": 1.5357707854391474, + "language_loss": 0.71728212, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.73833781, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.390625, + "step": 9737, + "time_per_iteration": 2.5804479122161865 + }, + { + "auxiliary_loss_clip": 0.01060165, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_clip": 1.02080846, + "balance_loss_mlp": 1.02153492, + "epoch": 0.5854802344806854, + "flos": 20338163504640.0, + "grad_norm": 1.634433807636634, + "language_loss": 0.83826458, + "learning_rate": 1.547313391573169e-06, + "loss": 0.85933501, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 9738, + "time_per_iteration": 2.401454210281372 + }, + { + "auxiliary_loss_clip": 0.01066013, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.01735926, + "balance_loss_mlp": 1.02406943, + "epoch": 0.5855403577333533, + "flos": 20920061001600.0, + "grad_norm": 2.317941877396921, + "language_loss": 0.69406289, + "learning_rate": 1.546934045946082e-06, + "loss": 0.71518707, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.41992188, + "step": 9739, + "time_per_iteration": 2.4022514820098877 + }, + { + "auxiliary_loss_clip": 0.01063427, + "auxiliary_loss_mlp": 0.01037195, + "balance_loss_clip": 1.00980091, + "balance_loss_mlp": 1.02155352, + "epoch": 0.5856004809860214, + "flos": 20447616216960.0, + "grad_norm": 3.381840185859494, + "language_loss": 0.59718215, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.61818838, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.41796875, + "step": 9740, + "time_per_iteration": 2.3687727451324463 + }, + { + "auxiliary_loss_clip": 0.01059513, + "auxiliary_loss_mlp": 0.01042137, + "balance_loss_clip": 1.01532698, + "balance_loss_mlp": 1.02046061, + "epoch": 0.5856606042386893, + "flos": 19639900846080.0, + "grad_norm": 1.973944953396449, + "language_loss": 0.75795615, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77897263, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 9741, + "time_per_iteration": 2.4126803874969482 + }, + { + "auxiliary_loss_clip": 0.01063716, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_clip": 1.02118373, + "balance_loss_mlp": 1.02358317, + "epoch": 0.5857207274913573, + "flos": 21685182647040.0, + "grad_norm": 1.683553134901866, + "language_loss": 0.76890856, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.79004055, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40039062, + "step": 9742, + "time_per_iteration": 2.396118402481079 + }, + { + "auxiliary_loss_clip": 0.01061308, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.01697016, + "balance_loss_mlp": 1.02206182, + "epoch": 0.5857808507440253, + "flos": 23181664786560.0, + "grad_norm": 1.6629997325539856, + "language_loss": 0.76161021, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.78265727, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 9743, + "time_per_iteration": 2.4101898670196533 + }, + { + "auxiliary_loss_clip": 0.0106037, + "auxiliary_loss_mlp": 0.01041179, + "balance_loss_clip": 1.0157752, + "balance_loss_mlp": 1.02184474, + "epoch": 0.5858409739966932, + "flos": 27234068405760.0, + "grad_norm": 1.91162794777979, + "language_loss": 0.8237381, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.84475356, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 9744, + "time_per_iteration": 2.4350175857543945 + }, + { + "auxiliary_loss_clip": 0.01063383, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.01348352, + "balance_loss_mlp": 1.02220893, + "epoch": 0.5859010972493612, + "flos": 27854265530880.0, + "grad_norm": 1.7755557672198368, + "language_loss": 0.72233105, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.74336863, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41210938, + "step": 9745, + "time_per_iteration": 3.7323148250579834 + }, + { + "auxiliary_loss_clip": 0.01022966, + "auxiliary_loss_mlp": 0.01017372, + "balance_loss_clip": 1.01453495, + "balance_loss_mlp": 1.01521647, + "epoch": 0.5859612205020291, + "flos": 70003333088640.0, + "grad_norm": 0.7370768540706987, + "language_loss": 0.53314221, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55354559, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.02832031, + "router_z_loss_mlp": 0.07763672, + "step": 9746, + "time_per_iteration": 3.1080479621887207 + }, + { + "auxiliary_loss_clip": 0.01061932, + "auxiliary_loss_mlp": 0.01048794, + "balance_loss_clip": 1.02058959, + "balance_loss_mlp": 1.02080011, + "epoch": 0.5860213437546972, + "flos": 24055017246720.0, + "grad_norm": 1.9249684550651125, + "language_loss": 0.74429631, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.76540351, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41015625, + "step": 9747, + "time_per_iteration": 2.400510311126709 + }, + { + "auxiliary_loss_clip": 0.01062437, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_clip": 1.02106309, + "balance_loss_mlp": 1.0218823, + "epoch": 0.5860814670073651, + "flos": 18946735246080.0, + "grad_norm": 2.2526233750786444, + "language_loss": 0.82245249, + "learning_rate": 1.543520710142051e-06, + "loss": 0.84356654, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40625, + "step": 9748, + "time_per_iteration": 2.3947677612304688 + }, + { + "auxiliary_loss_clip": 0.01060535, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.01856208, + "balance_loss_mlp": 1.02019918, + "epoch": 0.5861415902600331, + "flos": 22560839256960.0, + "grad_norm": 1.7823963644150866, + "language_loss": 0.73263371, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.75371528, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40234375, + "step": 9749, + "time_per_iteration": 2.388312816619873 + }, + { + "auxiliary_loss_clip": 0.01059115, + "auxiliary_loss_mlp": 0.01050986, + "balance_loss_clip": 1.02397346, + "balance_loss_mlp": 1.02013493, + "epoch": 0.586201713512701, + "flos": 14391162979200.0, + "grad_norm": 2.5057649052766076, + "language_loss": 0.76080716, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.78190815, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 9750, + "time_per_iteration": 3.818565607070923 + }, + { + "auxiliary_loss_clip": 0.0105871, + "auxiliary_loss_mlp": 0.01050268, + "balance_loss_clip": 1.02093065, + "balance_loss_mlp": 1.01973081, + "epoch": 0.586261836765369, + "flos": 19497594677760.0, + "grad_norm": 1.9471842730553588, + "language_loss": 0.71956974, + "learning_rate": 1.542383242598344e-06, + "loss": 0.74065953, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.390625, + "step": 9751, + "time_per_iteration": 3.810014009475708 + }, + { + "auxiliary_loss_clip": 0.01059518, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_clip": 1.02404165, + "balance_loss_mlp": 1.01909554, + "epoch": 0.5863219600180369, + "flos": 20700841374720.0, + "grad_norm": 1.743053692931917, + "language_loss": 0.75879192, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.77991766, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40429688, + "step": 9752, + "time_per_iteration": 2.391598701477051 + }, + { + "auxiliary_loss_clip": 0.01056448, + "auxiliary_loss_mlp": 0.01051816, + "balance_loss_clip": 1.02423072, + "balance_loss_mlp": 1.01819229, + "epoch": 0.586382083270705, + "flos": 19791109411200.0, + "grad_norm": 1.827747827252866, + "language_loss": 0.79210883, + "learning_rate": 1.541625017642943e-06, + "loss": 0.81319147, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.3828125, + "step": 9753, + "time_per_iteration": 2.392061471939087 + }, + { + "auxiliary_loss_clip": 0.01057221, + "auxiliary_loss_mlp": 0.0104409, + "balance_loss_clip": 1.01954508, + "balance_loss_mlp": 1.02006292, + "epoch": 0.5864422065233729, + "flos": 16499847542400.0, + "grad_norm": 4.031536510987028, + "language_loss": 0.72227943, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.74329263, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 9754, + "time_per_iteration": 2.36039137840271 + }, + { + "auxiliary_loss_clip": 0.01056517, + "auxiliary_loss_mlp": 0.01051991, + "balance_loss_clip": 1.02468038, + "balance_loss_mlp": 1.0173434, + "epoch": 0.5865023297760409, + "flos": 20412214231680.0, + "grad_norm": 1.5888446302874326, + "language_loss": 0.73173845, + "learning_rate": 1.540866862214043e-06, + "loss": 0.75282359, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39257812, + "step": 9755, + "time_per_iteration": 2.4182701110839844 + }, + { + "auxiliary_loss_clip": 0.01013033, + "auxiliary_loss_mlp": 0.01006134, + "balance_loss_clip": 1.00347519, + "balance_loss_mlp": 1.00555003, + "epoch": 0.5865624530287089, + "flos": 63347666319360.0, + "grad_norm": 0.7413834788762707, + "language_loss": 0.56997204, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59016371, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.07470703, + "step": 9756, + "time_per_iteration": 3.0084025859832764 + }, + { + "auxiliary_loss_clip": 0.01055327, + "auxiliary_loss_mlp": 0.01047551, + "balance_loss_clip": 1.02332783, + "balance_loss_mlp": 1.01786399, + "epoch": 0.5866225762813768, + "flos": 27015058247040.0, + "grad_norm": 1.6775201664448427, + "language_loss": 0.77615452, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.79718328, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 9757, + "time_per_iteration": 2.4750819206237793 + }, + { + "auxiliary_loss_clip": 0.01012298, + "auxiliary_loss_mlp": 0.0102221, + "balance_loss_clip": 1.01956403, + "balance_loss_mlp": 1.00502825, + "epoch": 0.5866826995340448, + "flos": 72983554721280.0, + "grad_norm": 0.8564145177413587, + "language_loss": 0.60649371, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.6268388, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.07275391, + "step": 9758, + "time_per_iteration": 4.476247787475586 + }, + { + "auxiliary_loss_clip": 0.0105902, + "auxiliary_loss_mlp": 0.01055727, + "balance_loss_clip": 1.02704573, + "balance_loss_mlp": 1.01871252, + "epoch": 0.5867428227867127, + "flos": 21284728819200.0, + "grad_norm": 2.0605510637049256, + "language_loss": 0.73864412, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.75979161, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40234375, + "step": 9759, + "time_per_iteration": 2.415647029876709 + }, + { + "auxiliary_loss_clip": 0.01057867, + "auxiliary_loss_mlp": 0.01057549, + "balance_loss_clip": 1.03154922, + "balance_loss_mlp": 1.01845789, + "epoch": 0.5868029460393808, + "flos": 33467601392640.0, + "grad_norm": 1.9039061845484746, + "language_loss": 0.73792493, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75907904, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 9760, + "time_per_iteration": 2.501605272293091 + }, + { + "auxiliary_loss_clip": 0.01057031, + "auxiliary_loss_mlp": 0.01048093, + "balance_loss_clip": 1.02184296, + "balance_loss_mlp": 1.01796579, + "epoch": 0.5868630692920487, + "flos": 17888657448960.0, + "grad_norm": 3.0528149530363113, + "language_loss": 0.74023652, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.76128781, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 9761, + "time_per_iteration": 2.3685927391052246 + }, + { + "auxiliary_loss_clip": 0.01061634, + "auxiliary_loss_mlp": 0.01056957, + "balance_loss_clip": 1.02701116, + "balance_loss_mlp": 1.0205617, + "epoch": 0.5869231925447167, + "flos": 21033912545280.0, + "grad_norm": 1.8360524145174864, + "language_loss": 0.75692928, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77811515, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41015625, + "step": 9762, + "time_per_iteration": 2.3865840435028076 + }, + { + "auxiliary_loss_clip": 0.0105966, + "auxiliary_loss_mlp": 0.01053581, + "balance_loss_clip": 1.0259366, + "balance_loss_mlp": 1.02157152, + "epoch": 0.5869833157973846, + "flos": 74735707680000.0, + "grad_norm": 1.2714532985385563, + "language_loss": 0.73446083, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.7555933, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.37890625, + "step": 9763, + "time_per_iteration": 2.822741985321045 + }, + { + "auxiliary_loss_clip": 0.0105903, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.01827013, + "balance_loss_mlp": 1.02111769, + "epoch": 0.5870434390500526, + "flos": 17638050643200.0, + "grad_norm": 1.5031529366500294, + "language_loss": 0.81271434, + "learning_rate": 1.53745602625755e-06, + "loss": 0.83373535, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 9764, + "time_per_iteration": 2.428218126296997 + }, + { + "auxiliary_loss_clip": 0.01062841, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.02490783, + "balance_loss_mlp": 1.0227741, + "epoch": 0.5871035623027205, + "flos": 21505100520960.0, + "grad_norm": 1.6664280941894007, + "language_loss": 0.79999435, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.82113284, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 9765, + "time_per_iteration": 2.4260759353637695 + }, + { + "auxiliary_loss_clip": 0.01059844, + "auxiliary_loss_mlp": 0.01051337, + "balance_loss_clip": 1.02393103, + "balance_loss_mlp": 1.02197659, + "epoch": 0.5871636855553886, + "flos": 13551048000000.0, + "grad_norm": 1.8576583243589238, + "language_loss": 0.85036755, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.87147939, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.37890625, + "step": 9766, + "time_per_iteration": 2.377051591873169 + }, + { + "auxiliary_loss_clip": 0.01065724, + "auxiliary_loss_mlp": 0.01050612, + "balance_loss_clip": 1.02357566, + "balance_loss_mlp": 1.02399993, + "epoch": 0.5872238088080565, + "flos": 26211741707520.0, + "grad_norm": 1.515951626664863, + "language_loss": 0.70344818, + "learning_rate": 1.536319396136257e-06, + "loss": 0.72461152, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.41796875, + "step": 9767, + "time_per_iteration": 2.4644381999969482 + }, + { + "auxiliary_loss_clip": 0.01063813, + "auxiliary_loss_mlp": 0.01039561, + "balance_loss_clip": 1.01424122, + "balance_loss_mlp": 1.02376509, + "epoch": 0.5872839320607245, + "flos": 30663866016000.0, + "grad_norm": 1.9142038107917099, + "language_loss": 0.65528989, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.67632365, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 9768, + "time_per_iteration": 2.4804739952087402 + }, + { + "auxiliary_loss_clip": 0.01022791, + "auxiliary_loss_mlp": 0.01026798, + "balance_loss_clip": 1.02427053, + "balance_loss_mlp": 1.0152812, + "epoch": 0.5873440553133924, + "flos": 60300062029440.0, + "grad_norm": 0.7398461795525371, + "language_loss": 0.54055202, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.56104791, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.07519531, + "step": 9769, + "time_per_iteration": 3.053971290588379 + }, + { + "auxiliary_loss_clip": 0.0106314, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.01515591, + "balance_loss_mlp": 1.02482939, + "epoch": 0.5874041785660604, + "flos": 21538338001920.0, + "grad_norm": 1.434541581601141, + "language_loss": 0.71239769, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.73342288, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3828125, + "step": 9770, + "time_per_iteration": 2.4311909675598145 + }, + { + "auxiliary_loss_clip": 0.01061415, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_clip": 1.01628721, + "balance_loss_mlp": 1.0235796, + "epoch": 0.5874643018187284, + "flos": 24387809126400.0, + "grad_norm": 1.8488738731751522, + "language_loss": 0.68552989, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.70658422, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.37890625, + "step": 9771, + "time_per_iteration": 2.43574857711792 + }, + { + "auxiliary_loss_clip": 0.01064902, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.01617229, + "balance_loss_mlp": 1.02476048, + "epoch": 0.5875244250713964, + "flos": 28146453632640.0, + "grad_norm": 2.0559316889919272, + "language_loss": 0.67251682, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.69358408, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 9772, + "time_per_iteration": 2.479489803314209 + }, + { + "auxiliary_loss_clip": 0.01065879, + "auxiliary_loss_mlp": 0.01051966, + "balance_loss_clip": 1.02402306, + "balance_loss_mlp": 1.02483547, + "epoch": 0.5875845483240644, + "flos": 25811218056960.0, + "grad_norm": 2.041369653311522, + "language_loss": 0.75990045, + "learning_rate": 1.534046611017519e-06, + "loss": 0.78107888, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41015625, + "step": 9773, + "time_per_iteration": 2.4428458213806152 + }, + { + "auxiliary_loss_clip": 0.01063985, + "auxiliary_loss_mlp": 0.01051156, + "balance_loss_clip": 1.02388096, + "balance_loss_mlp": 1.02379918, + "epoch": 0.5876446715767323, + "flos": 26905361155200.0, + "grad_norm": 2.4598936169141683, + "language_loss": 0.54777348, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.56892484, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 9774, + "time_per_iteration": 2.4470460414886475 + }, + { + "auxiliary_loss_clip": 0.0106375, + "auxiliary_loss_mlp": 0.01050041, + "balance_loss_clip": 1.02293265, + "balance_loss_mlp": 1.02386355, + "epoch": 0.5877047948294003, + "flos": 36683346257280.0, + "grad_norm": 2.4216129762090306, + "language_loss": 0.66738808, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.68852603, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3984375, + "step": 9775, + "time_per_iteration": 2.5285141468048096 + }, + { + "auxiliary_loss_clip": 0.01063399, + "auxiliary_loss_mlp": 0.0105179, + "balance_loss_clip": 1.02526605, + "balance_loss_mlp": 1.02381968, + "epoch": 0.5877649180820682, + "flos": 26723498549760.0, + "grad_norm": 1.826914951963819, + "language_loss": 0.7476725, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.76882434, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 9776, + "time_per_iteration": 2.456399917602539 + }, + { + "auxiliary_loss_clip": 0.01063431, + "auxiliary_loss_mlp": 0.01055286, + "balance_loss_clip": 1.02872646, + "balance_loss_mlp": 1.02354527, + "epoch": 0.5878250413347362, + "flos": 21031154547840.0, + "grad_norm": 1.902457289599324, + "language_loss": 0.75696641, + "learning_rate": 1.532531774126821e-06, + "loss": 0.77815354, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 9777, + "time_per_iteration": 2.4330735206604004 + }, + { + "auxiliary_loss_clip": 0.01060399, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_clip": 1.02083373, + "balance_loss_mlp": 1.02396441, + "epoch": 0.5878851645874041, + "flos": 25483069388160.0, + "grad_norm": 1.6636980117061506, + "language_loss": 0.75383389, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.77488828, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 9778, + "time_per_iteration": 2.4269490242004395 + }, + { + "auxiliary_loss_clip": 0.0106018, + "auxiliary_loss_mlp": 0.01053131, + "balance_loss_clip": 1.02866948, + "balance_loss_mlp": 1.02257562, + "epoch": 0.5879452878400722, + "flos": 23767996026240.0, + "grad_norm": 2.0318245475368504, + "language_loss": 0.70881462, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72994775, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 9779, + "time_per_iteration": 2.4315452575683594 + }, + { + "auxiliary_loss_clip": 0.01061388, + "auxiliary_loss_mlp": 0.01056449, + "balance_loss_clip": 1.03116441, + "balance_loss_mlp": 1.02175117, + "epoch": 0.5880054110927401, + "flos": 17823474207360.0, + "grad_norm": 1.9399220722520079, + "language_loss": 0.68221283, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.70339119, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39648438, + "step": 9780, + "time_per_iteration": 2.402477979660034 + }, + { + "auxiliary_loss_clip": 0.01062199, + "auxiliary_loss_mlp": 0.01055693, + "balance_loss_clip": 1.02775002, + "balance_loss_mlp": 1.02363515, + "epoch": 0.5880655343454081, + "flos": 19462402160640.0, + "grad_norm": 1.9263018884879528, + "language_loss": 0.73937577, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.76055467, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38671875, + "step": 9781, + "time_per_iteration": 2.3893721103668213 + }, + { + "auxiliary_loss_clip": 0.01059597, + "auxiliary_loss_mlp": 0.0105769, + "balance_loss_clip": 1.03294241, + "balance_loss_mlp": 1.0208981, + "epoch": 0.588125657598076, + "flos": 21396520592640.0, + "grad_norm": 1.6476443585366127, + "language_loss": 0.70904386, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.73021674, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 9782, + "time_per_iteration": 2.411966323852539 + }, + { + "auxiliary_loss_clip": 0.01062036, + "auxiliary_loss_mlp": 0.01055708, + "balance_loss_clip": 1.02824259, + "balance_loss_mlp": 1.02131593, + "epoch": 0.588185780850744, + "flos": 16033721713920.0, + "grad_norm": 2.0895369699825412, + "language_loss": 0.7221247, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.74330211, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40625, + "step": 9783, + "time_per_iteration": 2.4257309436798096 + }, + { + "auxiliary_loss_clip": 0.01060742, + "auxiliary_loss_mlp": 0.01049014, + "balance_loss_clip": 1.02064204, + "balance_loss_mlp": 1.02058792, + "epoch": 0.588245904103412, + "flos": 23727217691520.0, + "grad_norm": 2.1301783880466316, + "language_loss": 0.70159727, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.72269487, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40234375, + "step": 9784, + "time_per_iteration": 2.3934402465820312 + }, + { + "auxiliary_loss_clip": 0.01059562, + "auxiliary_loss_mlp": 0.01048355, + "balance_loss_clip": 1.02212882, + "balance_loss_mlp": 1.01947534, + "epoch": 0.58830602735608, + "flos": 33801126410880.0, + "grad_norm": 1.9242367773335916, + "language_loss": 0.69917047, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.72024965, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40039062, + "step": 9785, + "time_per_iteration": 3.8367767333984375 + }, + { + "auxiliary_loss_clip": 0.01057126, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_clip": 1.02340281, + "balance_loss_mlp": 1.01947689, + "epoch": 0.588366150608748, + "flos": 17089809563520.0, + "grad_norm": 2.248249477992592, + "language_loss": 0.79706579, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.81811881, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 9786, + "time_per_iteration": 2.3927433490753174 + }, + { + "auxiliary_loss_clip": 0.01057991, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.03166854, + "balance_loss_mlp": 1.01885033, + "epoch": 0.5884262738614159, + "flos": 22126100607360.0, + "grad_norm": 16.03189773149027, + "language_loss": 0.80568814, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.82685715, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39257812, + "step": 9787, + "time_per_iteration": 2.404615640640259 + }, + { + "auxiliary_loss_clip": 0.01056261, + "auxiliary_loss_mlp": 0.01047426, + "balance_loss_clip": 1.02286935, + "balance_loss_mlp": 1.01739287, + "epoch": 0.5884863971140839, + "flos": 21030805434240.0, + "grad_norm": 1.5489621279929908, + "language_loss": 0.67235696, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.69339383, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38867188, + "step": 9788, + "time_per_iteration": 2.4064552783966064 + }, + { + "auxiliary_loss_clip": 0.01055916, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.01531875, + "balance_loss_mlp": 1.01893878, + "epoch": 0.5885465203667518, + "flos": 23803991504640.0, + "grad_norm": 2.0758238338132093, + "language_loss": 0.81393886, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.83489579, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 9789, + "time_per_iteration": 2.3997650146484375 + }, + { + "auxiliary_loss_clip": 0.01055438, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.0167855, + "balance_loss_mlp": 1.01817048, + "epoch": 0.5886066436194198, + "flos": 18879562056960.0, + "grad_norm": 1.4910676215258065, + "language_loss": 0.7038579, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72482657, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 9790, + "time_per_iteration": 5.266185760498047 + }, + { + "auxiliary_loss_clip": 0.01057387, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.01750648, + "balance_loss_mlp": 1.01934624, + "epoch": 0.5886667668720877, + "flos": 24788996092800.0, + "grad_norm": 1.7933219988503695, + "language_loss": 0.84635949, + "learning_rate": 1.527232084570895e-06, + "loss": 0.8673659, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 9791, + "time_per_iteration": 2.4299306869506836 + }, + { + "auxiliary_loss_clip": 0.01057898, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.01787722, + "balance_loss_mlp": 1.01936936, + "epoch": 0.5887268901247558, + "flos": 21613366247040.0, + "grad_norm": 1.5734312241864312, + "language_loss": 0.77241886, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.79343909, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9792, + "time_per_iteration": 2.3913958072662354 + }, + { + "auxiliary_loss_clip": 0.01058387, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.01885986, + "balance_loss_mlp": 1.01896536, + "epoch": 0.5887870133774237, + "flos": 20480783875200.0, + "grad_norm": 2.4198945028240546, + "language_loss": 0.70017147, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.72120297, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 9793, + "time_per_iteration": 2.4286458492279053 + }, + { + "auxiliary_loss_clip": 0.010565, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.01439214, + "balance_loss_mlp": 1.01916432, + "epoch": 0.5888471366300917, + "flos": 19205336753280.0, + "grad_norm": 1.8287091972430205, + "language_loss": 0.61531603, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.63626993, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37304688, + "step": 9794, + "time_per_iteration": 2.351283311843872 + }, + { + "auxiliary_loss_clip": 0.01060029, + "auxiliary_loss_mlp": 0.01043572, + "balance_loss_clip": 1.01728678, + "balance_loss_mlp": 1.02080977, + "epoch": 0.5889072598827596, + "flos": 19971924675840.0, + "grad_norm": 1.6953057625030912, + "language_loss": 0.66341114, + "learning_rate": 1.525718531219257e-06, + "loss": 0.68444717, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 9795, + "time_per_iteration": 2.408621311187744 + }, + { + "auxiliary_loss_clip": 0.0105764, + "auxiliary_loss_mlp": 0.01039172, + "balance_loss_clip": 1.01523447, + "balance_loss_mlp": 1.02005124, + "epoch": 0.5889673831354276, + "flos": 20740188343680.0, + "grad_norm": 1.5282127373746623, + "language_loss": 0.74914491, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.77011299, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 9796, + "time_per_iteration": 2.399714708328247 + }, + { + "auxiliary_loss_clip": 0.01058704, + "auxiliary_loss_mlp": 0.01041834, + "balance_loss_clip": 1.01826644, + "balance_loss_mlp": 1.01985061, + "epoch": 0.5890275063880956, + "flos": 25299775416960.0, + "grad_norm": 1.557435602958131, + "language_loss": 0.83843076, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85943615, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38867188, + "step": 9797, + "time_per_iteration": 2.426107883453369 + }, + { + "auxiliary_loss_clip": 0.01056258, + "auxiliary_loss_mlp": 0.01040449, + "balance_loss_clip": 1.01546288, + "balance_loss_mlp": 1.01915383, + "epoch": 0.5890876296407636, + "flos": 11764577174400.0, + "grad_norm": 2.095080868489773, + "language_loss": 0.79941326, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.82038033, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 9798, + "time_per_iteration": 3.7924790382385254 + }, + { + "auxiliary_loss_clip": 0.01057082, + "auxiliary_loss_mlp": 0.01037167, + "balance_loss_clip": 1.01387334, + "balance_loss_mlp": 1.01947284, + "epoch": 0.5891477528934316, + "flos": 13588614489600.0, + "grad_norm": 5.07606119058867, + "language_loss": 0.76382101, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.78476351, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37695312, + "step": 9799, + "time_per_iteration": 2.4188294410705566 + }, + { + "auxiliary_loss_clip": 0.01060951, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.01572967, + "balance_loss_mlp": 1.02046514, + "epoch": 0.5892078761460995, + "flos": 15048298189440.0, + "grad_norm": 2.0324980849070373, + "language_loss": 0.77970183, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.80074131, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40625, + "step": 9800, + "time_per_iteration": 2.3576500415802 + }, + { + "auxiliary_loss_clip": 0.01060581, + "auxiliary_loss_mlp": 0.01047019, + "balance_loss_clip": 1.02190161, + "balance_loss_mlp": 1.02092004, + "epoch": 0.5892679993987675, + "flos": 15777319622400.0, + "grad_norm": 1.8803605095584501, + "language_loss": 0.8060469, + "learning_rate": 1.523448741022722e-06, + "loss": 0.82712293, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39648438, + "step": 9801, + "time_per_iteration": 2.3911850452423096 + }, + { + "auxiliary_loss_clip": 0.01061109, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.00904655, + "balance_loss_mlp": 1.02143645, + "epoch": 0.5893281226514354, + "flos": 25264024318080.0, + "grad_norm": 1.6901665474660512, + "language_loss": 0.67164814, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.69261169, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3984375, + "step": 9802, + "time_per_iteration": 2.419430732727051 + }, + { + "auxiliary_loss_clip": 0.01058657, + "auxiliary_loss_mlp": 0.01039433, + "balance_loss_clip": 1.01482868, + "balance_loss_mlp": 1.01986456, + "epoch": 0.5893882459041034, + "flos": 19457374924800.0, + "grad_norm": 1.5644278859641219, + "language_loss": 0.7867099, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80769086, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38671875, + "step": 9803, + "time_per_iteration": 2.402287244796753 + }, + { + "auxiliary_loss_clip": 0.01060116, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.0169611, + "balance_loss_mlp": 1.01996279, + "epoch": 0.5894483691567713, + "flos": 20632935047040.0, + "grad_norm": 1.7366428014516204, + "language_loss": 0.73642284, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75744605, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40039062, + "step": 9804, + "time_per_iteration": 2.4120917320251465 + }, + { + "auxiliary_loss_clip": 0.01058858, + "auxiliary_loss_mlp": 0.01041032, + "balance_loss_clip": 1.01565182, + "balance_loss_mlp": 1.02099419, + "epoch": 0.5895084924094394, + "flos": 17777459168640.0, + "grad_norm": 1.5008006217082384, + "language_loss": 0.76376498, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.78476387, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 9805, + "time_per_iteration": 2.3930094242095947 + }, + { + "auxiliary_loss_clip": 0.0106411, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.0173192, + "balance_loss_mlp": 1.0221014, + "epoch": 0.5895686156621073, + "flos": 20120026129920.0, + "grad_norm": 1.874442911382488, + "language_loss": 0.79020405, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.81129086, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41992188, + "step": 9806, + "time_per_iteration": 2.4049558639526367 + }, + { + "auxiliary_loss_clip": 0.01058028, + "auxiliary_loss_mlp": 0.01042248, + "balance_loss_clip": 1.01488936, + "balance_loss_mlp": 1.01937485, + "epoch": 0.5896287389147753, + "flos": 20849012651520.0, + "grad_norm": 1.8709451041246972, + "language_loss": 0.78523362, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.80623639, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 9807, + "time_per_iteration": 2.4193782806396484 + }, + { + "auxiliary_loss_clip": 0.01062218, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.01332688, + "balance_loss_mlp": 1.02217817, + "epoch": 0.5896888621674432, + "flos": 14537030106240.0, + "grad_norm": 1.8416547567645436, + "language_loss": 0.76081586, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.78184533, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 9808, + "time_per_iteration": 2.3565709590911865 + }, + { + "auxiliary_loss_clip": 0.01062188, + "auxiliary_loss_mlp": 0.01043033, + "balance_loss_clip": 1.01500654, + "balance_loss_mlp": 1.02157044, + "epoch": 0.5897489854201112, + "flos": 20885706357120.0, + "grad_norm": 1.9715277990572166, + "language_loss": 0.73674321, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.75779545, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40625, + "step": 9809, + "time_per_iteration": 2.4090802669525146 + }, + { + "auxiliary_loss_clip": 0.01061602, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.01836324, + "balance_loss_mlp": 1.02087998, + "epoch": 0.5898091086727792, + "flos": 20010119569920.0, + "grad_norm": 2.0932944629257344, + "language_loss": 0.83882666, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.85990149, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40625, + "step": 9810, + "time_per_iteration": 2.3816840648651123 + }, + { + "auxiliary_loss_clip": 0.01057954, + "auxiliary_loss_mlp": 0.01043143, + "balance_loss_clip": 1.01771617, + "balance_loss_mlp": 1.01990831, + "epoch": 0.5898692319254472, + "flos": 16252312936320.0, + "grad_norm": 1.5535813052226084, + "language_loss": 0.82372475, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.84473568, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 9811, + "time_per_iteration": 2.3717074394226074 + }, + { + "auxiliary_loss_clip": 0.01061246, + "auxiliary_loss_mlp": 0.01049205, + "balance_loss_clip": 1.02176261, + "balance_loss_mlp": 1.0219698, + "epoch": 0.5899293551781152, + "flos": 20447511482880.0, + "grad_norm": 1.676837051849893, + "language_loss": 0.77675062, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.79785514, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39257812, + "step": 9812, + "time_per_iteration": 2.4155492782592773 + }, + { + "auxiliary_loss_clip": 0.01059694, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_clip": 1.01994228, + "balance_loss_mlp": 1.02118349, + "epoch": 0.5899894784307831, + "flos": 13880837502720.0, + "grad_norm": 1.6774681844377857, + "language_loss": 0.71719193, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.73821473, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.38476562, + "step": 9813, + "time_per_iteration": 2.3653905391693115 + }, + { + "auxiliary_loss_clip": 0.01056898, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_clip": 1.01827228, + "balance_loss_mlp": 1.01851416, + "epoch": 0.5900496016834511, + "flos": 20082773842560.0, + "grad_norm": 1.613512684382054, + "language_loss": 0.73272288, + "learning_rate": 1.518533098148494e-06, + "loss": 0.75373244, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 9814, + "time_per_iteration": 2.414436101913452 + }, + { + "auxiliary_loss_clip": 0.01058862, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.0164727, + "balance_loss_mlp": 1.02067578, + "epoch": 0.590109724936119, + "flos": 20258317491840.0, + "grad_norm": 1.91464007456179, + "language_loss": 0.79493254, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.81593937, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 9815, + "time_per_iteration": 2.4081943035125732 + }, + { + "auxiliary_loss_clip": 0.01060801, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_clip": 1.0195508, + "balance_loss_mlp": 1.01989233, + "epoch": 0.590169848188787, + "flos": 24234156766080.0, + "grad_norm": 2.453378709548626, + "language_loss": 0.77481234, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.79589581, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41015625, + "step": 9816, + "time_per_iteration": 2.4371511936187744 + }, + { + "auxiliary_loss_clip": 0.01057114, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.01510656, + "balance_loss_mlp": 1.01916194, + "epoch": 0.590229971441455, + "flos": 17783778124800.0, + "grad_norm": 1.7607873733410155, + "language_loss": 0.81979162, + "learning_rate": 1.517399156051309e-06, + "loss": 0.84075952, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 9817, + "time_per_iteration": 2.3585567474365234 + }, + { + "auxiliary_loss_clip": 0.01058057, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_clip": 1.02321291, + "balance_loss_mlp": 1.01883864, + "epoch": 0.590290094694123, + "flos": 22235797699200.0, + "grad_norm": 1.9731097852977253, + "language_loss": 0.77723837, + "learning_rate": 1.517021211933682e-06, + "loss": 0.79831088, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 9818, + "time_per_iteration": 2.3959803581237793 + }, + { + "auxiliary_loss_clip": 0.01056844, + "auxiliary_loss_mlp": 0.01043764, + "balance_loss_clip": 1.01870644, + "balance_loss_mlp": 1.01869321, + "epoch": 0.5903502179467909, + "flos": 19097629608960.0, + "grad_norm": 1.823609893029903, + "language_loss": 0.68174821, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.70275432, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 9819, + "time_per_iteration": 2.3611340522766113 + }, + { + "auxiliary_loss_clip": 0.01057376, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.0159831, + "balance_loss_mlp": 1.01903284, + "epoch": 0.5904103411994589, + "flos": 24234575702400.0, + "grad_norm": 1.825395038309527, + "language_loss": 0.78959, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.81057847, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 9820, + "time_per_iteration": 2.424339532852173 + }, + { + "auxiliary_loss_clip": 0.0100955, + "auxiliary_loss_mlp": 0.01017824, + "balance_loss_clip": 1.01507068, + "balance_loss_mlp": 1.00209427, + "epoch": 0.5904704644521268, + "flos": 64873650424320.0, + "grad_norm": 0.9245256291211371, + "language_loss": 0.65184909, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67212278, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.07421875, + "step": 9821, + "time_per_iteration": 2.996143102645874 + }, + { + "auxiliary_loss_clip": 0.01056067, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.01919866, + "balance_loss_mlp": 1.0181222, + "epoch": 0.5905305877047948, + "flos": 19608967514880.0, + "grad_norm": 2.043262325818316, + "language_loss": 0.62228703, + "learning_rate": 1.515509618752521e-06, + "loss": 0.64328659, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37890625, + "step": 9822, + "time_per_iteration": 2.3726558685302734 + }, + { + "auxiliary_loss_clip": 0.01058401, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_clip": 1.01972818, + "balance_loss_mlp": 1.01880682, + "epoch": 0.5905907109574628, + "flos": 18988630744320.0, + "grad_norm": 1.950797215480475, + "language_loss": 0.84151828, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.86255801, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 9823, + "time_per_iteration": 2.380657196044922 + }, + { + "auxiliary_loss_clip": 0.0105583, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.0135293, + "balance_loss_mlp": 1.0183301, + "epoch": 0.5906508342101308, + "flos": 22199313461760.0, + "grad_norm": 2.160961605694398, + "language_loss": 0.74832028, + "learning_rate": 1.514753932336165e-06, + "loss": 0.7692703, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 9824, + "time_per_iteration": 2.388810634613037 + }, + { + "auxiliary_loss_clip": 0.01063196, + "auxiliary_loss_mlp": 0.01043948, + "balance_loss_clip": 1.01433635, + "balance_loss_mlp": 1.02013254, + "epoch": 0.5907109574627988, + "flos": 20885636534400.0, + "grad_norm": 2.0251671970825527, + "language_loss": 0.84598768, + "learning_rate": 1.514376116721693e-06, + "loss": 0.86705911, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4296875, + "step": 9825, + "time_per_iteration": 3.591508150100708 + }, + { + "auxiliary_loss_clip": 0.01054315, + "auxiliary_loss_mlp": 0.01038642, + "balance_loss_clip": 1.01594436, + "balance_loss_mlp": 1.01808512, + "epoch": 0.5907710807154667, + "flos": 21505589280000.0, + "grad_norm": 1.8172376098103686, + "language_loss": 0.76905835, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78998792, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 9826, + "time_per_iteration": 2.374995470046997 + }, + { + "auxiliary_loss_clip": 0.01054362, + "auxiliary_loss_mlp": 0.01036558, + "balance_loss_clip": 1.01094031, + "balance_loss_mlp": 1.01706445, + "epoch": 0.5908312039681347, + "flos": 22017276299520.0, + "grad_norm": 1.6017280068908066, + "language_loss": 0.73321772, + "learning_rate": 1.513620540751793e-06, + "loss": 0.75412691, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 9827, + "time_per_iteration": 2.397533655166626 + }, + { + "auxiliary_loss_clip": 0.0105623, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.01385856, + "balance_loss_mlp": 1.01774395, + "epoch": 0.5908913272208026, + "flos": 18478514736000.0, + "grad_norm": 1.7070527771604371, + "language_loss": 0.80268192, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.82362902, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38476562, + "step": 9828, + "time_per_iteration": 2.3873817920684814 + }, + { + "auxiliary_loss_clip": 0.0105924, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.01680732, + "balance_loss_mlp": 1.0194962, + "epoch": 0.5909514504734706, + "flos": 12311386888320.0, + "grad_norm": 3.309359692456124, + "language_loss": 0.89635217, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.91738546, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3984375, + "step": 9829, + "time_per_iteration": 5.3355231285095215 + }, + { + "auxiliary_loss_clip": 0.0101241, + "auxiliary_loss_mlp": 0.01004755, + "balance_loss_clip": 1.00231135, + "balance_loss_mlp": 1.00474179, + "epoch": 0.5910115737261386, + "flos": 70209879891840.0, + "grad_norm": 0.7588367477791231, + "language_loss": 0.57929075, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59946239, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.07666016, + "step": 9830, + "time_per_iteration": 2.9433066844940186 + }, + { + "auxiliary_loss_clip": 0.01061439, + "auxiliary_loss_mlp": 0.01046706, + "balance_loss_clip": 1.01707029, + "balance_loss_mlp": 1.01885688, + "epoch": 0.5910716969788066, + "flos": 22016682806400.0, + "grad_norm": 4.580588646340233, + "language_loss": 0.78057647, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.80165792, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.42578125, + "step": 9831, + "time_per_iteration": 2.38224720954895 + }, + { + "auxiliary_loss_clip": 0.01055348, + "auxiliary_loss_mlp": 0.01038375, + "balance_loss_clip": 1.01378214, + "balance_loss_mlp": 1.01824069, + "epoch": 0.5911318202314745, + "flos": 21250583642880.0, + "grad_norm": 1.9566153667282091, + "language_loss": 0.7826122, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80354947, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 9832, + "time_per_iteration": 2.4239137172698975 + }, + { + "auxiliary_loss_clip": 0.01056781, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.01039279, + "balance_loss_mlp": 1.01832116, + "epoch": 0.5911919434841425, + "flos": 17820646387200.0, + "grad_norm": 1.937199388278729, + "language_loss": 0.84898812, + "learning_rate": 1.511354255945847e-06, + "loss": 0.86989999, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38476562, + "step": 9833, + "time_per_iteration": 2.404972791671753 + }, + { + "auxiliary_loss_clip": 0.01056287, + "auxiliary_loss_mlp": 0.01038423, + "balance_loss_clip": 1.01181543, + "balance_loss_mlp": 1.01803768, + "epoch": 0.5912520667368104, + "flos": 20373774958080.0, + "grad_norm": 1.8074089091538157, + "language_loss": 0.75033998, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.77128708, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 9834, + "time_per_iteration": 2.3910415172576904 + }, + { + "auxiliary_loss_clip": 0.01055881, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.01408744, + "balance_loss_mlp": 1.0174408, + "epoch": 0.5913121899894784, + "flos": 17929610340480.0, + "grad_norm": 2.2768533605267436, + "language_loss": 0.79449487, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.81544822, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 9835, + "time_per_iteration": 2.343096971511841 + }, + { + "auxiliary_loss_clip": 0.01057262, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.01332974, + "balance_loss_mlp": 1.01889133, + "epoch": 0.5913723132421465, + "flos": 22125856227840.0, + "grad_norm": 2.053847170654326, + "language_loss": 0.74529678, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76625741, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 9836, + "time_per_iteration": 2.4411516189575195 + }, + { + "auxiliary_loss_clip": 0.01056961, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.01173639, + "balance_loss_mlp": 1.01809418, + "epoch": 0.5914324364948144, + "flos": 15697229230080.0, + "grad_norm": 1.9953831664624084, + "language_loss": 0.84092164, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.86187053, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 9837, + "time_per_iteration": 3.890681266784668 + }, + { + "auxiliary_loss_clip": 0.01057029, + "auxiliary_loss_mlp": 0.01041279, + "balance_loss_clip": 1.01523137, + "balance_loss_mlp": 1.01858115, + "epoch": 0.5914925597474824, + "flos": 22746227909760.0, + "grad_norm": 1.6790512099861714, + "language_loss": 0.80551982, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.82650292, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38476562, + "step": 9838, + "time_per_iteration": 2.4116079807281494 + }, + { + "auxiliary_loss_clip": 0.01057049, + "auxiliary_loss_mlp": 0.01040898, + "balance_loss_clip": 1.01399231, + "balance_loss_mlp": 1.01827657, + "epoch": 0.5915526830001503, + "flos": 18291904185600.0, + "grad_norm": 1.8720173676136334, + "language_loss": 0.7120378, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.73301721, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 9839, + "time_per_iteration": 2.354358673095703 + }, + { + "auxiliary_loss_clip": 0.01058309, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_clip": 1.01655614, + "balance_loss_mlp": 1.01803267, + "epoch": 0.5916128062528183, + "flos": 17018132808960.0, + "grad_norm": 2.727759830983989, + "language_loss": 0.66573399, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.6867463, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 9840, + "time_per_iteration": 2.375784158706665 + }, + { + "auxiliary_loss_clip": 0.01057151, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.01082826, + "balance_loss_mlp": 1.01785111, + "epoch": 0.5916729295054862, + "flos": 24753070437120.0, + "grad_norm": 1.8564403581734852, + "language_loss": 0.82463914, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.84557152, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39257812, + "step": 9841, + "time_per_iteration": 2.4149649143218994 + }, + { + "auxiliary_loss_clip": 0.0105366, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.01498878, + "balance_loss_mlp": 1.01666641, + "epoch": 0.5917330527581542, + "flos": 15957366837120.0, + "grad_norm": 2.4381471451219485, + "language_loss": 0.70548391, + "learning_rate": 1.507956080444291e-06, + "loss": 0.72641063, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 9842, + "time_per_iteration": 2.3622469902038574 + }, + { + "auxiliary_loss_clip": 0.01058411, + "auxiliary_loss_mlp": 0.010432, + "balance_loss_clip": 1.01700926, + "balance_loss_mlp": 1.01889896, + "epoch": 0.5917931760108222, + "flos": 23799732318720.0, + "grad_norm": 1.9267965696881344, + "language_loss": 0.83556116, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.85657728, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 9843, + "time_per_iteration": 2.403280258178711 + }, + { + "auxiliary_loss_clip": 0.01056112, + "auxiliary_loss_mlp": 0.01040922, + "balance_loss_clip": 1.01511312, + "balance_loss_mlp": 1.01756716, + "epoch": 0.5918532992634902, + "flos": 23248733241600.0, + "grad_norm": 2.482368880578817, + "language_loss": 0.83418965, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.85516, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9844, + "time_per_iteration": 2.3888766765594482 + }, + { + "auxiliary_loss_clip": 0.01058288, + "auxiliary_loss_mlp": 0.01037434, + "balance_loss_clip": 1.01318645, + "balance_loss_mlp": 1.01833081, + "epoch": 0.5919134225161581, + "flos": 19498851486720.0, + "grad_norm": 2.0166191137273946, + "language_loss": 0.75658673, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.77754396, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.40039062, + "step": 9845, + "time_per_iteration": 2.370279312133789 + }, + { + "auxiliary_loss_clip": 0.01057539, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.0160588, + "balance_loss_mlp": 1.01825953, + "epoch": 0.5919735457688261, + "flos": 38799397117440.0, + "grad_norm": 1.9341905431498867, + "language_loss": 0.6548453, + "learning_rate": 1.506446264718213e-06, + "loss": 0.67585969, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39257812, + "step": 9846, + "time_per_iteration": 2.552384376525879 + }, + { + "auxiliary_loss_clip": 0.01053124, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.01106048, + "balance_loss_mlp": 1.01749229, + "epoch": 0.592033669021494, + "flos": 22162899047040.0, + "grad_norm": 1.8267633889259995, + "language_loss": 0.77746105, + "learning_rate": 1.506068857539931e-06, + "loss": 0.79832441, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 9847, + "time_per_iteration": 2.382852077484131 + }, + { + "auxiliary_loss_clip": 0.0105669, + "auxiliary_loss_mlp": 0.01041442, + "balance_loss_clip": 1.01572812, + "balance_loss_mlp": 1.01765275, + "epoch": 0.592093792274162, + "flos": 22709883317760.0, + "grad_norm": 2.548019539662544, + "language_loss": 0.64375609, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.66473746, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 9848, + "time_per_iteration": 2.4100582599639893 + }, + { + "auxiliary_loss_clip": 0.01057545, + "auxiliary_loss_mlp": 0.01040261, + "balance_loss_clip": 1.01495337, + "balance_loss_mlp": 1.01806593, + "epoch": 0.59215391552683, + "flos": 22527846155520.0, + "grad_norm": 1.802175035767617, + "language_loss": 0.77724826, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.79822636, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39453125, + "step": 9849, + "time_per_iteration": 2.395768880844116 + }, + { + "auxiliary_loss_clip": 0.0105708, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.01740086, + "balance_loss_mlp": 1.01740742, + "epoch": 0.592214038779498, + "flos": 24497855331840.0, + "grad_norm": 1.8359711267540675, + "language_loss": 0.7651121, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.78613937, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.39648438, + "step": 9850, + "time_per_iteration": 2.4219248294830322 + }, + { + "auxiliary_loss_clip": 0.01055289, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.02072382, + "balance_loss_mlp": 1.01694489, + "epoch": 0.592274162032166, + "flos": 21830386458240.0, + "grad_norm": 3.196406052552999, + "language_loss": 0.76592416, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.78693676, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 9851, + "time_per_iteration": 2.3682548999786377 + }, + { + "auxiliary_loss_clip": 0.01056986, + "auxiliary_loss_mlp": 0.01038899, + "balance_loss_clip": 1.01483095, + "balance_loss_mlp": 1.01754022, + "epoch": 0.5923342852848339, + "flos": 24606993841920.0, + "grad_norm": 2.2216923732476874, + "language_loss": 0.71832216, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.739281, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.39453125, + "step": 9852, + "time_per_iteration": 2.4349143505096436 + }, + { + "auxiliary_loss_clip": 0.01060008, + "auxiliary_loss_mlp": 0.01050138, + "balance_loss_clip": 1.02174234, + "balance_loss_mlp": 1.01838684, + "epoch": 0.5923944085375019, + "flos": 19937116183680.0, + "grad_norm": 1.6194195799452875, + "language_loss": 0.81216586, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.83326733, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.41601562, + "step": 9853, + "time_per_iteration": 2.4016530513763428 + }, + { + "auxiliary_loss_clip": 0.01054389, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.01340091, + "balance_loss_mlp": 1.01733184, + "epoch": 0.5924545317901698, + "flos": 28657232956800.0, + "grad_norm": 1.6575426671886002, + "language_loss": 0.6867671, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.7076844, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 9854, + "time_per_iteration": 2.4455151557922363 + }, + { + "auxiliary_loss_clip": 0.01054825, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.01297963, + "balance_loss_mlp": 1.01723671, + "epoch": 0.5925146550428378, + "flos": 19863868417920.0, + "grad_norm": 1.8661664873016393, + "language_loss": 0.89946586, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.92037833, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 9855, + "time_per_iteration": 2.3918418884277344 + }, + { + "auxiliary_loss_clip": 0.01055549, + "auxiliary_loss_mlp": 0.01045431, + "balance_loss_clip": 1.02100527, + "balance_loss_mlp": 1.01771665, + "epoch": 0.5925747782955058, + "flos": 15122069625600.0, + "grad_norm": 1.6583025908691962, + "language_loss": 0.87809771, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.89910746, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 9856, + "time_per_iteration": 2.3781585693359375 + }, + { + "auxiliary_loss_clip": 0.01056994, + "auxiliary_loss_mlp": 0.01041954, + "balance_loss_clip": 1.01727772, + "balance_loss_mlp": 1.01789248, + "epoch": 0.5926349015481738, + "flos": 18404464008960.0, + "grad_norm": 1.8783595993335969, + "language_loss": 0.78435898, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.80534846, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.390625, + "step": 9857, + "time_per_iteration": 2.361701488494873 + }, + { + "auxiliary_loss_clip": 0.01057232, + "auxiliary_loss_mlp": 0.01045683, + "balance_loss_clip": 1.02162671, + "balance_loss_mlp": 1.0182209, + "epoch": 0.5926950248008417, + "flos": 23110057854720.0, + "grad_norm": 2.0581972726452884, + "language_loss": 0.66536951, + "learning_rate": 1.501918617901419e-06, + "loss": 0.68639863, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.390625, + "step": 9858, + "time_per_iteration": 2.377786636352539 + }, + { + "auxiliary_loss_clip": 0.01055086, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.01615644, + "balance_loss_mlp": 1.01647568, + "epoch": 0.5927551480535097, + "flos": 28032776645760.0, + "grad_norm": 1.7312770721302726, + "language_loss": 0.78262144, + "learning_rate": 1.501541436426501e-06, + "loss": 0.8035655, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38671875, + "step": 9859, + "time_per_iteration": 2.4458224773406982 + }, + { + "auxiliary_loss_clip": 0.01057866, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.01256275, + "balance_loss_mlp": 1.01764834, + "epoch": 0.5928152713061776, + "flos": 21797602824960.0, + "grad_norm": 2.40942840287197, + "language_loss": 0.77119768, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.7921735, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40234375, + "step": 9860, + "time_per_iteration": 2.375856399536133 + }, + { + "auxiliary_loss_clip": 0.01055434, + "auxiliary_loss_mlp": 0.01041873, + "balance_loss_clip": 1.01735139, + "balance_loss_mlp": 1.01670718, + "epoch": 0.5928753945588456, + "flos": 24315678524160.0, + "grad_norm": 1.6332966781191838, + "language_loss": 0.77835977, + "learning_rate": 1.500787130195763e-06, + "loss": 0.7993328, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 9861, + "time_per_iteration": 2.416691303253174 + }, + { + "auxiliary_loss_clip": 0.01054315, + "auxiliary_loss_mlp": 0.01036027, + "balance_loss_clip": 1.01191044, + "balance_loss_mlp": 1.01673079, + "epoch": 0.5929355178115137, + "flos": 26463535499520.0, + "grad_norm": 1.6428174741266346, + "language_loss": 0.71606749, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.7369709, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 9862, + "time_per_iteration": 2.4039266109466553 + }, + { + "auxiliary_loss_clip": 0.0105719, + "auxiliary_loss_mlp": 0.01037525, + "balance_loss_clip": 1.0125035, + "balance_loss_mlp": 1.01870704, + "epoch": 0.5929956410641816, + "flos": 24965028501120.0, + "grad_norm": 1.8238357687828608, + "language_loss": 0.7888006, + "learning_rate": 1.500032899685832e-06, + "loss": 0.80974776, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 9863, + "time_per_iteration": 3.783052921295166 + }, + { + "auxiliary_loss_clip": 0.01055435, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.01317787, + "balance_loss_mlp": 1.01732397, + "epoch": 0.5930557643168496, + "flos": 26207273053440.0, + "grad_norm": 1.6561127682033918, + "language_loss": 0.71510959, + "learning_rate": 1.499655812861921e-06, + "loss": 0.73604923, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 9864, + "time_per_iteration": 2.4254157543182373 + }, + { + "auxiliary_loss_clip": 0.0105672, + "auxiliary_loss_mlp": 0.0103568, + "balance_loss_clip": 1.01108682, + "balance_loss_mlp": 1.01750231, + "epoch": 0.5931158875695175, + "flos": 27853706949120.0, + "grad_norm": 1.5979681763400244, + "language_loss": 0.68778747, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.7087115, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39257812, + "step": 9865, + "time_per_iteration": 2.4131698608398438 + }, + { + "auxiliary_loss_clip": 0.01058285, + "auxiliary_loss_mlp": 0.01045682, + "balance_loss_clip": 1.01753664, + "balance_loss_mlp": 1.01779819, + "epoch": 0.5931760108221855, + "flos": 15412756538880.0, + "grad_norm": 2.20776128885764, + "language_loss": 0.8028183, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.82385802, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40429688, + "step": 9866, + "time_per_iteration": 2.34771466255188 + }, + { + "auxiliary_loss_clip": 0.01055316, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.01113236, + "balance_loss_mlp": 1.0172112, + "epoch": 0.5932361340748534, + "flos": 30187266779520.0, + "grad_norm": 1.9552084251447066, + "language_loss": 0.73629677, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.75720525, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38085938, + "step": 9867, + "time_per_iteration": 2.442326545715332 + }, + { + "auxiliary_loss_clip": 0.01055979, + "auxiliary_loss_mlp": 0.01041129, + "balance_loss_clip": 1.01567745, + "balance_loss_mlp": 1.01819146, + "epoch": 0.5932962573275214, + "flos": 20156510367360.0, + "grad_norm": 1.558930669342135, + "language_loss": 0.68402773, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.70499879, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 9868, + "time_per_iteration": 2.3611552715301514 + }, + { + "auxiliary_loss_clip": 0.01058047, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.01235127, + "balance_loss_mlp": 1.01802313, + "epoch": 0.5933563805801894, + "flos": 25444769760000.0, + "grad_norm": 1.5142922592889356, + "language_loss": 0.76241916, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.78339422, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40039062, + "step": 9869, + "time_per_iteration": 5.0830559730529785 + }, + { + "auxiliary_loss_clip": 0.01057822, + "auxiliary_loss_mlp": 0.01042124, + "balance_loss_clip": 1.01673269, + "balance_loss_mlp": 1.01866794, + "epoch": 0.5934165038328574, + "flos": 59993701781760.0, + "grad_norm": 1.6271390468488758, + "language_loss": 0.74956501, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.77056444, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 9870, + "time_per_iteration": 2.7256340980529785 + }, + { + "auxiliary_loss_clip": 0.01057905, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.01189566, + "balance_loss_mlp": 1.01789045, + "epoch": 0.5934766270855253, + "flos": 24419545418880.0, + "grad_norm": 2.4236132085305786, + "language_loss": 0.73340839, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.75436223, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40039062, + "step": 9871, + "time_per_iteration": 2.394937515258789 + }, + { + "auxiliary_loss_clip": 0.01058893, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.01147604, + "balance_loss_mlp": 1.018466, + "epoch": 0.5935367503381933, + "flos": 23512047782400.0, + "grad_norm": 1.8965951428462107, + "language_loss": 0.76538217, + "learning_rate": 1.496639802503271e-06, + "loss": 0.78635681, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 9872, + "time_per_iteration": 2.411900758743286 + }, + { + "auxiliary_loss_clip": 0.01057412, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.01446772, + "balance_loss_mlp": 1.01738369, + "epoch": 0.5935968735908612, + "flos": 18947468384640.0, + "grad_norm": 1.9642003065547438, + "language_loss": 0.80026293, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.82124817, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40039062, + "step": 9873, + "time_per_iteration": 2.3314599990844727 + }, + { + "auxiliary_loss_clip": 0.01056732, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.01628566, + "balance_loss_mlp": 1.01739824, + "epoch": 0.5936569968435292, + "flos": 25482266426880.0, + "grad_norm": 2.1135977215408452, + "language_loss": 0.85698926, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87797451, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39453125, + "step": 9874, + "time_per_iteration": 2.4232192039489746 + }, + { + "auxiliary_loss_clip": 0.01009775, + "auxiliary_loss_mlp": 0.01002146, + "balance_loss_clip": 0.99966669, + "balance_loss_mlp": 1.00243056, + "epoch": 0.5937171200961973, + "flos": 66375194711040.0, + "grad_norm": 0.7102851665113262, + "language_loss": 0.60183477, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62195396, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.07324219, + "step": 9875, + "time_per_iteration": 3.1292707920074463 + }, + { + "auxiliary_loss_clip": 0.01058601, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.01232839, + "balance_loss_mlp": 1.01740289, + "epoch": 0.5937772433488652, + "flos": 14902570707840.0, + "grad_norm": 1.9212546015040861, + "language_loss": 0.78712887, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.80812013, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.41210938, + "step": 9876, + "time_per_iteration": 2.355015516281128 + }, + { + "auxiliary_loss_clip": 0.01051439, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.01304793, + "balance_loss_mlp": 1.01543403, + "epoch": 0.5938373666015332, + "flos": 22560490143360.0, + "grad_norm": 2.7260805860399056, + "language_loss": 0.7635507, + "learning_rate": 1.494755415907243e-06, + "loss": 0.7844274, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 9877, + "time_per_iteration": 3.811195135116577 + }, + { + "auxiliary_loss_clip": 0.01056243, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.0138073, + "balance_loss_mlp": 1.01738334, + "epoch": 0.5938974898542011, + "flos": 18439935816960.0, + "grad_norm": 2.2181731239797817, + "language_loss": 0.82335013, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.84430718, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38867188, + "step": 9878, + "time_per_iteration": 2.3364570140838623 + }, + { + "auxiliary_loss_clip": 0.01055395, + "auxiliary_loss_mlp": 0.01040638, + "balance_loss_clip": 1.01453161, + "balance_loss_mlp": 1.01746964, + "epoch": 0.5939576131068691, + "flos": 45585011433600.0, + "grad_norm": 1.9624140787621422, + "language_loss": 0.72286201, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.7438224, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 9879, + "time_per_iteration": 2.585948944091797 + }, + { + "auxiliary_loss_clip": 0.01055601, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.01768196, + "balance_loss_mlp": 1.01762104, + "epoch": 0.594017736359537, + "flos": 23586552357120.0, + "grad_norm": 1.5320826756895167, + "language_loss": 0.58841634, + "learning_rate": 1.493625013742401e-06, + "loss": 0.60938168, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 9880, + "time_per_iteration": 2.3939645290374756 + }, + { + "auxiliary_loss_clip": 0.01055188, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_clip": 1.01584208, + "balance_loss_mlp": 1.01658571, + "epoch": 0.594077859612205, + "flos": 29456045930880.0, + "grad_norm": 1.795227670567752, + "language_loss": 0.78589994, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.80687463, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38476562, + "step": 9881, + "time_per_iteration": 2.4415647983551025 + }, + { + "auxiliary_loss_clip": 0.01055471, + "auxiliary_loss_mlp": 0.01040933, + "balance_loss_clip": 1.01361036, + "balance_loss_mlp": 1.01706493, + "epoch": 0.594137982864873, + "flos": 16799157561600.0, + "grad_norm": 2.0988005771417098, + "language_loss": 0.84078759, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.86175162, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3828125, + "step": 9882, + "time_per_iteration": 2.339254856109619 + }, + { + "auxiliary_loss_clip": 0.01054639, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.01218104, + "balance_loss_mlp": 1.01738071, + "epoch": 0.594198106117541, + "flos": 12749442117120.0, + "grad_norm": 3.0819409682543766, + "language_loss": 0.81676972, + "learning_rate": 1.492494784393667e-06, + "loss": 0.83767635, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 9883, + "time_per_iteration": 2.347867012023926 + }, + { + "auxiliary_loss_clip": 0.01059509, + "auxiliary_loss_mlp": 0.01045472, + "balance_loss_clip": 1.01817298, + "balance_loss_mlp": 1.01916552, + "epoch": 0.5942582293702089, + "flos": 20995473271680.0, + "grad_norm": 1.831010830545076, + "language_loss": 0.75645328, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.77750301, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 9884, + "time_per_iteration": 2.427304744720459 + }, + { + "auxiliary_loss_clip": 0.01057077, + "auxiliary_loss_mlp": 0.01038717, + "balance_loss_clip": 1.01253867, + "balance_loss_mlp": 1.01793611, + "epoch": 0.5943183526228769, + "flos": 28290226078080.0, + "grad_norm": 2.4006219401471984, + "language_loss": 0.68194604, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.70290399, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 9885, + "time_per_iteration": 2.464165449142456 + }, + { + "auxiliary_loss_clip": 0.01055731, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.01546299, + "balance_loss_mlp": 1.01775503, + "epoch": 0.5943784758755448, + "flos": 26613417432960.0, + "grad_norm": 2.368896367707395, + "language_loss": 0.7816304, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.80260158, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 9886, + "time_per_iteration": 2.3972744941711426 + }, + { + "auxiliary_loss_clip": 0.0100962, + "auxiliary_loss_mlp": 0.01011648, + "balance_loss_clip": 1.00893044, + "balance_loss_mlp": 1.00189066, + "epoch": 0.5944385991282128, + "flos": 64187781298560.0, + "grad_norm": 0.8672627612741726, + "language_loss": 0.64732945, + "learning_rate": 1.490988081420423e-06, + "loss": 0.6675421, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.07714844, + "step": 9887, + "time_per_iteration": 2.9264235496520996 + }, + { + "auxiliary_loss_clip": 0.01053627, + "auxiliary_loss_mlp": 0.01040138, + "balance_loss_clip": 1.01586711, + "balance_loss_mlp": 1.0166229, + "epoch": 0.5944987223808808, + "flos": 19571017000320.0, + "grad_norm": 3.43726645402833, + "language_loss": 0.6994127, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.72035033, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 9888, + "time_per_iteration": 2.352416515350342 + }, + { + "auxiliary_loss_clip": 0.01056373, + "auxiliary_loss_mlp": 0.01038711, + "balance_loss_clip": 1.01377201, + "balance_loss_mlp": 1.01816702, + "epoch": 0.5945588456335488, + "flos": 26176374633600.0, + "grad_norm": 1.65752255212212, + "language_loss": 0.80502188, + "learning_rate": 1.490234845687366e-06, + "loss": 0.82597268, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3828125, + "step": 9889, + "time_per_iteration": 2.451280355453491 + }, + { + "auxiliary_loss_clip": 0.01055055, + "auxiliary_loss_mlp": 0.0104079, + "balance_loss_clip": 1.0172348, + "balance_loss_mlp": 1.01717949, + "epoch": 0.5946189688862168, + "flos": 20445521535360.0, + "grad_norm": 1.6224895588702797, + "language_loss": 0.7203306, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.74128902, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37890625, + "step": 9890, + "time_per_iteration": 2.3797669410705566 + }, + { + "auxiliary_loss_clip": 0.01056843, + "auxiliary_loss_mlp": 0.01041325, + "balance_loss_clip": 1.01718545, + "balance_loss_mlp": 1.01816809, + "epoch": 0.5946790921388847, + "flos": 13436847342720.0, + "grad_norm": 2.1329797560472903, + "language_loss": 0.69724262, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71822441, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38671875, + "step": 9891, + "time_per_iteration": 2.372577428817749 + }, + { + "auxiliary_loss_clip": 0.01053171, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.01422012, + "balance_loss_mlp": 1.01726472, + "epoch": 0.5947392153915527, + "flos": 20411236713600.0, + "grad_norm": 2.1399311138857446, + "language_loss": 0.55106956, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.5719763, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 9892, + "time_per_iteration": 2.3510234355926514 + }, + { + "auxiliary_loss_clip": 0.01008537, + "auxiliary_loss_mlp": 0.01003397, + "balance_loss_clip": 1.00054777, + "balance_loss_mlp": 1.00095344, + "epoch": 0.5947993386442206, + "flos": 65615798528640.0, + "grad_norm": 0.6867370855182499, + "language_loss": 0.5467301, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56684941, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.07568359, + "step": 9893, + "time_per_iteration": 3.104739189147949 + }, + { + "auxiliary_loss_clip": 0.01055364, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_clip": 1.0175637, + "balance_loss_mlp": 1.01860166, + "epoch": 0.5948594618968887, + "flos": 23182048811520.0, + "grad_norm": 1.650462342265301, + "language_loss": 0.75747645, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.77843416, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 9894, + "time_per_iteration": 2.3839035034179688 + }, + { + "auxiliary_loss_clip": 0.01055906, + "auxiliary_loss_mlp": 0.01039858, + "balance_loss_clip": 1.01605177, + "balance_loss_mlp": 1.01788211, + "epoch": 0.5949195851495566, + "flos": 13625901688320.0, + "grad_norm": 1.7935500194522842, + "language_loss": 0.79213643, + "learning_rate": 1.487975602873434e-06, + "loss": 0.81309408, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38085938, + "step": 9895, + "time_per_iteration": 2.3822174072265625 + }, + { + "auxiliary_loss_clip": 0.01056791, + "auxiliary_loss_mlp": 0.01041874, + "balance_loss_clip": 1.01830637, + "balance_loss_mlp": 1.01847553, + "epoch": 0.5949797084022246, + "flos": 19750121608320.0, + "grad_norm": 3.2978235670272786, + "language_loss": 0.80176431, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.82275099, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3828125, + "step": 9896, + "time_per_iteration": 2.3826372623443604 + }, + { + "auxiliary_loss_clip": 0.01055581, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_clip": 1.01751029, + "balance_loss_mlp": 1.01780522, + "epoch": 0.5950398316548925, + "flos": 25772743872000.0, + "grad_norm": 1.5108685772478063, + "language_loss": 0.84167981, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.86265969, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37695312, + "step": 9897, + "time_per_iteration": 2.4422972202301025 + }, + { + "auxiliary_loss_clip": 0.01056247, + "auxiliary_loss_mlp": 0.01040034, + "balance_loss_clip": 1.01623988, + "balance_loss_mlp": 1.01792884, + "epoch": 0.5950999549075605, + "flos": 23037927252480.0, + "grad_norm": 2.9502264577751407, + "language_loss": 0.721506, + "learning_rate": 1.486846243389939e-06, + "loss": 0.74246883, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 9898, + "time_per_iteration": 2.368438959121704 + }, + { + "auxiliary_loss_clip": 0.01058999, + "auxiliary_loss_mlp": 0.0103989, + "balance_loss_clip": 1.01291275, + "balance_loss_mlp": 1.01813173, + "epoch": 0.5951600781602284, + "flos": 32445169960320.0, + "grad_norm": 2.2667441289113417, + "language_loss": 0.65112615, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.67211503, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40820312, + "step": 9899, + "time_per_iteration": 2.479994773864746 + }, + { + "auxiliary_loss_clip": 0.01055436, + "auxiliary_loss_mlp": 0.01037167, + "balance_loss_clip": 1.01498222, + "balance_loss_mlp": 1.01812291, + "epoch": 0.5952202014128964, + "flos": 23799871964160.0, + "grad_norm": 2.015337672202236, + "language_loss": 0.73150074, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.7524268, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.37109375, + "step": 9900, + "time_per_iteration": 2.3990750312805176 + }, + { + "auxiliary_loss_clip": 0.01055003, + "auxiliary_loss_mlp": 0.01036991, + "balance_loss_clip": 1.01199329, + "balance_loss_mlp": 1.01874018, + "epoch": 0.5952803246655644, + "flos": 22491082627200.0, + "grad_norm": 1.7052892578310308, + "language_loss": 0.85480225, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.87572217, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 9901, + "time_per_iteration": 2.4071247577667236 + }, + { + "auxiliary_loss_clip": 0.01008991, + "auxiliary_loss_mlp": 0.0100763, + "balance_loss_clip": 1.00491166, + "balance_loss_mlp": 1.00149095, + "epoch": 0.5953404479182324, + "flos": 51232001846400.0, + "grad_norm": 0.7943805889503383, + "language_loss": 0.58186322, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60202944, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.07519531, + "step": 9902, + "time_per_iteration": 2.8641674518585205 + }, + { + "auxiliary_loss_clip": 0.01055501, + "auxiliary_loss_mlp": 0.01042236, + "balance_loss_clip": 1.01840627, + "balance_loss_mlp": 1.01691866, + "epoch": 0.5954005711709004, + "flos": 23111559043200.0, + "grad_norm": 1.6951676230507706, + "language_loss": 0.78530157, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.80627894, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38671875, + "step": 9903, + "time_per_iteration": 3.7552719116210938 + }, + { + "auxiliary_loss_clip": 0.01056424, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.01770008, + "balance_loss_mlp": 1.01800632, + "epoch": 0.5954606944235683, + "flos": 35953277483520.0, + "grad_norm": 1.749271141857159, + "language_loss": 0.79100209, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.81198633, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38476562, + "step": 9904, + "time_per_iteration": 2.502774715423584 + }, + { + "auxiliary_loss_clip": 0.01057561, + "auxiliary_loss_mlp": 0.01042081, + "balance_loss_clip": 1.01698756, + "balance_loss_mlp": 1.01839435, + "epoch": 0.5955208176762363, + "flos": 30442412062080.0, + "grad_norm": 2.3521941388620142, + "language_loss": 0.73856032, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.75955677, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39257812, + "step": 9905, + "time_per_iteration": 2.45149564743042 + }, + { + "auxiliary_loss_clip": 0.01055277, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.01524186, + "balance_loss_mlp": 1.01699579, + "epoch": 0.5955809409289042, + "flos": 17639132895360.0, + "grad_norm": 1.696122289872541, + "language_loss": 0.71036243, + "learning_rate": 1.483835475336295e-06, + "loss": 0.73132384, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 9906, + "time_per_iteration": 2.334688425064087 + }, + { + "auxiliary_loss_clip": 0.01056157, + "auxiliary_loss_mlp": 0.01045375, + "balance_loss_clip": 1.02171159, + "balance_loss_mlp": 1.01705253, + "epoch": 0.5956410641815723, + "flos": 24278740439040.0, + "grad_norm": 2.1188421897124194, + "language_loss": 0.75998938, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.78100473, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.390625, + "step": 9907, + "time_per_iteration": 2.3913822174072266 + }, + { + "auxiliary_loss_clip": 0.01055887, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.02097619, + "balance_loss_mlp": 1.01688683, + "epoch": 0.5957011874342402, + "flos": 35732870870400.0, + "grad_norm": 1.5681992688545845, + "language_loss": 0.68389702, + "learning_rate": 1.483082978767595e-06, + "loss": 0.70491356, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 9908, + "time_per_iteration": 3.9403486251831055 + }, + { + "auxiliary_loss_clip": 0.01056566, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.01335001, + "balance_loss_mlp": 1.01848197, + "epoch": 0.5957613106869082, + "flos": 21244125041280.0, + "grad_norm": 2.1618850614092304, + "language_loss": 0.7746954, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.79563701, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38085938, + "step": 9909, + "time_per_iteration": 3.7156882286071777 + }, + { + "auxiliary_loss_clip": 0.01010295, + "auxiliary_loss_mlp": 0.01004772, + "balance_loss_clip": 1.00251937, + "balance_loss_mlp": 1.00297236, + "epoch": 0.5958214339395761, + "flos": 65937802798080.0, + "grad_norm": 0.9351411240151967, + "language_loss": 0.73531342, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75546408, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.07324219, + "step": 9910, + "time_per_iteration": 3.06516170501709 + }, + { + "auxiliary_loss_clip": 0.01056067, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_clip": 1.01845372, + "balance_loss_mlp": 1.01727247, + "epoch": 0.5958815571922441, + "flos": 23217660264960.0, + "grad_norm": 1.635279359389376, + "language_loss": 0.70587265, + "learning_rate": 1.481954380961799e-06, + "loss": 0.72687757, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 9911, + "time_per_iteration": 2.400561571121216 + }, + { + "auxiliary_loss_clip": 0.01061073, + "auxiliary_loss_mlp": 0.01046035, + "balance_loss_clip": 1.01750779, + "balance_loss_mlp": 1.01962876, + "epoch": 0.595941680444912, + "flos": 16537867879680.0, + "grad_norm": 2.051306269517258, + "language_loss": 0.67658317, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.69765425, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 9912, + "time_per_iteration": 2.3497612476348877 + }, + { + "auxiliary_loss_clip": 0.01056845, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.01720011, + "balance_loss_mlp": 1.01779556, + "epoch": 0.59600180369758, + "flos": 27817641648000.0, + "grad_norm": 2.202082479614937, + "language_loss": 0.74147433, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.76247525, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 9913, + "time_per_iteration": 2.428071975708008 + }, + { + "auxiliary_loss_clip": 0.01057991, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.01306891, + "balance_loss_mlp": 1.0172509, + "epoch": 0.596061926950248, + "flos": 29490435486720.0, + "grad_norm": 1.9372761455362892, + "language_loss": 0.81776726, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.83873332, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40625, + "step": 9914, + "time_per_iteration": 2.4447903633117676 + }, + { + "auxiliary_loss_clip": 0.01055215, + "auxiliary_loss_mlp": 0.01037712, + "balance_loss_clip": 1.01227319, + "balance_loss_mlp": 1.01799452, + "epoch": 0.596122050202916, + "flos": 16835851267200.0, + "grad_norm": 1.9013282223746726, + "language_loss": 0.69112027, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.7120496, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 9915, + "time_per_iteration": 2.3612301349639893 + }, + { + "auxiliary_loss_clip": 0.01056751, + "auxiliary_loss_mlp": 0.01038292, + "balance_loss_clip": 1.01335323, + "balance_loss_mlp": 1.01750326, + "epoch": 0.596182173455584, + "flos": 20995578005760.0, + "grad_norm": 1.6262143831614215, + "language_loss": 0.79884487, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81979531, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.39257812, + "step": 9916, + "time_per_iteration": 2.3785958290100098 + }, + { + "auxiliary_loss_clip": 0.01056722, + "auxiliary_loss_mlp": 0.01041018, + "balance_loss_clip": 1.01712847, + "balance_loss_mlp": 1.01834774, + "epoch": 0.5962422967082519, + "flos": 16064899424640.0, + "grad_norm": 1.9198378051777651, + "language_loss": 0.85161823, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.87259561, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.38476562, + "step": 9917, + "time_per_iteration": 3.809262752532959 + }, + { + "auxiliary_loss_clip": 0.01055045, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.0138824, + "balance_loss_mlp": 1.01746321, + "epoch": 0.5963024199609199, + "flos": 12166148165760.0, + "grad_norm": 1.7948635965720272, + "language_loss": 0.77894306, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79985976, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.375, + "step": 9918, + "time_per_iteration": 2.3467795848846436 + }, + { + "auxiliary_loss_clip": 0.01057092, + "auxiliary_loss_mlp": 0.01040553, + "balance_loss_clip": 1.01603222, + "balance_loss_mlp": 1.01854801, + "epoch": 0.5963625432135878, + "flos": 28073031310080.0, + "grad_norm": 1.4212061824327238, + "language_loss": 0.79327917, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81425565, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38476562, + "step": 9919, + "time_per_iteration": 2.4675228595733643 + }, + { + "auxiliary_loss_clip": 0.01054684, + "auxiliary_loss_mlp": 0.01034876, + "balance_loss_clip": 1.00994968, + "balance_loss_mlp": 1.0175916, + "epoch": 0.5964226664662559, + "flos": 19859434675200.0, + "grad_norm": 2.056656753931069, + "language_loss": 0.79270834, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.81360394, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 9920, + "time_per_iteration": 2.3454315662384033 + }, + { + "auxiliary_loss_clip": 0.01058311, + "auxiliary_loss_mlp": 0.01042083, + "balance_loss_clip": 1.0181818, + "balance_loss_mlp": 1.01927614, + "epoch": 0.5964827897189238, + "flos": 12931793481600.0, + "grad_norm": 2.1718086821826863, + "language_loss": 0.84334207, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.86434597, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.390625, + "step": 9921, + "time_per_iteration": 2.3803868293762207 + }, + { + "auxiliary_loss_clip": 0.01055088, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.01191318, + "balance_loss_mlp": 1.01743519, + "epoch": 0.5965429129715918, + "flos": 18149807485440.0, + "grad_norm": 2.031014583400477, + "language_loss": 0.82404685, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.84496367, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37695312, + "step": 9922, + "time_per_iteration": 2.3407955169677734 + }, + { + "auxiliary_loss_clip": 0.01054653, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.01107657, + "balance_loss_mlp": 1.01686072, + "epoch": 0.5966030362242597, + "flos": 21762131016960.0, + "grad_norm": 1.700717390348759, + "language_loss": 0.78359008, + "learning_rate": 1.477441761580111e-06, + "loss": 0.80448771, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37695312, + "step": 9923, + "time_per_iteration": 2.4004054069519043 + }, + { + "auxiliary_loss_clip": 0.01058583, + "auxiliary_loss_mlp": 0.01045536, + "balance_loss_clip": 1.01848722, + "balance_loss_mlp": 1.01824808, + "epoch": 0.5966631594769277, + "flos": 18806209557120.0, + "grad_norm": 1.9852961445349215, + "language_loss": 0.77100259, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.79204375, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 9924, + "time_per_iteration": 2.3756442070007324 + }, + { + "auxiliary_loss_clip": 0.01053916, + "auxiliary_loss_mlp": 0.01038108, + "balance_loss_clip": 1.01413524, + "balance_loss_mlp": 1.01732218, + "epoch": 0.5967232827295956, + "flos": 14063293601280.0, + "grad_norm": 1.7644070749144674, + "language_loss": 0.67087138, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.69179165, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 9925, + "time_per_iteration": 2.3415942192077637 + }, + { + "auxiliary_loss_clip": 0.0105466, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.0090549, + "balance_loss_mlp": 1.01838708, + "epoch": 0.5967834059822636, + "flos": 17237282613120.0, + "grad_norm": 2.05458727826708, + "language_loss": 0.72999245, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.75086445, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 9926, + "time_per_iteration": 2.3691956996917725 + }, + { + "auxiliary_loss_clip": 0.01057669, + "auxiliary_loss_mlp": 0.01042016, + "balance_loss_clip": 1.01513422, + "balance_loss_mlp": 1.01772356, + "epoch": 0.5968435292349316, + "flos": 42518659743360.0, + "grad_norm": 1.751084712656822, + "language_loss": 0.71684861, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.73784548, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 9927, + "time_per_iteration": 2.5719106197357178 + }, + { + "auxiliary_loss_clip": 0.01057901, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.01525187, + "balance_loss_mlp": 1.01753831, + "epoch": 0.5969036524875996, + "flos": 37629457724160.0, + "grad_norm": 1.6003024549540563, + "language_loss": 0.64646018, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.66746926, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40234375, + "step": 9928, + "time_per_iteration": 2.511996269226074 + }, + { + "auxiliary_loss_clip": 0.01052553, + "auxiliary_loss_mlp": 0.01045488, + "balance_loss_clip": 1.02209949, + "balance_loss_mlp": 1.01576388, + "epoch": 0.5969637757402676, + "flos": 23147275230720.0, + "grad_norm": 1.6866114616404946, + "language_loss": 0.69966412, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.72064459, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 9929, + "time_per_iteration": 2.3840153217315674 + }, + { + "auxiliary_loss_clip": 0.01054163, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.01319814, + "balance_loss_mlp": 1.01749647, + "epoch": 0.5970238989929355, + "flos": 24019894552320.0, + "grad_norm": 2.177440534955975, + "language_loss": 0.78718126, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.80809605, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 9930, + "time_per_iteration": 2.3896005153656006 + }, + { + "auxiliary_loss_clip": 0.01057542, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.01843047, + "balance_loss_mlp": 1.01850128, + "epoch": 0.5970840222456035, + "flos": 19425883011840.0, + "grad_norm": 1.8992300168237586, + "language_loss": 0.70998168, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.73098415, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.390625, + "step": 9931, + "time_per_iteration": 2.3762331008911133 + }, + { + "auxiliary_loss_clip": 0.01011174, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.00083113, + "balance_loss_mlp": 1.00399065, + "epoch": 0.5971441454982714, + "flos": 62973781902720.0, + "grad_norm": 0.8601237645092096, + "language_loss": 0.64256954, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66271508, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.07177734, + "step": 9932, + "time_per_iteration": 2.9297099113464355 + }, + { + "auxiliary_loss_clip": 0.01055588, + "auxiliary_loss_mlp": 0.01043209, + "balance_loss_clip": 1.01817501, + "balance_loss_mlp": 1.0174942, + "epoch": 0.5972042687509395, + "flos": 20265195029760.0, + "grad_norm": 1.8033175774765589, + "language_loss": 0.75206226, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.77305019, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 9933, + "time_per_iteration": 2.379478693008423 + }, + { + "auxiliary_loss_clip": 0.01010376, + "auxiliary_loss_mlp": 0.01003762, + "balance_loss_clip": 1.00106823, + "balance_loss_mlp": 1.00329423, + "epoch": 0.5972643920036074, + "flos": 71648510175360.0, + "grad_norm": 0.6660497446821488, + "language_loss": 0.52077883, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54092026, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.02697754, + "router_z_loss_mlp": 0.0703125, + "step": 9934, + "time_per_iteration": 3.1557915210723877 + }, + { + "auxiliary_loss_clip": 0.01010248, + "auxiliary_loss_mlp": 0.01002996, + "balance_loss_clip": 1.00044477, + "balance_loss_mlp": 1.00300455, + "epoch": 0.5973245152562754, + "flos": 56889781735680.0, + "grad_norm": 0.8325285759992868, + "language_loss": 0.54316813, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56330055, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.07226562, + "step": 9935, + "time_per_iteration": 3.08651065826416 + }, + { + "auxiliary_loss_clip": 0.01056062, + "auxiliary_loss_mlp": 0.01042169, + "balance_loss_clip": 1.01733732, + "balance_loss_mlp": 1.01803327, + "epoch": 0.5973846385089433, + "flos": 24163387706880.0, + "grad_norm": 1.7365500033417096, + "language_loss": 0.66849494, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68947721, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 9936, + "time_per_iteration": 2.398763656616211 + }, + { + "auxiliary_loss_clip": 0.01055984, + "auxiliary_loss_mlp": 0.01042412, + "balance_loss_clip": 1.01812899, + "balance_loss_mlp": 1.01756144, + "epoch": 0.5974447617616113, + "flos": 17669786935680.0, + "grad_norm": 2.0818594543520956, + "language_loss": 0.68737364, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.70835757, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.38476562, + "step": 9937, + "time_per_iteration": 2.383715867996216 + }, + { + "auxiliary_loss_clip": 0.01055687, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.01318407, + "balance_loss_mlp": 1.01654959, + "epoch": 0.5975048850142792, + "flos": 22891431720960.0, + "grad_norm": 2.0628814772001447, + "language_loss": 0.78649455, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.80743659, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 9938, + "time_per_iteration": 2.378601551055908 + }, + { + "auxiliary_loss_clip": 0.01054708, + "auxiliary_loss_mlp": 0.01045055, + "balance_loss_clip": 1.02064061, + "balance_loss_mlp": 1.01711214, + "epoch": 0.5975650082669473, + "flos": 24351953293440.0, + "grad_norm": 1.4402495932248698, + "language_loss": 0.7680434, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78904098, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 9939, + "time_per_iteration": 2.4574036598205566 + }, + { + "auxiliary_loss_clip": 0.01057991, + "auxiliary_loss_mlp": 0.01040984, + "balance_loss_clip": 1.01213503, + "balance_loss_mlp": 1.01744592, + "epoch": 0.5976251315196152, + "flos": 20922295328640.0, + "grad_norm": 2.4299259578614567, + "language_loss": 0.70922345, + "learning_rate": 1.471053774486878e-06, + "loss": 0.73021317, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40625, + "step": 9940, + "time_per_iteration": 2.3637375831604004 + }, + { + "auxiliary_loss_clip": 0.01053536, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.01649833, + "balance_loss_mlp": 1.0168736, + "epoch": 0.5976852547722832, + "flos": 35843161455360.0, + "grad_norm": 1.3841775875988074, + "language_loss": 0.70814443, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72906661, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3671875, + "step": 9941, + "time_per_iteration": 2.528798818588257 + }, + { + "auxiliary_loss_clip": 0.01054665, + "auxiliary_loss_mlp": 0.01040417, + "balance_loss_clip": 1.01724303, + "balance_loss_mlp": 1.01662493, + "epoch": 0.5977453780249512, + "flos": 12855229136640.0, + "grad_norm": 1.7899245678780817, + "language_loss": 0.78945941, + "learning_rate": 1.470302626336386e-06, + "loss": 0.81041026, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.38085938, + "step": 9942, + "time_per_iteration": 2.3339056968688965 + }, + { + "auxiliary_loss_clip": 0.01055599, + "auxiliary_loss_mlp": 0.01046306, + "balance_loss_clip": 1.02143931, + "balance_loss_mlp": 1.0170697, + "epoch": 0.5978055012776191, + "flos": 20958116250240.0, + "grad_norm": 1.8791643231170954, + "language_loss": 0.76706159, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78808063, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 9943, + "time_per_iteration": 3.6250364780426025 + }, + { + "auxiliary_loss_clip": 0.01055285, + "auxiliary_loss_mlp": 0.01041011, + "balance_loss_clip": 1.01707339, + "balance_loss_mlp": 1.01821935, + "epoch": 0.5978656245302871, + "flos": 34056585895680.0, + "grad_norm": 2.129793730277272, + "language_loss": 0.63407278, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.65503573, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 9944, + "time_per_iteration": 2.487746238708496 + }, + { + "auxiliary_loss_clip": 0.01056656, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.01773524, + "balance_loss_mlp": 1.01834917, + "epoch": 0.597925747782955, + "flos": 37371903557760.0, + "grad_norm": 1.6528238211017168, + "language_loss": 0.73335743, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.7543658, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3828125, + "step": 9945, + "time_per_iteration": 2.538015842437744 + }, + { + "auxiliary_loss_clip": 0.01055395, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.01283383, + "balance_loss_mlp": 1.0168376, + "epoch": 0.5979858710356231, + "flos": 25373616675840.0, + "grad_norm": 3.105106744641933, + "language_loss": 0.68694031, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.7078703, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 9946, + "time_per_iteration": 2.3913629055023193 + }, + { + "auxiliary_loss_clip": 0.01057742, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.01486063, + "balance_loss_mlp": 1.01851964, + "epoch": 0.598045994288291, + "flos": 13697578442880.0, + "grad_norm": 2.0147583691020117, + "language_loss": 0.89563942, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91662455, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39257812, + "step": 9947, + "time_per_iteration": 2.3525450229644775 + }, + { + "auxiliary_loss_clip": 0.01053644, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_clip": 1.01912367, + "balance_loss_mlp": 1.01766515, + "epoch": 0.598106117540959, + "flos": 21980268391680.0, + "grad_norm": 1.8359819156631376, + "language_loss": 0.73363698, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.7546038, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 9948, + "time_per_iteration": 3.868403196334839 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01043901, + "balance_loss_clip": 1.0176034, + "balance_loss_mlp": 1.01957297, + "epoch": 0.5981662407936269, + "flos": 20558290826880.0, + "grad_norm": 1.9085672715621058, + "language_loss": 0.90419406, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.9252187, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.390625, + "step": 9949, + "time_per_iteration": 2.373253345489502 + }, + { + "auxiliary_loss_clip": 0.0105723, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.01698613, + "balance_loss_mlp": 1.01994979, + "epoch": 0.5982263640462949, + "flos": 14062979399040.0, + "grad_norm": 1.6913773671150722, + "language_loss": 0.71487403, + "learning_rate": 1.467298838320673e-06, + "loss": 0.73584259, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37304688, + "step": 9950, + "time_per_iteration": 2.3334810733795166 + }, + { + "auxiliary_loss_clip": 0.01056438, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.01514721, + "balance_loss_mlp": 1.01852107, + "epoch": 0.5982864872989628, + "flos": 17706410818560.0, + "grad_norm": 1.529870542961733, + "language_loss": 0.78716111, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.8081215, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 9951, + "time_per_iteration": 2.3631749153137207 + }, + { + "auxiliary_loss_clip": 0.0105764, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_clip": 1.01713943, + "balance_loss_mlp": 1.01881647, + "epoch": 0.5983466105516309, + "flos": 16763825399040.0, + "grad_norm": 1.8742363290683295, + "language_loss": 0.74962986, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.77065921, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.38671875, + "step": 9952, + "time_per_iteration": 2.360698938369751 + }, + { + "auxiliary_loss_clip": 0.01059592, + "auxiliary_loss_mlp": 0.01040939, + "balance_loss_clip": 1.01313925, + "balance_loss_mlp": 1.0196569, + "epoch": 0.5984067338042988, + "flos": 20041820951040.0, + "grad_norm": 2.077515582325337, + "language_loss": 0.79660285, + "learning_rate": 1.466172750724613e-06, + "loss": 0.81760818, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 9953, + "time_per_iteration": 2.3742663860321045 + }, + { + "auxiliary_loss_clip": 0.0105649, + "auxiliary_loss_mlp": 0.0104079, + "balance_loss_clip": 1.0161736, + "balance_loss_mlp": 1.0185622, + "epoch": 0.5984668570569668, + "flos": 26318785536000.0, + "grad_norm": 1.5317461866514381, + "language_loss": 0.70584619, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.72681904, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 9954, + "time_per_iteration": 2.4102349281311035 + }, + { + "auxiliary_loss_clip": 0.01058621, + "auxiliary_loss_mlp": 0.01037562, + "balance_loss_clip": 1.01292157, + "balance_loss_mlp": 1.01981902, + "epoch": 0.5985269803096348, + "flos": 20592715294080.0, + "grad_norm": 1.747905215288434, + "language_loss": 0.74190533, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.76286721, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38867188, + "step": 9955, + "time_per_iteration": 2.3617281913757324 + }, + { + "auxiliary_loss_clip": 0.01056393, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.01273346, + "balance_loss_mlp": 1.01818943, + "epoch": 0.5985871035623027, + "flos": 26864303529600.0, + "grad_norm": 1.5313390534872875, + "language_loss": 0.69448864, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.7154156, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3828125, + "step": 9956, + "time_per_iteration": 3.9031474590301514 + }, + { + "auxiliary_loss_clip": 0.01057381, + "auxiliary_loss_mlp": 0.01041673, + "balance_loss_clip": 1.01680624, + "balance_loss_mlp": 1.01848388, + "epoch": 0.5986472268149707, + "flos": 19608688224000.0, + "grad_norm": 2.144083995236414, + "language_loss": 0.74774683, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.76873744, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38867188, + "step": 9957, + "time_per_iteration": 2.359659194946289 + }, + { + "auxiliary_loss_clip": 0.01053547, + "auxiliary_loss_mlp": 0.01039763, + "balance_loss_clip": 1.01652944, + "balance_loss_mlp": 1.01787305, + "epoch": 0.5987073500676386, + "flos": 21793657841280.0, + "grad_norm": 2.7272328792314013, + "language_loss": 0.86016589, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.88109899, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 9958, + "time_per_iteration": 2.374685049057007 + }, + { + "auxiliary_loss_clip": 0.01058445, + "auxiliary_loss_mlp": 0.0104366, + "balance_loss_clip": 1.01746976, + "balance_loss_mlp": 1.01889539, + "epoch": 0.5987674733203067, + "flos": 24313269640320.0, + "grad_norm": 1.8990965587152735, + "language_loss": 0.67715812, + "learning_rate": 1.463921122471864e-06, + "loss": 0.69817913, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 9959, + "time_per_iteration": 2.388361692428589 + }, + { + "auxiliary_loss_clip": 0.01058156, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.01458216, + "balance_loss_mlp": 1.0193783, + "epoch": 0.5988275965729746, + "flos": 21319258020480.0, + "grad_norm": 1.7429088556643662, + "language_loss": 0.84876347, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.86974943, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 9960, + "time_per_iteration": 2.396484136581421 + }, + { + "auxiliary_loss_clip": 0.01055824, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.01239824, + "balance_loss_mlp": 1.01775956, + "epoch": 0.5988877198256426, + "flos": 25116900382080.0, + "grad_norm": 1.521707695732219, + "language_loss": 0.80874467, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.82966113, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.38085938, + "step": 9961, + "time_per_iteration": 2.4081528186798096 + }, + { + "auxiliary_loss_clip": 0.01057454, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.01640236, + "balance_loss_mlp": 1.0193603, + "epoch": 0.5989478430783105, + "flos": 26427993868800.0, + "grad_norm": 2.041265034522868, + "language_loss": 0.67894268, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69992876, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 9962, + "time_per_iteration": 2.3977408409118652 + }, + { + "auxiliary_loss_clip": 0.01056124, + "auxiliary_loss_mlp": 0.01041693, + "balance_loss_clip": 1.01811385, + "balance_loss_mlp": 1.01806092, + "epoch": 0.5990079663309785, + "flos": 25777177614720.0, + "grad_norm": 1.4157428472610571, + "language_loss": 0.74773693, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76871514, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38085938, + "step": 9963, + "time_per_iteration": 2.4470884799957275 + }, + { + "auxiliary_loss_clip": 0.01055007, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.01420593, + "balance_loss_mlp": 1.0174706, + "epoch": 0.5990680895836464, + "flos": 36830260725120.0, + "grad_norm": 1.6925047739996948, + "language_loss": 0.68722701, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70816827, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 9964, + "time_per_iteration": 2.5061001777648926 + }, + { + "auxiliary_loss_clip": 0.010539, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.01021242, + "balance_loss_mlp": 1.01701176, + "epoch": 0.5991282128363145, + "flos": 24132419464320.0, + "grad_norm": 2.264245204125111, + "language_loss": 0.77349401, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79437852, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36914062, + "step": 9965, + "time_per_iteration": 2.4127469062805176 + }, + { + "auxiliary_loss_clip": 0.01054206, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.01013029, + "balance_loss_mlp": 1.01629996, + "epoch": 0.5991883360889824, + "flos": 10303357374720.0, + "grad_norm": 1.7109538725366742, + "language_loss": 0.78370655, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.80459511, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 9966, + "time_per_iteration": 2.331923723220825 + }, + { + "auxiliary_loss_clip": 0.01056231, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.01381636, + "balance_loss_mlp": 1.01877761, + "epoch": 0.5992484593416504, + "flos": 23950068099840.0, + "grad_norm": 1.5567797919069344, + "language_loss": 0.74803042, + "learning_rate": 1.460920090376422e-06, + "loss": 0.76896179, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.375, + "step": 9967, + "time_per_iteration": 2.416025400161743 + }, + { + "auxiliary_loss_clip": 0.01058218, + "auxiliary_loss_mlp": 0.01047597, + "balance_loss_clip": 1.0205009, + "balance_loss_mlp": 1.01729393, + "epoch": 0.5993085825943184, + "flos": 11943402491520.0, + "grad_norm": 2.714080626937971, + "language_loss": 0.68751591, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70857406, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.41015625, + "step": 9968, + "time_per_iteration": 2.345857620239258 + }, + { + "auxiliary_loss_clip": 0.01056321, + "auxiliary_loss_mlp": 0.01043237, + "balance_loss_clip": 1.01699924, + "balance_loss_mlp": 1.01764548, + "epoch": 0.5993687058469863, + "flos": 19025813208960.0, + "grad_norm": 1.5801331112902455, + "language_loss": 0.80215585, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.82315141, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 9969, + "time_per_iteration": 2.3613781929016113 + }, + { + "auxiliary_loss_clip": 0.01056281, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.01493299, + "balance_loss_mlp": 1.01684535, + "epoch": 0.5994288290996543, + "flos": 14282094291840.0, + "grad_norm": 1.7574182580157922, + "language_loss": 0.82637519, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.84734315, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39257812, + "step": 9970, + "time_per_iteration": 2.329341173171997 + }, + { + "auxiliary_loss_clip": 0.0105807, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.01158118, + "balance_loss_mlp": 1.01724946, + "epoch": 0.5994889523523222, + "flos": 19205685866880.0, + "grad_norm": 1.9217755984884692, + "language_loss": 0.63575971, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.65674841, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40820312, + "step": 9971, + "time_per_iteration": 2.347970962524414 + }, + { + "auxiliary_loss_clip": 0.01053542, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.01327038, + "balance_loss_mlp": 1.01745903, + "epoch": 0.5995490756049903, + "flos": 28035813934080.0, + "grad_norm": 1.8486144461196083, + "language_loss": 0.79985332, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.82075208, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 9972, + "time_per_iteration": 2.3997459411621094 + }, + { + "auxiliary_loss_clip": 0.01060481, + "auxiliary_loss_mlp": 0.01047309, + "balance_loss_clip": 1.0180198, + "balance_loss_mlp": 1.01787972, + "epoch": 0.5996091988576582, + "flos": 29051856587520.0, + "grad_norm": 3.723828446511799, + "language_loss": 0.76723301, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78831089, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.42578125, + "step": 9973, + "time_per_iteration": 2.4451162815093994 + }, + { + "auxiliary_loss_clip": 0.01054285, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.01639891, + "balance_loss_mlp": 1.01605177, + "epoch": 0.5996693221103262, + "flos": 20812912439040.0, + "grad_norm": 2.0709075017681853, + "language_loss": 0.67050302, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.69146311, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 9974, + "time_per_iteration": 2.352391004562378 + }, + { + "auxiliary_loss_clip": 0.0105634, + "auxiliary_loss_mlp": 0.01046836, + "balance_loss_clip": 1.01880956, + "balance_loss_mlp": 1.01752388, + "epoch": 0.5997294453629941, + "flos": 23767786558080.0, + "grad_norm": 1.597075682178523, + "language_loss": 0.75655019, + "learning_rate": 1.457920366566428e-06, + "loss": 0.77758193, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.38867188, + "step": 9975, + "time_per_iteration": 2.411155939102173 + }, + { + "auxiliary_loss_clip": 0.01056382, + "auxiliary_loss_mlp": 0.01045277, + "balance_loss_clip": 1.01938438, + "balance_loss_mlp": 1.01772535, + "epoch": 0.5997895686156621, + "flos": 20958954122880.0, + "grad_norm": 1.8252498726278144, + "language_loss": 0.78694606, + "learning_rate": 1.457545493441611e-06, + "loss": 0.80796266, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 9976, + "time_per_iteration": 2.353914260864258 + }, + { + "auxiliary_loss_clip": 0.01055287, + "auxiliary_loss_mlp": 0.01041001, + "balance_loss_clip": 1.01361823, + "balance_loss_mlp": 1.01642394, + "epoch": 0.59984969186833, + "flos": 28364206982400.0, + "grad_norm": 2.7139976296865886, + "language_loss": 0.76729989, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.78826278, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38867188, + "step": 9977, + "time_per_iteration": 2.432612895965576 + }, + { + "auxiliary_loss_clip": 0.01055615, + "auxiliary_loss_mlp": 0.01042101, + "balance_loss_clip": 1.01611316, + "balance_loss_mlp": 1.01627135, + "epoch": 0.5999098151209981, + "flos": 22564784240640.0, + "grad_norm": 1.5372321596116194, + "language_loss": 0.69984853, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.72082567, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39257812, + "step": 9978, + "time_per_iteration": 2.3688621520996094 + }, + { + "auxiliary_loss_clip": 0.01060556, + "auxiliary_loss_mlp": 0.01038968, + "balance_loss_clip": 1.01251519, + "balance_loss_mlp": 1.01985979, + "epoch": 0.599969938373666, + "flos": 18767770283520.0, + "grad_norm": 2.468577529800031, + "language_loss": 0.82661968, + "learning_rate": 1.456420997543594e-06, + "loss": 0.84761494, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40820312, + "step": 9979, + "time_per_iteration": 2.361079692840576 + }, + { + "auxiliary_loss_clip": 0.01054221, + "auxiliary_loss_mlp": 0.0104174, + "balance_loss_clip": 1.01665807, + "balance_loss_mlp": 1.01705003, + "epoch": 0.600030061626334, + "flos": 11326452122880.0, + "grad_norm": 2.1125025964226527, + "language_loss": 0.72336268, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.7443223, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 9980, + "time_per_iteration": 2.330484390258789 + }, + { + "auxiliary_loss_clip": 0.01058806, + "auxiliary_loss_mlp": 0.0104373, + "balance_loss_clip": 1.01507223, + "balance_loss_mlp": 1.01834583, + "epoch": 0.600090184879002, + "flos": 16577808341760.0, + "grad_norm": 2.691356307603904, + "language_loss": 0.71007049, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.73109591, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40429688, + "step": 9981, + "time_per_iteration": 2.3744113445281982 + }, + { + "auxiliary_loss_clip": 0.01054324, + "auxiliary_loss_mlp": 0.01043941, + "balance_loss_clip": 1.02074337, + "balance_loss_mlp": 1.01676023, + "epoch": 0.6001503081316699, + "flos": 23617625333760.0, + "grad_norm": 1.8151439109385232, + "language_loss": 0.7930423, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.81402498, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 9982, + "time_per_iteration": 3.713190793991089 + }, + { + "auxiliary_loss_clip": 0.01056258, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.01872849, + "balance_loss_mlp": 1.01786613, + "epoch": 0.6002104313843379, + "flos": 20666626375680.0, + "grad_norm": 1.6960455449591838, + "language_loss": 0.73771095, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.75873286, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38476562, + "step": 9983, + "time_per_iteration": 2.3946287631988525 + }, + { + "auxiliary_loss_clip": 0.01056429, + "auxiliary_loss_mlp": 0.01045113, + "balance_loss_clip": 1.01868439, + "balance_loss_mlp": 1.017259, + "epoch": 0.6002705546370058, + "flos": 22454144542080.0, + "grad_norm": 1.9451310040349825, + "language_loss": 0.79418409, + "learning_rate": 1.454547250154447e-06, + "loss": 0.81519949, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.390625, + "step": 9984, + "time_per_iteration": 2.363572359085083 + }, + { + "auxiliary_loss_clip": 0.01055958, + "auxiliary_loss_mlp": 0.01041402, + "balance_loss_clip": 1.01701164, + "balance_loss_mlp": 1.01757669, + "epoch": 0.6003306778896739, + "flos": 25190811463680.0, + "grad_norm": 1.8448951584694433, + "language_loss": 0.83990037, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.860874, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3828125, + "step": 9985, + "time_per_iteration": 2.4531543254852295 + }, + { + "auxiliary_loss_clip": 0.01055934, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.01652622, + "balance_loss_mlp": 1.01802087, + "epoch": 0.6003908011423418, + "flos": 26686525553280.0, + "grad_norm": 1.746543116683562, + "language_loss": 0.73202038, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.75299686, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 9986, + "time_per_iteration": 2.4075582027435303 + }, + { + "auxiliary_loss_clip": 0.01059873, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.01282144, + "balance_loss_mlp": 1.02011991, + "epoch": 0.6004509243950098, + "flos": 22563981279360.0, + "grad_norm": 1.5043397889060346, + "language_loss": 0.72951221, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.75049245, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3984375, + "step": 9987, + "time_per_iteration": 5.2034924030303955 + }, + { + "auxiliary_loss_clip": 0.01054176, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.01363516, + "balance_loss_mlp": 1.01721001, + "epoch": 0.6005110476476777, + "flos": 19718280581760.0, + "grad_norm": 1.754951353683002, + "language_loss": 0.8568939, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.87781852, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 9988, + "time_per_iteration": 2.369492292404175 + }, + { + "auxiliary_loss_clip": 0.01055153, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.0162586, + "balance_loss_mlp": 1.0177424, + "epoch": 0.6005711709003457, + "flos": 17711577699840.0, + "grad_norm": 1.8724557476127222, + "language_loss": 0.66228807, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.68324733, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37304688, + "step": 9989, + "time_per_iteration": 2.3601319789886475 + }, + { + "auxiliary_loss_clip": 0.0105641, + "auxiliary_loss_mlp": 0.01036666, + "balance_loss_clip": 1.01349139, + "balance_loss_mlp": 1.01835728, + "epoch": 0.6006312941530136, + "flos": 18513497784960.0, + "grad_norm": 1.4911187259181997, + "language_loss": 0.81869841, + "learning_rate": 1.452299436003257e-06, + "loss": 0.83962923, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38085938, + "step": 9990, + "time_per_iteration": 2.389498233795166 + }, + { + "auxiliary_loss_clip": 0.0105646, + "auxiliary_loss_mlp": 0.01038929, + "balance_loss_clip": 1.01334703, + "balance_loss_mlp": 1.01792336, + "epoch": 0.6006914174056817, + "flos": 21389957256960.0, + "grad_norm": 2.0059310199922757, + "language_loss": 0.84070396, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.86165786, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38476562, + "step": 9991, + "time_per_iteration": 2.404491662979126 + }, + { + "auxiliary_loss_clip": 0.01055017, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01061249, + "balance_loss_mlp": 1.01803172, + "epoch": 0.6007515406583496, + "flos": 12749686496640.0, + "grad_norm": 2.692204334681764, + "language_loss": 0.84414041, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.86503434, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 9992, + "time_per_iteration": 2.3645944595336914 + }, + { + "auxiliary_loss_clip": 0.01055299, + "auxiliary_loss_mlp": 0.01044516, + "balance_loss_clip": 1.01969624, + "balance_loss_mlp": 1.01739645, + "epoch": 0.6008116639110176, + "flos": 19205930246400.0, + "grad_norm": 2.429295269533105, + "language_loss": 0.67316431, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.69416249, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 9993, + "time_per_iteration": 2.402932643890381 + }, + { + "auxiliary_loss_clip": 0.01053501, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.01395476, + "balance_loss_mlp": 1.0168674, + "epoch": 0.6008717871636855, + "flos": 17054407578240.0, + "grad_norm": 2.489186767266464, + "language_loss": 0.82356226, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.84448516, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 9994, + "time_per_iteration": 2.361377239227295 + }, + { + "auxiliary_loss_clip": 0.01052578, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.01183712, + "balance_loss_mlp": 1.01771355, + "epoch": 0.6009319104163535, + "flos": 20297769194880.0, + "grad_norm": 1.9537563075947522, + "language_loss": 0.73049939, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.75134861, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34960938, + "step": 9995, + "time_per_iteration": 3.8425729274749756 + }, + { + "auxiliary_loss_clip": 0.01055342, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.01603198, + "balance_loss_mlp": 1.01613283, + "epoch": 0.6009920336690215, + "flos": 21835658073600.0, + "grad_norm": 1.6212383282764546, + "language_loss": 0.82098114, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.84196448, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39257812, + "step": 9996, + "time_per_iteration": 2.43393611907959 + }, + { + "auxiliary_loss_clip": 0.01056322, + "auxiliary_loss_mlp": 0.01045815, + "balance_loss_clip": 1.02138865, + "balance_loss_mlp": 1.0188911, + "epoch": 0.6010521569216895, + "flos": 22595158990080.0, + "grad_norm": 1.7207174660075102, + "language_loss": 0.79705715, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.81807852, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 9997, + "time_per_iteration": 2.364619493484497 + }, + { + "auxiliary_loss_clip": 0.0105659, + "auxiliary_loss_mlp": 0.01043088, + "balance_loss_clip": 1.01710069, + "balance_loss_mlp": 1.01768064, + "epoch": 0.6011122801743575, + "flos": 19170702817920.0, + "grad_norm": 2.2689863991073267, + "language_loss": 0.73958039, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.76057714, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 9998, + "time_per_iteration": 2.3612189292907715 + }, + { + "auxiliary_loss_clip": 0.01053494, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.01728046, + "balance_loss_mlp": 1.01641142, + "epoch": 0.6011724034270254, + "flos": 25008844124160.0, + "grad_norm": 1.6077152778076338, + "language_loss": 0.73018277, + "learning_rate": 1.448929117633027e-06, + "loss": 0.75112164, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 9999, + "time_per_iteration": 2.386514902114868 + }, + { + "auxiliary_loss_clip": 0.01058043, + "auxiliary_loss_mlp": 0.01046285, + "balance_loss_clip": 1.01994014, + "balance_loss_mlp": 1.01736701, + "epoch": 0.6012325266796934, + "flos": 21796625306880.0, + "grad_norm": 1.5769567983087593, + "language_loss": 0.79357123, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.81461453, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40820312, + "step": 10000, + "time_per_iteration": 2.4028587341308594 + }, + { + "auxiliary_loss_clip": 0.01059081, + "auxiliary_loss_mlp": 0.01042817, + "balance_loss_clip": 1.01587546, + "balance_loss_mlp": 1.01847351, + "epoch": 0.6012926499323613, + "flos": 19571994518400.0, + "grad_norm": 1.9518899461351125, + "language_loss": 0.7891736, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.81019258, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40625, + "step": 10001, + "time_per_iteration": 2.3408384323120117 + }, + { + "auxiliary_loss_clip": 0.01056929, + "auxiliary_loss_mlp": 0.010418, + "balance_loss_clip": 1.0135591, + "balance_loss_mlp": 1.0167706, + "epoch": 0.6013527731850293, + "flos": 34859343853440.0, + "grad_norm": 2.106335718015344, + "language_loss": 0.59159625, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.61258352, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40234375, + "step": 10002, + "time_per_iteration": 2.500988245010376 + }, + { + "auxiliary_loss_clip": 0.01059886, + "auxiliary_loss_mlp": 0.01050315, + "balance_loss_clip": 1.02023816, + "balance_loss_mlp": 1.01917601, + "epoch": 0.6014128964376972, + "flos": 23290908030720.0, + "grad_norm": 1.5730375891371469, + "language_loss": 0.79155594, + "learning_rate": 1.447431741055314e-06, + "loss": 0.81265795, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.40625, + "step": 10003, + "time_per_iteration": 2.3700995445251465 + }, + { + "auxiliary_loss_clip": 0.01057572, + "auxiliary_loss_mlp": 0.01042489, + "balance_loss_clip": 1.01513052, + "balance_loss_mlp": 1.01725578, + "epoch": 0.6014730196903653, + "flos": 24819929424000.0, + "grad_norm": 2.104655564803409, + "language_loss": 0.78388047, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.8048811, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40234375, + "step": 10004, + "time_per_iteration": 2.395785331726074 + }, + { + "auxiliary_loss_clip": 0.01055433, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.01545525, + "balance_loss_mlp": 1.01712239, + "epoch": 0.6015331429430332, + "flos": 23111244840960.0, + "grad_norm": 1.5539206305330344, + "language_loss": 0.73296744, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.75393188, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 10005, + "time_per_iteration": 2.400291681289673 + }, + { + "auxiliary_loss_clip": 0.01055305, + "auxiliary_loss_mlp": 0.01037304, + "balance_loss_clip": 1.01392722, + "balance_loss_mlp": 1.01806271, + "epoch": 0.6015932661957012, + "flos": 19200553896960.0, + "grad_norm": 3.55078194571101, + "language_loss": 0.75541055, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.77633667, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37304688, + "step": 10006, + "time_per_iteration": 2.3416922092437744 + }, + { + "auxiliary_loss_clip": 0.0105439, + "auxiliary_loss_mlp": 0.01040009, + "balance_loss_clip": 1.01381862, + "balance_loss_mlp": 1.01624405, + "epoch": 0.6016533894483691, + "flos": 18112659932160.0, + "grad_norm": 1.7889122695374988, + "language_loss": 0.75690806, + "learning_rate": 1.445934699732685e-06, + "loss": 0.77785206, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 10007, + "time_per_iteration": 2.428571939468384 + }, + { + "auxiliary_loss_clip": 0.01055588, + "auxiliary_loss_mlp": 0.01037417, + "balance_loss_clip": 1.01238275, + "balance_loss_mlp": 1.01716971, + "epoch": 0.6017135127010371, + "flos": 16215968344320.0, + "grad_norm": 1.8043965835336002, + "language_loss": 0.7120136, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.73294365, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 10008, + "time_per_iteration": 2.369271993637085 + }, + { + "auxiliary_loss_clip": 0.010542, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.01087904, + "balance_loss_mlp": 1.01692271, + "epoch": 0.6017736359537051, + "flos": 23443024291200.0, + "grad_norm": 2.3145706592441133, + "language_loss": 0.77223259, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.79312086, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 10009, + "time_per_iteration": 2.4240803718566895 + }, + { + "auxiliary_loss_clip": 0.0105679, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.01342261, + "balance_loss_mlp": 1.01682019, + "epoch": 0.601833759206373, + "flos": 23512920566400.0, + "grad_norm": 2.2627455316789464, + "language_loss": 0.75834471, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.77928662, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3984375, + "step": 10010, + "time_per_iteration": 2.3826427459716797 + }, + { + "auxiliary_loss_clip": 0.01009479, + "auxiliary_loss_mlp": 0.01006877, + "balance_loss_clip": 1.00438547, + "balance_loss_mlp": 1.00219572, + "epoch": 0.6018938824590411, + "flos": 63987972387840.0, + "grad_norm": 0.8173790918715339, + "language_loss": 0.55142069, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57158434, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.07275391, + "step": 10011, + "time_per_iteration": 3.0754191875457764 + }, + { + "auxiliary_loss_clip": 0.01057946, + "auxiliary_loss_mlp": 0.01044657, + "balance_loss_clip": 1.0193367, + "balance_loss_mlp": 1.01919627, + "epoch": 0.601954005711709, + "flos": 34638623038080.0, + "grad_norm": 1.4194221558653177, + "language_loss": 0.63043129, + "learning_rate": 1.44406387091556e-06, + "loss": 0.65145737, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 10012, + "time_per_iteration": 2.505100965499878 + }, + { + "auxiliary_loss_clip": 0.01055718, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.01031852, + "balance_loss_mlp": 1.01814413, + "epoch": 0.602014128964377, + "flos": 19426057568640.0, + "grad_norm": 1.7833400399957942, + "language_loss": 0.75456846, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77546304, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 10013, + "time_per_iteration": 2.3759562969207764 + }, + { + "auxiliary_loss_clip": 0.01052294, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.01197708, + "balance_loss_mlp": 1.01705289, + "epoch": 0.6020742522170449, + "flos": 28328141681280.0, + "grad_norm": 1.7244470837625046, + "language_loss": 0.82183379, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.84269416, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35351562, + "step": 10014, + "time_per_iteration": 2.4222571849823 + }, + { + "auxiliary_loss_clip": 0.01052041, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.00931859, + "balance_loss_mlp": 1.0168612, + "epoch": 0.6021343754697129, + "flos": 22745948618880.0, + "grad_norm": 1.3590615099265648, + "language_loss": 0.73751742, + "learning_rate": 1.442941626485624e-06, + "loss": 0.7583611, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 10015, + "time_per_iteration": 2.413330554962158 + }, + { + "auxiliary_loss_clip": 0.01009296, + "auxiliary_loss_mlp": 0.01004067, + "balance_loss_clip": 1.00173008, + "balance_loss_mlp": 1.00185907, + "epoch": 0.6021944987223808, + "flos": 65749027743360.0, + "grad_norm": 0.8290510624287949, + "language_loss": 0.54894686, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56908047, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.07421875, + "step": 10016, + "time_per_iteration": 2.9392285346984863 + }, + { + "auxiliary_loss_clip": 0.01055958, + "auxiliary_loss_mlp": 0.01036318, + "balance_loss_clip": 1.01205862, + "balance_loss_mlp": 1.01884198, + "epoch": 0.6022546219750489, + "flos": 16104316216320.0, + "grad_norm": 1.5236522114454711, + "language_loss": 0.84080267, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.86172545, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 10017, + "time_per_iteration": 2.3530538082122803 + }, + { + "auxiliary_loss_clip": 0.01055195, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.01438272, + "balance_loss_mlp": 1.01849008, + "epoch": 0.6023147452277168, + "flos": 25511593835520.0, + "grad_norm": 2.4719039476460534, + "language_loss": 0.84156585, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.86249685, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3671875, + "step": 10018, + "time_per_iteration": 2.411412239074707 + }, + { + "auxiliary_loss_clip": 0.01058921, + "auxiliary_loss_mlp": 0.01045368, + "balance_loss_clip": 1.01952338, + "balance_loss_mlp": 1.01913595, + "epoch": 0.6023748684803848, + "flos": 22635029629440.0, + "grad_norm": 1.7112613379056687, + "language_loss": 0.79485518, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.81589806, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3984375, + "step": 10019, + "time_per_iteration": 2.396043062210083 + }, + { + "auxiliary_loss_clip": 0.0105294, + "auxiliary_loss_mlp": 0.01038589, + "balance_loss_clip": 1.01522386, + "balance_loss_mlp": 1.01525283, + "epoch": 0.6024349917330527, + "flos": 26209332823680.0, + "grad_norm": 1.5118120171970497, + "language_loss": 0.74529499, + "learning_rate": 1.441071641765681e-06, + "loss": 0.7662102, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37695312, + "step": 10020, + "time_per_iteration": 2.434704303741455 + }, + { + "auxiliary_loss_clip": 0.01055262, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.01791286, + "balance_loss_mlp": 1.01770687, + "epoch": 0.6024951149857207, + "flos": 21250688376960.0, + "grad_norm": 1.449336209591949, + "language_loss": 0.64630264, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66730487, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.375, + "step": 10021, + "time_per_iteration": 3.710780620574951 + }, + { + "auxiliary_loss_clip": 0.01054676, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.01173806, + "balance_loss_mlp": 1.01748908, + "epoch": 0.6025552382383887, + "flos": 26942229417600.0, + "grad_norm": 5.805897605448316, + "language_loss": 0.8182494, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.83915985, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 10022, + "time_per_iteration": 2.431185722351074 + }, + { + "auxiliary_loss_clip": 0.01056872, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.01113081, + "balance_loss_mlp": 1.0181613, + "epoch": 0.6026153614910567, + "flos": 31683085603200.0, + "grad_norm": 1.685334479549561, + "language_loss": 0.67582685, + "learning_rate": 1.439949905155693e-06, + "loss": 0.69675171, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 10023, + "time_per_iteration": 2.4425570964813232 + }, + { + "auxiliary_loss_clip": 0.01057958, + "auxiliary_loss_mlp": 0.01042096, + "balance_loss_clip": 1.01790857, + "balance_loss_mlp": 1.01904917, + "epoch": 0.6026754847437247, + "flos": 29311505435520.0, + "grad_norm": 1.933775113647562, + "language_loss": 0.75696617, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.77796674, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38867188, + "step": 10024, + "time_per_iteration": 2.4413654804229736 + }, + { + "auxiliary_loss_clip": 0.01054933, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.01341343, + "balance_loss_mlp": 1.01769423, + "epoch": 0.6027356079963926, + "flos": 23585644661760.0, + "grad_norm": 1.7873045696077303, + "language_loss": 0.73554862, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75647157, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 10025, + "time_per_iteration": 2.4041390419006348 + }, + { + "auxiliary_loss_clip": 0.01059276, + "auxiliary_loss_mlp": 0.01037079, + "balance_loss_clip": 1.01067436, + "balance_loss_mlp": 1.0190767, + "epoch": 0.6027957312490606, + "flos": 20812702970880.0, + "grad_norm": 2.2684067959744336, + "language_loss": 0.69361383, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.71457738, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 10026, + "time_per_iteration": 2.4444937705993652 + }, + { + "auxiliary_loss_clip": 0.01053028, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.01538134, + "balance_loss_mlp": 1.01715994, + "epoch": 0.6028558545017285, + "flos": 19934812033920.0, + "grad_norm": 1.7470013012371683, + "language_loss": 0.81298667, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.83389568, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 10027, + "time_per_iteration": 5.252922058105469 + }, + { + "auxiliary_loss_clip": 0.01056721, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.02140164, + "balance_loss_mlp": 1.01780713, + "epoch": 0.6029159777543965, + "flos": 22819720055040.0, + "grad_norm": 2.0225026042525127, + "language_loss": 0.72393072, + "learning_rate": 1.438080769071171e-06, + "loss": 0.74495661, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38867188, + "step": 10028, + "time_per_iteration": 2.384355068206787 + }, + { + "auxiliary_loss_clip": 0.01057726, + "auxiliary_loss_mlp": 0.01046766, + "balance_loss_clip": 1.02064705, + "balance_loss_mlp": 1.01820076, + "epoch": 0.6029761010070644, + "flos": 23586098509440.0, + "grad_norm": 1.719159671660472, + "language_loss": 0.84940588, + "learning_rate": 1.437707005721669e-06, + "loss": 0.87045085, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39648438, + "step": 10029, + "time_per_iteration": 2.402247667312622 + }, + { + "auxiliary_loss_clip": 0.0105408, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.01441467, + "balance_loss_mlp": 1.01681924, + "epoch": 0.6030362242597325, + "flos": 13661582964480.0, + "grad_norm": 1.8934965916828035, + "language_loss": 0.81761378, + "learning_rate": 1.437333263694373e-06, + "loss": 0.83853173, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37304688, + "step": 10030, + "time_per_iteration": 2.3609673976898193 + }, + { + "auxiliary_loss_clip": 0.01057984, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.01332474, + "balance_loss_mlp": 1.01878834, + "epoch": 0.6030963475124004, + "flos": 24421814657280.0, + "grad_norm": 1.5272563858611599, + "language_loss": 0.72435725, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.74532551, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 10031, + "time_per_iteration": 2.4335687160491943 + }, + { + "auxiliary_loss_clip": 0.01057108, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_clip": 1.01532018, + "balance_loss_mlp": 1.01725149, + "epoch": 0.6031564707650684, + "flos": 29642726304000.0, + "grad_norm": 1.5445109829831585, + "language_loss": 0.74033219, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.76133537, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.3984375, + "step": 10032, + "time_per_iteration": 2.4263806343078613 + }, + { + "auxiliary_loss_clip": 0.01057151, + "auxiliary_loss_mlp": 0.01042515, + "balance_loss_clip": 1.01644433, + "balance_loss_mlp": 1.01807332, + "epoch": 0.6032165940177363, + "flos": 16617818626560.0, + "grad_norm": 2.06502912268565, + "language_loss": 0.69918638, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.72018301, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 10033, + "time_per_iteration": 2.3496081829071045 + }, + { + "auxiliary_loss_clip": 0.01055948, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.01403928, + "balance_loss_mlp": 1.01826239, + "epoch": 0.6032767172704043, + "flos": 17487365748480.0, + "grad_norm": 1.8489189181336203, + "language_loss": 0.76930106, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.790254, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 10034, + "time_per_iteration": 2.3508565425872803 + }, + { + "auxiliary_loss_clip": 0.01057262, + "auxiliary_loss_mlp": 0.01041623, + "balance_loss_clip": 1.0153966, + "balance_loss_mlp": 1.01828384, + "epoch": 0.6033368405230723, + "flos": 26831764275840.0, + "grad_norm": 1.7045397186395337, + "language_loss": 0.75598866, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.77697754, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 10035, + "time_per_iteration": 3.836616277694702 + }, + { + "auxiliary_loss_clip": 0.0105434, + "auxiliary_loss_mlp": 0.01035393, + "balance_loss_clip": 1.01211119, + "balance_loss_mlp": 1.0175097, + "epoch": 0.6033969637757403, + "flos": 16908959387520.0, + "grad_norm": 1.565425281351307, + "language_loss": 0.86986268, + "learning_rate": 1.435091260090536e-06, + "loss": 0.89075994, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 10036, + "time_per_iteration": 2.328681230545044 + }, + { + "auxiliary_loss_clip": 0.01055956, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.01527119, + "balance_loss_mlp": 1.01645041, + "epoch": 0.6034570870284083, + "flos": 22928963299200.0, + "grad_norm": 1.8789243297714495, + "language_loss": 0.71435505, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.73533249, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 10037, + "time_per_iteration": 2.4216697216033936 + }, + { + "auxiliary_loss_clip": 0.01054903, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.01774096, + "balance_loss_mlp": 1.01776338, + "epoch": 0.6035172102810762, + "flos": 23365238048640.0, + "grad_norm": 1.9236525893463865, + "language_loss": 0.86398172, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.88494164, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 10038, + "time_per_iteration": 2.381261110305786 + }, + { + "auxiliary_loss_clip": 0.01056943, + "auxiliary_loss_mlp": 0.0104128, + "balance_loss_clip": 1.01596022, + "balance_loss_mlp": 1.01764607, + "epoch": 0.6035773335337442, + "flos": 20886020559360.0, + "grad_norm": 1.9464623804513126, + "language_loss": 0.7773329, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.79831517, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.39257812, + "step": 10039, + "time_per_iteration": 2.3926968574523926 + }, + { + "auxiliary_loss_clip": 0.01054725, + "auxiliary_loss_mlp": 0.01037392, + "balance_loss_clip": 1.01285887, + "balance_loss_mlp": 1.01697338, + "epoch": 0.6036374567864121, + "flos": 24935142510720.0, + "grad_norm": 1.7579240797709554, + "language_loss": 0.72440612, + "learning_rate": 1.433597019260301e-06, + "loss": 0.74532729, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37695312, + "step": 10040, + "time_per_iteration": 2.3875391483306885 + }, + { + "auxiliary_loss_clip": 0.01057963, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.01448798, + "balance_loss_mlp": 1.01755738, + "epoch": 0.6036975800390801, + "flos": 23147170496640.0, + "grad_norm": 1.940968858127802, + "language_loss": 0.79477692, + "learning_rate": 1.433223512712475e-06, + "loss": 0.81578535, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40429688, + "step": 10041, + "time_per_iteration": 2.3988823890686035 + }, + { + "auxiliary_loss_clip": 0.01055852, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.0143708, + "balance_loss_mlp": 1.01788008, + "epoch": 0.603757703291748, + "flos": 18659748936960.0, + "grad_norm": 1.7909329400534297, + "language_loss": 0.7650072, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.78595924, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 10042, + "time_per_iteration": 2.338707447052002 + }, + { + "auxiliary_loss_clip": 0.01054471, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.01672673, + "epoch": 0.6038178265444161, + "flos": 19681586876160.0, + "grad_norm": 2.667931501560209, + "language_loss": 0.85688448, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.87782705, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37695312, + "step": 10043, + "time_per_iteration": 2.410823106765747 + }, + { + "auxiliary_loss_clip": 0.01057471, + "auxiliary_loss_mlp": 0.01040412, + "balance_loss_clip": 1.01502049, + "balance_loss_mlp": 1.01749182, + "epoch": 0.603877949797084, + "flos": 22637124311040.0, + "grad_norm": 2.1318722219908826, + "language_loss": 0.70641315, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72739196, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.40039062, + "step": 10044, + "time_per_iteration": 2.356569290161133 + }, + { + "auxiliary_loss_clip": 0.01057583, + "auxiliary_loss_mlp": 0.01047286, + "balance_loss_clip": 1.01891375, + "balance_loss_mlp": 1.01788473, + "epoch": 0.603938073049752, + "flos": 25446689884800.0, + "grad_norm": 1.6320284887579466, + "language_loss": 0.78429461, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80534333, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39648438, + "step": 10045, + "time_per_iteration": 2.416003942489624 + }, + { + "auxiliary_loss_clip": 0.0105459, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.01613116, + "balance_loss_mlp": 1.01711988, + "epoch": 0.6039981963024199, + "flos": 22339210746240.0, + "grad_norm": 1.775896432647438, + "language_loss": 0.77880311, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.7997427, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 10046, + "time_per_iteration": 2.4374520778656006 + }, + { + "auxiliary_loss_clip": 0.01054852, + "auxiliary_loss_mlp": 0.010375, + "balance_loss_clip": 1.01388431, + "balance_loss_mlp": 1.01696754, + "epoch": 0.6040583195550879, + "flos": 20702133095040.0, + "grad_norm": 1.7282788967952203, + "language_loss": 0.87887394, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89979744, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 10047, + "time_per_iteration": 2.3855438232421875 + }, + { + "auxiliary_loss_clip": 0.01054551, + "auxiliary_loss_mlp": 0.01040132, + "balance_loss_clip": 1.01749444, + "balance_loss_mlp": 1.01750207, + "epoch": 0.604118442807756, + "flos": 27161867980800.0, + "grad_norm": 1.5712018331529578, + "language_loss": 0.7649647, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.78591156, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37109375, + "step": 10048, + "time_per_iteration": 2.4322800636291504 + }, + { + "auxiliary_loss_clip": 0.01060273, + "auxiliary_loss_mlp": 0.01056746, + "balance_loss_clip": 1.02441597, + "balance_loss_mlp": 1.01855052, + "epoch": 0.6041785660604239, + "flos": 30880257822720.0, + "grad_norm": 2.3635233156213507, + "language_loss": 0.67641109, + "learning_rate": 1.430236235239386e-06, + "loss": 0.69758129, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.41796875, + "step": 10049, + "time_per_iteration": 2.466089963912964 + }, + { + "auxiliary_loss_clip": 0.01055097, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.01677692, + "balance_loss_mlp": 1.01747668, + "epoch": 0.6042386893130919, + "flos": 19937186006400.0, + "grad_norm": 1.5201463575166956, + "language_loss": 0.677145, + "learning_rate": 1.429862922631336e-06, + "loss": 0.69810927, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 10050, + "time_per_iteration": 2.3757522106170654 + }, + { + "auxiliary_loss_clip": 0.01057579, + "auxiliary_loss_mlp": 0.01041316, + "balance_loss_clip": 1.01554346, + "balance_loss_mlp": 1.01858163, + "epoch": 0.6042988125657598, + "flos": 32414550831360.0, + "grad_norm": 1.9245496196008511, + "language_loss": 0.71052706, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.731516, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 10051, + "time_per_iteration": 2.4590303897857666 + }, + { + "auxiliary_loss_clip": 0.01053915, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.01177287, + "balance_loss_mlp": 1.015944, + "epoch": 0.6043589358184278, + "flos": 17419843445760.0, + "grad_norm": 1.9028008264227345, + "language_loss": 0.66022241, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.68112564, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 10052, + "time_per_iteration": 2.38143253326416 + }, + { + "auxiliary_loss_clip": 0.01055313, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.01691055, + "balance_loss_mlp": 1.01698875, + "epoch": 0.6044190590710957, + "flos": 27671599964160.0, + "grad_norm": 1.5570425749807124, + "language_loss": 0.69829649, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71929908, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.3828125, + "step": 10053, + "time_per_iteration": 2.4447693824768066 + }, + { + "auxiliary_loss_clip": 0.01010278, + "auxiliary_loss_mlp": 0.01004769, + "balance_loss_clip": 1.00246859, + "balance_loss_mlp": 1.00270414, + "epoch": 0.6044791823237637, + "flos": 65313241752960.0, + "grad_norm": 0.7230929446614806, + "language_loss": 0.60504025, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62519073, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.07568359, + "step": 10054, + "time_per_iteration": 3.1382691860198975 + }, + { + "auxiliary_loss_clip": 0.01054525, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.01219368, + "balance_loss_mlp": 1.01694822, + "epoch": 0.6045393055764317, + "flos": 24491396730240.0, + "grad_norm": 1.5748647512038563, + "language_loss": 0.86454016, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.88547021, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 10055, + "time_per_iteration": 2.3962295055389404 + }, + { + "auxiliary_loss_clip": 0.01060795, + "auxiliary_loss_mlp": 0.01045945, + "balance_loss_clip": 1.01950479, + "balance_loss_mlp": 1.02026033, + "epoch": 0.6045994288290997, + "flos": 19053569606400.0, + "grad_norm": 2.4912222679583254, + "language_loss": 0.7495544, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.77062184, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40625, + "step": 10056, + "time_per_iteration": 2.3341498374938965 + }, + { + "auxiliary_loss_clip": 0.01055512, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.01389146, + "balance_loss_mlp": 1.01713741, + "epoch": 0.6046595520817676, + "flos": 26575536741120.0, + "grad_norm": 1.8932602138371952, + "language_loss": 0.81159711, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.83254445, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 10057, + "time_per_iteration": 2.444180488586426 + }, + { + "auxiliary_loss_clip": 0.01054886, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.01000714, + "balance_loss_mlp": 1.01658833, + "epoch": 0.6047196753344356, + "flos": 13581632217600.0, + "grad_norm": 2.5832678790579986, + "language_loss": 0.76563412, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.78653604, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 10058, + "time_per_iteration": 2.372636556625366 + }, + { + "auxiliary_loss_clip": 0.01054687, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.00920796, + "balance_loss_mlp": 1.01693988, + "epoch": 0.6047797985871035, + "flos": 25519274334720.0, + "grad_norm": 1.8355110180155307, + "language_loss": 0.7154628, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73635876, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 10059, + "time_per_iteration": 2.4261057376861572 + }, + { + "auxiliary_loss_clip": 0.01055896, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.01307499, + "balance_loss_mlp": 1.01699531, + "epoch": 0.6048399218397715, + "flos": 20519153326080.0, + "grad_norm": 1.5278120639064248, + "language_loss": 0.77328753, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.79424936, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 10060, + "time_per_iteration": 2.417599678039551 + }, + { + "auxiliary_loss_clip": 0.01055454, + "auxiliary_loss_mlp": 0.01042322, + "balance_loss_clip": 1.017169, + "balance_loss_mlp": 1.01740122, + "epoch": 0.6049000450924396, + "flos": 20407850311680.0, + "grad_norm": 2.848556310749835, + "language_loss": 0.74938083, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.77035856, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 10061, + "time_per_iteration": 3.7193291187286377 + }, + { + "auxiliary_loss_clip": 0.01056309, + "auxiliary_loss_mlp": 0.01044526, + "balance_loss_clip": 1.01757324, + "balance_loss_mlp": 1.01675189, + "epoch": 0.6049601683451075, + "flos": 20740293077760.0, + "grad_norm": 1.701312392823698, + "language_loss": 0.67951524, + "learning_rate": 1.425384861715639e-06, + "loss": 0.70052356, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 10062, + "time_per_iteration": 2.381218910217285 + }, + { + "auxiliary_loss_clip": 0.01054421, + "auxiliary_loss_mlp": 0.01044493, + "balance_loss_clip": 1.01917291, + "balance_loss_mlp": 1.01605022, + "epoch": 0.6050202915977755, + "flos": 20082110526720.0, + "grad_norm": 2.0681578088813826, + "language_loss": 0.73166335, + "learning_rate": 1.425011831266978e-06, + "loss": 0.75265253, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 10063, + "time_per_iteration": 2.3881235122680664 + }, + { + "auxiliary_loss_clip": 0.01054826, + "auxiliary_loss_mlp": 0.01036505, + "balance_loss_clip": 1.01281857, + "balance_loss_mlp": 1.01689732, + "epoch": 0.6050804148504434, + "flos": 15959915366400.0, + "grad_norm": 1.580038064418922, + "language_loss": 0.85104436, + "learning_rate": 1.424638822621926e-06, + "loss": 0.87195766, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 10064, + "time_per_iteration": 2.3497180938720703 + }, + { + "auxiliary_loss_clip": 0.0105528, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.01369107, + "balance_loss_mlp": 1.01752234, + "epoch": 0.6051405381031114, + "flos": 17455699278720.0, + "grad_norm": 2.622611413800532, + "language_loss": 0.81740904, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.83834445, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 10065, + "time_per_iteration": 2.3603100776672363 + }, + { + "auxiliary_loss_clip": 0.0105929, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.01387715, + "balance_loss_mlp": 1.01826715, + "epoch": 0.6052006613557793, + "flos": 11399350775040.0, + "grad_norm": 2.0601715135281453, + "language_loss": 0.7943424, + "learning_rate": 1.423892870799226e-06, + "loss": 0.8153618, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.41015625, + "step": 10066, + "time_per_iteration": 3.765894889831543 + }, + { + "auxiliary_loss_clip": 0.01056523, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.01431513, + "balance_loss_mlp": 1.01682794, + "epoch": 0.6052607846084473, + "flos": 24749928414720.0, + "grad_norm": 1.506716426556112, + "language_loss": 0.74434, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.7653079, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 10067, + "time_per_iteration": 3.7590901851654053 + }, + { + "auxiliary_loss_clip": 0.01054371, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.01476824, + "balance_loss_mlp": 1.01654458, + "epoch": 0.6053209078611153, + "flos": 20740083609600.0, + "grad_norm": 1.4982910934938019, + "language_loss": 0.69326591, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.71421325, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 10068, + "time_per_iteration": 2.4085068702697754 + }, + { + "auxiliary_loss_clip": 0.01055416, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.01274872, + "balance_loss_mlp": 1.01605046, + "epoch": 0.6053810311137833, + "flos": 18952146506880.0, + "grad_norm": 1.9950215390282955, + "language_loss": 0.88109636, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.90203893, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 10069, + "time_per_iteration": 2.3394198417663574 + }, + { + "auxiliary_loss_clip": 0.01054371, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.01282048, + "balance_loss_mlp": 1.01621854, + "epoch": 0.6054411543664512, + "flos": 23949998277120.0, + "grad_norm": 1.4749230058685392, + "language_loss": 0.84257948, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.86348808, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3828125, + "step": 10070, + "time_per_iteration": 2.4191813468933105 + }, + { + "auxiliary_loss_clip": 0.01056672, + "auxiliary_loss_mlp": 0.01041095, + "balance_loss_clip": 1.01418936, + "balance_loss_mlp": 1.01650548, + "epoch": 0.6055012776191192, + "flos": 20592959673600.0, + "grad_norm": 1.601699119502509, + "language_loss": 0.86926472, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.89024234, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 10071, + "time_per_iteration": 2.3607540130615234 + }, + { + "auxiliary_loss_clip": 0.01058041, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.01769185, + "balance_loss_mlp": 1.01771057, + "epoch": 0.6055614008717871, + "flos": 30296928960000.0, + "grad_norm": 1.894987789549741, + "language_loss": 0.78396165, + "learning_rate": 1.421655540088603e-06, + "loss": 0.80499828, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 10072, + "time_per_iteration": 2.4543874263763428 + }, + { + "auxiliary_loss_clip": 0.01055083, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.01456332, + "balance_loss_mlp": 1.01606882, + "epoch": 0.6056215241244551, + "flos": 27123812732160.0, + "grad_norm": 1.907186574017839, + "language_loss": 0.75374997, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.77471662, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 10073, + "time_per_iteration": 2.4208884239196777 + }, + { + "auxiliary_loss_clip": 0.01009151, + "auxiliary_loss_mlp": 0.01020763, + "balance_loss_clip": 1.01850975, + "balance_loss_mlp": 1.00195098, + "epoch": 0.6056816473771232, + "flos": 56004699058560.0, + "grad_norm": 0.7797854709302074, + "language_loss": 0.55255222, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57285142, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.07226562, + "step": 10074, + "time_per_iteration": 4.456384658813477 + }, + { + "auxiliary_loss_clip": 0.01055443, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.01051092, + "balance_loss_mlp": 1.01842856, + "epoch": 0.6057417706297911, + "flos": 23548392374400.0, + "grad_norm": 1.7159582571694112, + "language_loss": 0.82389599, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.84479851, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 10075, + "time_per_iteration": 2.3855326175689697 + }, + { + "auxiliary_loss_clip": 0.01055121, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.00937581, + "balance_loss_mlp": 1.01667547, + "epoch": 0.6058018938824591, + "flos": 27743102161920.0, + "grad_norm": 1.7011406953373038, + "language_loss": 0.80101311, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.82191372, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38476562, + "step": 10076, + "time_per_iteration": 2.4487080574035645 + }, + { + "auxiliary_loss_clip": 0.01056373, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.0152061, + "balance_loss_mlp": 1.01709342, + "epoch": 0.605862017135127, + "flos": 22782293210880.0, + "grad_norm": 1.8341809904738935, + "language_loss": 0.74470919, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.76568401, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39257812, + "step": 10077, + "time_per_iteration": 2.3789799213409424 + }, + { + "auxiliary_loss_clip": 0.0105576, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_clip": 1.01895118, + "balance_loss_mlp": 1.0172596, + "epoch": 0.605922140387795, + "flos": 21213959760000.0, + "grad_norm": 1.6255654281743943, + "language_loss": 0.56921166, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.59019756, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38476562, + "step": 10078, + "time_per_iteration": 2.388460874557495 + }, + { + "auxiliary_loss_clip": 0.01058821, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.01378584, + "balance_loss_mlp": 1.01912868, + "epoch": 0.6059822636404629, + "flos": 27267236064000.0, + "grad_norm": 1.7395304027303617, + "language_loss": 0.71811163, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.73909926, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39648438, + "step": 10079, + "time_per_iteration": 2.5044214725494385 + }, + { + "auxiliary_loss_clip": 0.01054597, + "auxiliary_loss_mlp": 0.01045694, + "balance_loss_clip": 1.02132797, + "balance_loss_mlp": 1.01703238, + "epoch": 0.606042386893131, + "flos": 20630281783680.0, + "grad_norm": 1.992765764897413, + "language_loss": 0.63633847, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.6573413, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 10080, + "time_per_iteration": 2.3873131275177 + }, + { + "auxiliary_loss_clip": 0.01056763, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.01221049, + "balance_loss_mlp": 1.0177567, + "epoch": 0.6061025101457989, + "flos": 23001198635520.0, + "grad_norm": 2.0270619902429567, + "language_loss": 0.72244698, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.74339569, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 10081, + "time_per_iteration": 2.377899169921875 + }, + { + "auxiliary_loss_clip": 0.01054777, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.01174045, + "balance_loss_mlp": 1.0177865, + "epoch": 0.6061626333984669, + "flos": 29897627207040.0, + "grad_norm": 1.6710469538541883, + "language_loss": 0.70533222, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.72623026, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 10082, + "time_per_iteration": 2.4608869552612305 + }, + { + "auxiliary_loss_clip": 0.01056959, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.01772022, + "balance_loss_mlp": 1.01929641, + "epoch": 0.6062227566511348, + "flos": 25008041162880.0, + "grad_norm": 1.6822961843600346, + "language_loss": 0.67198169, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.69295752, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.375, + "step": 10083, + "time_per_iteration": 2.396024227142334 + }, + { + "auxiliary_loss_clip": 0.01056149, + "auxiliary_loss_mlp": 0.01040752, + "balance_loss_clip": 1.01645708, + "balance_loss_mlp": 1.01827884, + "epoch": 0.6062828799038028, + "flos": 19462925831040.0, + "grad_norm": 2.5143542788375886, + "language_loss": 0.75408173, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.77505076, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37890625, + "step": 10084, + "time_per_iteration": 2.3692259788513184 + }, + { + "auxiliary_loss_clip": 0.01056187, + "auxiliary_loss_mlp": 0.01043974, + "balance_loss_clip": 1.01689017, + "balance_loss_mlp": 1.01702046, + "epoch": 0.6063430031564707, + "flos": 13588719223680.0, + "grad_norm": 2.55211160879116, + "language_loss": 0.73665428, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.75765586, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39257812, + "step": 10085, + "time_per_iteration": 2.3311548233032227 + }, + { + "auxiliary_loss_clip": 0.01054212, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.01109791, + "balance_loss_mlp": 1.01668596, + "epoch": 0.6064031264091387, + "flos": 23254458704640.0, + "grad_norm": 2.087533667759812, + "language_loss": 0.77719009, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.79808784, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 10086, + "time_per_iteration": 2.4106812477111816 + }, + { + "auxiliary_loss_clip": 0.01054169, + "auxiliary_loss_mlp": 0.01034118, + "balance_loss_clip": 1.01056278, + "balance_loss_mlp": 1.01735806, + "epoch": 0.6064632496618068, + "flos": 22457286564480.0, + "grad_norm": 1.3770110018893293, + "language_loss": 0.73893034, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.75981319, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3671875, + "step": 10087, + "time_per_iteration": 2.3975656032562256 + }, + { + "auxiliary_loss_clip": 0.01053203, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.01409674, + "balance_loss_mlp": 1.01719618, + "epoch": 0.6065233729144747, + "flos": 25117493875200.0, + "grad_norm": 1.7153978872844409, + "language_loss": 0.84079677, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86169624, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 10088, + "time_per_iteration": 2.4119889736175537 + }, + { + "auxiliary_loss_clip": 0.01054636, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.01168704, + "balance_loss_mlp": 1.01660979, + "epoch": 0.6065834961671427, + "flos": 23476226860800.0, + "grad_norm": 2.0189233974031464, + "language_loss": 0.72458029, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.74548852, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38085938, + "step": 10089, + "time_per_iteration": 2.3739147186279297 + }, + { + "auxiliary_loss_clip": 0.01056275, + "auxiliary_loss_mlp": 0.01039434, + "balance_loss_clip": 1.01608086, + "balance_loss_mlp": 1.01762629, + "epoch": 0.6066436194198106, + "flos": 17018447011200.0, + "grad_norm": 2.7693877014255857, + "language_loss": 0.84235466, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.86331177, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38671875, + "step": 10090, + "time_per_iteration": 2.345466375350952 + }, + { + "auxiliary_loss_clip": 0.01059503, + "auxiliary_loss_mlp": 0.01047961, + "balance_loss_clip": 1.01873064, + "balance_loss_mlp": 1.01839459, + "epoch": 0.6067037426724786, + "flos": 18513777075840.0, + "grad_norm": 2.424577423063455, + "language_loss": 0.76737815, + "learning_rate": 1.4145758826341e-06, + "loss": 0.78845274, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41015625, + "step": 10091, + "time_per_iteration": 2.355814218521118 + }, + { + "auxiliary_loss_clip": 0.01052879, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.01486039, + "balance_loss_mlp": 1.01593173, + "epoch": 0.6067638659251465, + "flos": 22344901297920.0, + "grad_norm": 1.6518140662506577, + "language_loss": 0.80776978, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.82869905, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36914062, + "step": 10092, + "time_per_iteration": 2.442143440246582 + }, + { + "auxiliary_loss_clip": 0.01055538, + "auxiliary_loss_mlp": 0.01040932, + "balance_loss_clip": 1.01517093, + "balance_loss_mlp": 1.01653039, + "epoch": 0.6068239891778145, + "flos": 12450411388800.0, + "grad_norm": 1.9774632579834142, + "language_loss": 0.77380615, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.7947709, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 10093, + "time_per_iteration": 2.351202964782715 + }, + { + "auxiliary_loss_clip": 0.01053842, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.01557958, + "balance_loss_mlp": 1.01815844, + "epoch": 0.6068841124304825, + "flos": 23184736986240.0, + "grad_norm": 2.1601069835477875, + "language_loss": 0.88294041, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.9038738, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35742188, + "step": 10094, + "time_per_iteration": 2.3961546421051025 + }, + { + "auxiliary_loss_clip": 0.01055206, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.01291299, + "balance_loss_mlp": 1.01780248, + "epoch": 0.6069442356831505, + "flos": 18586920107520.0, + "grad_norm": 2.0603463091263143, + "language_loss": 0.73000485, + "learning_rate": 1.413086446353919e-06, + "loss": 0.7509222, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.375, + "step": 10095, + "time_per_iteration": 2.40669584274292 + }, + { + "auxiliary_loss_clip": 0.0105607, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.01554251, + "balance_loss_mlp": 1.01700127, + "epoch": 0.6070043589358184, + "flos": 20959268325120.0, + "grad_norm": 1.61878679684336, + "language_loss": 0.78026825, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.80121505, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.390625, + "step": 10096, + "time_per_iteration": 2.3878166675567627 + }, + { + "auxiliary_loss_clip": 0.01055844, + "auxiliary_loss_mlp": 0.01037885, + "balance_loss_clip": 1.01480627, + "balance_loss_mlp": 1.0173347, + "epoch": 0.6070644821884864, + "flos": 11691643610880.0, + "grad_norm": 1.6758034508983441, + "language_loss": 0.8097719, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.83070916, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.38476562, + "step": 10097, + "time_per_iteration": 2.4160642623901367 + }, + { + "auxiliary_loss_clip": 0.01052273, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.01338482, + "balance_loss_mlp": 1.01633072, + "epoch": 0.6071246054411543, + "flos": 19309762229760.0, + "grad_norm": 2.005187811436659, + "language_loss": 0.68526053, + "learning_rate": 1.411969602780478e-06, + "loss": 0.70615709, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 10098, + "time_per_iteration": 2.362675428390503 + }, + { + "auxiliary_loss_clip": 0.01055171, + "auxiliary_loss_mlp": 0.01040537, + "balance_loss_clip": 1.01679087, + "balance_loss_mlp": 1.01741374, + "epoch": 0.6071847286938223, + "flos": 17748061937280.0, + "grad_norm": 1.856235238382897, + "language_loss": 0.81502193, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.83597904, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37695312, + "step": 10099, + "time_per_iteration": 2.3933286666870117 + }, + { + "auxiliary_loss_clip": 0.01057484, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.01451981, + "balance_loss_mlp": 1.01740289, + "epoch": 0.6072448519464904, + "flos": 22636426083840.0, + "grad_norm": 2.288897385660846, + "language_loss": 0.7204082, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.74137902, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.40234375, + "step": 10100, + "time_per_iteration": 2.3931686878204346 + }, + { + "auxiliary_loss_clip": 0.01058658, + "auxiliary_loss_mlp": 0.01047478, + "balance_loss_clip": 1.02084661, + "balance_loss_mlp": 1.01983559, + "epoch": 0.6073049751991583, + "flos": 19536278330880.0, + "grad_norm": 1.6201068969416446, + "language_loss": 0.71823186, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.73929322, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38867188, + "step": 10101, + "time_per_iteration": 3.707383632659912 + }, + { + "auxiliary_loss_clip": 0.01053618, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.01719391, + "balance_loss_mlp": 1.01651239, + "epoch": 0.6073650984518263, + "flos": 28292949164160.0, + "grad_norm": 1.9703308230510506, + "language_loss": 0.6996755, + "learning_rate": 1.410480790256154e-06, + "loss": 0.72062123, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 10102, + "time_per_iteration": 2.462928056716919 + }, + { + "auxiliary_loss_clip": 0.01056563, + "auxiliary_loss_mlp": 0.0104715, + "balance_loss_clip": 1.02241421, + "balance_loss_mlp": 1.01835012, + "epoch": 0.6074252217044942, + "flos": 25663291159680.0, + "grad_norm": 1.8427691617554462, + "language_loss": 0.7433483, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.76438546, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3828125, + "step": 10103, + "time_per_iteration": 2.383058547973633 + }, + { + "auxiliary_loss_clip": 0.01058114, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.01413381, + "balance_loss_mlp": 1.01850438, + "epoch": 0.6074853449571622, + "flos": 22855994824320.0, + "grad_norm": 2.5120703533469575, + "language_loss": 0.77557468, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.79655594, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 10104, + "time_per_iteration": 2.4433412551879883 + }, + { + "auxiliary_loss_clip": 0.01008282, + "auxiliary_loss_mlp": 0.01003094, + "balance_loss_clip": 1.00057912, + "balance_loss_mlp": 1.00146842, + "epoch": 0.6075454682098301, + "flos": 67108126216320.0, + "grad_norm": 0.7110531802209868, + "language_loss": 0.56128699, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58140075, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.02514648, + "router_z_loss_mlp": 0.06835938, + "step": 10105, + "time_per_iteration": 3.0163962841033936 + }, + { + "auxiliary_loss_clip": 0.01008878, + "auxiliary_loss_mlp": 0.01004866, + "balance_loss_clip": 1.00235057, + "balance_loss_mlp": 1.00187433, + "epoch": 0.6076055914624982, + "flos": 70708963910400.0, + "grad_norm": 0.7600719995737946, + "language_loss": 0.56845045, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58858788, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.02514648, + "router_z_loss_mlp": 0.0703125, + "step": 10106, + "time_per_iteration": 5.7897655963897705 + }, + { + "auxiliary_loss_clip": 0.01052679, + "auxiliary_loss_mlp": 0.0103981, + "balance_loss_clip": 1.01601529, + "balance_loss_mlp": 1.01635015, + "epoch": 0.6076657147151661, + "flos": 28363334198400.0, + "grad_norm": 1.613839047428806, + "language_loss": 0.69394398, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.7148689, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 10107, + "time_per_iteration": 2.453108072280884 + }, + { + "auxiliary_loss_clip": 0.01057271, + "auxiliary_loss_mlp": 0.01038146, + "balance_loss_clip": 1.01448345, + "balance_loss_mlp": 1.01807451, + "epoch": 0.6077258379678341, + "flos": 15048856771200.0, + "grad_norm": 2.116723110189438, + "language_loss": 0.81562561, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.8365798, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.390625, + "step": 10108, + "time_per_iteration": 2.3345375061035156 + }, + { + "auxiliary_loss_clip": 0.01058664, + "auxiliary_loss_mlp": 0.01042803, + "balance_loss_clip": 1.01581407, + "balance_loss_mlp": 1.01834416, + "epoch": 0.607785961220502, + "flos": 36165968686080.0, + "grad_norm": 1.8961280078830969, + "language_loss": 0.72577816, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.74679279, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40234375, + "step": 10109, + "time_per_iteration": 2.4980127811431885 + }, + { + "auxiliary_loss_clip": 0.01051448, + "auxiliary_loss_mlp": 0.01036491, + "balance_loss_clip": 1.01329267, + "balance_loss_mlp": 1.01575446, + "epoch": 0.60784608447317, + "flos": 22523272767360.0, + "grad_norm": 1.6241161522472802, + "language_loss": 0.80653965, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82741904, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 10110, + "time_per_iteration": 2.3622288703918457 + }, + { + "auxiliary_loss_clip": 0.01055939, + "auxiliary_loss_mlp": 0.01037201, + "balance_loss_clip": 1.01239347, + "balance_loss_mlp": 1.01682174, + "epoch": 0.6079062077258379, + "flos": 23840056805760.0, + "grad_norm": 2.3711157006717696, + "language_loss": 0.71512127, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.73605275, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 10111, + "time_per_iteration": 2.4092485904693604 + }, + { + "auxiliary_loss_clip": 0.01057262, + "auxiliary_loss_mlp": 0.01041627, + "balance_loss_clip": 1.01552057, + "balance_loss_mlp": 1.0183593, + "epoch": 0.6079663309785059, + "flos": 23365936275840.0, + "grad_norm": 2.667199785939236, + "language_loss": 0.66453522, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.68552411, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 10112, + "time_per_iteration": 2.3841166496276855 + }, + { + "auxiliary_loss_clip": 0.01008851, + "auxiliary_loss_mlp": 0.01002767, + "balance_loss_clip": 1.00032282, + "balance_loss_mlp": 1.00211012, + "epoch": 0.6080264542311739, + "flos": 71379400348800.0, + "grad_norm": 0.6281910934367857, + "language_loss": 0.49674919, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51686537, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.06738281, + "step": 10113, + "time_per_iteration": 4.4834418296813965 + }, + { + "auxiliary_loss_clip": 0.01008987, + "auxiliary_loss_mlp": 0.01004038, + "balance_loss_clip": 1.00158238, + "balance_loss_mlp": 1.00224495, + "epoch": 0.6080865774838419, + "flos": 66526508010240.0, + "grad_norm": 0.8433108303729301, + "language_loss": 0.57013965, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.59026992, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.06738281, + "step": 10114, + "time_per_iteration": 2.9610743522644043 + }, + { + "auxiliary_loss_clip": 0.01057116, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.00939691, + "balance_loss_mlp": 1.01800776, + "epoch": 0.6081467007365099, + "flos": 19206942675840.0, + "grad_norm": 1.8804016506162635, + "language_loss": 0.72220814, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.74314713, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 10115, + "time_per_iteration": 2.3535733222961426 + }, + { + "auxiliary_loss_clip": 0.01055967, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.01467538, + "balance_loss_mlp": 1.01777315, + "epoch": 0.6082068239891778, + "flos": 24166669374720.0, + "grad_norm": 1.7578055187957127, + "language_loss": 0.73360687, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.75456661, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 10116, + "time_per_iteration": 2.437307357788086 + }, + { + "auxiliary_loss_clip": 0.0105756, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_clip": 1.01834249, + "balance_loss_mlp": 1.0175817, + "epoch": 0.6082669472418458, + "flos": 37411844019840.0, + "grad_norm": 4.247967243604657, + "language_loss": 0.54834235, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.56937903, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40039062, + "step": 10117, + "time_per_iteration": 2.5046637058258057 + }, + { + "auxiliary_loss_clip": 0.01056063, + "auxiliary_loss_mlp": 0.01040273, + "balance_loss_clip": 1.01443982, + "balance_loss_mlp": 1.01726079, + "epoch": 0.6083270704945137, + "flos": 15084642781440.0, + "grad_norm": 1.7947523458000971, + "language_loss": 0.71800816, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.73897153, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 10118, + "time_per_iteration": 2.384993076324463 + }, + { + "auxiliary_loss_clip": 0.01054958, + "auxiliary_loss_mlp": 0.0103875, + "balance_loss_clip": 1.01465774, + "balance_loss_mlp": 1.01675773, + "epoch": 0.6083871937471818, + "flos": 20667394425600.0, + "grad_norm": 1.5490050955863737, + "language_loss": 0.75703186, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.77796888, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 10119, + "time_per_iteration": 2.3810994625091553 + }, + { + "auxiliary_loss_clip": 0.01055809, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.01303279, + "balance_loss_mlp": 1.01757467, + "epoch": 0.6084473169998497, + "flos": 21505833659520.0, + "grad_norm": 1.7074518740409932, + "language_loss": 0.68140876, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.70234609, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3828125, + "step": 10120, + "time_per_iteration": 2.419356346130371 + }, + { + "auxiliary_loss_clip": 0.01057815, + "auxiliary_loss_mlp": 0.01047515, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.01755023, + "epoch": 0.6085074402525177, + "flos": 26868842006400.0, + "grad_norm": 1.7306925516631244, + "language_loss": 0.75380182, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.77485514, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 10121, + "time_per_iteration": 2.4245526790618896 + }, + { + "auxiliary_loss_clip": 0.01056456, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.01466691, + "balance_loss_mlp": 1.01795244, + "epoch": 0.6085675635051856, + "flos": 10889060209920.0, + "grad_norm": 2.5088655894724035, + "language_loss": 0.81576145, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.83670324, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.38476562, + "step": 10122, + "time_per_iteration": 2.367236375808716 + }, + { + "auxiliary_loss_clip": 0.01054861, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.01937008, + "balance_loss_mlp": 1.01707411, + "epoch": 0.6086276867578536, + "flos": 34860705396480.0, + "grad_norm": 1.6657428658776678, + "language_loss": 0.57003474, + "learning_rate": 1.402670413578284e-06, + "loss": 0.59102428, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37890625, + "step": 10123, + "time_per_iteration": 2.487635850906372 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01041974, + "balance_loss_clip": 1.01782215, + "balance_loss_mlp": 1.01813877, + "epoch": 0.6086878100105215, + "flos": 20046673630080.0, + "grad_norm": 1.965663591587554, + "language_loss": 0.75476539, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.77575171, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38476562, + "step": 10124, + "time_per_iteration": 2.3803932666778564 + }, + { + "auxiliary_loss_clip": 0.01056311, + "auxiliary_loss_mlp": 0.01040838, + "balance_loss_clip": 1.01544666, + "balance_loss_mlp": 1.01782489, + "epoch": 0.6087479332631895, + "flos": 18331495534080.0, + "grad_norm": 1.8947008136279415, + "language_loss": 0.66711384, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.68808532, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 10125, + "time_per_iteration": 2.340538740158081 + }, + { + "auxiliary_loss_clip": 0.0105453, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.01114011, + "balance_loss_mlp": 1.01773667, + "epoch": 0.6088080565158575, + "flos": 24492409159680.0, + "grad_norm": 1.7709671151275594, + "language_loss": 0.77409565, + "learning_rate": 1.40155545786479e-06, + "loss": 0.79499245, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 10126, + "time_per_iteration": 2.424031972885132 + }, + { + "auxiliary_loss_clip": 0.01057237, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.01025915, + "balance_loss_mlp": 1.01737905, + "epoch": 0.6088681797685255, + "flos": 10268269591680.0, + "grad_norm": 2.496299378804706, + "language_loss": 0.74478281, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.7657212, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 10127, + "time_per_iteration": 2.3721301555633545 + }, + { + "auxiliary_loss_clip": 0.01056421, + "auxiliary_loss_mlp": 0.01042539, + "balance_loss_clip": 1.01602721, + "balance_loss_mlp": 1.01731229, + "epoch": 0.6089283030211935, + "flos": 21972832272000.0, + "grad_norm": 2.570220221718418, + "language_loss": 0.73415744, + "learning_rate": 1.400812267497691e-06, + "loss": 0.75514698, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 10128, + "time_per_iteration": 2.4035398960113525 + }, + { + "auxiliary_loss_clip": 0.0105358, + "auxiliary_loss_mlp": 0.01039233, + "balance_loss_clip": 1.01524794, + "balance_loss_mlp": 1.01639342, + "epoch": 0.6089884262738614, + "flos": 17784231972480.0, + "grad_norm": 1.963888248489555, + "language_loss": 0.74838346, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.76931161, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 10129, + "time_per_iteration": 2.3678672313690186 + }, + { + "auxiliary_loss_clip": 0.01053522, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.01338315, + "balance_loss_mlp": 1.01534474, + "epoch": 0.6090485495265294, + "flos": 36908745194880.0, + "grad_norm": 1.5534316782780562, + "language_loss": 0.66597307, + "learning_rate": 1.400069168015626e-06, + "loss": 0.68688899, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 10130, + "time_per_iteration": 2.5567409992218018 + }, + { + "auxiliary_loss_clip": 0.01052551, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.00985789, + "balance_loss_mlp": 1.01585579, + "epoch": 0.6091086727791973, + "flos": 19898083416960.0, + "grad_norm": 1.5718001089771045, + "language_loss": 0.7835381, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.80439246, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3671875, + "step": 10131, + "time_per_iteration": 2.3770458698272705 + }, + { + "auxiliary_loss_clip": 0.0105548, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.01797938, + "balance_loss_mlp": 1.01770759, + "epoch": 0.6091687960318654, + "flos": 22162549933440.0, + "grad_norm": 1.8199195292763626, + "language_loss": 0.78012693, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.80111158, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 10132, + "time_per_iteration": 2.364088296890259 + }, + { + "auxiliary_loss_clip": 0.01052153, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.01568651, + "balance_loss_mlp": 1.01659656, + "epoch": 0.6092289192845333, + "flos": 21464357097600.0, + "grad_norm": 1.6218005773946338, + "language_loss": 0.76167333, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.78257895, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 10133, + "time_per_iteration": 2.3851053714752197 + }, + { + "auxiliary_loss_clip": 0.01053623, + "auxiliary_loss_mlp": 0.01041701, + "balance_loss_clip": 1.01599956, + "balance_loss_mlp": 1.01611757, + "epoch": 0.6092890425372013, + "flos": 28693647371520.0, + "grad_norm": 1.9082199846094894, + "language_loss": 0.65023088, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.67118418, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 10134, + "time_per_iteration": 2.4249978065490723 + }, + { + "auxiliary_loss_clip": 0.01052435, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.01011419, + "balance_loss_mlp": 1.01582146, + "epoch": 0.6093491657898692, + "flos": 20812144389120.0, + "grad_norm": 1.835014484177221, + "language_loss": 0.80065668, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.82151961, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 10135, + "time_per_iteration": 2.371340036392212 + }, + { + "auxiliary_loss_clip": 0.01055371, + "auxiliary_loss_mlp": 0.01039586, + "balance_loss_clip": 1.01603043, + "balance_loss_mlp": 1.01705718, + "epoch": 0.6094092890425372, + "flos": 25445817100800.0, + "grad_norm": 1.7831916235547955, + "language_loss": 0.72851038, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74945998, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3828125, + "step": 10136, + "time_per_iteration": 2.3884825706481934 + }, + { + "auxiliary_loss_clip": 0.01055116, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.01458442, + "balance_loss_mlp": 1.0169723, + "epoch": 0.6094694122952051, + "flos": 35619961933440.0, + "grad_norm": 1.8277198989264358, + "language_loss": 0.75970495, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.78066164, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 10137, + "time_per_iteration": 2.504307508468628 + }, + { + "auxiliary_loss_clip": 0.01057126, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.01623273, + "balance_loss_mlp": 1.01786017, + "epoch": 0.6095295355478731, + "flos": 24455959833600.0, + "grad_norm": 1.6649886771836895, + "language_loss": 0.80864334, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82964075, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 10138, + "time_per_iteration": 2.3972814083099365 + }, + { + "auxiliary_loss_clip": 0.01052349, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.01531875, + "balance_loss_mlp": 1.01709402, + "epoch": 0.6095896588005411, + "flos": 15632290368000.0, + "grad_norm": 4.4079750506959225, + "language_loss": 0.81825644, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83917868, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35351562, + "step": 10139, + "time_per_iteration": 2.354684352874756 + }, + { + "auxiliary_loss_clip": 0.01057743, + "auxiliary_loss_mlp": 0.01041667, + "balance_loss_clip": 1.01466644, + "balance_loss_mlp": 1.01807809, + "epoch": 0.6096497820532091, + "flos": 15549930737280.0, + "grad_norm": 2.3605001165086374, + "language_loss": 0.84766901, + "learning_rate": 1.396355037825315e-06, + "loss": 0.86866313, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 10140, + "time_per_iteration": 2.321728229522705 + }, + { + "auxiliary_loss_clip": 0.01054915, + "auxiliary_loss_mlp": 0.01043356, + "balance_loss_clip": 1.01859593, + "balance_loss_mlp": 1.01595902, + "epoch": 0.6097099053058771, + "flos": 24203397991680.0, + "grad_norm": 2.8669061180079267, + "language_loss": 0.76539171, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.78637445, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.390625, + "step": 10141, + "time_per_iteration": 3.65199613571167 + }, + { + "auxiliary_loss_clip": 0.01054371, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.01069069, + "balance_loss_mlp": 1.0160346, + "epoch": 0.609770028558545, + "flos": 19569306343680.0, + "grad_norm": 1.8116825012556448, + "language_loss": 0.7709986, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.79189277, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3828125, + "step": 10142, + "time_per_iteration": 2.3339269161224365 + }, + { + "auxiliary_loss_clip": 0.0105498, + "auxiliary_loss_mlp": 0.01042448, + "balance_loss_clip": 1.01644874, + "balance_loss_mlp": 1.01692009, + "epoch": 0.609830151811213, + "flos": 23948113063680.0, + "grad_norm": 1.6484057234219356, + "language_loss": 0.78430855, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.80528283, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38085938, + "step": 10143, + "time_per_iteration": 2.404829740524292 + }, + { + "auxiliary_loss_clip": 0.01054236, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.01385069, + "balance_loss_mlp": 1.01596713, + "epoch": 0.6098902750638809, + "flos": 16178820791040.0, + "grad_norm": 1.8533256669931164, + "language_loss": 0.76428032, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.78521854, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 10144, + "time_per_iteration": 2.3285562992095947 + }, + { + "auxiliary_loss_clip": 0.01056201, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.01797342, + "balance_loss_mlp": 1.01676166, + "epoch": 0.609950398316549, + "flos": 44524769132160.0, + "grad_norm": 1.7557018421791746, + "language_loss": 0.74702156, + "learning_rate": 1.394498830235383e-06, + "loss": 0.76801878, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39453125, + "step": 10145, + "time_per_iteration": 3.9736719131469727 + }, + { + "auxiliary_loss_clip": 0.01053712, + "auxiliary_loss_mlp": 0.0103954, + "balance_loss_clip": 1.01542425, + "balance_loss_mlp": 1.01723123, + "epoch": 0.6100105215692169, + "flos": 23220627730560.0, + "grad_norm": 2.0027674695441697, + "language_loss": 0.7036798, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.7246123, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 10146, + "time_per_iteration": 3.8053507804870605 + }, + { + "auxiliary_loss_clip": 0.01052525, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.01123703, + "balance_loss_mlp": 1.01660931, + "epoch": 0.6100706448218849, + "flos": 15011674306560.0, + "grad_norm": 1.6418260738230064, + "language_loss": 0.78046823, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.80133045, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 10147, + "time_per_iteration": 2.3386895656585693 + }, + { + "auxiliary_loss_clip": 0.01054295, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.01220298, + "balance_loss_mlp": 1.01617718, + "epoch": 0.6101307680745528, + "flos": 19639132796160.0, + "grad_norm": 1.9619752613431187, + "language_loss": 0.79608566, + "learning_rate": 1.393385381096786e-06, + "loss": 0.81698608, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.38085938, + "step": 10148, + "time_per_iteration": 2.357222557067871 + }, + { + "auxiliary_loss_clip": 0.01058404, + "auxiliary_loss_mlp": 0.01044315, + "balance_loss_clip": 1.0158596, + "balance_loss_mlp": 1.01703727, + "epoch": 0.6101908913272208, + "flos": 29934251089920.0, + "grad_norm": 2.0238669635046516, + "language_loss": 0.551687, + "learning_rate": 1.39301427737093e-06, + "loss": 0.57271421, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.4140625, + "step": 10149, + "time_per_iteration": 2.4173364639282227 + }, + { + "auxiliary_loss_clip": 0.01053336, + "auxiliary_loss_mlp": 0.01042939, + "balance_loss_clip": 1.01784575, + "balance_loss_mlp": 1.01710916, + "epoch": 0.6102510145798887, + "flos": 21797567913600.0, + "grad_norm": 3.352150278897715, + "language_loss": 0.8120327, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.83299541, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 10150, + "time_per_iteration": 2.3728349208831787 + }, + { + "auxiliary_loss_clip": 0.01057943, + "auxiliary_loss_mlp": 0.01048915, + "balance_loss_clip": 1.02292728, + "balance_loss_mlp": 1.0188365, + "epoch": 0.6103111378325567, + "flos": 20705030737920.0, + "grad_norm": 1.6817292173346392, + "language_loss": 0.69949913, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.72056764, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 10151, + "time_per_iteration": 2.376608371734619 + }, + { + "auxiliary_loss_clip": 0.01054216, + "auxiliary_loss_mlp": 0.01034158, + "balance_loss_clip": 1.01117432, + "balance_loss_mlp": 1.01674056, + "epoch": 0.6103712610852247, + "flos": 29380528926720.0, + "grad_norm": 1.683685675848623, + "language_loss": 0.72168148, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.74256516, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.375, + "step": 10152, + "time_per_iteration": 2.4395201206207275 + }, + { + "auxiliary_loss_clip": 0.01056303, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.01585603, + "balance_loss_mlp": 1.01718307, + "epoch": 0.6104313843378927, + "flos": 20812004743680.0, + "grad_norm": 1.7474882844734831, + "language_loss": 0.79200125, + "learning_rate": 1.391530092777811e-06, + "loss": 0.81297946, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 10153, + "time_per_iteration": 3.7833759784698486 + }, + { + "auxiliary_loss_clip": 0.01055352, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.01412892, + "balance_loss_mlp": 1.01696491, + "epoch": 0.6104915075905607, + "flos": 26577247397760.0, + "grad_norm": 1.7556249128454096, + "language_loss": 0.80533963, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.82628453, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 10154, + "time_per_iteration": 2.3988234996795654 + }, + { + "auxiliary_loss_clip": 0.01054412, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.0140599, + "balance_loss_mlp": 1.01753998, + "epoch": 0.6105516308432286, + "flos": 23914631203200.0, + "grad_norm": 1.561216030409758, + "language_loss": 0.71023667, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.73115873, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 10155, + "time_per_iteration": 2.384814739227295 + }, + { + "auxiliary_loss_clip": 0.01056681, + "auxiliary_loss_mlp": 0.01048257, + "balance_loss_clip": 1.02018285, + "balance_loss_mlp": 1.0186193, + "epoch": 0.6106117540958966, + "flos": 31576006863360.0, + "grad_norm": 1.6748906020840442, + "language_loss": 0.72298396, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.7440334, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.38085938, + "step": 10156, + "time_per_iteration": 2.473940849304199 + }, + { + "auxiliary_loss_clip": 0.01053198, + "auxiliary_loss_mlp": 0.01045781, + "balance_loss_clip": 1.02097368, + "balance_loss_mlp": 1.01704359, + "epoch": 0.6106718773485645, + "flos": 19607187035520.0, + "grad_norm": 6.09325101433792, + "language_loss": 0.67993784, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.70092762, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 10157, + "time_per_iteration": 2.3599741458892822 + }, + { + "auxiliary_loss_clip": 0.0105382, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.01311183, + "balance_loss_mlp": 1.01578784, + "epoch": 0.6107320006012326, + "flos": 17123081955840.0, + "grad_norm": 2.0104429449004515, + "language_loss": 0.7380724, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.75899076, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 10158, + "time_per_iteration": 2.3594093322753906 + }, + { + "auxiliary_loss_clip": 0.01056364, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.01416445, + "balance_loss_mlp": 1.01749945, + "epoch": 0.6107921238539005, + "flos": 30147081937920.0, + "grad_norm": 1.5542448216428797, + "language_loss": 0.70227909, + "learning_rate": 1.389304508366635e-06, + "loss": 0.72323835, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38867188, + "step": 10159, + "time_per_iteration": 2.432460308074951 + }, + { + "auxiliary_loss_clip": 0.01055312, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.01002026, + "balance_loss_mlp": 1.01678276, + "epoch": 0.6108522471065685, + "flos": 18439342323840.0, + "grad_norm": 1.7240088646101073, + "language_loss": 0.79647923, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81738383, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 10160, + "time_per_iteration": 2.38443922996521 + }, + { + "auxiliary_loss_clip": 0.01008971, + "auxiliary_loss_mlp": 0.01009802, + "balance_loss_clip": 1.00744152, + "balance_loss_mlp": 1.00168073, + "epoch": 0.6109123703592364, + "flos": 64131814656000.0, + "grad_norm": 0.8302795922540579, + "language_loss": 0.61560953, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63579726, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07324219, + "step": 10161, + "time_per_iteration": 3.174032688140869 + }, + { + "auxiliary_loss_clip": 0.01056635, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_clip": 1.01724243, + "balance_loss_mlp": 1.01769304, + "epoch": 0.6109724936119044, + "flos": 20666800932480.0, + "grad_norm": 1.5863965532564583, + "language_loss": 0.77472609, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.79572046, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.390625, + "step": 10162, + "time_per_iteration": 2.374626398086548 + }, + { + "auxiliary_loss_clip": 0.0105515, + "auxiliary_loss_mlp": 0.01041392, + "balance_loss_clip": 1.01574993, + "balance_loss_mlp": 1.01734328, + "epoch": 0.6110326168645723, + "flos": 31350712659840.0, + "grad_norm": 2.500014481168638, + "language_loss": 0.72777152, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.74873698, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37695312, + "step": 10163, + "time_per_iteration": 2.471022605895996 + }, + { + "auxiliary_loss_clip": 0.01052608, + "auxiliary_loss_mlp": 0.01035626, + "balance_loss_clip": 1.01316738, + "balance_loss_mlp": 1.01606166, + "epoch": 0.6110927401172404, + "flos": 25002385522560.0, + "grad_norm": 1.8317847307123063, + "language_loss": 0.60648441, + "learning_rate": 1.387450491396625e-06, + "loss": 0.62736678, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36523438, + "step": 10164, + "time_per_iteration": 2.3980040550231934 + }, + { + "auxiliary_loss_clip": 0.010538, + "auxiliary_loss_mlp": 0.01043405, + "balance_loss_clip": 1.01949179, + "balance_loss_mlp": 1.0163908, + "epoch": 0.6111528633699083, + "flos": 26246934224640.0, + "grad_norm": 1.6860364501075091, + "language_loss": 0.76352775, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.78449982, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37304688, + "step": 10165, + "time_per_iteration": 2.3831772804260254 + }, + { + "auxiliary_loss_clip": 0.01054119, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.015136, + "balance_loss_mlp": 1.01777327, + "epoch": 0.6112129866225763, + "flos": 22381385535360.0, + "grad_norm": 1.534629945281974, + "language_loss": 0.80208755, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.82302582, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 10166, + "time_per_iteration": 2.3842854499816895 + }, + { + "auxiliary_loss_clip": 0.01055495, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.01199269, + "balance_loss_mlp": 1.01745558, + "epoch": 0.6112731098752443, + "flos": 25226737119360.0, + "grad_norm": 1.8347021853031908, + "language_loss": 0.69073033, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.71166354, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38085938, + "step": 10167, + "time_per_iteration": 2.3885860443115234 + }, + { + "auxiliary_loss_clip": 0.01054151, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.01427603, + "balance_loss_mlp": 1.01762319, + "epoch": 0.6113332331279122, + "flos": 22892060125440.0, + "grad_norm": 1.7883391119799896, + "language_loss": 0.80245984, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.82336146, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36523438, + "step": 10168, + "time_per_iteration": 2.3915746212005615 + }, + { + "auxiliary_loss_clip": 0.01058703, + "auxiliary_loss_mlp": 0.01049086, + "balance_loss_clip": 1.02134562, + "balance_loss_mlp": 1.01760066, + "epoch": 0.6113933563805802, + "flos": 18619459361280.0, + "grad_norm": 2.751229441709021, + "language_loss": 0.87331653, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.8943944, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.41015625, + "step": 10169, + "time_per_iteration": 2.3326714038848877 + }, + { + "auxiliary_loss_clip": 0.01053535, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.01040816, + "balance_loss_mlp": 1.01629817, + "epoch": 0.6114534796332481, + "flos": 41864631644160.0, + "grad_norm": 1.717906418074867, + "language_loss": 0.79408246, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.81495261, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37109375, + "step": 10170, + "time_per_iteration": 2.5349748134613037 + }, + { + "auxiliary_loss_clip": 0.01057348, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_clip": 1.02035153, + "balance_loss_mlp": 1.01666176, + "epoch": 0.6115136028859162, + "flos": 21907369739520.0, + "grad_norm": 1.825873760285473, + "language_loss": 0.6976009, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.71866417, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40625, + "step": 10171, + "time_per_iteration": 2.3608834743499756 + }, + { + "auxiliary_loss_clip": 0.01057816, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.01457417, + "balance_loss_mlp": 1.01757491, + "epoch": 0.6115737261385841, + "flos": 28803553931520.0, + "grad_norm": 1.6821312382291538, + "language_loss": 0.80177891, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.82279629, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.40234375, + "step": 10172, + "time_per_iteration": 2.424043655395508 + }, + { + "auxiliary_loss_clip": 0.01058011, + "auxiliary_loss_mlp": 0.01039302, + "balance_loss_clip": 1.01299238, + "balance_loss_mlp": 1.01785517, + "epoch": 0.6116338493912521, + "flos": 21250409086080.0, + "grad_norm": 3.9431915422615016, + "language_loss": 0.68149912, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.70247221, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.40234375, + "step": 10173, + "time_per_iteration": 2.36535382270813 + }, + { + "auxiliary_loss_clip": 0.01056654, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.01320553, + "balance_loss_mlp": 1.01741242, + "epoch": 0.61169397264392, + "flos": 17529226335360.0, + "grad_norm": 1.7880653718311157, + "language_loss": 0.56936991, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.59032089, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.39257812, + "step": 10174, + "time_per_iteration": 2.3660430908203125 + }, + { + "auxiliary_loss_clip": 0.0105774, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.0145098, + "balance_loss_mlp": 1.01729321, + "epoch": 0.611754095896588, + "flos": 23950417213440.0, + "grad_norm": 1.896923775863193, + "language_loss": 0.67183858, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.69283324, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 10175, + "time_per_iteration": 2.3844029903411865 + }, + { + "auxiliary_loss_clip": 0.01054156, + "auxiliary_loss_mlp": 0.01040073, + "balance_loss_clip": 1.01537299, + "balance_loss_mlp": 1.01597261, + "epoch": 0.6118142191492559, + "flos": 25993674155520.0, + "grad_norm": 1.908534130005781, + "language_loss": 0.84326243, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.8642047, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38085938, + "step": 10176, + "time_per_iteration": 2.427708387374878 + }, + { + "auxiliary_loss_clip": 0.01055338, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.0165664, + "balance_loss_mlp": 1.01626813, + "epoch": 0.611874342401924, + "flos": 24602176074240.0, + "grad_norm": 1.8423979073802392, + "language_loss": 0.78243244, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.80342239, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.390625, + "step": 10177, + "time_per_iteration": 2.376904249191284 + }, + { + "auxiliary_loss_clip": 0.01055135, + "auxiliary_loss_mlp": 0.01040226, + "balance_loss_clip": 1.01603818, + "balance_loss_mlp": 1.01671124, + "epoch": 0.6119344656545919, + "flos": 15886248664320.0, + "grad_norm": 1.9243683209452622, + "language_loss": 0.76282728, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.78378093, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38476562, + "step": 10178, + "time_per_iteration": 2.354708671569824 + }, + { + "auxiliary_loss_clip": 0.01056221, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.01583087, + "balance_loss_mlp": 1.01747489, + "epoch": 0.6119945889072599, + "flos": 21651805520640.0, + "grad_norm": 1.7692669478788137, + "language_loss": 0.68690848, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.70788276, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 10179, + "time_per_iteration": 2.359222888946533 + }, + { + "auxiliary_loss_clip": 0.01057435, + "auxiliary_loss_mlp": 0.01045283, + "balance_loss_clip": 1.02052319, + "balance_loss_mlp": 1.01838076, + "epoch": 0.6120547121599279, + "flos": 13771664081280.0, + "grad_norm": 1.738499797352307, + "language_loss": 0.84995198, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.87097919, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.390625, + "step": 10180, + "time_per_iteration": 3.562486171722412 + }, + { + "auxiliary_loss_clip": 0.0105449, + "auxiliary_loss_mlp": 0.01038721, + "balance_loss_clip": 1.01366353, + "balance_loss_mlp": 1.01689053, + "epoch": 0.6121148354125958, + "flos": 20078270277120.0, + "grad_norm": 1.6952667504803487, + "language_loss": 0.78417945, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.80511159, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 10181, + "time_per_iteration": 2.344153642654419 + }, + { + "auxiliary_loss_clip": 0.01056061, + "auxiliary_loss_mlp": 0.01040135, + "balance_loss_clip": 1.01481485, + "balance_loss_mlp": 1.01739192, + "epoch": 0.6121749586652638, + "flos": 13470713228160.0, + "grad_norm": 2.1728932977114934, + "language_loss": 0.81810594, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.83906788, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 10182, + "time_per_iteration": 2.322718858718872 + }, + { + "auxiliary_loss_clip": 0.01052974, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.01076102, + "balance_loss_mlp": 1.01730478, + "epoch": 0.6122350819179317, + "flos": 20119502459520.0, + "grad_norm": 2.0149375199125696, + "language_loss": 0.83989525, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.86075157, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35546875, + "step": 10183, + "time_per_iteration": 2.356590747833252 + }, + { + "auxiliary_loss_clip": 0.01008265, + "auxiliary_loss_mlp": 0.01003071, + "balance_loss_clip": 1.00050795, + "balance_loss_mlp": 1.0013411, + "epoch": 0.6122952051705998, + "flos": 65426115778560.0, + "grad_norm": 0.7051638764271655, + "language_loss": 0.62855649, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64866984, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.06933594, + "step": 10184, + "time_per_iteration": 3.1742677688598633 + }, + { + "auxiliary_loss_clip": 0.01056308, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.01563776, + "balance_loss_mlp": 1.01947689, + "epoch": 0.6123553284232677, + "flos": 20375206323840.0, + "grad_norm": 1.8000322856006856, + "language_loss": 0.83147037, + "learning_rate": 1.379669981812101e-06, + "loss": 0.85242462, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36914062, + "step": 10185, + "time_per_iteration": 3.80475115776062 + }, + { + "auxiliary_loss_clip": 0.01055704, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.01433229, + "balance_loss_mlp": 1.01691246, + "epoch": 0.6124154516759357, + "flos": 23986517425920.0, + "grad_norm": 1.9191027119604105, + "language_loss": 0.75684536, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.77781188, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 10186, + "time_per_iteration": 3.7379934787750244 + }, + { + "auxiliary_loss_clip": 0.01053502, + "auxiliary_loss_mlp": 0.0104161, + "balance_loss_clip": 1.01804256, + "balance_loss_mlp": 1.01620746, + "epoch": 0.6124755749286036, + "flos": 21467778410880.0, + "grad_norm": 1.6109793806709114, + "language_loss": 0.79559231, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.81654346, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37304688, + "step": 10187, + "time_per_iteration": 2.4205245971679688 + }, + { + "auxiliary_loss_clip": 0.01053315, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.01507473, + "balance_loss_mlp": 1.01641059, + "epoch": 0.6125356981812716, + "flos": 23878042231680.0, + "grad_norm": 1.7474831335627108, + "language_loss": 0.84508407, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.86599654, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36914062, + "step": 10188, + "time_per_iteration": 2.3839361667633057 + }, + { + "auxiliary_loss_clip": 0.01053581, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.01317739, + "balance_loss_mlp": 1.01585829, + "epoch": 0.6125958214339395, + "flos": 14424819396480.0, + "grad_norm": 1.77500660677378, + "language_loss": 0.7667526, + "learning_rate": 1.378189152155896e-06, + "loss": 0.78766084, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37695312, + "step": 10189, + "time_per_iteration": 2.3534929752349854 + }, + { + "auxiliary_loss_clip": 0.01053215, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.01361036, + "balance_loss_mlp": 1.0156424, + "epoch": 0.6126559446866076, + "flos": 23257949840640.0, + "grad_norm": 1.5653663069103256, + "language_loss": 0.75092119, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.77182186, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37695312, + "step": 10190, + "time_per_iteration": 2.3563454151153564 + }, + { + "auxiliary_loss_clip": 0.01055206, + "auxiliary_loss_mlp": 0.01039704, + "balance_loss_clip": 1.01536143, + "balance_loss_mlp": 1.017079, + "epoch": 0.6127160679392755, + "flos": 26863744947840.0, + "grad_norm": 1.5867625152382085, + "language_loss": 0.68825495, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70920408, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38085938, + "step": 10191, + "time_per_iteration": 2.410029411315918 + }, + { + "auxiliary_loss_clip": 0.01053726, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.02035809, + "balance_loss_mlp": 1.01559806, + "epoch": 0.6127761911919435, + "flos": 26395210235520.0, + "grad_norm": 2.1274911362553914, + "language_loss": 0.75510478, + "learning_rate": 1.377078777445467e-06, + "loss": 0.77609193, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38085938, + "step": 10192, + "time_per_iteration": 2.39237904548645 + }, + { + "auxiliary_loss_clip": 0.01053261, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.01450706, + "balance_loss_mlp": 1.01705575, + "epoch": 0.6128363144446115, + "flos": 22633737909120.0, + "grad_norm": 1.7675420024990942, + "language_loss": 0.85267615, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.87359035, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 10193, + "time_per_iteration": 3.9120144844055176 + }, + { + "auxiliary_loss_clip": 0.01053981, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.01170492, + "balance_loss_mlp": 1.01621807, + "epoch": 0.6128964376972794, + "flos": 26757888105600.0, + "grad_norm": 2.240736804769411, + "language_loss": 0.70975566, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.73066521, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 10194, + "time_per_iteration": 2.401890754699707 + }, + { + "auxiliary_loss_clip": 0.01008405, + "auxiliary_loss_mlp": 0.01008209, + "balance_loss_clip": 1.00581288, + "balance_loss_mlp": 1.00138462, + "epoch": 0.6129565609499474, + "flos": 65565000633600.0, + "grad_norm": 0.8289476287768276, + "language_loss": 0.58703166, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60719776, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.0703125, + "step": 10195, + "time_per_iteration": 2.780883550643921 + }, + { + "auxiliary_loss_clip": 0.01056731, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.01446915, + "balance_loss_mlp": 1.01794362, + "epoch": 0.6130166842026153, + "flos": 16361172155520.0, + "grad_norm": 2.242929368284764, + "language_loss": 0.70608711, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.7270689, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 10196, + "time_per_iteration": 2.347914695739746 + }, + { + "auxiliary_loss_clip": 0.01054594, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.01619744, + "balance_loss_mlp": 1.01733232, + "epoch": 0.6130768074552834, + "flos": 23651526130560.0, + "grad_norm": 1.8225504378453137, + "language_loss": 0.71979177, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.74073696, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37304688, + "step": 10197, + "time_per_iteration": 2.3711631298065186 + }, + { + "auxiliary_loss_clip": 0.01055412, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.0205543, + "balance_loss_mlp": 1.01716948, + "epoch": 0.6131369307079513, + "flos": 20046429250560.0, + "grad_norm": 1.8377312522588325, + "language_loss": 0.79929185, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.82029772, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 10198, + "time_per_iteration": 2.358750104904175 + }, + { + "auxiliary_loss_clip": 0.01055631, + "auxiliary_loss_mlp": 0.0104117, + "balance_loss_clip": 1.0163151, + "balance_loss_mlp": 1.016909, + "epoch": 0.6131970539606193, + "flos": 22671129841920.0, + "grad_norm": 1.6489714895117142, + "language_loss": 0.75058103, + "learning_rate": 1.374488730519181e-06, + "loss": 0.77154899, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 10199, + "time_per_iteration": 2.397869110107422 + }, + { + "auxiliary_loss_clip": 0.01055961, + "auxiliary_loss_mlp": 0.01042451, + "balance_loss_clip": 1.01480651, + "balance_loss_mlp": 1.01601136, + "epoch": 0.6132571772132872, + "flos": 26869679879040.0, + "grad_norm": 1.6034883609832455, + "language_loss": 0.6379528, + "learning_rate": 1.374118818580993e-06, + "loss": 0.65893686, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 10200, + "time_per_iteration": 2.404062032699585 + }, + { + "auxiliary_loss_clip": 0.01052475, + "auxiliary_loss_mlp": 0.01043518, + "balance_loss_clip": 1.02052224, + "balance_loss_mlp": 1.01610017, + "epoch": 0.6133173004659552, + "flos": 22891571366400.0, + "grad_norm": 1.8212952594075678, + "language_loss": 0.69763029, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.71859014, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 10201, + "time_per_iteration": 2.3810291290283203 + }, + { + "auxiliary_loss_clip": 0.01052275, + "auxiliary_loss_mlp": 0.01037043, + "balance_loss_clip": 1.01212859, + "balance_loss_mlp": 1.01483464, + "epoch": 0.6133774237186231, + "flos": 20484065543040.0, + "grad_norm": 1.8776983272142918, + "language_loss": 0.85426438, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.87515759, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 10202, + "time_per_iteration": 2.3629202842712402 + }, + { + "auxiliary_loss_clip": 0.01008047, + "auxiliary_loss_mlp": 0.01004299, + "balance_loss_clip": 1.00198603, + "balance_loss_mlp": 1.00095081, + "epoch": 0.6134375469712912, + "flos": 69409635552000.0, + "grad_norm": 0.869363193653156, + "language_loss": 0.67108035, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69120371, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.07128906, + "step": 10203, + "time_per_iteration": 3.034778356552124 + }, + { + "auxiliary_loss_clip": 0.01053416, + "auxiliary_loss_mlp": 0.01038983, + "balance_loss_clip": 1.01500964, + "balance_loss_mlp": 1.01595151, + "epoch": 0.6134976702239591, + "flos": 41279941238400.0, + "grad_norm": 1.748467560663141, + "language_loss": 0.62548184, + "learning_rate": 1.37263940830327e-06, + "loss": 0.64640582, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 10204, + "time_per_iteration": 2.545292377471924 + }, + { + "auxiliary_loss_clip": 0.0105159, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.01443672, + "balance_loss_mlp": 1.01575184, + "epoch": 0.6135577934766271, + "flos": 22345494791040.0, + "grad_norm": 1.8412972155130083, + "language_loss": 0.73780286, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.75869679, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 10205, + "time_per_iteration": 2.3859293460845947 + }, + { + "auxiliary_loss_clip": 0.01052947, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.01017237, + "balance_loss_mlp": 1.01639128, + "epoch": 0.6136179167292951, + "flos": 23727147868800.0, + "grad_norm": 2.065818482946654, + "language_loss": 0.76636755, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78724623, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 10206, + "time_per_iteration": 2.37298583984375 + }, + { + "auxiliary_loss_clip": 0.01054342, + "auxiliary_loss_mlp": 0.01037806, + "balance_loss_clip": 1.01178229, + "balance_loss_mlp": 1.01594996, + "epoch": 0.613678039981963, + "flos": 26023664880000.0, + "grad_norm": 2.1593557529822713, + "language_loss": 0.76716059, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.78808212, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 10207, + "time_per_iteration": 2.4210500717163086 + }, + { + "auxiliary_loss_clip": 0.01054399, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.0130415, + "balance_loss_mlp": 1.01706111, + "epoch": 0.613738163234631, + "flos": 9859437037440.0, + "grad_norm": 2.2259107365022976, + "language_loss": 0.83611965, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.85703135, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37304688, + "step": 10208, + "time_per_iteration": 2.3206613063812256 + }, + { + "auxiliary_loss_clip": 0.01056267, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01374316, + "balance_loss_mlp": 1.01740086, + "epoch": 0.613798286487299, + "flos": 33180684906240.0, + "grad_norm": 1.7877839453468847, + "language_loss": 0.74147761, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.76243091, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38867188, + "step": 10209, + "time_per_iteration": 2.470982074737549 + }, + { + "auxiliary_loss_clip": 0.0105376, + "auxiliary_loss_mlp": 0.01042862, + "balance_loss_clip": 1.0190078, + "balance_loss_mlp": 1.01730633, + "epoch": 0.613858409739967, + "flos": 25626772010880.0, + "grad_norm": 1.7012824297373796, + "language_loss": 0.75243425, + "learning_rate": 1.37042100685438e-06, + "loss": 0.77340043, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 10210, + "time_per_iteration": 2.3884389400482178 + }, + { + "auxiliary_loss_clip": 0.01008892, + "auxiliary_loss_mlp": 0.01005425, + "balance_loss_clip": 1.00276709, + "balance_loss_mlp": 1.00178123, + "epoch": 0.6139185329926349, + "flos": 67188810101760.0, + "grad_norm": 0.869753624934555, + "language_loss": 0.65221488, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67235804, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.07128906, + "step": 10211, + "time_per_iteration": 3.1526570320129395 + }, + { + "auxiliary_loss_clip": 0.01054701, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.01602137, + "balance_loss_mlp": 1.01760817, + "epoch": 0.6139786562453029, + "flos": 21542562276480.0, + "grad_norm": 1.7289763263905074, + "language_loss": 0.7687909, + "learning_rate": 1.369681730544801e-06, + "loss": 0.78973877, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 10212, + "time_per_iteration": 2.399231433868408 + }, + { + "auxiliary_loss_clip": 0.01055044, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.01602054, + "balance_loss_mlp": 1.01703811, + "epoch": 0.6140387794979708, + "flos": 26067271034880.0, + "grad_norm": 1.6016841733824052, + "language_loss": 0.7557143, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.7766664, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38085938, + "step": 10213, + "time_per_iteration": 2.414647340774536 + }, + { + "auxiliary_loss_clip": 0.01056253, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_clip": 1.0151248, + "balance_loss_mlp": 1.01714766, + "epoch": 0.6140989027506388, + "flos": 23693526362880.0, + "grad_norm": 1.5425392749307867, + "language_loss": 0.74246567, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.76343286, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 10214, + "time_per_iteration": 2.3918840885162354 + }, + { + "auxiliary_loss_clip": 0.01054482, + "auxiliary_loss_mlp": 0.01038777, + "balance_loss_clip": 1.0130868, + "balance_loss_mlp": 1.01636612, + "epoch": 0.6141590260033067, + "flos": 22230770463360.0, + "grad_norm": 1.6837767344832155, + "language_loss": 0.75449544, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.775428, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 10215, + "time_per_iteration": 2.3961873054504395 + }, + { + "auxiliary_loss_clip": 0.01053219, + "auxiliary_loss_mlp": 0.0103921, + "balance_loss_clip": 1.01521349, + "balance_loss_mlp": 1.01715159, + "epoch": 0.6142191492559748, + "flos": 23870710846080.0, + "grad_norm": 1.6665008478300087, + "language_loss": 0.79640222, + "learning_rate": 1.368203464858542e-06, + "loss": 0.81732649, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 10216, + "time_per_iteration": 2.3681938648223877 + }, + { + "auxiliary_loss_clip": 0.01053691, + "auxiliary_loss_mlp": 0.01038531, + "balance_loss_clip": 1.01279366, + "balance_loss_mlp": 1.01687467, + "epoch": 0.6142792725086427, + "flos": 15041804676480.0, + "grad_norm": 1.9849839900093875, + "language_loss": 0.80889416, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.8298164, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3671875, + "step": 10217, + "time_per_iteration": 2.327357053756714 + }, + { + "auxiliary_loss_clip": 0.01055229, + "auxiliary_loss_mlp": 0.01040568, + "balance_loss_clip": 1.01465154, + "balance_loss_mlp": 1.01712871, + "epoch": 0.6143393957613107, + "flos": 23329836063360.0, + "grad_norm": 2.2239238124845904, + "language_loss": 0.79952163, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.82047963, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 10218, + "time_per_iteration": 2.410734176635742 + }, + { + "auxiliary_loss_clip": 0.01053551, + "auxiliary_loss_mlp": 0.01038894, + "balance_loss_clip": 1.01262021, + "balance_loss_mlp": 1.01672411, + "epoch": 0.6143995190139786, + "flos": 20116150968960.0, + "grad_norm": 1.5550103081969515, + "language_loss": 0.82754916, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84847367, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3671875, + "step": 10219, + "time_per_iteration": 2.369307279586792 + }, + { + "auxiliary_loss_clip": 0.01055387, + "auxiliary_loss_mlp": 0.01039185, + "balance_loss_clip": 1.01311409, + "balance_loss_mlp": 1.01677132, + "epoch": 0.6144596422666466, + "flos": 42301918823040.0, + "grad_norm": 2.352525811538234, + "language_loss": 0.68480939, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.70575511, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 10220, + "time_per_iteration": 3.775808811187744 + }, + { + "auxiliary_loss_clip": 0.01052776, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.00900102, + "balance_loss_mlp": 1.01596713, + "epoch": 0.6145197655193146, + "flos": 21572727557760.0, + "grad_norm": 1.8490856269044695, + "language_loss": 0.73503464, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.75588727, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 10221, + "time_per_iteration": 2.3571949005126953 + }, + { + "auxiliary_loss_clip": 0.010526, + "auxiliary_loss_mlp": 0.01035976, + "balance_loss_clip": 1.0147922, + "balance_loss_mlp": 1.01622057, + "epoch": 0.6145798887719826, + "flos": 21470012737920.0, + "grad_norm": 1.7346930662986375, + "language_loss": 0.80517596, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.82606173, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.36328125, + "step": 10222, + "time_per_iteration": 2.389357089996338 + }, + { + "auxiliary_loss_clip": 0.01055023, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.01338708, + "balance_loss_mlp": 1.01711321, + "epoch": 0.6146400120246506, + "flos": 20775974353920.0, + "grad_norm": 1.7454310919919303, + "language_loss": 0.77426469, + "learning_rate": 1.365617422821788e-06, + "loss": 0.79518437, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37890625, + "step": 10223, + "time_per_iteration": 2.3504133224487305 + }, + { + "auxiliary_loss_clip": 0.01053038, + "auxiliary_loss_mlp": 0.0103977, + "balance_loss_clip": 1.01667953, + "balance_loss_mlp": 1.01713014, + "epoch": 0.6147001352773185, + "flos": 13880453477760.0, + "grad_norm": 2.0809952392060835, + "language_loss": 0.79173291, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.81266105, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 10224, + "time_per_iteration": 2.3573110103607178 + }, + { + "auxiliary_loss_clip": 0.01051518, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01433206, + "balance_loss_mlp": 1.01594579, + "epoch": 0.6147602585299865, + "flos": 56639770289280.0, + "grad_norm": 1.2197501730766158, + "language_loss": 0.66887689, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68975532, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35546875, + "step": 10225, + "time_per_iteration": 5.372615098953247 + }, + { + "auxiliary_loss_clip": 0.01054586, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.01230264, + "balance_loss_mlp": 1.01596236, + "epoch": 0.6148203817826544, + "flos": 32815842531840.0, + "grad_norm": 2.2101406814378706, + "language_loss": 0.64934772, + "learning_rate": 1.364509479649357e-06, + "loss": 0.6702677, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 10226, + "time_per_iteration": 2.4486351013183594 + }, + { + "auxiliary_loss_clip": 0.01055686, + "auxiliary_loss_mlp": 0.01044571, + "balance_loss_clip": 1.01890564, + "balance_loss_mlp": 1.01733434, + "epoch": 0.6148805050353224, + "flos": 18331076597760.0, + "grad_norm": 1.87647394166594, + "language_loss": 0.76754045, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.78854311, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 10227, + "time_per_iteration": 2.4433724880218506 + }, + { + "auxiliary_loss_clip": 0.01055443, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.01587689, + "balance_loss_mlp": 1.01700306, + "epoch": 0.6149406282879903, + "flos": 14063119044480.0, + "grad_norm": 1.9352413361730647, + "language_loss": 0.63466549, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.65563786, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38476562, + "step": 10228, + "time_per_iteration": 2.3503243923187256 + }, + { + "auxiliary_loss_clip": 0.01052764, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.01356089, + "balance_loss_mlp": 1.01667464, + "epoch": 0.6150007515406584, + "flos": 25189065895680.0, + "grad_norm": 1.3449134419844209, + "language_loss": 0.752653, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.77355295, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 10229, + "time_per_iteration": 2.3935534954071045 + }, + { + "auxiliary_loss_clip": 0.01055503, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.01478243, + "balance_loss_mlp": 1.01802897, + "epoch": 0.6150608747933263, + "flos": 21944168179200.0, + "grad_norm": 1.86963620302402, + "language_loss": 0.78970277, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.81066138, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 10230, + "time_per_iteration": 2.3655385971069336 + }, + { + "auxiliary_loss_clip": 0.01054341, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.01646745, + "balance_loss_mlp": 1.01662731, + "epoch": 0.6151209980459943, + "flos": 30116148606720.0, + "grad_norm": 1.561731816668647, + "language_loss": 0.7349093, + "learning_rate": 1.36266338983927e-06, + "loss": 0.7558741, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 10231, + "time_per_iteration": 2.434199571609497 + }, + { + "auxiliary_loss_clip": 0.01055955, + "auxiliary_loss_mlp": 0.01040519, + "balance_loss_clip": 1.01605725, + "balance_loss_mlp": 1.01745749, + "epoch": 0.6151811212986622, + "flos": 30007045008000.0, + "grad_norm": 1.5908896989075247, + "language_loss": 0.71306002, + "learning_rate": 1.362294244324858e-06, + "loss": 0.73402476, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 10232, + "time_per_iteration": 2.442121982574463 + }, + { + "auxiliary_loss_clip": 0.01052635, + "auxiliary_loss_mlp": 0.01034301, + "balance_loss_clip": 1.01115084, + "balance_loss_mlp": 1.01648402, + "epoch": 0.6152412445513302, + "flos": 18872091025920.0, + "grad_norm": 2.0172774399317412, + "language_loss": 0.93094569, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.95181507, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 10233, + "time_per_iteration": 3.7522194385528564 + }, + { + "auxiliary_loss_clip": 0.010521, + "auxiliary_loss_mlp": 0.01041607, + "balance_loss_clip": 1.01888597, + "balance_loss_mlp": 1.01619315, + "epoch": 0.6153013678039982, + "flos": 25702393749120.0, + "grad_norm": 1.701100955987421, + "language_loss": 0.71901029, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73994738, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 10234, + "time_per_iteration": 2.4451911449432373 + }, + { + "auxiliary_loss_clip": 0.01054427, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.01461971, + "balance_loss_mlp": 1.0161525, + "epoch": 0.6153614910566662, + "flos": 28509061680000.0, + "grad_norm": 2.4922907446166183, + "language_loss": 0.67438912, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.69534868, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3828125, + "step": 10235, + "time_per_iteration": 2.4527862071990967 + }, + { + "auxiliary_loss_clip": 0.01056044, + "auxiliary_loss_mlp": 0.0103961, + "balance_loss_clip": 1.01501679, + "balance_loss_mlp": 1.01713443, + "epoch": 0.6154216143093342, + "flos": 23548671665280.0, + "grad_norm": 1.9439115783916816, + "language_loss": 0.82019401, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.84115052, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38867188, + "step": 10236, + "time_per_iteration": 2.384989023208618 + }, + { + "auxiliary_loss_clip": 0.01054426, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.01341081, + "balance_loss_mlp": 1.01611996, + "epoch": 0.6154817375620021, + "flos": 22746961048320.0, + "grad_norm": 1.3718782843607344, + "language_loss": 0.81386501, + "learning_rate": 1.360448879760721e-06, + "loss": 0.83479202, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3828125, + "step": 10237, + "time_per_iteration": 2.3981287479400635 + }, + { + "auxiliary_loss_clip": 0.01055235, + "auxiliary_loss_mlp": 0.01041564, + "balance_loss_clip": 1.01694679, + "balance_loss_mlp": 1.0171144, + "epoch": 0.6155418608146701, + "flos": 27161728335360.0, + "grad_norm": 1.6604622607031265, + "language_loss": 0.77474344, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.7957114, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38085938, + "step": 10238, + "time_per_iteration": 2.4053943157196045 + }, + { + "auxiliary_loss_clip": 0.01008751, + "auxiliary_loss_mlp": 0.0100629, + "balance_loss_clip": 1.00360799, + "balance_loss_mlp": 1.00174809, + "epoch": 0.615601984067338, + "flos": 68808257516160.0, + "grad_norm": 0.7576818578528131, + "language_loss": 0.57717693, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59732729, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.02685547, + "router_z_loss_mlp": 0.0703125, + "step": 10239, + "time_per_iteration": 2.9996843338012695 + }, + { + "auxiliary_loss_clip": 0.01054205, + "auxiliary_loss_mlp": 0.01039689, + "balance_loss_clip": 1.01348722, + "balance_loss_mlp": 1.01677489, + "epoch": 0.615662107320006, + "flos": 15516413965440.0, + "grad_norm": 1.8405235236782518, + "language_loss": 0.79239941, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.8133384, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 10240, + "time_per_iteration": 2.3343698978424072 + }, + { + "auxiliary_loss_clip": 0.01057665, + "auxiliary_loss_mlp": 0.01043812, + "balance_loss_clip": 1.01857579, + "balance_loss_mlp": 1.01912189, + "epoch": 0.615722230572674, + "flos": 21062786106240.0, + "grad_norm": 3.456049119748707, + "language_loss": 0.73933661, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.76035136, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 10241, + "time_per_iteration": 2.3832666873931885 + }, + { + "auxiliary_loss_clip": 0.01054275, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.01340795, + "balance_loss_mlp": 1.0171051, + "epoch": 0.615782353825342, + "flos": 23255715513600.0, + "grad_norm": 1.6603409239949884, + "language_loss": 0.72514284, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.74605596, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 10242, + "time_per_iteration": 2.3843417167663574 + }, + { + "auxiliary_loss_clip": 0.01054158, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.01356947, + "balance_loss_mlp": 1.01710916, + "epoch": 0.6158424770780099, + "flos": 21102901125120.0, + "grad_norm": 2.1122451554037815, + "language_loss": 0.7303412, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.75125515, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 10243, + "time_per_iteration": 2.371317148208618 + }, + { + "auxiliary_loss_clip": 0.01008589, + "auxiliary_loss_mlp": 0.01003141, + "balance_loss_clip": 1.00053072, + "balance_loss_mlp": 1.00142574, + "epoch": 0.6159026003306779, + "flos": 70329596544000.0, + "grad_norm": 0.7597256756502746, + "language_loss": 0.56890279, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58902007, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.07128906, + "step": 10244, + "time_per_iteration": 3.06811785697937 + }, + { + "auxiliary_loss_clip": 0.01053841, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.01279724, + "balance_loss_mlp": 1.01576865, + "epoch": 0.6159627235833458, + "flos": 33872977722240.0, + "grad_norm": 2.0228981835662245, + "language_loss": 0.64571655, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.66664851, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38085938, + "step": 10245, + "time_per_iteration": 2.494758367538452 + }, + { + "auxiliary_loss_clip": 0.01052912, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.01297283, + "balance_loss_mlp": 1.01661897, + "epoch": 0.6160228468360138, + "flos": 26574314843520.0, + "grad_norm": 2.440600416961405, + "language_loss": 0.80311692, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.82402492, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 10246, + "time_per_iteration": 2.3971683979034424 + }, + { + "auxiliary_loss_clip": 0.01057691, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.01842451, + "balance_loss_mlp": 1.01804972, + "epoch": 0.6160829700886818, + "flos": 17192559294720.0, + "grad_norm": 2.613669713426707, + "language_loss": 0.88897514, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.91001618, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.39648438, + "step": 10247, + "time_per_iteration": 2.3688583374023438 + }, + { + "auxiliary_loss_clip": 0.01056096, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.01455772, + "balance_loss_mlp": 1.01767635, + "epoch": 0.6161430933413498, + "flos": 23622408190080.0, + "grad_norm": 1.6465646638280664, + "language_loss": 0.81161904, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.83257306, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38476562, + "step": 10248, + "time_per_iteration": 2.381340980529785 + }, + { + "auxiliary_loss_clip": 0.01053012, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.01216912, + "balance_loss_mlp": 1.01630378, + "epoch": 0.6162032165940178, + "flos": 23001338280960.0, + "grad_norm": 1.9644989263730592, + "language_loss": 0.88374931, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.90462983, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 10249, + "time_per_iteration": 2.4324100017547607 + }, + { + "auxiliary_loss_clip": 0.01055624, + "auxiliary_loss_mlp": 0.01039488, + "balance_loss_clip": 1.01279712, + "balance_loss_mlp": 1.01790881, + "epoch": 0.6162633398466857, + "flos": 39420397203840.0, + "grad_norm": 2.3715895951350783, + "language_loss": 0.70475543, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.72570646, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.37695312, + "step": 10250, + "time_per_iteration": 2.512131690979004 + }, + { + "auxiliary_loss_clip": 0.01050222, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.00799632, + "balance_loss_mlp": 1.01568758, + "epoch": 0.6163234630993537, + "flos": 19243671292800.0, + "grad_norm": 1.9641841445918256, + "language_loss": 0.74353087, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76433742, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 10251, + "time_per_iteration": 2.35385799407959 + }, + { + "auxiliary_loss_clip": 0.01053859, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.01347899, + "balance_loss_mlp": 1.0167402, + "epoch": 0.6163835863520216, + "flos": 15960857973120.0, + "grad_norm": 2.2412589181805953, + "language_loss": 0.69304579, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.71396506, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 10252, + "time_per_iteration": 2.3457908630371094 + }, + { + "auxiliary_loss_clip": 0.01009027, + "auxiliary_loss_mlp": 0.0100345, + "balance_loss_clip": 1.00110197, + "balance_loss_mlp": 1.00162506, + "epoch": 0.6164437096046896, + "flos": 68100322544640.0, + "grad_norm": 0.8870067258278813, + "language_loss": 0.58072412, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.60084891, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.07421875, + "step": 10253, + "time_per_iteration": 3.041614294052124 + }, + { + "auxiliary_loss_clip": 0.01054579, + "auxiliary_loss_mlp": 0.01038814, + "balance_loss_clip": 1.01275516, + "balance_loss_mlp": 1.01621079, + "epoch": 0.6165038328573575, + "flos": 21360141089280.0, + "grad_norm": 1.6602442248727862, + "language_loss": 0.80623412, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.82716799, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 10254, + "time_per_iteration": 2.37368106842041 + }, + { + "auxiliary_loss_clip": 0.01056998, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.01435339, + "balance_loss_mlp": 1.01669383, + "epoch": 0.6165639561100256, + "flos": 21101015911680.0, + "grad_norm": 1.831894490075482, + "language_loss": 0.81658947, + "learning_rate": 1.353810600008846e-06, + "loss": 0.83755958, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 10255, + "time_per_iteration": 2.4080846309661865 + }, + { + "auxiliary_loss_clip": 0.0105654, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.01153016, + "balance_loss_mlp": 1.01723945, + "epoch": 0.6166240793626935, + "flos": 25337341906560.0, + "grad_norm": 1.936676156350165, + "language_loss": 0.67137593, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.69234264, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39257812, + "step": 10256, + "time_per_iteration": 2.399456739425659 + }, + { + "auxiliary_loss_clip": 0.01053975, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.0124892, + "balance_loss_mlp": 1.01733899, + "epoch": 0.6166842026153615, + "flos": 19681621787520.0, + "grad_norm": 2.0517921745849272, + "language_loss": 0.74059153, + "learning_rate": 1.353073501949825e-06, + "loss": 0.76150471, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36523438, + "step": 10257, + "time_per_iteration": 2.389092206954956 + }, + { + "auxiliary_loss_clip": 0.01056466, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.01079535, + "balance_loss_mlp": 1.01806748, + "epoch": 0.6167443258680294, + "flos": 19317337994880.0, + "grad_norm": 2.172536155293353, + "language_loss": 0.73687088, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.75779325, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 10258, + "time_per_iteration": 2.3538520336151123 + }, + { + "auxiliary_loss_clip": 0.01054878, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.01027846, + "balance_loss_mlp": 1.01679492, + "epoch": 0.6168044491206974, + "flos": 25264059229440.0, + "grad_norm": 10.557538578325099, + "language_loss": 0.67084986, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.69175309, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 10259, + "time_per_iteration": 3.726186990737915 + }, + { + "auxiliary_loss_clip": 0.01054227, + "auxiliary_loss_mlp": 0.01042188, + "balance_loss_clip": 1.01640296, + "balance_loss_mlp": 1.0171504, + "epoch": 0.6168645723733654, + "flos": 13219198727040.0, + "grad_norm": 2.2075743277811752, + "language_loss": 0.72696292, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.74792713, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 10260, + "time_per_iteration": 2.3431553840637207 + }, + { + "auxiliary_loss_clip": 0.01056651, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.01605713, + "balance_loss_mlp": 1.01725161, + "epoch": 0.6169246956260334, + "flos": 26650809365760.0, + "grad_norm": 1.947618265433298, + "language_loss": 0.69551229, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.71653259, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.39453125, + "step": 10261, + "time_per_iteration": 2.4185078144073486 + }, + { + "auxiliary_loss_clip": 0.01053805, + "auxiliary_loss_mlp": 0.01040657, + "balance_loss_clip": 1.01557577, + "balance_loss_mlp": 1.01586092, + "epoch": 0.6169848188787014, + "flos": 23147310142080.0, + "grad_norm": 1.756720759065761, + "language_loss": 0.72147393, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.74241859, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 10262, + "time_per_iteration": 2.3805980682373047 + }, + { + "auxiliary_loss_clip": 0.01054027, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.01376319, + "balance_loss_mlp": 1.01614368, + "epoch": 0.6170449421313693, + "flos": 23330778670080.0, + "grad_norm": 1.7260363041589926, + "language_loss": 0.71075237, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.73169541, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 10263, + "time_per_iteration": 2.407578706741333 + }, + { + "auxiliary_loss_clip": 0.01055583, + "auxiliary_loss_mlp": 0.01035875, + "balance_loss_clip": 1.01075768, + "balance_loss_mlp": 1.01717091, + "epoch": 0.6171050653840373, + "flos": 15850707033600.0, + "grad_norm": 2.685011693283178, + "language_loss": 0.77635193, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.79726648, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38476562, + "step": 10264, + "time_per_iteration": 3.785489320755005 + }, + { + "auxiliary_loss_clip": 0.01053922, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.01134551, + "balance_loss_mlp": 1.01663923, + "epoch": 0.6171651886367052, + "flos": 20044544037120.0, + "grad_norm": 2.1079325188530573, + "language_loss": 0.85828745, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87919688, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 10265, + "time_per_iteration": 3.7775566577911377 + }, + { + "auxiliary_loss_clip": 0.01054652, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.01150584, + "balance_loss_mlp": 1.01765394, + "epoch": 0.6172253118893732, + "flos": 26431485004800.0, + "grad_norm": 1.7396464251862396, + "language_loss": 0.65309322, + "learning_rate": 1.349757776608153e-06, + "loss": 0.67401308, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 10266, + "time_per_iteration": 2.4222776889801025 + }, + { + "auxiliary_loss_clip": 0.01053917, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.01222086, + "balance_loss_mlp": 1.01579821, + "epoch": 0.6172854351420412, + "flos": 22631922518400.0, + "grad_norm": 1.6489298152757297, + "language_loss": 0.7649281, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.78584063, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 10267, + "time_per_iteration": 2.362180471420288 + }, + { + "auxiliary_loss_clip": 0.01054582, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.01369798, + "balance_loss_mlp": 1.01560068, + "epoch": 0.6173455583947092, + "flos": 21211934901120.0, + "grad_norm": 1.8582415292749932, + "language_loss": 0.76160079, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.78256214, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.390625, + "step": 10268, + "time_per_iteration": 2.3989670276641846 + }, + { + "auxiliary_loss_clip": 0.01056129, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.01425314, + "balance_loss_mlp": 1.01649904, + "epoch": 0.6174056816473771, + "flos": 19499270423040.0, + "grad_norm": 1.7775074809634057, + "language_loss": 0.76622427, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.78718972, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 10269, + "time_per_iteration": 2.3619232177734375 + }, + { + "auxiliary_loss_clip": 0.01052151, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.01100588, + "balance_loss_mlp": 1.01525557, + "epoch": 0.6174658049000451, + "flos": 15996434515200.0, + "grad_norm": 2.0086018334237483, + "language_loss": 0.77202344, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.79291201, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36914062, + "step": 10270, + "time_per_iteration": 2.363550901412964 + }, + { + "auxiliary_loss_clip": 0.01055738, + "auxiliary_loss_mlp": 0.01040358, + "balance_loss_clip": 1.0147872, + "balance_loss_mlp": 1.0177145, + "epoch": 0.617525928152713, + "flos": 21902935996800.0, + "grad_norm": 2.926390188583105, + "language_loss": 0.83611596, + "learning_rate": 1.347916569325736e-06, + "loss": 0.85707688, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 10271, + "time_per_iteration": 2.3760952949523926 + }, + { + "auxiliary_loss_clip": 0.0105335, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.01553941, + "balance_loss_mlp": 1.01665282, + "epoch": 0.617586051405381, + "flos": 21104891072640.0, + "grad_norm": 1.7458202842304997, + "language_loss": 0.77960277, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.80053532, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 10272, + "time_per_iteration": 2.3724489212036133 + }, + { + "auxiliary_loss_clip": 0.01008822, + "auxiliary_loss_mlp": 0.01004991, + "balance_loss_clip": 1.00239205, + "balance_loss_mlp": 1.00173926, + "epoch": 0.617646174658049, + "flos": 58607717829120.0, + "grad_norm": 0.8920638960036812, + "language_loss": 0.59255981, + "learning_rate": 1.347180259404513e-06, + "loss": 0.6126979, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.07128906, + "step": 10273, + "time_per_iteration": 4.353146076202393 + }, + { + "auxiliary_loss_clip": 0.01052657, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.01006174, + "balance_loss_mlp": 1.01503253, + "epoch": 0.617706297910717, + "flos": 13877904948480.0, + "grad_norm": 2.7652999626952526, + "language_loss": 0.73832226, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.7592079, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 10274, + "time_per_iteration": 2.339550495147705 + }, + { + "auxiliary_loss_clip": 0.01053843, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.01127064, + "balance_loss_mlp": 1.01650143, + "epoch": 0.617766421163385, + "flos": 19207431434880.0, + "grad_norm": 1.7187404709621221, + "language_loss": 0.79134226, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.81224436, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 10275, + "time_per_iteration": 2.3611490726470947 + }, + { + "auxiliary_loss_clip": 0.01054217, + "auxiliary_loss_mlp": 0.01038685, + "balance_loss_clip": 1.01438999, + "balance_loss_mlp": 1.01645803, + "epoch": 0.6178265444160529, + "flos": 22564854063360.0, + "grad_norm": 1.6553921392704254, + "language_loss": 0.80571318, + "learning_rate": 1.346075980219998e-06, + "loss": 0.82664216, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37695312, + "step": 10276, + "time_per_iteration": 2.409409761428833 + }, + { + "auxiliary_loss_clip": 0.01054935, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.01930356, + "balance_loss_mlp": 1.01654732, + "epoch": 0.6178866676687209, + "flos": 11983482599040.0, + "grad_norm": 2.8905722807712824, + "language_loss": 0.82108289, + "learning_rate": 1.345707936733612e-06, + "loss": 0.8420859, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 10277, + "time_per_iteration": 2.3472745418548584 + }, + { + "auxiliary_loss_clip": 0.01056096, + "auxiliary_loss_mlp": 0.01040172, + "balance_loss_clip": 1.01441121, + "balance_loss_mlp": 1.01728797, + "epoch": 0.6179467909213888, + "flos": 20990585681280.0, + "grad_norm": 1.661407368833782, + "language_loss": 0.82660329, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.84756595, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38867188, + "step": 10278, + "time_per_iteration": 2.366598606109619 + }, + { + "auxiliary_loss_clip": 0.01054881, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.01285315, + "balance_loss_mlp": 1.01668274, + "epoch": 0.6180069141740568, + "flos": 25336922970240.0, + "grad_norm": 1.5563691517418181, + "language_loss": 0.74772286, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.76865506, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 10279, + "time_per_iteration": 2.389824390411377 + }, + { + "auxiliary_loss_clip": 0.01052638, + "auxiliary_loss_mlp": 0.01036708, + "balance_loss_clip": 1.01290154, + "balance_loss_mlp": 1.01511788, + "epoch": 0.6180670374267248, + "flos": 19644718613760.0, + "grad_norm": 1.5780966301453538, + "language_loss": 0.72004664, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.74094009, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 10280, + "time_per_iteration": 2.364516258239746 + }, + { + "auxiliary_loss_clip": 0.01054357, + "auxiliary_loss_mlp": 0.01041095, + "balance_loss_clip": 1.01700258, + "balance_loss_mlp": 1.01661611, + "epoch": 0.6181271606793928, + "flos": 19463833526400.0, + "grad_norm": 1.4657528659903063, + "language_loss": 0.73122168, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.75217617, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37890625, + "step": 10281, + "time_per_iteration": 2.348449468612671 + }, + { + "auxiliary_loss_clip": 0.0105226, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.01236033, + "balance_loss_mlp": 1.01657701, + "epoch": 0.6181872839320607, + "flos": 25593080682240.0, + "grad_norm": 1.4634231713041268, + "language_loss": 0.77753198, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.79841071, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35742188, + "step": 10282, + "time_per_iteration": 2.4308903217315674 + }, + { + "auxiliary_loss_clip": 0.01056012, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.01357746, + "balance_loss_mlp": 1.01625896, + "epoch": 0.6182474071847287, + "flos": 25550766247680.0, + "grad_norm": 1.7046759284202693, + "language_loss": 0.69830477, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71929526, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.39648438, + "step": 10283, + "time_per_iteration": 2.4231510162353516 + }, + { + "auxiliary_loss_clip": 0.01059471, + "auxiliary_loss_mlp": 0.01041052, + "balance_loss_clip": 1.01339555, + "balance_loss_mlp": 1.01716161, + "epoch": 0.6183075304373966, + "flos": 22122749116800.0, + "grad_norm": 1.8273736937816063, + "language_loss": 0.75947732, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.78048253, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.42382812, + "step": 10284, + "time_per_iteration": 2.411320686340332 + }, + { + "auxiliary_loss_clip": 0.01051461, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.0108341, + "balance_loss_mlp": 1.01670432, + "epoch": 0.6183676536900646, + "flos": 22454493655680.0, + "grad_norm": 1.4991984395311004, + "language_loss": 0.76419777, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.78506792, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.34765625, + "step": 10285, + "time_per_iteration": 2.455461263656616 + }, + { + "auxiliary_loss_clip": 0.010549, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.01326847, + "balance_loss_mlp": 1.01640105, + "epoch": 0.6184277769427327, + "flos": 23363108455680.0, + "grad_norm": 1.5301748845805834, + "language_loss": 0.73832828, + "learning_rate": 1.342396663517503e-06, + "loss": 0.75926852, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 10286, + "time_per_iteration": 2.4174907207489014 + }, + { + "auxiliary_loss_clip": 0.01052373, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.0158577, + "balance_loss_mlp": 1.01582766, + "epoch": 0.6184879001954006, + "flos": 22709953140480.0, + "grad_norm": 1.6846940569030369, + "language_loss": 0.76935333, + "learning_rate": 1.342028868767199e-06, + "loss": 0.79027307, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36523438, + "step": 10287, + "time_per_iteration": 2.364710807800293 + }, + { + "auxiliary_loss_clip": 0.01054395, + "auxiliary_loss_mlp": 0.01041322, + "balance_loss_clip": 1.01662207, + "balance_loss_mlp": 1.0169965, + "epoch": 0.6185480234480686, + "flos": 23840789944320.0, + "grad_norm": 1.7112884699958915, + "language_loss": 0.736817, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75777423, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 10288, + "time_per_iteration": 2.4099583625793457 + }, + { + "auxiliary_loss_clip": 0.0105082, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0124197, + "balance_loss_mlp": 1.01588869, + "epoch": 0.6186081467007365, + "flos": 45475872923520.0, + "grad_norm": 1.8519807596649471, + "language_loss": 0.73922062, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.76007378, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 10289, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.01056723, + "auxiliary_loss_mlp": 0.01044848, + "balance_loss_clip": 1.01881289, + "balance_loss_mlp": 1.01646495, + "epoch": 0.6186682699534045, + "flos": 23549719006080.0, + "grad_norm": 1.469722787756303, + "language_loss": 0.79939497, + "learning_rate": 1.340925634274056e-06, + "loss": 0.82041073, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40234375, + "step": 10290, + "time_per_iteration": 2.4029996395111084 + }, + { + "auxiliary_loss_clip": 0.01055565, + "auxiliary_loss_mlp": 0.01038899, + "balance_loss_clip": 1.01323342, + "balance_loss_mlp": 1.01727176, + "epoch": 0.6187283932060724, + "flos": 25773058074240.0, + "grad_norm": 1.5213891059345583, + "language_loss": 0.82381678, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.84476143, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 10291, + "time_per_iteration": 2.404261827468872 + }, + { + "auxiliary_loss_clip": 0.0105314, + "auxiliary_loss_mlp": 0.01046934, + "balance_loss_clip": 1.02149487, + "balance_loss_mlp": 1.01630592, + "epoch": 0.6187885164587404, + "flos": 25264024318080.0, + "grad_norm": 1.6358674137150262, + "language_loss": 0.78600669, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.80700743, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 10292, + "time_per_iteration": 2.4843029975891113 + }, + { + "auxiliary_loss_clip": 0.01059354, + "auxiliary_loss_mlp": 0.01050049, + "balance_loss_clip": 1.02003193, + "balance_loss_mlp": 1.01810622, + "epoch": 0.6188486397114084, + "flos": 26249552576640.0, + "grad_norm": 2.2784556043987965, + "language_loss": 0.74439883, + "learning_rate": 1.339822624710401e-06, + "loss": 0.7654928, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.41210938, + "step": 10293, + "time_per_iteration": 2.4044783115386963 + }, + { + "auxiliary_loss_clip": 0.01054438, + "auxiliary_loss_mlp": 0.01044451, + "balance_loss_clip": 1.01961994, + "balance_loss_mlp": 1.01728702, + "epoch": 0.6189087629640764, + "flos": 20922330240000.0, + "grad_norm": 1.571774766573395, + "language_loss": 0.84459788, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.86558676, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 10294, + "time_per_iteration": 2.379075288772583 + }, + { + "auxiliary_loss_clip": 0.01055582, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.01165724, + "balance_loss_mlp": 1.01729977, + "epoch": 0.6189688862167443, + "flos": 14828938917120.0, + "grad_norm": 2.6394107789561745, + "language_loss": 0.72642815, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.74735332, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 10295, + "time_per_iteration": 2.3259530067443848 + }, + { + "auxiliary_loss_clip": 0.01053919, + "auxiliary_loss_mlp": 0.01049258, + "balance_loss_clip": 1.02273369, + "balance_loss_mlp": 1.01670241, + "epoch": 0.6190290094694123, + "flos": 24283767674880.0, + "grad_norm": 1.9515641311095626, + "language_loss": 0.71184027, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.73287201, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37304688, + "step": 10296, + "time_per_iteration": 2.4911317825317383 + }, + { + "auxiliary_loss_clip": 0.01056844, + "auxiliary_loss_mlp": 0.01046619, + "balance_loss_clip": 1.01970172, + "balance_loss_mlp": 1.01795268, + "epoch": 0.6190891327220802, + "flos": 22528334914560.0, + "grad_norm": 1.8461428611174853, + "language_loss": 0.72231054, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.7433452, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38867188, + "step": 10297, + "time_per_iteration": 2.3764867782592773 + }, + { + "auxiliary_loss_clip": 0.01009424, + "auxiliary_loss_mlp": 0.0100451, + "balance_loss_clip": 1.00193524, + "balance_loss_mlp": 1.00219917, + "epoch": 0.6191492559747482, + "flos": 67726123925760.0, + "grad_norm": 0.8977100334170434, + "language_loss": 0.64326662, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66340595, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.07226562, + "step": 10298, + "time_per_iteration": 2.916808605194092 + }, + { + "auxiliary_loss_clip": 0.01055173, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.01360488, + "balance_loss_mlp": 1.01676452, + "epoch": 0.6192093792274163, + "flos": 22345564613760.0, + "grad_norm": 1.6317495176143209, + "language_loss": 0.75302231, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.77396011, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 10299, + "time_per_iteration": 3.6500940322875977 + }, + { + "auxiliary_loss_clip": 0.01056283, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.01204658, + "balance_loss_mlp": 1.0166328, + "epoch": 0.6192695024800842, + "flos": 13553072858880.0, + "grad_norm": 1.5907186791749184, + "language_loss": 0.68937629, + "learning_rate": 1.337249812568732e-06, + "loss": 0.71031588, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39648438, + "step": 10300, + "time_per_iteration": 2.3524250984191895 + }, + { + "auxiliary_loss_clip": 0.01057077, + "auxiliary_loss_mlp": 0.01044745, + "balance_loss_clip": 1.01985407, + "balance_loss_mlp": 1.0183841, + "epoch": 0.6193296257327522, + "flos": 17414502007680.0, + "grad_norm": 1.8268386698941323, + "language_loss": 0.68212569, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.70314389, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38671875, + "step": 10301, + "time_per_iteration": 2.3298444747924805 + }, + { + "auxiliary_loss_clip": 0.01054863, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.01416111, + "balance_loss_mlp": 1.01612198, + "epoch": 0.6193897489854201, + "flos": 31099826563200.0, + "grad_norm": 1.5691273205467593, + "language_loss": 0.74045044, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.76138031, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.38671875, + "step": 10302, + "time_per_iteration": 2.4525365829467773 + }, + { + "auxiliary_loss_clip": 0.01053549, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.01272845, + "balance_loss_mlp": 1.01633358, + "epoch": 0.6194498722380881, + "flos": 19133066505600.0, + "grad_norm": 1.6929313180039014, + "language_loss": 0.81947589, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.84038746, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37109375, + "step": 10303, + "time_per_iteration": 2.35943603515625 + }, + { + "auxiliary_loss_clip": 0.01055232, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.00998724, + "balance_loss_mlp": 1.01640201, + "epoch": 0.619509995490756, + "flos": 21834017239680.0, + "grad_norm": 1.7371303633106103, + "language_loss": 0.77469724, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.79562718, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38671875, + "step": 10304, + "time_per_iteration": 3.749577045440674 + }, + { + "auxiliary_loss_clip": 0.01056535, + "auxiliary_loss_mlp": 0.01045112, + "balance_loss_clip": 1.0170145, + "balance_loss_mlp": 1.01636052, + "epoch": 0.619570118743424, + "flos": 23805387959040.0, + "grad_norm": 1.856185229947296, + "language_loss": 0.78238231, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.80339873, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 10305, + "time_per_iteration": 3.7299325466156006 + }, + { + "auxiliary_loss_clip": 0.0105857, + "auxiliary_loss_mlp": 0.01043211, + "balance_loss_clip": 1.01636517, + "balance_loss_mlp": 1.01872826, + "epoch": 0.619630241996092, + "flos": 21100666798080.0, + "grad_norm": 1.5879580408886338, + "language_loss": 0.79563439, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81665224, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3984375, + "step": 10306, + "time_per_iteration": 2.361147165298462 + }, + { + "auxiliary_loss_clip": 0.01051246, + "auxiliary_loss_mlp": 0.01034227, + "balance_loss_clip": 1.01230478, + "balance_loss_mlp": 1.015836, + "epoch": 0.61969036524876, + "flos": 27307036880640.0, + "grad_norm": 1.633922511296341, + "language_loss": 0.81282783, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.83368266, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 10307, + "time_per_iteration": 2.4607291221618652 + }, + { + "auxiliary_loss_clip": 0.01009305, + "auxiliary_loss_mlp": 0.01009695, + "balance_loss_clip": 1.00752521, + "balance_loss_mlp": 1.00210214, + "epoch": 0.6197504885014279, + "flos": 51645896547840.0, + "grad_norm": 0.8016492260076375, + "language_loss": 0.59420097, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61439097, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.07226562, + "step": 10308, + "time_per_iteration": 3.0528175830841064 + }, + { + "auxiliary_loss_clip": 0.01051976, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.01133299, + "balance_loss_mlp": 1.01611245, + "epoch": 0.6198106117540959, + "flos": 30555739935360.0, + "grad_norm": 1.8832124686079856, + "language_loss": 0.68878251, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70964038, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 10309, + "time_per_iteration": 2.462857246398926 + }, + { + "auxiliary_loss_clip": 0.01052175, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.01650226, + "balance_loss_mlp": 1.01623785, + "epoch": 0.6198707350067638, + "flos": 18908924376960.0, + "grad_norm": 1.5806301970392462, + "language_loss": 0.73132318, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.75223964, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 10310, + "time_per_iteration": 2.352304220199585 + }, + { + "auxiliary_loss_clip": 0.01055858, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.01523697, + "balance_loss_mlp": 1.01792216, + "epoch": 0.6199308582594318, + "flos": 21432795361920.0, + "grad_norm": 3.7554705719012635, + "language_loss": 0.79621375, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81719297, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.37890625, + "step": 10311, + "time_per_iteration": 2.3850057125091553 + }, + { + "auxiliary_loss_clip": 0.01053659, + "auxiliary_loss_mlp": 0.01041289, + "balance_loss_clip": 1.01564693, + "balance_loss_mlp": 1.01604319, + "epoch": 0.6199909815120999, + "flos": 18406349222400.0, + "grad_norm": 1.6188888866100697, + "language_loss": 0.73873174, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.75968134, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 10312, + "time_per_iteration": 3.8706939220428467 + }, + { + "auxiliary_loss_clip": 0.01053712, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.01350629, + "balance_loss_mlp": 1.01682067, + "epoch": 0.6200511047647678, + "flos": 21465893197440.0, + "grad_norm": 1.9933267918420687, + "language_loss": 0.73134398, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.75227916, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.36914062, + "step": 10313, + "time_per_iteration": 2.3704605102539062 + }, + { + "auxiliary_loss_clip": 0.01055898, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_clip": 1.01719666, + "balance_loss_mlp": 1.01715398, + "epoch": 0.6201112280174358, + "flos": 18215130372480.0, + "grad_norm": 1.9568416383467089, + "language_loss": 0.79571635, + "learning_rate": 1.332107887401416e-06, + "loss": 0.8167038, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 10314, + "time_per_iteration": 2.3326687812805176 + }, + { + "auxiliary_loss_clip": 0.01054547, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.01435208, + "balance_loss_mlp": 1.01666141, + "epoch": 0.6201713512701037, + "flos": 20010154481280.0, + "grad_norm": 1.7050344524696863, + "language_loss": 0.79191619, + "learning_rate": 1.331740796528812e-06, + "loss": 0.81285644, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 10315, + "time_per_iteration": 2.3799993991851807 + }, + { + "auxiliary_loss_clip": 0.01057236, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.01686883, + "balance_loss_mlp": 1.01810431, + "epoch": 0.6202314745227717, + "flos": 22486718707200.0, + "grad_norm": 1.6794934166236695, + "language_loss": 0.76978415, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.79077446, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 10316, + "time_per_iteration": 2.369307279586792 + }, + { + "auxiliary_loss_clip": 0.01053031, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.01139414, + "balance_loss_mlp": 1.01531613, + "epoch": 0.6202915977754396, + "flos": 26827609824000.0, + "grad_norm": 1.8215213697045738, + "language_loss": 0.78852355, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.809434, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37695312, + "step": 10317, + "time_per_iteration": 2.4071438312530518 + }, + { + "auxiliary_loss_clip": 0.01009245, + "auxiliary_loss_mlp": 0.01006998, + "balance_loss_clip": 1.00441146, + "balance_loss_mlp": 1.00192142, + "epoch": 0.6203517210281076, + "flos": 62739269233920.0, + "grad_norm": 0.7199198521556569, + "language_loss": 0.5912407, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61140305, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.07324219, + "step": 10318, + "time_per_iteration": 3.072370767593384 + }, + { + "auxiliary_loss_clip": 0.01056019, + "auxiliary_loss_mlp": 0.0104046, + "balance_loss_clip": 1.01425791, + "balance_loss_mlp": 1.01796484, + "epoch": 0.6204118442807756, + "flos": 23403153651840.0, + "grad_norm": 1.5146564364093789, + "language_loss": 0.79088676, + "learning_rate": 1.330272686582143e-06, + "loss": 0.8118515, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 10319, + "time_per_iteration": 2.371276378631592 + }, + { + "auxiliary_loss_clip": 0.01052956, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.01374817, + "balance_loss_mlp": 1.01680577, + "epoch": 0.6204719675334436, + "flos": 20192610579840.0, + "grad_norm": 2.168946592111342, + "language_loss": 0.6761651, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.69706142, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36132812, + "step": 10320, + "time_per_iteration": 2.357863426208496 + }, + { + "auxiliary_loss_clip": 0.01053043, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.01386285, + "balance_loss_mlp": 1.01631737, + "epoch": 0.6205320907861115, + "flos": 13187218055040.0, + "grad_norm": 6.625752664763431, + "language_loss": 0.77168214, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.79257977, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 10321, + "time_per_iteration": 2.3517634868621826 + }, + { + "auxiliary_loss_clip": 0.01052825, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.01081717, + "balance_loss_mlp": 1.01656437, + "epoch": 0.6205922140387795, + "flos": 20667324602880.0, + "grad_norm": 1.7985303966577608, + "language_loss": 0.74595988, + "learning_rate": 1.329171870732758e-06, + "loss": 0.76682085, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36328125, + "step": 10322, + "time_per_iteration": 2.355745792388916 + }, + { + "auxiliary_loss_clip": 0.01053605, + "auxiliary_loss_mlp": 0.01035392, + "balance_loss_clip": 1.01276565, + "balance_loss_mlp": 1.01661158, + "epoch": 0.6206523372914474, + "flos": 23876715600000.0, + "grad_norm": 1.6204904579828854, + "language_loss": 0.74622321, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.76711315, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37109375, + "step": 10323, + "time_per_iteration": 2.4112660884857178 + }, + { + "auxiliary_loss_clip": 0.0105865, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_clip": 1.01764631, + "balance_loss_mlp": 1.01857066, + "epoch": 0.6207124605441154, + "flos": 13405774366080.0, + "grad_norm": 2.3984670192435447, + "language_loss": 0.5996629, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.62068284, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.40234375, + "step": 10324, + "time_per_iteration": 2.3247218132019043 + }, + { + "auxiliary_loss_clip": 0.01055329, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.01525986, + "balance_loss_mlp": 1.01818037, + "epoch": 0.6207725837967835, + "flos": 18915348067200.0, + "grad_norm": 1.8196881155683957, + "language_loss": 0.77910483, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.80005527, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 10325, + "time_per_iteration": 2.3593223094940186 + }, + { + "auxiliary_loss_clip": 0.01055917, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.01104593, + "balance_loss_mlp": 1.01690257, + "epoch": 0.6208327070494514, + "flos": 23979290774400.0, + "grad_norm": 1.7863978289153462, + "language_loss": 0.7331813, + "learning_rate": 1.327704472462003e-06, + "loss": 0.75411743, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38867188, + "step": 10326, + "time_per_iteration": 2.381061315536499 + }, + { + "auxiliary_loss_clip": 0.01056266, + "auxiliary_loss_mlp": 0.01048716, + "balance_loss_clip": 1.02219224, + "balance_loss_mlp": 1.01760566, + "epoch": 0.6208928303021194, + "flos": 22819301118720.0, + "grad_norm": 2.8565994117141877, + "language_loss": 0.75227427, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.77332401, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 10327, + "time_per_iteration": 2.380669116973877 + }, + { + "auxiliary_loss_clip": 0.01056462, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_clip": 1.01794553, + "balance_loss_mlp": 1.0170213, + "epoch": 0.6209529535547873, + "flos": 17563615891200.0, + "grad_norm": 3.6940257385244974, + "language_loss": 0.8211838, + "learning_rate": 1.326970926232066e-06, + "loss": 0.84219599, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39453125, + "step": 10328, + "time_per_iteration": 2.326030731201172 + }, + { + "auxiliary_loss_clip": 0.01054472, + "auxiliary_loss_mlp": 0.01044254, + "balance_loss_clip": 1.01956606, + "balance_loss_mlp": 1.01650882, + "epoch": 0.6210130768074553, + "flos": 22010992254720.0, + "grad_norm": 2.6188011340152793, + "language_loss": 0.78938097, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.81036818, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37890625, + "step": 10329, + "time_per_iteration": 2.407322406768799 + }, + { + "auxiliary_loss_clip": 0.01009209, + "auxiliary_loss_mlp": 0.01005976, + "balance_loss_clip": 1.00350809, + "balance_loss_mlp": 1.00218725, + "epoch": 0.6210732000601232, + "flos": 63673825040640.0, + "grad_norm": 0.834593320396637, + "language_loss": 0.62317693, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64332879, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.0246582, + "router_z_loss_mlp": 0.0703125, + "step": 10330, + "time_per_iteration": 2.9504528045654297 + }, + { + "auxiliary_loss_clip": 0.01057254, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_clip": 1.02184844, + "balance_loss_mlp": 1.0185082, + "epoch": 0.6211333233127913, + "flos": 24242221290240.0, + "grad_norm": 2.1092282525960924, + "language_loss": 0.79457295, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.81562757, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 10331, + "time_per_iteration": 2.401834487915039 + }, + { + "auxiliary_loss_clip": 0.0105606, + "auxiliary_loss_mlp": 0.0104453, + "balance_loss_clip": 1.02067602, + "balance_loss_mlp": 1.01758325, + "epoch": 0.6211934465654592, + "flos": 16942930007040.0, + "grad_norm": 2.0442346171897285, + "language_loss": 0.68850219, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.70950806, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38476562, + "step": 10332, + "time_per_iteration": 2.363570213317871 + }, + { + "auxiliary_loss_clip": 0.01052267, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.01127183, + "balance_loss_mlp": 1.01592302, + "epoch": 0.6212535698181272, + "flos": 15266505386880.0, + "grad_norm": 1.67780319072786, + "language_loss": 0.77428395, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.7951563, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 10333, + "time_per_iteration": 2.374277114868164 + }, + { + "auxiliary_loss_clip": 0.01052975, + "auxiliary_loss_mlp": 0.01039939, + "balance_loss_clip": 1.01609671, + "balance_loss_mlp": 1.01691544, + "epoch": 0.6213136930707951, + "flos": 13443096476160.0, + "grad_norm": 4.904472164224447, + "language_loss": 0.71535999, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.73628908, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 10334, + "time_per_iteration": 2.3289544582366943 + }, + { + "auxiliary_loss_clip": 0.01053894, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.01451302, + "balance_loss_mlp": 1.01790524, + "epoch": 0.6213738163234631, + "flos": 18110320871040.0, + "grad_norm": 1.7256149939076006, + "language_loss": 0.71313226, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.73402905, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.359375, + "step": 10335, + "time_per_iteration": 2.3913121223449707 + }, + { + "auxiliary_loss_clip": 0.01051566, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.01426828, + "balance_loss_mlp": 1.0162555, + "epoch": 0.621433939576131, + "flos": 25336189831680.0, + "grad_norm": 1.504678699249693, + "language_loss": 0.81549585, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.83638561, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35351562, + "step": 10336, + "time_per_iteration": 2.39898943901062 + }, + { + "auxiliary_loss_clip": 0.01051485, + "auxiliary_loss_mlp": 0.01036088, + "balance_loss_clip": 1.01474905, + "balance_loss_mlp": 1.01626754, + "epoch": 0.621494062828799, + "flos": 22564504949760.0, + "grad_norm": 1.7804675422657101, + "language_loss": 0.7455467, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.76642239, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 10337, + "time_per_iteration": 2.444209337234497 + }, + { + "auxiliary_loss_clip": 0.01054724, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.01238143, + "balance_loss_mlp": 1.01650012, + "epoch": 0.621554186081467, + "flos": 27416733972480.0, + "grad_norm": 2.2293843961455058, + "language_loss": 0.64943683, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.67036706, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 10338, + "time_per_iteration": 2.4169082641601562 + }, + { + "auxiliary_loss_clip": 0.01052826, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.01211762, + "balance_loss_mlp": 1.01614809, + "epoch": 0.621614309334135, + "flos": 22345704259200.0, + "grad_norm": 1.677295296419799, + "language_loss": 0.72334045, + "learning_rate": 1.322938249724991e-06, + "loss": 0.74421847, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 10339, + "time_per_iteration": 3.6075921058654785 + }, + { + "auxiliary_loss_clip": 0.01052205, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.01201558, + "balance_loss_mlp": 1.01659977, + "epoch": 0.621674432586803, + "flos": 19280225352960.0, + "grad_norm": 1.5194873957947403, + "language_loss": 0.70773685, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.72860587, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 10340, + "time_per_iteration": 2.3465890884399414 + }, + { + "auxiliary_loss_clip": 0.01051156, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.01514614, + "balance_loss_mlp": 1.01596808, + "epoch": 0.6217345558394709, + "flos": 21608653213440.0, + "grad_norm": 1.8917461559509743, + "language_loss": 0.7086519, + "learning_rate": 1.322205369037788e-06, + "loss": 0.72954071, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 10341, + "time_per_iteration": 2.3737545013427734 + }, + { + "auxiliary_loss_clip": 0.01054803, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.01501954, + "balance_loss_mlp": 1.01761925, + "epoch": 0.6217946790921389, + "flos": 18003137397120.0, + "grad_norm": 5.733564295140985, + "language_loss": 0.82298255, + "learning_rate": 1.321838967240299e-06, + "loss": 0.84392124, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 10342, + "time_per_iteration": 2.3412342071533203 + }, + { + "auxiliary_loss_clip": 0.01009257, + "auxiliary_loss_mlp": 0.01002644, + "balance_loss_clip": 0.99998587, + "balance_loss_mlp": 1.00215602, + "epoch": 0.6218548023448068, + "flos": 61970307338880.0, + "grad_norm": 0.775701546284952, + "language_loss": 0.57408607, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59420502, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.07128906, + "step": 10343, + "time_per_iteration": 4.372875690460205 + }, + { + "auxiliary_loss_clip": 0.01050229, + "auxiliary_loss_mlp": 0.01040786, + "balance_loss_clip": 1.02048445, + "balance_loss_mlp": 1.01596463, + "epoch": 0.6219149255974749, + "flos": 25737970291200.0, + "grad_norm": 1.99436827598977, + "language_loss": 0.74039197, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.76130211, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34179688, + "step": 10344, + "time_per_iteration": 3.8489503860473633 + }, + { + "auxiliary_loss_clip": 0.01053768, + "auxiliary_loss_mlp": 0.01043465, + "balance_loss_clip": 1.02012444, + "balance_loss_mlp": 1.01747143, + "epoch": 0.6219750488501428, + "flos": 25409891445120.0, + "grad_norm": 1.8180348618964912, + "language_loss": 0.61310446, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.63407671, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 10345, + "time_per_iteration": 2.3838841915130615 + }, + { + "auxiliary_loss_clip": 0.01052855, + "auxiliary_loss_mlp": 0.01040656, + "balance_loss_clip": 1.01744604, + "balance_loss_mlp": 1.01627374, + "epoch": 0.6220351721028108, + "flos": 20046359427840.0, + "grad_norm": 1.7175506947687915, + "language_loss": 0.79313666, + "learning_rate": 1.320373617348614e-06, + "loss": 0.81407171, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 10346, + "time_per_iteration": 2.3566694259643555 + }, + { + "auxiliary_loss_clip": 0.01054232, + "auxiliary_loss_mlp": 0.01044333, + "balance_loss_clip": 1.01751113, + "balance_loss_mlp": 1.01632118, + "epoch": 0.6220952953554787, + "flos": 27487223740800.0, + "grad_norm": 1.5753575271455187, + "language_loss": 0.72491145, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.74589717, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 10347, + "time_per_iteration": 2.406557321548462 + }, + { + "auxiliary_loss_clip": 0.01052212, + "auxiliary_loss_mlp": 0.0104059, + "balance_loss_clip": 1.01776123, + "balance_loss_mlp": 1.01574659, + "epoch": 0.6221554186081467, + "flos": 19206628473600.0, + "grad_norm": 1.764215531408927, + "language_loss": 0.72804976, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74897778, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 10348, + "time_per_iteration": 2.3912715911865234 + }, + { + "auxiliary_loss_clip": 0.01008222, + "auxiliary_loss_mlp": 0.01009025, + "balance_loss_clip": 1.00630677, + "balance_loss_mlp": 1.00117147, + "epoch": 0.6222155418608146, + "flos": 62947805984640.0, + "grad_norm": 0.813812655238156, + "language_loss": 0.54242289, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56259537, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.0703125, + "step": 10349, + "time_per_iteration": 3.0444953441619873 + }, + { + "auxiliary_loss_clip": 0.01053792, + "auxiliary_loss_mlp": 0.01038, + "balance_loss_clip": 1.01589835, + "balance_loss_mlp": 1.01751971, + "epoch": 0.6222756651134826, + "flos": 22600011669120.0, + "grad_norm": 2.144965586643581, + "language_loss": 0.71577364, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.73669153, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 10350, + "time_per_iteration": 2.408726930618286 + }, + { + "auxiliary_loss_clip": 0.01054081, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.01901054, + "balance_loss_mlp": 1.01642859, + "epoch": 0.6223357883661506, + "flos": 21141165841920.0, + "grad_norm": 1.890829756460324, + "language_loss": 0.58642673, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.60739911, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 10351, + "time_per_iteration": 2.3930017948150635 + }, + { + "auxiliary_loss_clip": 0.01007691, + "auxiliary_loss_mlp": 0.01015959, + "balance_loss_clip": 1.01339567, + "balance_loss_mlp": 1.00079894, + "epoch": 0.6223959116188186, + "flos": 63761595932160.0, + "grad_norm": 0.8090585668858306, + "language_loss": 0.61277419, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63301075, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.06884766, + "step": 10352, + "time_per_iteration": 4.415473461151123 + }, + { + "auxiliary_loss_clip": 0.01051297, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.01761985, + "balance_loss_mlp": 1.01574922, + "epoch": 0.6224560348714866, + "flos": 22564609683840.0, + "grad_norm": 1.9343754949635912, + "language_loss": 0.83055502, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.85147274, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 10353, + "time_per_iteration": 2.4050681591033936 + }, + { + "auxiliary_loss_clip": 0.01049434, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.01622462, + "balance_loss_mlp": 1.01516294, + "epoch": 0.6225161581241545, + "flos": 24096598542720.0, + "grad_norm": 1.478828853788124, + "language_loss": 0.76349938, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.78437328, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 10354, + "time_per_iteration": 2.45405650138855 + }, + { + "auxiliary_loss_clip": 0.01052999, + "auxiliary_loss_mlp": 0.01038444, + "balance_loss_clip": 1.014328, + "balance_loss_mlp": 1.01627827, + "epoch": 0.6225762813768225, + "flos": 20442623892480.0, + "grad_norm": 1.512867933292012, + "language_loss": 0.79636383, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.81727827, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 10355, + "time_per_iteration": 2.4461629390716553 + }, + { + "auxiliary_loss_clip": 0.01053338, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.01527429, + "balance_loss_mlp": 1.01724982, + "epoch": 0.6226364046294904, + "flos": 27196920852480.0, + "grad_norm": 4.750038477446377, + "language_loss": 0.79113263, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.81203365, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.359375, + "step": 10356, + "time_per_iteration": 2.4590542316436768 + }, + { + "auxiliary_loss_clip": 0.01058318, + "auxiliary_loss_mlp": 0.01044884, + "balance_loss_clip": 1.01785946, + "balance_loss_mlp": 1.01910281, + "epoch": 0.6226965278821585, + "flos": 20444823308160.0, + "grad_norm": 2.2580023699977616, + "language_loss": 0.69446462, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.71549666, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 10357, + "time_per_iteration": 2.374389886856079 + }, + { + "auxiliary_loss_clip": 0.01057059, + "auxiliary_loss_mlp": 0.01042209, + "balance_loss_clip": 1.01429057, + "balance_loss_mlp": 1.01819611, + "epoch": 0.6227566511348264, + "flos": 22161677149440.0, + "grad_norm": 2.635816565190352, + "language_loss": 0.76905507, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.79004776, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38867188, + "step": 10358, + "time_per_iteration": 2.4003114700317383 + }, + { + "auxiliary_loss_clip": 0.01054556, + "auxiliary_loss_mlp": 0.01041603, + "balance_loss_clip": 1.01675975, + "balance_loss_mlp": 1.01698041, + "epoch": 0.6228167743874944, + "flos": 18039900925440.0, + "grad_norm": 2.165154962910996, + "language_loss": 0.8364917, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.85745335, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 10359, + "time_per_iteration": 2.3589015007019043 + }, + { + "auxiliary_loss_clip": 0.01052814, + "auxiliary_loss_mlp": 0.01047115, + "balance_loss_clip": 1.02299857, + "balance_loss_mlp": 1.01727796, + "epoch": 0.6228768976401623, + "flos": 17742057183360.0, + "grad_norm": 2.2542880978542588, + "language_loss": 0.74243176, + "learning_rate": 1.315248145768822e-06, + "loss": 0.76343107, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35546875, + "step": 10360, + "time_per_iteration": 2.337226629257202 + }, + { + "auxiliary_loss_clip": 0.0105459, + "auxiliary_loss_mlp": 0.01040027, + "balance_loss_clip": 1.01623333, + "balance_loss_mlp": 1.01714683, + "epoch": 0.6229370208928303, + "flos": 17893963975680.0, + "grad_norm": 2.1044991348075848, + "language_loss": 0.78679955, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.80774581, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 10361, + "time_per_iteration": 2.3586506843566895 + }, + { + "auxiliary_loss_clip": 0.01053074, + "auxiliary_loss_mlp": 0.01038982, + "balance_loss_clip": 1.015903, + "balance_loss_mlp": 1.01704717, + "epoch": 0.6229971441454982, + "flos": 17346805148160.0, + "grad_norm": 1.723405357730571, + "language_loss": 0.68679726, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.70771784, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 10362, + "time_per_iteration": 2.3847625255584717 + }, + { + "auxiliary_loss_clip": 0.01056658, + "auxiliary_loss_mlp": 0.01038214, + "balance_loss_clip": 1.01312065, + "balance_loss_mlp": 1.0183481, + "epoch": 0.6230572673981662, + "flos": 29240107971840.0, + "grad_norm": 2.201055614045916, + "language_loss": 0.68717456, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.70812327, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 10363, + "time_per_iteration": 2.4558916091918945 + }, + { + "auxiliary_loss_clip": 0.01056671, + "auxiliary_loss_mlp": 0.01041594, + "balance_loss_clip": 1.01430702, + "balance_loss_mlp": 1.01827061, + "epoch": 0.6231173906508342, + "flos": 16325037031680.0, + "grad_norm": 2.0454598406220574, + "language_loss": 0.87868273, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.89966536, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38476562, + "step": 10364, + "time_per_iteration": 2.3656580448150635 + }, + { + "auxiliary_loss_clip": 0.01011992, + "auxiliary_loss_mlp": 0.0100734, + "balance_loss_clip": 1.00518203, + "balance_loss_mlp": 1.00429797, + "epoch": 0.6231775139035022, + "flos": 68699119006080.0, + "grad_norm": 0.8819379211903688, + "language_loss": 0.60916281, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62935615, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.07714844, + "step": 10365, + "time_per_iteration": 3.0280003547668457 + }, + { + "auxiliary_loss_clip": 0.01057873, + "auxiliary_loss_mlp": 0.0104876, + "balance_loss_clip": 1.02012622, + "balance_loss_mlp": 1.01846504, + "epoch": 0.6232376371561702, + "flos": 19820227351680.0, + "grad_norm": 1.7282713085913322, + "language_loss": 0.76847243, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.7895388, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.39453125, + "step": 10366, + "time_per_iteration": 2.3816006183624268 + }, + { + "auxiliary_loss_clip": 0.01055338, + "auxiliary_loss_mlp": 0.01042214, + "balance_loss_clip": 1.01679897, + "balance_loss_mlp": 1.01749349, + "epoch": 0.6232977604088381, + "flos": 23257146879360.0, + "grad_norm": 1.8525592566391478, + "language_loss": 0.78050572, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.80148125, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 10367, + "time_per_iteration": 2.45912766456604 + }, + { + "auxiliary_loss_clip": 0.01053599, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.0124104, + "balance_loss_mlp": 1.0175997, + "epoch": 0.6233578836615061, + "flos": 21105344920320.0, + "grad_norm": 1.6354004855902522, + "language_loss": 0.79289371, + "learning_rate": 1.312321587418457e-06, + "loss": 0.8137958, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 10368, + "time_per_iteration": 2.399183750152588 + }, + { + "auxiliary_loss_clip": 0.01055217, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.01361001, + "balance_loss_mlp": 1.01776361, + "epoch": 0.623418006914174, + "flos": 23768275317120.0, + "grad_norm": 1.8066895665749894, + "language_loss": 0.70241654, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.7233628, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 10369, + "time_per_iteration": 2.425150156021118 + }, + { + "auxiliary_loss_clip": 0.01054611, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.01798606, + "balance_loss_mlp": 1.01670289, + "epoch": 0.6234781301668421, + "flos": 17889634967040.0, + "grad_norm": 2.0829942566115727, + "language_loss": 0.89214563, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.91311777, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 10370, + "time_per_iteration": 2.3363070487976074 + }, + { + "auxiliary_loss_clip": 0.010509, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.01165271, + "balance_loss_mlp": 1.01570845, + "epoch": 0.62353825341951, + "flos": 26174349774720.0, + "grad_norm": 1.415569294073299, + "language_loss": 0.66857195, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68943477, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35351562, + "step": 10371, + "time_per_iteration": 2.4231669902801514 + }, + { + "auxiliary_loss_clip": 0.01047885, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.01371098, + "balance_loss_mlp": 1.01482391, + "epoch": 0.623598376672178, + "flos": 31138545127680.0, + "grad_norm": 1.675645161935773, + "language_loss": 0.77849656, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79930013, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.33007812, + "step": 10372, + "time_per_iteration": 2.47499418258667 + }, + { + "auxiliary_loss_clip": 0.01053687, + "auxiliary_loss_mlp": 0.01040837, + "balance_loss_clip": 1.0170188, + "balance_loss_mlp": 1.01653159, + "epoch": 0.6236584999248459, + "flos": 23729137816320.0, + "grad_norm": 1.5715844806074482, + "language_loss": 0.78089297, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.80183816, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 10373, + "time_per_iteration": 2.402592420578003 + }, + { + "auxiliary_loss_clip": 0.01050558, + "auxiliary_loss_mlp": 0.01041836, + "balance_loss_clip": 1.019449, + "balance_loss_mlp": 1.01555681, + "epoch": 0.6237186231775139, + "flos": 21761677169280.0, + "grad_norm": 1.4724963280863845, + "language_loss": 0.70541281, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.72633684, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34960938, + "step": 10374, + "time_per_iteration": 2.4063589572906494 + }, + { + "auxiliary_loss_clip": 0.01052249, + "auxiliary_loss_mlp": 0.01040545, + "balance_loss_clip": 1.01878881, + "balance_loss_mlp": 1.01603293, + "epoch": 0.6237787464301818, + "flos": 14938601097600.0, + "grad_norm": 1.6915383108618636, + "language_loss": 0.78314036, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.80406833, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.36132812, + "step": 10375, + "time_per_iteration": 2.3391923904418945 + }, + { + "auxiliary_loss_clip": 0.01052199, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.01941943, + "balance_loss_mlp": 1.01702964, + "epoch": 0.6238388696828499, + "flos": 35588854045440.0, + "grad_norm": 1.4071605051182923, + "language_loss": 0.71686786, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.73780191, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.3515625, + "step": 10376, + "time_per_iteration": 2.543391466140747 + }, + { + "auxiliary_loss_clip": 0.01053741, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.02173531, + "balance_loss_mlp": 1.01623321, + "epoch": 0.6238989929355178, + "flos": 23622373278720.0, + "grad_norm": 1.573291646142195, + "language_loss": 0.78272569, + "learning_rate": 1.309031204505301e-06, + "loss": 0.80373931, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 10377, + "time_per_iteration": 2.4061696529388428 + }, + { + "auxiliary_loss_clip": 0.01053483, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.01223803, + "balance_loss_mlp": 1.01721096, + "epoch": 0.6239591161881858, + "flos": 22086474347520.0, + "grad_norm": 1.7015181006235787, + "language_loss": 0.69565392, + "learning_rate": 1.308665737227052e-06, + "loss": 0.71654141, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 10378, + "time_per_iteration": 3.627535581588745 + }, + { + "auxiliary_loss_clip": 0.01052879, + "auxiliary_loss_mlp": 0.01038257, + "balance_loss_clip": 1.01555967, + "balance_loss_mlp": 1.01597142, + "epoch": 0.6240192394408538, + "flos": 24534758505600.0, + "grad_norm": 1.7072616048456617, + "language_loss": 0.77200609, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.79291743, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36914062, + "step": 10379, + "time_per_iteration": 2.4317829608917236 + }, + { + "auxiliary_loss_clip": 0.01052552, + "auxiliary_loss_mlp": 0.01038453, + "balance_loss_clip": 1.01560068, + "balance_loss_mlp": 1.01630712, + "epoch": 0.6240793626935217, + "flos": 27930585496320.0, + "grad_norm": 1.4653428445869618, + "language_loss": 0.79947734, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.82038736, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 10380, + "time_per_iteration": 2.4452779293060303 + }, + { + "auxiliary_loss_clip": 0.01052739, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.01769471, + "balance_loss_mlp": 1.01790142, + "epoch": 0.6241394859461897, + "flos": 22891431720960.0, + "grad_norm": 1.532092472179868, + "language_loss": 0.80638897, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82729495, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.34765625, + "step": 10381, + "time_per_iteration": 2.424164056777954 + }, + { + "auxiliary_loss_clip": 0.01054272, + "auxiliary_loss_mlp": 0.01035424, + "balance_loss_clip": 1.01180863, + "balance_loss_mlp": 1.01737905, + "epoch": 0.6241996091988576, + "flos": 12749930876160.0, + "grad_norm": 1.850215485497891, + "language_loss": 0.75545204, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.77634895, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 10382, + "time_per_iteration": 2.3303074836730957 + }, + { + "auxiliary_loss_clip": 0.01050913, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_clip": 1.00999999, + "balance_loss_mlp": 1.01606536, + "epoch": 0.6242597324515257, + "flos": 25850041355520.0, + "grad_norm": 1.401425278727459, + "language_loss": 0.79464471, + "learning_rate": 1.306838794344911e-06, + "loss": 0.81546974, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34960938, + "step": 10383, + "time_per_iteration": 3.860297679901123 + }, + { + "auxiliary_loss_clip": 0.01052503, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.01251531, + "balance_loss_mlp": 1.01706171, + "epoch": 0.6243198557041936, + "flos": 19936697247360.0, + "grad_norm": 1.9574102467733465, + "language_loss": 0.76416814, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.78503549, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35351562, + "step": 10384, + "time_per_iteration": 3.7070844173431396 + }, + { + "auxiliary_loss_clip": 0.01053655, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.01526022, + "balance_loss_mlp": 1.01678753, + "epoch": 0.6243799789568616, + "flos": 18405197147520.0, + "grad_norm": 1.9423526662972488, + "language_loss": 0.6768074, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.69773668, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 10385, + "time_per_iteration": 2.350911855697632 + }, + { + "auxiliary_loss_clip": 0.01009698, + "auxiliary_loss_mlp": 0.01014362, + "balance_loss_clip": 1.01169205, + "balance_loss_mlp": 1.00266314, + "epoch": 0.6244401022095295, + "flos": 66024037589760.0, + "grad_norm": 0.7613603335947061, + "language_loss": 0.62089193, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64113259, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.0703125, + "step": 10386, + "time_per_iteration": 3.0550146102905273 + }, + { + "auxiliary_loss_clip": 0.01054037, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.01390982, + "balance_loss_mlp": 1.0169754, + "epoch": 0.6245002254621975, + "flos": 24570125579520.0, + "grad_norm": 2.314102804626421, + "language_loss": 0.72964084, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.75056255, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 10387, + "time_per_iteration": 2.382514715194702 + }, + { + "auxiliary_loss_clip": 0.01058721, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.01729965, + "balance_loss_mlp": 1.01875198, + "epoch": 0.6245603487148654, + "flos": 29167558433280.0, + "grad_norm": 2.065989784685282, + "language_loss": 0.66475785, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.6857779, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.40039062, + "step": 10388, + "time_per_iteration": 2.445387125015259 + }, + { + "auxiliary_loss_clip": 0.01053355, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.01131415, + "balance_loss_mlp": 1.01735973, + "epoch": 0.6246204719675335, + "flos": 14789312657280.0, + "grad_norm": 1.5590397146944934, + "language_loss": 0.80171514, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.82257378, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.36132812, + "step": 10389, + "time_per_iteration": 2.3365225791931152 + }, + { + "auxiliary_loss_clip": 0.01053177, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.01441884, + "balance_loss_mlp": 1.01757646, + "epoch": 0.6246805952202014, + "flos": 12492760734720.0, + "grad_norm": 1.676858400231038, + "language_loss": 0.61217564, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.63309419, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 10390, + "time_per_iteration": 2.378221273422241 + }, + { + "auxiliary_loss_clip": 0.01054673, + "auxiliary_loss_mlp": 0.01040947, + "balance_loss_clip": 1.01853561, + "balance_loss_mlp": 1.01768661, + "epoch": 0.6247407184728694, + "flos": 12785856531840.0, + "grad_norm": 1.8529478692312638, + "language_loss": 0.79009598, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.8110522, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36914062, + "step": 10391, + "time_per_iteration": 2.3453776836395264 + }, + { + "auxiliary_loss_clip": 0.01055609, + "auxiliary_loss_mlp": 0.01040564, + "balance_loss_clip": 1.01729476, + "balance_loss_mlp": 1.0194447, + "epoch": 0.6248008417255374, + "flos": 40627484150400.0, + "grad_norm": 1.769448494277114, + "language_loss": 0.65832156, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.67928326, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 10392, + "time_per_iteration": 3.9396097660064697 + }, + { + "auxiliary_loss_clip": 0.01055013, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_clip": 1.01821959, + "balance_loss_mlp": 1.01799822, + "epoch": 0.6248609649782053, + "flos": 19900981059840.0, + "grad_norm": 1.8247558843499425, + "language_loss": 0.77717435, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.7981559, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36914062, + "step": 10393, + "time_per_iteration": 2.352872848510742 + }, + { + "auxiliary_loss_clip": 0.01055475, + "auxiliary_loss_mlp": 0.01043752, + "balance_loss_clip": 1.02024412, + "balance_loss_mlp": 1.01880813, + "epoch": 0.6249210882308733, + "flos": 19681726521600.0, + "grad_norm": 1.7577350271273602, + "language_loss": 0.8382569, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.85924917, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3671875, + "step": 10394, + "time_per_iteration": 2.416154623031616 + }, + { + "auxiliary_loss_clip": 0.01055078, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_clip": 1.0208081, + "balance_loss_mlp": 1.01788104, + "epoch": 0.6249812114835412, + "flos": 13989871278720.0, + "grad_norm": 1.8502069453458474, + "language_loss": 0.76604658, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.78704411, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 10395, + "time_per_iteration": 2.3470935821533203 + }, + { + "auxiliary_loss_clip": 0.0105578, + "auxiliary_loss_mlp": 0.01043067, + "balance_loss_clip": 1.01903486, + "balance_loss_mlp": 1.0178858, + "epoch": 0.6250413347362093, + "flos": 14529384518400.0, + "grad_norm": 2.418631137979976, + "language_loss": 0.73625791, + "learning_rate": 1.302091822487119e-06, + "loss": 0.75724649, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 10396, + "time_per_iteration": 2.3322646617889404 + }, + { + "auxiliary_loss_clip": 0.01054565, + "auxiliary_loss_mlp": 0.01039238, + "balance_loss_clip": 1.01701748, + "balance_loss_mlp": 1.01791549, + "epoch": 0.6251014579888772, + "flos": 22961991312000.0, + "grad_norm": 1.9033091483329845, + "language_loss": 0.77224505, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.79318309, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3671875, + "step": 10397, + "time_per_iteration": 2.373284339904785 + }, + { + "auxiliary_loss_clip": 0.01053702, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.01656771, + "balance_loss_mlp": 1.01788378, + "epoch": 0.6251615812415452, + "flos": 28109969395200.0, + "grad_norm": 2.5835589994262986, + "language_loss": 0.7682333, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.78917038, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35742188, + "step": 10398, + "time_per_iteration": 2.487374782562256 + }, + { + "auxiliary_loss_clip": 0.01053618, + "auxiliary_loss_mlp": 0.0104341, + "balance_loss_clip": 1.01984239, + "balance_loss_mlp": 1.01679373, + "epoch": 0.6252217044942131, + "flos": 26723254170240.0, + "grad_norm": 1.738572524526894, + "language_loss": 0.75255007, + "learning_rate": 1.300997001489483e-06, + "loss": 0.77352035, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 10399, + "time_per_iteration": 2.4130094051361084 + }, + { + "auxiliary_loss_clip": 0.01053161, + "auxiliary_loss_mlp": 0.01044538, + "balance_loss_clip": 1.02014732, + "balance_loss_mlp": 1.01713037, + "epoch": 0.6252818277468811, + "flos": 20005860384000.0, + "grad_norm": 1.9416818484992924, + "language_loss": 0.75680745, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.77778435, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 10400, + "time_per_iteration": 2.385928153991699 + }, + { + "auxiliary_loss_clip": 0.01008595, + "auxiliary_loss_mlp": 0.01003157, + "balance_loss_clip": 1.00096369, + "balance_loss_mlp": 1.00167942, + "epoch": 0.625341950999549, + "flos": 59274907511040.0, + "grad_norm": 0.806394618414399, + "language_loss": 0.56494671, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58506423, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06933594, + "step": 10401, + "time_per_iteration": 3.077420473098755 + }, + { + "auxiliary_loss_clip": 0.01055005, + "auxiliary_loss_mlp": 0.01042464, + "balance_loss_clip": 1.01913452, + "balance_loss_mlp": 1.01790309, + "epoch": 0.625402074252217, + "flos": 20156056519680.0, + "grad_norm": 2.202435230977698, + "language_loss": 0.84251082, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.86348546, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 10402, + "time_per_iteration": 2.37788987159729 + }, + { + "auxiliary_loss_clip": 0.0105205, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.01875329, + "balance_loss_mlp": 1.01595449, + "epoch": 0.625462197504885, + "flos": 29131248752640.0, + "grad_norm": 1.9496345276339333, + "language_loss": 0.70224255, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.72318137, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 10403, + "time_per_iteration": 2.4419331550598145 + }, + { + "auxiliary_loss_clip": 0.01054097, + "auxiliary_loss_mlp": 0.01041003, + "balance_loss_clip": 1.01657724, + "balance_loss_mlp": 1.01648772, + "epoch": 0.625522320757553, + "flos": 26103231601920.0, + "grad_norm": 1.5468519915695669, + "language_loss": 0.73480642, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.75575739, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 10404, + "time_per_iteration": 2.4396274089813232 + }, + { + "auxiliary_loss_clip": 0.01054059, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.01844847, + "balance_loss_mlp": 1.01727653, + "epoch": 0.625582444010221, + "flos": 20629932670080.0, + "grad_norm": 1.7876897860227412, + "language_loss": 0.7105279, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.73149276, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 10405, + "time_per_iteration": 2.35520601272583 + }, + { + "auxiliary_loss_clip": 0.01053794, + "auxiliary_loss_mlp": 0.0104517, + "balance_loss_clip": 1.02105427, + "balance_loss_mlp": 1.01674199, + "epoch": 0.6256425672628889, + "flos": 20520479957760.0, + "grad_norm": 1.572519998634553, + "language_loss": 0.79665995, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81764966, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37109375, + "step": 10406, + "time_per_iteration": 2.39420747756958 + }, + { + "auxiliary_loss_clip": 0.01054422, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.02190351, + "balance_loss_mlp": 1.017838, + "epoch": 0.6257026905155569, + "flos": 29528036887680.0, + "grad_norm": 1.6690621680314628, + "language_loss": 0.7019639, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.72296429, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 10407, + "time_per_iteration": 2.4289591312408447 + }, + { + "auxiliary_loss_clip": 0.01051384, + "auxiliary_loss_mlp": 0.01043783, + "balance_loss_clip": 1.02201533, + "balance_loss_mlp": 1.01644075, + "epoch": 0.6257628137682248, + "flos": 24023734801920.0, + "grad_norm": 1.7165498652451934, + "language_loss": 0.86782885, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.88878053, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34960938, + "step": 10408, + "time_per_iteration": 2.4138877391815186 + }, + { + "auxiliary_loss_clip": 0.01052313, + "auxiliary_loss_mlp": 0.01042705, + "balance_loss_clip": 1.02036488, + "balance_loss_mlp": 1.01633728, + "epoch": 0.6258229370208929, + "flos": 20849885435520.0, + "grad_norm": 1.6627959219809607, + "language_loss": 0.8040418, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.82499206, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.359375, + "step": 10409, + "time_per_iteration": 2.380603313446045 + }, + { + "auxiliary_loss_clip": 0.01051628, + "auxiliary_loss_mlp": 0.01039578, + "balance_loss_clip": 1.01671338, + "balance_loss_mlp": 1.01532733, + "epoch": 0.6258830602735608, + "flos": 22230595906560.0, + "grad_norm": 2.8588586005566885, + "language_loss": 0.71231639, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.73322845, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 10410, + "time_per_iteration": 2.3872859477996826 + }, + { + "auxiliary_loss_clip": 0.01052816, + "auxiliary_loss_mlp": 0.01040115, + "balance_loss_clip": 1.01772738, + "balance_loss_mlp": 1.01723766, + "epoch": 0.6259431835262288, + "flos": 25075877667840.0, + "grad_norm": 1.761882910366046, + "language_loss": 0.68642527, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.70735455, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 10411, + "time_per_iteration": 2.4011940956115723 + }, + { + "auxiliary_loss_clip": 0.01053731, + "auxiliary_loss_mlp": 0.01043735, + "balance_loss_clip": 1.01953506, + "balance_loss_mlp": 1.01679504, + "epoch": 0.6260033067788967, + "flos": 28251158400000.0, + "grad_norm": 2.001412536550126, + "language_loss": 0.71159488, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.73256952, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 10412, + "time_per_iteration": 2.476613759994507 + }, + { + "auxiliary_loss_clip": 0.01052065, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.01745856, + "balance_loss_mlp": 1.01644361, + "epoch": 0.6260634300315647, + "flos": 23366320300800.0, + "grad_norm": 8.203350369684953, + "language_loss": 0.70451885, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.7254346, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 10413, + "time_per_iteration": 2.3794362545013428 + }, + { + "auxiliary_loss_clip": 0.01055928, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.01793635, + "balance_loss_mlp": 1.01699185, + "epoch": 0.6261235532842326, + "flos": 18034489664640.0, + "grad_norm": 2.8099916866485297, + "language_loss": 0.8126049, + "learning_rate": 1.295526482316796e-06, + "loss": 0.83362103, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.390625, + "step": 10414, + "time_per_iteration": 2.3492469787597656 + }, + { + "auxiliary_loss_clip": 0.01056347, + "auxiliary_loss_mlp": 0.0104819, + "balance_loss_clip": 1.02282262, + "balance_loss_mlp": 1.01862228, + "epoch": 0.6261836765369007, + "flos": 22010363850240.0, + "grad_norm": 1.776530005457135, + "language_loss": 0.75674725, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.77779263, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 10415, + "time_per_iteration": 2.4029667377471924 + }, + { + "auxiliary_loss_clip": 0.01053897, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.01462889, + "balance_loss_mlp": 1.01743698, + "epoch": 0.6262437997895686, + "flos": 24934863219840.0, + "grad_norm": 1.7724508029424213, + "language_loss": 0.75616497, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.77707803, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 10416, + "time_per_iteration": 2.4498000144958496 + }, + { + "auxiliary_loss_clip": 0.01054554, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.01210022, + "balance_loss_mlp": 1.01900053, + "epoch": 0.6263039230422366, + "flos": 31607219485440.0, + "grad_norm": 1.5276670756965094, + "language_loss": 0.85273445, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.87362218, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 10417, + "time_per_iteration": 2.4961016178131104 + }, + { + "auxiliary_loss_clip": 0.01054565, + "auxiliary_loss_mlp": 0.01038576, + "balance_loss_clip": 1.01414967, + "balance_loss_mlp": 1.01828229, + "epoch": 0.6263640462949046, + "flos": 17638504490880.0, + "grad_norm": 2.208849435474345, + "language_loss": 0.58673292, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.60766435, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 10418, + "time_per_iteration": 3.609483003616333 + }, + { + "auxiliary_loss_clip": 0.0105849, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.01485062, + "balance_loss_mlp": 1.01877141, + "epoch": 0.6264241695475725, + "flos": 19973914623360.0, + "grad_norm": 1.817707450002884, + "language_loss": 0.85063887, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.87163985, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39648438, + "step": 10419, + "time_per_iteration": 2.356804132461548 + }, + { + "auxiliary_loss_clip": 0.01054497, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.0125432, + "balance_loss_mlp": 1.01770234, + "epoch": 0.6264842928002405, + "flos": 27343102181760.0, + "grad_norm": 1.714348133702366, + "language_loss": 0.65384924, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.67475104, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 10420, + "time_per_iteration": 2.503810167312622 + }, + { + "auxiliary_loss_clip": 0.01056972, + "auxiliary_loss_mlp": 0.01041867, + "balance_loss_clip": 1.01415038, + "balance_loss_mlp": 1.01939845, + "epoch": 0.6265444160529084, + "flos": 22996311045120.0, + "grad_norm": 1.7521513655525907, + "language_loss": 0.872679, + "learning_rate": 1.292975627485741e-06, + "loss": 0.89366734, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.375, + "step": 10421, + "time_per_iteration": 2.376208543777466 + }, + { + "auxiliary_loss_clip": 0.01055824, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.01301527, + "balance_loss_mlp": 1.01883101, + "epoch": 0.6266045393055765, + "flos": 19937290740480.0, + "grad_norm": 2.349912281322267, + "language_loss": 0.80629253, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.82721072, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36914062, + "step": 10422, + "time_per_iteration": 3.7676563262939453 + }, + { + "auxiliary_loss_clip": 0.0105241, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.0080862, + "balance_loss_mlp": 1.01687336, + "epoch": 0.6266646625582444, + "flos": 24387948771840.0, + "grad_norm": 1.5751908297218915, + "language_loss": 0.75787604, + "learning_rate": 1.292247052906389e-06, + "loss": 0.77871239, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 10423, + "time_per_iteration": 3.90198016166687 + }, + { + "auxiliary_loss_clip": 0.01054665, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01020026, + "balance_loss_mlp": 1.01797962, + "epoch": 0.6267247858109124, + "flos": 14682932144640.0, + "grad_norm": 1.7947600463761983, + "language_loss": 0.78213811, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.80301505, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 10424, + "time_per_iteration": 2.3382835388183594 + }, + { + "auxiliary_loss_clip": 0.01053798, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.01543236, + "balance_loss_mlp": 1.01783872, + "epoch": 0.6267849090635803, + "flos": 24928998111360.0, + "grad_norm": 1.933031412551169, + "language_loss": 0.7017858, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.72274971, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.359375, + "step": 10425, + "time_per_iteration": 2.42175555229187 + }, + { + "auxiliary_loss_clip": 0.01051218, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.01176202, + "balance_loss_mlp": 1.01681626, + "epoch": 0.6268450323162483, + "flos": 25336678590720.0, + "grad_norm": 1.5233851794142177, + "language_loss": 0.75649893, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.77733773, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 10426, + "time_per_iteration": 2.403594732284546 + }, + { + "auxiliary_loss_clip": 0.01055293, + "auxiliary_loss_mlp": 0.01036904, + "balance_loss_clip": 1.01296639, + "balance_loss_mlp": 1.01839304, + "epoch": 0.6269051555689162, + "flos": 26176095342720.0, + "grad_norm": 1.545790664009311, + "language_loss": 0.81243455, + "learning_rate": 1.290790225914929e-06, + "loss": 0.83335644, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36914062, + "step": 10427, + "time_per_iteration": 2.4388580322265625 + }, + { + "auxiliary_loss_clip": 0.01057837, + "auxiliary_loss_mlp": 0.01037951, + "balance_loss_clip": 1.01329875, + "balance_loss_mlp": 1.01932955, + "epoch": 0.6269652788215843, + "flos": 18255978529920.0, + "grad_norm": 1.7991189765076165, + "language_loss": 0.70047635, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.72143424, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38476562, + "step": 10428, + "time_per_iteration": 2.325345277786255 + }, + { + "auxiliary_loss_clip": 0.01053583, + "auxiliary_loss_mlp": 0.01041867, + "balance_loss_clip": 1.01652288, + "balance_loss_mlp": 1.01727784, + "epoch": 0.6270254020742522, + "flos": 11764612085760.0, + "grad_norm": 1.7053976134569133, + "language_loss": 0.73133922, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.75229371, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 10429, + "time_per_iteration": 2.359295606613159 + }, + { + "auxiliary_loss_clip": 0.01056685, + "auxiliary_loss_mlp": 0.01043443, + "balance_loss_clip": 1.0177536, + "balance_loss_mlp": 1.01812685, + "epoch": 0.6270855253269202, + "flos": 23474551115520.0, + "grad_norm": 1.753916059204552, + "language_loss": 0.80374253, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82474387, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 10430, + "time_per_iteration": 2.3883137702941895 + }, + { + "auxiliary_loss_clip": 0.01011688, + "auxiliary_loss_mlp": 0.01002794, + "balance_loss_clip": 1.00012374, + "balance_loss_mlp": 1.00432491, + "epoch": 0.6271456485795882, + "flos": 70061219856000.0, + "grad_norm": 0.7664526577515368, + "language_loss": 0.59165394, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61179876, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.07373047, + "step": 10431, + "time_per_iteration": 3.1403627395629883 + }, + { + "auxiliary_loss_clip": 0.01011062, + "auxiliary_loss_mlp": 0.01003666, + "balance_loss_clip": 1.00101936, + "balance_loss_mlp": 1.00403404, + "epoch": 0.6272057718322561, + "flos": 65153059102080.0, + "grad_norm": 1.0806094753837518, + "language_loss": 0.63857388, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65872115, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.0703125, + "step": 10432, + "time_per_iteration": 4.477612257003784 + }, + { + "auxiliary_loss_clip": 0.01052355, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.01621819, + "balance_loss_mlp": 1.01605701, + "epoch": 0.6272658950849241, + "flos": 24388193151360.0, + "grad_norm": 1.8316199503847987, + "language_loss": 0.65702039, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.67792654, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 10433, + "time_per_iteration": 2.3769307136535645 + }, + { + "auxiliary_loss_clip": 0.01056969, + "auxiliary_loss_mlp": 0.01043559, + "balance_loss_clip": 1.01603353, + "balance_loss_mlp": 1.01807606, + "epoch": 0.627326018337592, + "flos": 17965082148480.0, + "grad_norm": 1.9986950389610383, + "language_loss": 0.62956542, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.65057075, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.38867188, + "step": 10434, + "time_per_iteration": 2.3534419536590576 + }, + { + "auxiliary_loss_clip": 0.0105367, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_clip": 1.01909912, + "balance_loss_mlp": 1.0161953, + "epoch": 0.6273861415902601, + "flos": 20229059905920.0, + "grad_norm": 1.5515433091497373, + "language_loss": 0.85719723, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.8781631, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 10435, + "time_per_iteration": 2.3625059127807617 + }, + { + "auxiliary_loss_clip": 0.01009773, + "auxiliary_loss_mlp": 0.01006333, + "balance_loss_clip": 1.00392473, + "balance_loss_mlp": 1.00257277, + "epoch": 0.627446264842928, + "flos": 64950144946560.0, + "grad_norm": 0.740617752959928, + "language_loss": 0.61582357, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63598466, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.07226562, + "step": 10436, + "time_per_iteration": 3.043268918991089 + }, + { + "auxiliary_loss_clip": 0.01053481, + "auxiliary_loss_mlp": 0.01044707, + "balance_loss_clip": 1.01927924, + "balance_loss_mlp": 1.01628053, + "epoch": 0.627506388095596, + "flos": 23583200866560.0, + "grad_norm": 1.5189686527278299, + "language_loss": 0.78301316, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.80399501, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 10437, + "time_per_iteration": 2.415388822555542 + }, + { + "auxiliary_loss_clip": 0.01008386, + "auxiliary_loss_mlp": 0.01008459, + "balance_loss_clip": 1.00574076, + "balance_loss_mlp": 1.00155377, + "epoch": 0.6275665113482639, + "flos": 67580396444160.0, + "grad_norm": 0.8074312465670727, + "language_loss": 0.54463273, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56480122, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.06835938, + "step": 10438, + "time_per_iteration": 3.0252771377563477 + }, + { + "auxiliary_loss_clip": 0.01052569, + "auxiliary_loss_mlp": 0.01051344, + "balance_loss_clip": 1.02759743, + "balance_loss_mlp": 1.01583183, + "epoch": 0.6276266346009319, + "flos": 27635674308480.0, + "grad_norm": 1.7261284999480881, + "language_loss": 0.85100013, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.87203932, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 10439, + "time_per_iteration": 2.444892644882202 + }, + { + "auxiliary_loss_clip": 0.01055415, + "auxiliary_loss_mlp": 0.01048412, + "balance_loss_clip": 1.02272272, + "balance_loss_mlp": 1.01671672, + "epoch": 0.6276867578535998, + "flos": 22745075834880.0, + "grad_norm": 2.0096707658799473, + "language_loss": 0.80846441, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.8295027, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 10440, + "time_per_iteration": 2.408198595046997 + }, + { + "auxiliary_loss_clip": 0.01049708, + "auxiliary_loss_mlp": 0.01040628, + "balance_loss_clip": 1.01961112, + "balance_loss_mlp": 1.01554275, + "epoch": 0.6277468811062679, + "flos": 24643059143040.0, + "grad_norm": 1.3685712035651625, + "language_loss": 0.75532925, + "learning_rate": 1.285694725799337e-06, + "loss": 0.77623266, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34179688, + "step": 10441, + "time_per_iteration": 2.417717456817627 + }, + { + "auxiliary_loss_clip": 0.0105282, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.0141511, + "balance_loss_mlp": 1.0166347, + "epoch": 0.6278070043589358, + "flos": 19678060828800.0, + "grad_norm": 1.8209256091517885, + "language_loss": 0.72795004, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74885219, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 10442, + "time_per_iteration": 2.39794921875 + }, + { + "auxiliary_loss_clip": 0.01053879, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.01959133, + "balance_loss_mlp": 1.01737309, + "epoch": 0.6278671276116038, + "flos": 22120898814720.0, + "grad_norm": 1.43718143971416, + "language_loss": 0.72102112, + "learning_rate": 1.284967229712762e-06, + "loss": 0.74198306, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36523438, + "step": 10443, + "time_per_iteration": 2.4080426692962646 + }, + { + "auxiliary_loss_clip": 0.01054367, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_clip": 1.02305126, + "balance_loss_mlp": 1.01752281, + "epoch": 0.6279272508642717, + "flos": 23037473404800.0, + "grad_norm": 2.821861614455931, + "language_loss": 0.74392879, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.76494288, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 10444, + "time_per_iteration": 2.429173707962036 + }, + { + "auxiliary_loss_clip": 0.01054691, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.01567388, + "balance_loss_mlp": 1.01859331, + "epoch": 0.6279873741169397, + "flos": 19823194817280.0, + "grad_norm": 2.4534369808144603, + "language_loss": 0.73415929, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.75510859, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 10445, + "time_per_iteration": 2.364154100418091 + }, + { + "auxiliary_loss_clip": 0.01055024, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.0174185, + "balance_loss_mlp": 1.01835918, + "epoch": 0.6280474973696077, + "flos": 23914247178240.0, + "grad_norm": 1.4741886975908554, + "language_loss": 0.70280182, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.72377872, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 10446, + "time_per_iteration": 2.407290458679199 + }, + { + "auxiliary_loss_clip": 0.01060123, + "auxiliary_loss_mlp": 0.01043379, + "balance_loss_clip": 1.01698589, + "balance_loss_mlp": 1.01957321, + "epoch": 0.6281076206222757, + "flos": 17967002273280.0, + "grad_norm": 1.7463897778696198, + "language_loss": 0.75137603, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.77241111, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40625, + "step": 10447, + "time_per_iteration": 2.3524186611175537 + }, + { + "auxiliary_loss_clip": 0.01016698, + "auxiliary_loss_mlp": 0.01037593, + "balance_loss_clip": 1.03497016, + "balance_loss_mlp": 1.00966191, + "epoch": 0.6281677438749437, + "flos": 66775229602560.0, + "grad_norm": 0.7054279770741533, + "language_loss": 0.52502251, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54556543, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.0703125, + "step": 10448, + "time_per_iteration": 2.8806471824645996 + }, + { + "auxiliary_loss_clip": 0.01059647, + "auxiliary_loss_mlp": 0.01049936, + "balance_loss_clip": 1.02275622, + "balance_loss_mlp": 1.0213449, + "epoch": 0.6282278671276116, + "flos": 11655368841600.0, + "grad_norm": 2.830898432129441, + "language_loss": 0.92825508, + "learning_rate": 1.282785392633079e-06, + "loss": 0.94935089, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3828125, + "step": 10449, + "time_per_iteration": 2.3643429279327393 + }, + { + "auxiliary_loss_clip": 0.01057464, + "auxiliary_loss_mlp": 0.01041239, + "balance_loss_clip": 1.01781464, + "balance_loss_mlp": 1.01971829, + "epoch": 0.6282879903802796, + "flos": 42739939140480.0, + "grad_norm": 1.5561178109140146, + "language_loss": 0.61468542, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.63567245, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37890625, + "step": 10450, + "time_per_iteration": 2.5989434719085693 + }, + { + "auxiliary_loss_clip": 0.01055232, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.01080024, + "balance_loss_mlp": 1.01957202, + "epoch": 0.6283481136329475, + "flos": 20008234356480.0, + "grad_norm": 1.563206262617694, + "language_loss": 0.77598721, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79688674, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 10451, + "time_per_iteration": 2.3993911743164062 + }, + { + "auxiliary_loss_clip": 0.01056985, + "auxiliary_loss_mlp": 0.01043561, + "balance_loss_clip": 1.01617908, + "balance_loss_mlp": 1.01855242, + "epoch": 0.6284082368856155, + "flos": 21903459667200.0, + "grad_norm": 1.58192440240751, + "language_loss": 0.78396046, + "learning_rate": 1.281694841064566e-06, + "loss": 0.80496585, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38476562, + "step": 10452, + "time_per_iteration": 2.394521713256836 + }, + { + "auxiliary_loss_clip": 0.01058212, + "auxiliary_loss_mlp": 0.01041539, + "balance_loss_clip": 1.01618314, + "balance_loss_mlp": 1.02112639, + "epoch": 0.6284683601382834, + "flos": 25482999565440.0, + "grad_norm": 1.6904674355651164, + "language_loss": 0.7409513, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.76194876, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 10453, + "time_per_iteration": 2.427011251449585 + }, + { + "auxiliary_loss_clip": 0.01056691, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.01515222, + "balance_loss_mlp": 1.01933801, + "epoch": 0.6285284833909515, + "flos": 16537937702400.0, + "grad_norm": 1.786575101646722, + "language_loss": 0.82026047, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.84123552, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 10454, + "time_per_iteration": 2.4013924598693848 + }, + { + "auxiliary_loss_clip": 0.01058138, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.01300478, + "balance_loss_mlp": 1.02113366, + "epoch": 0.6285886066436194, + "flos": 22819580409600.0, + "grad_norm": 1.7827179244417677, + "language_loss": 0.83663845, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.85758567, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 10455, + "time_per_iteration": 2.4187216758728027 + }, + { + "auxiliary_loss_clip": 0.0105777, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.01327038, + "balance_loss_mlp": 1.01970339, + "epoch": 0.6286487298962874, + "flos": 24714631163520.0, + "grad_norm": 1.698544133404817, + "language_loss": 0.8316716, + "learning_rate": 1.280241153705706e-06, + "loss": 0.85263175, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 10456, + "time_per_iteration": 2.4711849689483643 + }, + { + "auxiliary_loss_clip": 0.0105985, + "auxiliary_loss_mlp": 0.01042978, + "balance_loss_clip": 1.01970792, + "balance_loss_mlp": 1.02109921, + "epoch": 0.6287088531489553, + "flos": 20739769407360.0, + "grad_norm": 1.5139654517395458, + "language_loss": 0.73406839, + "learning_rate": 1.27987780006486e-06, + "loss": 0.75509667, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.38671875, + "step": 10457, + "time_per_iteration": 2.463101863861084 + }, + { + "auxiliary_loss_clip": 0.01059125, + "auxiliary_loss_mlp": 0.01053001, + "balance_loss_clip": 1.02479625, + "balance_loss_mlp": 1.01884246, + "epoch": 0.6287689764016233, + "flos": 23069663544960.0, + "grad_norm": 1.648426880299542, + "language_loss": 0.82034886, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.84147012, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40234375, + "step": 10458, + "time_per_iteration": 3.6649014949798584 + }, + { + "auxiliary_loss_clip": 0.01058686, + "auxiliary_loss_mlp": 0.01055018, + "balance_loss_clip": 1.02739704, + "balance_loss_mlp": 1.02063644, + "epoch": 0.6288290996542913, + "flos": 32232304200960.0, + "grad_norm": 2.5718742965523185, + "language_loss": 0.62158442, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.64272147, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38085938, + "step": 10459, + "time_per_iteration": 2.4933125972747803 + }, + { + "auxiliary_loss_clip": 0.01057026, + "auxiliary_loss_mlp": 0.01048576, + "balance_loss_clip": 1.02435231, + "balance_loss_mlp": 1.01956487, + "epoch": 0.6288892229069593, + "flos": 24640266234240.0, + "grad_norm": 1.6340825148321925, + "language_loss": 0.79700089, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.81805688, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 10460, + "time_per_iteration": 2.4513556957244873 + }, + { + "auxiliary_loss_clip": 0.01055886, + "auxiliary_loss_mlp": 0.01041298, + "balance_loss_clip": 1.01700282, + "balance_loss_mlp": 1.01850057, + "epoch": 0.6289493461596273, + "flos": 17857375004160.0, + "grad_norm": 1.7633119819204586, + "language_loss": 0.74899995, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76997173, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 10461, + "time_per_iteration": 2.3525476455688477 + }, + { + "auxiliary_loss_clip": 0.01053891, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_clip": 1.02108824, + "balance_loss_mlp": 1.01775479, + "epoch": 0.6290094694122952, + "flos": 22344307804800.0, + "grad_norm": 2.344253162065434, + "language_loss": 0.7150377, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.73603696, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36132812, + "step": 10462, + "time_per_iteration": 3.876101016998291 + }, + { + "auxiliary_loss_clip": 0.01052281, + "auxiliary_loss_mlp": 0.01039869, + "balance_loss_clip": 1.01929379, + "balance_loss_mlp": 1.01832938, + "epoch": 0.6290695926649632, + "flos": 28401179978880.0, + "grad_norm": 2.795749666282322, + "language_loss": 0.73279643, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.7537179, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 10463, + "time_per_iteration": 3.9579579830169678 + }, + { + "auxiliary_loss_clip": 0.01053142, + "auxiliary_loss_mlp": 0.01048715, + "balance_loss_clip": 1.02496898, + "balance_loss_mlp": 1.0180912, + "epoch": 0.6291297159176311, + "flos": 21504437205120.0, + "grad_norm": 1.6134864650126743, + "language_loss": 0.73152447, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.75254303, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34960938, + "step": 10464, + "time_per_iteration": 2.425267219543457 + }, + { + "auxiliary_loss_clip": 0.01053794, + "auxiliary_loss_mlp": 0.01048925, + "balance_loss_clip": 1.02539313, + "balance_loss_mlp": 1.01770878, + "epoch": 0.6291898391702991, + "flos": 12202492757760.0, + "grad_norm": 1.6915103754745773, + "language_loss": 0.69903266, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.72005981, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36132812, + "step": 10465, + "time_per_iteration": 2.4495770931243896 + }, + { + "auxiliary_loss_clip": 0.01011358, + "auxiliary_loss_mlp": 0.01008228, + "balance_loss_clip": 1.0056051, + "balance_loss_mlp": 1.00404191, + "epoch": 0.629249962422967, + "flos": 69296168033280.0, + "grad_norm": 0.6986120146895876, + "language_loss": 0.59844089, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61863685, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.07324219, + "step": 10466, + "time_per_iteration": 3.146167039871216 + }, + { + "auxiliary_loss_clip": 0.01051672, + "auxiliary_loss_mlp": 0.01046333, + "balance_loss_clip": 1.02226484, + "balance_loss_mlp": 1.01566672, + "epoch": 0.6293100856756351, + "flos": 40076310516480.0, + "grad_norm": 2.394223377683595, + "language_loss": 0.66434979, + "learning_rate": 1.276245767820154e-06, + "loss": 0.68532991, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 10467, + "time_per_iteration": 2.5486841201782227 + }, + { + "auxiliary_loss_clip": 0.01010641, + "auxiliary_loss_mlp": 0.01013611, + "balance_loss_clip": 1.01043987, + "balance_loss_mlp": 1.00329638, + "epoch": 0.629370208928303, + "flos": 67498141547520.0, + "grad_norm": 0.8048984627070126, + "language_loss": 0.57073814, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.59098065, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.03173828, + "router_z_loss_mlp": 0.07324219, + "step": 10468, + "time_per_iteration": 2.8294243812561035 + }, + { + "auxiliary_loss_clip": 0.01009079, + "auxiliary_loss_mlp": 0.01015092, + "balance_loss_clip": 1.01246977, + "balance_loss_mlp": 1.00193238, + "epoch": 0.629430332180971, + "flos": 60657154081920.0, + "grad_norm": 0.739913754539508, + "language_loss": 0.58109212, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.60133386, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.07128906, + "step": 10469, + "time_per_iteration": 3.0360817909240723 + }, + { + "auxiliary_loss_clip": 0.01009871, + "auxiliary_loss_mlp": 0.01007228, + "balance_loss_clip": 1.00451005, + "balance_loss_mlp": 1.00264573, + "epoch": 0.6294904554336389, + "flos": 66866107605120.0, + "grad_norm": 0.703491905310845, + "language_loss": 0.52211142, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54228246, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.07226562, + "step": 10470, + "time_per_iteration": 3.0835890769958496 + }, + { + "auxiliary_loss_clip": 0.01053895, + "auxiliary_loss_mlp": 0.0104868, + "balance_loss_clip": 1.02375364, + "balance_loss_mlp": 1.01759386, + "epoch": 0.6295505786863069, + "flos": 42521138449920.0, + "grad_norm": 1.6144861135913025, + "language_loss": 0.75775194, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.77877772, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36328125, + "step": 10471, + "time_per_iteration": 3.938607931137085 + }, + { + "auxiliary_loss_clip": 0.01056017, + "auxiliary_loss_mlp": 0.0104546, + "balance_loss_clip": 1.02102208, + "balance_loss_mlp": 1.01863527, + "epoch": 0.629610701938975, + "flos": 17383184651520.0, + "grad_norm": 1.9395791442557382, + "language_loss": 0.64412063, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.66513538, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 10472, + "time_per_iteration": 2.390970468521118 + }, + { + "auxiliary_loss_clip": 0.01057684, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_clip": 1.02546144, + "balance_loss_mlp": 1.01915741, + "epoch": 0.6296708251916429, + "flos": 24241802353920.0, + "grad_norm": 1.7524912168304718, + "language_loss": 0.70282477, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.72391397, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38476562, + "step": 10473, + "time_per_iteration": 2.48785662651062 + }, + { + "auxiliary_loss_clip": 0.01055035, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_clip": 1.0184586, + "balance_loss_mlp": 1.01867068, + "epoch": 0.6297309484443109, + "flos": 19277607000960.0, + "grad_norm": 1.821317644657953, + "language_loss": 0.75403148, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.77500176, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 10474, + "time_per_iteration": 2.452282190322876 + }, + { + "auxiliary_loss_clip": 0.01058243, + "auxiliary_loss_mlp": 0.01046318, + "balance_loss_clip": 1.02229714, + "balance_loss_mlp": 1.02045536, + "epoch": 0.6297910716969788, + "flos": 30661422220800.0, + "grad_norm": 1.5237340592522297, + "language_loss": 0.67511237, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.69615805, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37695312, + "step": 10475, + "time_per_iteration": 2.4870405197143555 + }, + { + "auxiliary_loss_clip": 0.01054688, + "auxiliary_loss_mlp": 0.01039442, + "balance_loss_clip": 1.01748407, + "balance_loss_mlp": 1.01932383, + "epoch": 0.6298511949496468, + "flos": 14422305778560.0, + "grad_norm": 1.9113048626642806, + "language_loss": 0.91791385, + "learning_rate": 1.272979284940101e-06, + "loss": 0.93885517, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 10476, + "time_per_iteration": 2.3698110580444336 + }, + { + "auxiliary_loss_clip": 0.01057304, + "auxiliary_loss_mlp": 0.01039764, + "balance_loss_clip": 1.01746058, + "balance_loss_mlp": 1.02109349, + "epoch": 0.6299113182023147, + "flos": 23513025300480.0, + "grad_norm": 1.6251481137368493, + "language_loss": 0.76254541, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.78351605, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36328125, + "step": 10477, + "time_per_iteration": 2.4155051708221436 + }, + { + "auxiliary_loss_clip": 0.01058476, + "auxiliary_loss_mlp": 0.01044838, + "balance_loss_clip": 1.01920772, + "balance_loss_mlp": 1.02040648, + "epoch": 0.6299714414549827, + "flos": 22673399080320.0, + "grad_norm": 1.6076283558823985, + "language_loss": 0.71111661, + "learning_rate": 1.272253702758138e-06, + "loss": 0.73214972, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 10478, + "time_per_iteration": 2.4416775703430176 + }, + { + "auxiliary_loss_clip": 0.01063007, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.01561499, + "balance_loss_mlp": 1.02336788, + "epoch": 0.6300315647076506, + "flos": 14500860071040.0, + "grad_norm": 2.1915359402828005, + "language_loss": 0.68881106, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.70985973, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39648438, + "step": 10479, + "time_per_iteration": 2.368682861328125 + }, + { + "auxiliary_loss_clip": 0.01060045, + "auxiliary_loss_mlp": 0.01045256, + "balance_loss_clip": 1.02086568, + "balance_loss_mlp": 1.02331889, + "epoch": 0.6300916879603187, + "flos": 21870606211200.0, + "grad_norm": 2.0207525384906595, + "language_loss": 0.74573326, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.76678622, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 10480, + "time_per_iteration": 2.4304986000061035 + }, + { + "auxiliary_loss_clip": 0.01060621, + "auxiliary_loss_mlp": 0.01044569, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.02192116, + "epoch": 0.6301518112129866, + "flos": 21833004810240.0, + "grad_norm": 2.2120851552590852, + "language_loss": 0.79497635, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.8160283, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 10481, + "time_per_iteration": 2.4402544498443604 + }, + { + "auxiliary_loss_clip": 0.01022973, + "auxiliary_loss_mlp": 0.01015026, + "balance_loss_clip": 1.01216543, + "balance_loss_mlp": 1.01493406, + "epoch": 0.6302119344656546, + "flos": 44331872670720.0, + "grad_norm": 0.9083362502747326, + "language_loss": 0.61883426, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63921422, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.08007812, + "step": 10482, + "time_per_iteration": 2.7955269813537598 + }, + { + "auxiliary_loss_clip": 0.01064539, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.01773417, + "balance_loss_mlp": 1.02337503, + "epoch": 0.6302720577183225, + "flos": 11217139056000.0, + "grad_norm": 1.9172514035561095, + "language_loss": 0.83510238, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85620379, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.41210938, + "step": 10483, + "time_per_iteration": 2.365884780883789 + }, + { + "auxiliary_loss_clip": 0.01058403, + "auxiliary_loss_mlp": 0.01042242, + "balance_loss_clip": 1.01899576, + "balance_loss_mlp": 1.02251577, + "epoch": 0.6303321809709905, + "flos": 27963683331840.0, + "grad_norm": 1.578495553820476, + "language_loss": 0.73583865, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75684506, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 10484, + "time_per_iteration": 2.4635939598083496 + }, + { + "auxiliary_loss_clip": 0.01059049, + "auxiliary_loss_mlp": 0.01041296, + "balance_loss_clip": 1.01743042, + "balance_loss_mlp": 1.02204704, + "epoch": 0.6303923042236586, + "flos": 28219491930240.0, + "grad_norm": 1.9394742949490609, + "language_loss": 0.75932324, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.78032672, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 10485, + "time_per_iteration": 2.4925811290740967 + }, + { + "auxiliary_loss_clip": 0.01060747, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.01590765, + "balance_loss_mlp": 1.02097344, + "epoch": 0.6304524274763265, + "flos": 27629948845440.0, + "grad_norm": 2.2885531716100216, + "language_loss": 0.83050287, + "learning_rate": 1.269352478979093e-06, + "loss": 0.85152119, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3984375, + "step": 10486, + "time_per_iteration": 2.469379425048828 + }, + { + "auxiliary_loss_clip": 0.01060114, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_clip": 1.02225661, + "balance_loss_mlp": 1.02235532, + "epoch": 0.6305125507289945, + "flos": 17310355822080.0, + "grad_norm": 1.6577055337564814, + "language_loss": 0.64971542, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.67078483, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 10487, + "time_per_iteration": 2.38136887550354 + }, + { + "auxiliary_loss_clip": 0.01058696, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.02542114, + "balance_loss_mlp": 1.02231264, + "epoch": 0.6305726739816624, + "flos": 25807203250560.0, + "grad_norm": 1.5249336694278446, + "language_loss": 0.68031275, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.70139509, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 10488, + "time_per_iteration": 2.484880208969116 + }, + { + "auxiliary_loss_clip": 0.01060665, + "auxiliary_loss_mlp": 0.01044148, + "balance_loss_clip": 1.01823139, + "balance_loss_mlp": 1.02157807, + "epoch": 0.6306327972343304, + "flos": 21796415838720.0, + "grad_norm": 1.7640174873254941, + "language_loss": 0.68793708, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.70898521, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 10489, + "time_per_iteration": 2.394228935241699 + }, + { + "auxiliary_loss_clip": 0.01064499, + "auxiliary_loss_mlp": 0.01049731, + "balance_loss_clip": 1.02035785, + "balance_loss_mlp": 1.02262259, + "epoch": 0.6306929204869983, + "flos": 20776323467520.0, + "grad_norm": 1.6846791024082155, + "language_loss": 0.70727706, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.72841936, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.41796875, + "step": 10490, + "time_per_iteration": 2.4332456588745117 + }, + { + "auxiliary_loss_clip": 0.01060073, + "auxiliary_loss_mlp": 0.01041951, + "balance_loss_clip": 1.01685739, + "balance_loss_mlp": 1.02151918, + "epoch": 0.6307530437396663, + "flos": 23653236787200.0, + "grad_norm": 2.3280290715327485, + "language_loss": 0.79564822, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.81666839, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 10491, + "time_per_iteration": 2.390270948410034 + }, + { + "auxiliary_loss_clip": 0.01058649, + "auxiliary_loss_mlp": 0.01045979, + "balance_loss_clip": 1.02148187, + "balance_loss_mlp": 1.02146268, + "epoch": 0.6308131669923343, + "flos": 24717808097280.0, + "grad_norm": 1.9004198919657853, + "language_loss": 0.5614301, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58247638, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37304688, + "step": 10492, + "time_per_iteration": 2.4452359676361084 + }, + { + "auxiliary_loss_clip": 0.01058756, + "auxiliary_loss_mlp": 0.01053787, + "balance_loss_clip": 1.02782369, + "balance_loss_mlp": 1.01990211, + "epoch": 0.6308732902450023, + "flos": 22564295481600.0, + "grad_norm": 1.8457126176544847, + "language_loss": 0.65630382, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.67742926, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38867188, + "step": 10493, + "time_per_iteration": 2.409545660018921 + }, + { + "auxiliary_loss_clip": 0.01055988, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02021205, + "balance_loss_mlp": 1.01877761, + "epoch": 0.6309334134976702, + "flos": 24643059143040.0, + "grad_norm": 1.3559619580988254, + "language_loss": 0.83363169, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.85463047, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 10494, + "time_per_iteration": 2.5296878814697266 + }, + { + "auxiliary_loss_clip": 0.01057304, + "auxiliary_loss_mlp": 0.01043621, + "balance_loss_clip": 1.01884985, + "balance_loss_mlp": 1.01968336, + "epoch": 0.6309935367503382, + "flos": 41426332035840.0, + "grad_norm": 2.005156788594954, + "language_loss": 0.80845928, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.82946861, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 10495, + "time_per_iteration": 2.5734477043151855 + }, + { + "auxiliary_loss_clip": 0.01057239, + "auxiliary_loss_mlp": 0.01046128, + "balance_loss_clip": 1.02079606, + "balance_loss_mlp": 1.01974177, + "epoch": 0.6310536600030061, + "flos": 15118124641920.0, + "grad_norm": 2.108429437271186, + "language_loss": 0.71154284, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.73257649, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 10496, + "time_per_iteration": 2.4166693687438965 + }, + { + "auxiliary_loss_clip": 0.01057818, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.01675463, + "balance_loss_mlp": 1.01867187, + "epoch": 0.6311137832556741, + "flos": 15230719376640.0, + "grad_norm": 2.7306489989212883, + "language_loss": 0.81760329, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.83861566, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 10497, + "time_per_iteration": 2.3821651935577393 + }, + { + "auxiliary_loss_clip": 0.01053585, + "auxiliary_loss_mlp": 0.01045334, + "balance_loss_clip": 1.02223158, + "balance_loss_mlp": 1.01746309, + "epoch": 0.6311739065083422, + "flos": 22017555590400.0, + "grad_norm": 2.474269916083689, + "language_loss": 0.7512368, + "learning_rate": 1.265003970256247e-06, + "loss": 0.77222598, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 10498, + "time_per_iteration": 3.779094696044922 + }, + { + "auxiliary_loss_clip": 0.01057011, + "auxiliary_loss_mlp": 0.01044267, + "balance_loss_clip": 1.01905417, + "balance_loss_mlp": 1.01861262, + "epoch": 0.6312340297610101, + "flos": 22709673849600.0, + "grad_norm": 1.9519345462550528, + "language_loss": 0.7090286, + "learning_rate": 1.264641775364217e-06, + "loss": 0.73004138, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 10499, + "time_per_iteration": 2.393601179122925 + }, + { + "auxiliary_loss_clip": 0.01054701, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_clip": 1.01810098, + "balance_loss_mlp": 1.01833737, + "epoch": 0.6312941530136781, + "flos": 24278949907200.0, + "grad_norm": 1.8352079553884406, + "language_loss": 0.70537311, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72635198, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 10500, + "time_per_iteration": 2.430647850036621 + }, + { + "auxiliary_loss_clip": 0.01056102, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02071714, + "balance_loss_mlp": 1.01864147, + "epoch": 0.631354276266346, + "flos": 21724878729600.0, + "grad_norm": 1.8208991759970066, + "language_loss": 0.75078785, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.77180976, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 10501, + "time_per_iteration": 2.4120659828186035 + }, + { + "auxiliary_loss_clip": 0.01056364, + "auxiliary_loss_mlp": 0.01047465, + "balance_loss_clip": 1.01985645, + "balance_loss_mlp": 1.01923478, + "epoch": 0.631414399519014, + "flos": 24023944270080.0, + "grad_norm": 1.6894652518685347, + "language_loss": 0.76563752, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.78667581, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.37109375, + "step": 10502, + "time_per_iteration": 5.311256170272827 + }, + { + "auxiliary_loss_clip": 0.01059547, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.02037716, + "balance_loss_mlp": 1.02025938, + "epoch": 0.6314745227716819, + "flos": 24314666094720.0, + "grad_norm": 2.041255253660512, + "language_loss": 0.85934424, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.88040662, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 10503, + "time_per_iteration": 2.3994874954223633 + }, + { + "auxiliary_loss_clip": 0.0105555, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.01807547, + "balance_loss_mlp": 1.01736021, + "epoch": 0.6315346460243499, + "flos": 23365307871360.0, + "grad_norm": 1.6106642728098897, + "language_loss": 0.87061858, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.89161056, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 10504, + "time_per_iteration": 2.38594651222229 + }, + { + "auxiliary_loss_clip": 0.01058709, + "auxiliary_loss_mlp": 0.01044686, + "balance_loss_clip": 1.01557517, + "balance_loss_mlp": 1.01834202, + "epoch": 0.6315947692770179, + "flos": 20259469566720.0, + "grad_norm": 1.615880366153173, + "language_loss": 0.78233671, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.80337065, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.40234375, + "step": 10505, + "time_per_iteration": 2.3879497051239014 + }, + { + "auxiliary_loss_clip": 0.0105633, + "auxiliary_loss_mlp": 0.01050335, + "balance_loss_clip": 1.02280998, + "balance_loss_mlp": 1.01741135, + "epoch": 0.6316548925296859, + "flos": 25264652722560.0, + "grad_norm": 2.3236020822658956, + "language_loss": 0.82993865, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.85100526, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.38867188, + "step": 10506, + "time_per_iteration": 2.409675359725952 + }, + { + "auxiliary_loss_clip": 0.01056936, + "auxiliary_loss_mlp": 0.01042438, + "balance_loss_clip": 1.01635504, + "balance_loss_mlp": 1.01864076, + "epoch": 0.6317150157823538, + "flos": 22929452058240.0, + "grad_norm": 2.412397253348964, + "language_loss": 0.7513355, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.77232921, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 10507, + "time_per_iteration": 2.4161272048950195 + }, + { + "auxiliary_loss_clip": 0.01058172, + "auxiliary_loss_mlp": 0.01042084, + "balance_loss_clip": 1.01472557, + "balance_loss_mlp": 1.01814461, + "epoch": 0.6317751390350218, + "flos": 22525995853440.0, + "grad_norm": 1.8161682835056638, + "language_loss": 0.69776565, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.71876824, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40039062, + "step": 10508, + "time_per_iteration": 2.3874475955963135 + }, + { + "auxiliary_loss_clip": 0.01055787, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_clip": 1.02105618, + "balance_loss_mlp": 1.01854467, + "epoch": 0.6318352622876897, + "flos": 23293631116800.0, + "grad_norm": 1.5799018146157635, + "language_loss": 0.7185716, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73961139, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.37109375, + "step": 10509, + "time_per_iteration": 2.4333319664001465 + }, + { + "auxiliary_loss_clip": 0.01054142, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.01490426, + "balance_loss_mlp": 1.01754665, + "epoch": 0.6318953855403577, + "flos": 20703040790400.0, + "grad_norm": 1.5790182368749008, + "language_loss": 0.80601323, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.82694173, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 10510, + "time_per_iteration": 2.384685754776001 + }, + { + "auxiliary_loss_clip": 0.01056788, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.01217175, + "balance_loss_mlp": 1.01882052, + "epoch": 0.6319555087930258, + "flos": 22818952005120.0, + "grad_norm": 1.5698929414431608, + "language_loss": 0.71819878, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.73914075, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 10511, + "time_per_iteration": 2.422070264816284 + }, + { + "auxiliary_loss_clip": 0.01053804, + "auxiliary_loss_mlp": 0.01039673, + "balance_loss_clip": 1.01742887, + "balance_loss_mlp": 1.01742816, + "epoch": 0.6320156320456937, + "flos": 19970004551040.0, + "grad_norm": 1.673445089512672, + "language_loss": 0.81074166, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.83167636, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 10512, + "time_per_iteration": 3.8560445308685303 + }, + { + "auxiliary_loss_clip": 0.01057516, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.0146277, + "balance_loss_mlp": 1.01923585, + "epoch": 0.6320757552983617, + "flos": 27012265338240.0, + "grad_norm": 1.838095516761923, + "language_loss": 0.71805704, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.73903632, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 10513, + "time_per_iteration": 2.4122586250305176 + }, + { + "auxiliary_loss_clip": 0.01057121, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.0169121, + "balance_loss_mlp": 1.01879454, + "epoch": 0.6321358785510296, + "flos": 23694818083200.0, + "grad_norm": 1.8492465303466612, + "language_loss": 0.67334378, + "learning_rate": 1.259212205855459e-06, + "loss": 0.69434023, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 10514, + "time_per_iteration": 2.4429259300231934 + }, + { + "auxiliary_loss_clip": 0.01053115, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.01455641, + "balance_loss_mlp": 1.01700401, + "epoch": 0.6321960018036976, + "flos": 25994023269120.0, + "grad_norm": 1.8260708743162748, + "language_loss": 0.75634342, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.77725118, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 10515, + "time_per_iteration": 2.425358772277832 + }, + { + "auxiliary_loss_clip": 0.01052703, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.01272964, + "balance_loss_mlp": 1.01816416, + "epoch": 0.6322561250563655, + "flos": 22819894611840.0, + "grad_norm": 1.699331933619142, + "language_loss": 0.90944993, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.93032342, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 10516, + "time_per_iteration": 2.4276981353759766 + }, + { + "auxiliary_loss_clip": 0.01060821, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.01465952, + "balance_loss_mlp": 1.02043033, + "epoch": 0.6323162483090335, + "flos": 18987443758080.0, + "grad_norm": 1.6227092091999098, + "language_loss": 0.82418001, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84522951, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40429688, + "step": 10517, + "time_per_iteration": 2.3748738765716553 + }, + { + "auxiliary_loss_clip": 0.01055514, + "auxiliary_loss_mlp": 0.0104442, + "balance_loss_clip": 1.01929045, + "balance_loss_mlp": 1.01836443, + "epoch": 0.6323763715617015, + "flos": 19864147708800.0, + "grad_norm": 1.7659136192592444, + "language_loss": 0.79007316, + "learning_rate": 1.257765386189541e-06, + "loss": 0.81107253, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 10518, + "time_per_iteration": 2.3990297317504883 + }, + { + "auxiliary_loss_clip": 0.01053913, + "auxiliary_loss_mlp": 0.01038315, + "balance_loss_clip": 1.01553392, + "balance_loss_mlp": 1.01778698, + "epoch": 0.6324364948143695, + "flos": 22781629895040.0, + "grad_norm": 1.4631114007329078, + "language_loss": 0.86031818, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.88124049, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 10519, + "time_per_iteration": 2.419728994369507 + }, + { + "auxiliary_loss_clip": 0.01053135, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.01419175, + "balance_loss_mlp": 1.0177995, + "epoch": 0.6324966180670374, + "flos": 22234855092480.0, + "grad_norm": 1.5424131527180835, + "language_loss": 0.72885877, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.74975419, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 10520, + "time_per_iteration": 2.405036211013794 + }, + { + "auxiliary_loss_clip": 0.01053524, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.01419342, + "balance_loss_mlp": 1.01764333, + "epoch": 0.6325567413197054, + "flos": 21688115201280.0, + "grad_norm": 2.3323249514853726, + "language_loss": 0.73421365, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.75511611, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.359375, + "step": 10521, + "time_per_iteration": 2.5349183082580566 + }, + { + "auxiliary_loss_clip": 0.01058426, + "auxiliary_loss_mlp": 0.01045504, + "balance_loss_clip": 1.01975441, + "balance_loss_mlp": 1.01986086, + "epoch": 0.6326168645723733, + "flos": 19936138665600.0, + "grad_norm": 1.762614424086139, + "language_loss": 0.73205447, + "learning_rate": 1.256319016853377e-06, + "loss": 0.75309378, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38476562, + "step": 10522, + "time_per_iteration": 2.481027603149414 + }, + { + "auxiliary_loss_clip": 0.01056966, + "auxiliary_loss_mlp": 0.0103859, + "balance_loss_clip": 1.01461697, + "balance_loss_mlp": 1.01907182, + "epoch": 0.6326769878250413, + "flos": 20229304285440.0, + "grad_norm": 2.1729622907488473, + "language_loss": 0.82683325, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.84778881, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 10523, + "time_per_iteration": 2.472660541534424 + }, + { + "auxiliary_loss_clip": 0.01055055, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.01366544, + "balance_loss_mlp": 1.01804137, + "epoch": 0.6327371110777094, + "flos": 20774752456320.0, + "grad_norm": 2.0092956726918474, + "language_loss": 0.74123406, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76215649, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 10524, + "time_per_iteration": 2.3481218814849854 + }, + { + "auxiliary_loss_clip": 0.01060062, + "auxiliary_loss_mlp": 0.01043392, + "balance_loss_clip": 1.01628327, + "balance_loss_mlp": 1.01943243, + "epoch": 0.6327972343303773, + "flos": 30335228588160.0, + "grad_norm": 3.872404379504013, + "language_loss": 0.85970545, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.88074005, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.40625, + "step": 10525, + "time_per_iteration": 2.4433298110961914 + }, + { + "auxiliary_loss_clip": 0.01054021, + "auxiliary_loss_mlp": 0.01037042, + "balance_loss_clip": 1.01324844, + "balance_loss_mlp": 1.01735353, + "epoch": 0.6328573575830453, + "flos": 17091310752000.0, + "grad_norm": 1.7460469272368473, + "language_loss": 0.67551553, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.69642621, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 10526, + "time_per_iteration": 2.3412821292877197 + }, + { + "auxiliary_loss_clip": 0.01058969, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.01633048, + "balance_loss_mlp": 1.02014089, + "epoch": 0.6329174808357132, + "flos": 25045956766080.0, + "grad_norm": 1.472329205523203, + "language_loss": 0.74434125, + "learning_rate": 1.254511689796244e-06, + "loss": 0.76535547, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 10527, + "time_per_iteration": 2.4166500568389893 + }, + { + "auxiliary_loss_clip": 0.01055915, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.01190233, + "balance_loss_mlp": 1.01911139, + "epoch": 0.6329776040883812, + "flos": 16835886178560.0, + "grad_norm": 2.5911715549930885, + "language_loss": 0.73128641, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.75219512, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 10528, + "time_per_iteration": 2.3505101203918457 + }, + { + "auxiliary_loss_clip": 0.01054705, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.01649249, + "balance_loss_mlp": 1.01801729, + "epoch": 0.6330377273410491, + "flos": 13515855482880.0, + "grad_norm": 2.2639938485597595, + "language_loss": 0.68142581, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.70238769, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 10529, + "time_per_iteration": 2.352111339569092 + }, + { + "auxiliary_loss_clip": 0.01055881, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.01425481, + "balance_loss_mlp": 1.01942825, + "epoch": 0.6330978505937171, + "flos": 21537884154240.0, + "grad_norm": 1.8514395983552554, + "language_loss": 0.76685131, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.78779423, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 10530, + "time_per_iteration": 2.375882625579834 + }, + { + "auxiliary_loss_clip": 0.01056578, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.01429486, + "balance_loss_mlp": 1.01985288, + "epoch": 0.6331579738463851, + "flos": 25008320453760.0, + "grad_norm": 1.5418929883501036, + "language_loss": 0.74405503, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.76499224, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 10531, + "time_per_iteration": 2.401207447052002 + }, + { + "auxiliary_loss_clip": 0.01052821, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.01143754, + "balance_loss_mlp": 1.01750541, + "epoch": 0.6332180970990531, + "flos": 14975120246400.0, + "grad_norm": 2.171939464199027, + "language_loss": 0.81234908, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.83321834, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 10532, + "time_per_iteration": 2.3500888347625732 + }, + { + "auxiliary_loss_clip": 0.01053006, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.01456892, + "balance_loss_mlp": 1.01743817, + "epoch": 0.633278220351721, + "flos": 22705973245440.0, + "grad_norm": 1.5595244319799706, + "language_loss": 0.7590155, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.7798996, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.35546875, + "step": 10533, + "time_per_iteration": 2.4066314697265625 + }, + { + "auxiliary_loss_clip": 0.01056769, + "auxiliary_loss_mlp": 0.01044771, + "balance_loss_clip": 1.01717353, + "balance_loss_mlp": 1.01757097, + "epoch": 0.633338343604389, + "flos": 12602143624320.0, + "grad_norm": 2.585295148606703, + "language_loss": 0.7909925, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.8120079, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.39257812, + "step": 10534, + "time_per_iteration": 2.322382926940918 + }, + { + "auxiliary_loss_clip": 0.01053699, + "auxiliary_loss_mlp": 0.01036457, + "balance_loss_clip": 1.01347387, + "balance_loss_mlp": 1.01742733, + "epoch": 0.6333984668570569, + "flos": 25958865663360.0, + "grad_norm": 3.0806473951071687, + "language_loss": 0.86417824, + "learning_rate": 1.251621437204777e-06, + "loss": 0.88507974, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 10535, + "time_per_iteration": 2.4390971660614014 + }, + { + "auxiliary_loss_clip": 0.01054881, + "auxiliary_loss_mlp": 0.01037793, + "balance_loss_clip": 1.01284266, + "balance_loss_mlp": 1.01773, + "epoch": 0.6334585901097249, + "flos": 23658124377600.0, + "grad_norm": 1.6944717437555805, + "language_loss": 0.77515024, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.79607695, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 10536, + "time_per_iteration": 2.372629404067993 + }, + { + "auxiliary_loss_clip": 0.01054679, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.01281655, + "balance_loss_mlp": 1.01864171, + "epoch": 0.633518713362393, + "flos": 28759424106240.0, + "grad_norm": 1.5594888429898282, + "language_loss": 0.60995543, + "learning_rate": 1.250899157568855e-06, + "loss": 0.63085318, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 10537, + "time_per_iteration": 3.739100933074951 + }, + { + "auxiliary_loss_clip": 0.01009515, + "auxiliary_loss_mlp": 0.01009023, + "balance_loss_clip": 1.00616169, + "balance_loss_mlp": 1.00238681, + "epoch": 0.6335788366150609, + "flos": 70417264567680.0, + "grad_norm": 0.7816503288013247, + "language_loss": 0.52549839, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54568374, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.07128906, + "step": 10538, + "time_per_iteration": 3.1288058757781982 + }, + { + "auxiliary_loss_clip": 0.01057219, + "auxiliary_loss_mlp": 0.01039964, + "balance_loss_clip": 1.01278448, + "balance_loss_mlp": 1.01803088, + "epoch": 0.6336389598677289, + "flos": 23730953207040.0, + "grad_norm": 1.7738693655818196, + "language_loss": 0.8453328, + "learning_rate": 1.250176991556848e-06, + "loss": 0.86630464, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39257812, + "step": 10539, + "time_per_iteration": 2.3866984844207764 + }, + { + "auxiliary_loss_clip": 0.01054823, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.01229489, + "balance_loss_mlp": 1.01653445, + "epoch": 0.6336990831203968, + "flos": 29275440134400.0, + "grad_norm": 1.8011083627853715, + "language_loss": 0.87470877, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.89562982, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 10540, + "time_per_iteration": 3.7856457233428955 + }, + { + "auxiliary_loss_clip": 0.01050724, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.0129776, + "balance_loss_mlp": 1.01639295, + "epoch": 0.6337592063730648, + "flos": 29095532565120.0, + "grad_norm": 1.6224240515001154, + "language_loss": 0.73206109, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.75290871, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34375, + "step": 10541, + "time_per_iteration": 2.4198968410491943 + }, + { + "auxiliary_loss_clip": 0.01056912, + "auxiliary_loss_mlp": 0.0104252, + "balance_loss_clip": 1.01642537, + "balance_loss_mlp": 1.01822734, + "epoch": 0.6338193296257327, + "flos": 34705272556800.0, + "grad_norm": 2.4875098565167955, + "language_loss": 0.86519837, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.88619268, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 10542, + "time_per_iteration": 3.8619327545166016 + }, + { + "auxiliary_loss_clip": 0.01053973, + "auxiliary_loss_mlp": 0.01040241, + "balance_loss_clip": 1.01461148, + "balance_loss_mlp": 1.01779199, + "epoch": 0.6338794528784008, + "flos": 16686737383680.0, + "grad_norm": 1.617053777984366, + "language_loss": 0.78768694, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.80862904, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36132812, + "step": 10543, + "time_per_iteration": 2.367506742477417 + }, + { + "auxiliary_loss_clip": 0.01049853, + "auxiliary_loss_mlp": 0.01038118, + "balance_loss_clip": 1.01677966, + "balance_loss_mlp": 1.01523864, + "epoch": 0.6339395761310687, + "flos": 22345494791040.0, + "grad_norm": 1.7817373425375305, + "language_loss": 0.74420047, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.76508021, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34570312, + "step": 10544, + "time_per_iteration": 2.381906747817993 + }, + { + "auxiliary_loss_clip": 0.01057382, + "auxiliary_loss_mlp": 0.01039631, + "balance_loss_clip": 1.01556277, + "balance_loss_mlp": 1.01897502, + "epoch": 0.6339996993837367, + "flos": 18550819895040.0, + "grad_norm": 9.688946921100003, + "language_loss": 0.70072532, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.72169542, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 10545, + "time_per_iteration": 2.403141736984253 + }, + { + "auxiliary_loss_clip": 0.01053256, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.02150083, + "balance_loss_mlp": 1.01660395, + "epoch": 0.6340598226364046, + "flos": 12968661744000.0, + "grad_norm": 1.9416617481841005, + "language_loss": 0.72370696, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.744708, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3671875, + "step": 10546, + "time_per_iteration": 2.3363473415374756 + }, + { + "auxiliary_loss_clip": 0.01052793, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.01605928, + "balance_loss_mlp": 1.01754987, + "epoch": 0.6341199458890726, + "flos": 26686769932800.0, + "grad_norm": 1.266248590185462, + "language_loss": 0.7871117, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80802101, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 10547, + "time_per_iteration": 2.403912305831909 + }, + { + "auxiliary_loss_clip": 0.01055132, + "auxiliary_loss_mlp": 0.01040656, + "balance_loss_clip": 1.01568174, + "balance_loss_mlp": 1.01637781, + "epoch": 0.6341800691417405, + "flos": 18733275993600.0, + "grad_norm": 1.8153097709876018, + "language_loss": 0.6403628, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.66132069, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38671875, + "step": 10548, + "time_per_iteration": 2.3602781295776367 + }, + { + "auxiliary_loss_clip": 0.01052165, + "auxiliary_loss_mlp": 0.01037673, + "balance_loss_clip": 1.01503539, + "balance_loss_mlp": 1.01649857, + "epoch": 0.6342401923944085, + "flos": 26248260856320.0, + "grad_norm": 1.6812121579971906, + "language_loss": 0.63377231, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.65467072, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35742188, + "step": 10549, + "time_per_iteration": 2.4006459712982178 + }, + { + "auxiliary_loss_clip": 0.01054324, + "auxiliary_loss_mlp": 0.0103797, + "balance_loss_clip": 1.01490295, + "balance_loss_mlp": 1.01632118, + "epoch": 0.6343003156470765, + "flos": 24679787760000.0, + "grad_norm": 1.5354918677252463, + "language_loss": 0.75391632, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.77483928, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.38085938, + "step": 10550, + "time_per_iteration": 2.4139156341552734 + }, + { + "auxiliary_loss_clip": 0.0101031, + "auxiliary_loss_mlp": 0.01013722, + "balance_loss_clip": 1.01094401, + "balance_loss_mlp": 1.00304782, + "epoch": 0.6343604388997445, + "flos": 69802164501120.0, + "grad_norm": 0.7056183024957036, + "language_loss": 0.57742912, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59766942, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.07275391, + "step": 10551, + "time_per_iteration": 4.424536943435669 + }, + { + "auxiliary_loss_clip": 0.01051725, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.01283336, + "balance_loss_mlp": 1.01615512, + "epoch": 0.6344205621524125, + "flos": 21981315732480.0, + "grad_norm": 1.886352403430425, + "language_loss": 0.6792829, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.70014191, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35546875, + "step": 10552, + "time_per_iteration": 2.3620798587799072 + }, + { + "auxiliary_loss_clip": 0.01055806, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.01093936, + "balance_loss_mlp": 1.01705837, + "epoch": 0.6344806854050804, + "flos": 20447825685120.0, + "grad_norm": 1.6843795925748004, + "language_loss": 0.83854735, + "learning_rate": 1.24512502014147e-06, + "loss": 0.85947537, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 10553, + "time_per_iteration": 2.4274709224700928 + }, + { + "auxiliary_loss_clip": 0.0105595, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.01778984, + "balance_loss_mlp": 1.01743221, + "epoch": 0.6345408086577484, + "flos": 40509163952640.0, + "grad_norm": 2.0576263251712663, + "language_loss": 0.56958491, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.59055817, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38671875, + "step": 10554, + "time_per_iteration": 2.516509532928467 + }, + { + "auxiliary_loss_clip": 0.01055926, + "auxiliary_loss_mlp": 0.01032411, + "balance_loss_clip": 1.00866508, + "balance_loss_mlp": 1.01850128, + "epoch": 0.6346009319104163, + "flos": 21360245823360.0, + "grad_norm": 2.3870921251722503, + "language_loss": 0.71880019, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.73968357, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 10555, + "time_per_iteration": 2.3956048488616943 + }, + { + "auxiliary_loss_clip": 0.01012684, + "auxiliary_loss_mlp": 0.01004095, + "balance_loss_clip": 1.00152016, + "balance_loss_mlp": 1.00535035, + "epoch": 0.6346610551630844, + "flos": 71362433427840.0, + "grad_norm": 0.7784149961114011, + "language_loss": 0.55477673, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.5749445, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.07324219, + "step": 10556, + "time_per_iteration": 2.939636468887329 + }, + { + "auxiliary_loss_clip": 0.01055445, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.0108304, + "balance_loss_mlp": 1.01748538, + "epoch": 0.6347211784157523, + "flos": 25410310381440.0, + "grad_norm": 1.69413589183333, + "language_loss": 0.69599134, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.71692526, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.37890625, + "step": 10557, + "time_per_iteration": 2.502399206161499 + }, + { + "auxiliary_loss_clip": 0.01053686, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.01362062, + "balance_loss_mlp": 1.0177139, + "epoch": 0.6347813016684203, + "flos": 15741812903040.0, + "grad_norm": 1.9706525508964654, + "language_loss": 0.71255255, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.73346508, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 10558, + "time_per_iteration": 2.3516550064086914 + }, + { + "auxiliary_loss_clip": 0.01054155, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.0096122, + "balance_loss_mlp": 1.01747298, + "epoch": 0.6348414249210882, + "flos": 21463868338560.0, + "grad_norm": 1.4658797116392277, + "language_loss": 0.78971624, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.81060851, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3671875, + "step": 10559, + "time_per_iteration": 2.4370269775390625 + }, + { + "auxiliary_loss_clip": 0.01056048, + "auxiliary_loss_mlp": 0.0103811, + "balance_loss_clip": 1.01329088, + "balance_loss_mlp": 1.01773882, + "epoch": 0.6349015481737562, + "flos": 21651980077440.0, + "grad_norm": 2.626856065089491, + "language_loss": 0.69926584, + "learning_rate": 1.242601136020078e-06, + "loss": 0.72020745, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 10560, + "time_per_iteration": 2.3701963424682617 + }, + { + "auxiliary_loss_clip": 0.01054869, + "auxiliary_loss_mlp": 0.01041107, + "balance_loss_clip": 1.01746821, + "balance_loss_mlp": 1.0175786, + "epoch": 0.6349616714264241, + "flos": 22194041846400.0, + "grad_norm": 1.7074778445425454, + "language_loss": 0.78159529, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.80255508, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 10561, + "time_per_iteration": 2.4509217739105225 + }, + { + "auxiliary_loss_clip": 0.01056445, + "auxiliary_loss_mlp": 0.01045014, + "balance_loss_clip": 1.02005172, + "balance_loss_mlp": 1.01805532, + "epoch": 0.6350217946790921, + "flos": 25409193217920.0, + "grad_norm": 1.9893474569242409, + "language_loss": 0.74144012, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.76245475, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 10562, + "time_per_iteration": 2.402327299118042 + }, + { + "auxiliary_loss_clip": 0.01055385, + "auxiliary_loss_mlp": 0.01041275, + "balance_loss_clip": 1.01686072, + "balance_loss_mlp": 1.01761675, + "epoch": 0.63508191793176, + "flos": 19717931468160.0, + "grad_norm": 2.231562273283751, + "language_loss": 0.81310129, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.83406794, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 10563, + "time_per_iteration": 2.399550676345825 + }, + { + "auxiliary_loss_clip": 0.01056006, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_clip": 1.01619911, + "balance_loss_mlp": 1.01852775, + "epoch": 0.6351420411844281, + "flos": 18185942609280.0, + "grad_norm": 2.1024285440910986, + "language_loss": 0.82359749, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.84458232, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.375, + "step": 10564, + "time_per_iteration": 2.369825601577759 + }, + { + "auxiliary_loss_clip": 0.01054781, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.01433444, + "balance_loss_mlp": 1.01801491, + "epoch": 0.6352021644370961, + "flos": 33725190470400.0, + "grad_norm": 1.5309668660752853, + "language_loss": 0.73571897, + "learning_rate": 1.240799222993407e-06, + "loss": 0.75664794, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 10565, + "time_per_iteration": 2.519329786300659 + }, + { + "auxiliary_loss_clip": 0.01056496, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_clip": 1.01742959, + "balance_loss_mlp": 1.01784301, + "epoch": 0.635262287689764, + "flos": 20373774958080.0, + "grad_norm": 1.9521500880999239, + "language_loss": 0.70212841, + "learning_rate": 1.240438926700324e-06, + "loss": 0.72313219, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 10566, + "time_per_iteration": 2.3518264293670654 + }, + { + "auxiliary_loss_clip": 0.01053668, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.01555657, + "balance_loss_mlp": 1.01790631, + "epoch": 0.635322410942432, + "flos": 27524231648640.0, + "grad_norm": 1.5007559089902194, + "language_loss": 0.70460021, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.72552735, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35742188, + "step": 10567, + "time_per_iteration": 2.4497287273406982 + }, + { + "auxiliary_loss_clip": 0.01052326, + "auxiliary_loss_mlp": 0.01040545, + "balance_loss_clip": 1.01791954, + "balance_loss_mlp": 1.01757312, + "epoch": 0.6353825341950999, + "flos": 21542527365120.0, + "grad_norm": 1.6417184777805978, + "language_loss": 0.85837978, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.87930852, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 10568, + "time_per_iteration": 2.402979612350464 + }, + { + "auxiliary_loss_clip": 0.01055846, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.01480126, + "balance_loss_mlp": 1.01743841, + "epoch": 0.635442657447768, + "flos": 31758393139200.0, + "grad_norm": 1.8956404866322525, + "language_loss": 0.85324699, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.87420201, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 10569, + "time_per_iteration": 2.485971212387085 + }, + { + "auxiliary_loss_clip": 0.01052976, + "auxiliary_loss_mlp": 0.01040939, + "balance_loss_clip": 1.01770484, + "balance_loss_mlp": 1.01706934, + "epoch": 0.6355027807004359, + "flos": 19827803116800.0, + "grad_norm": 1.6344510681161528, + "language_loss": 0.70225686, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.72319603, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 10570, + "time_per_iteration": 2.3852434158325195 + }, + { + "auxiliary_loss_clip": 0.0105606, + "auxiliary_loss_mlp": 0.01044986, + "balance_loss_clip": 1.01810431, + "balance_loss_mlp": 1.01669359, + "epoch": 0.6355629039531039, + "flos": 30371084421120.0, + "grad_norm": 2.106011072210145, + "language_loss": 0.6788125, + "learning_rate": 1.2386378775476e-06, + "loss": 0.69982296, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39453125, + "step": 10571, + "time_per_iteration": 2.4619498252868652 + }, + { + "auxiliary_loss_clip": 0.01056274, + "auxiliary_loss_mlp": 0.01040444, + "balance_loss_clip": 1.01593471, + "balance_loss_mlp": 1.01814806, + "epoch": 0.6356230272057718, + "flos": 17931076617600.0, + "grad_norm": 1.7978328884585786, + "language_loss": 0.72235072, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.74331784, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38085938, + "step": 10572, + "time_per_iteration": 2.3491668701171875 + }, + { + "auxiliary_loss_clip": 0.01051926, + "auxiliary_loss_mlp": 0.01039292, + "balance_loss_clip": 1.01729822, + "balance_loss_mlp": 1.01603866, + "epoch": 0.6356831504584398, + "flos": 25374629105280.0, + "grad_norm": 1.4752368432872431, + "language_loss": 0.82069641, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.84160858, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.359375, + "step": 10573, + "time_per_iteration": 2.427277088165283 + }, + { + "auxiliary_loss_clip": 0.01054825, + "auxiliary_loss_mlp": 0.01048049, + "balance_loss_clip": 1.02191865, + "balance_loss_mlp": 1.01622045, + "epoch": 0.6357432737111077, + "flos": 46498548735360.0, + "grad_norm": 1.6301358510327695, + "language_loss": 0.69979638, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.72082508, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 10574, + "time_per_iteration": 2.5732688903808594 + }, + { + "auxiliary_loss_clip": 0.01053273, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.01333213, + "balance_loss_mlp": 1.01630473, + "epoch": 0.6358033969637757, + "flos": 17273417736960.0, + "grad_norm": 2.39940966853445, + "language_loss": 0.88575089, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.90666133, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 10575, + "time_per_iteration": 2.333828926086426 + }, + { + "auxiliary_loss_clip": 0.0105286, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.02236807, + "balance_loss_mlp": 1.01688266, + "epoch": 0.6358635202164437, + "flos": 27124301491200.0, + "grad_norm": 2.2832010559061007, + "language_loss": 0.72966629, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.75065213, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 10576, + "time_per_iteration": 3.6443793773651123 + }, + { + "auxiliary_loss_clip": 0.01054978, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.01781392, + "balance_loss_mlp": 1.0167551, + "epoch": 0.6359236434691117, + "flos": 27524022180480.0, + "grad_norm": 1.8639850820859512, + "language_loss": 0.70064229, + "learning_rate": 1.236477571455085e-06, + "loss": 0.721632, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 10577, + "time_per_iteration": 2.4302561283111572 + }, + { + "auxiliary_loss_clip": 0.01054436, + "auxiliary_loss_mlp": 0.01037292, + "balance_loss_clip": 1.01455891, + "balance_loss_mlp": 1.01777601, + "epoch": 0.6359837667217797, + "flos": 39346730501760.0, + "grad_norm": 1.6344513759150767, + "language_loss": 0.73614562, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.75706291, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 10578, + "time_per_iteration": 2.5098342895507812 + }, + { + "auxiliary_loss_clip": 0.01010246, + "auxiliary_loss_mlp": 0.01008768, + "balance_loss_clip": 1.00639558, + "balance_loss_mlp": 1.00305974, + "epoch": 0.6360438899744476, + "flos": 56411017994880.0, + "grad_norm": 0.7098777113153529, + "language_loss": 0.54563677, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56582689, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07177734, + "step": 10579, + "time_per_iteration": 3.1269760131835938 + }, + { + "auxiliary_loss_clip": 0.01054342, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.0141747, + "balance_loss_mlp": 1.01687992, + "epoch": 0.6361040132271156, + "flos": 24971940950400.0, + "grad_norm": 1.5932020620655667, + "language_loss": 0.78243554, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.80336905, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 10580, + "time_per_iteration": 3.902066707611084 + }, + { + "auxiliary_loss_clip": 0.01053445, + "auxiliary_loss_mlp": 0.01038031, + "balance_loss_clip": 1.01424837, + "balance_loss_mlp": 1.01643038, + "epoch": 0.6361641364797835, + "flos": 23258054574720.0, + "grad_norm": 1.8558693793394638, + "language_loss": 0.67887533, + "learning_rate": 1.235037946268301e-06, + "loss": 0.69979012, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 10581, + "time_per_iteration": 3.899576187133789 + }, + { + "auxiliary_loss_clip": 0.01053544, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.01250923, + "balance_loss_mlp": 1.01650929, + "epoch": 0.6362242597324516, + "flos": 25993325041920.0, + "grad_norm": 1.4794772774774314, + "language_loss": 0.69208682, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.71296644, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.37109375, + "step": 10582, + "time_per_iteration": 2.435685873031616 + }, + { + "auxiliary_loss_clip": 0.01054172, + "auxiliary_loss_mlp": 0.01041708, + "balance_loss_clip": 1.01926064, + "balance_loss_mlp": 1.01683831, + "epoch": 0.6362843829851195, + "flos": 25702044635520.0, + "grad_norm": 1.7033765305013842, + "language_loss": 0.85580683, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.87676561, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.37304688, + "step": 10583, + "time_per_iteration": 2.3920769691467285 + }, + { + "auxiliary_loss_clip": 0.01053805, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.01378489, + "balance_loss_mlp": 1.01770902, + "epoch": 0.6363445062377875, + "flos": 20521841500800.0, + "grad_norm": 1.6304133558948084, + "language_loss": 0.76501834, + "learning_rate": 1.233958531908538e-06, + "loss": 0.78593826, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 10584, + "time_per_iteration": 2.4013376235961914 + }, + { + "auxiliary_loss_clip": 0.01055925, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.01959646, + "balance_loss_mlp": 1.01737285, + "epoch": 0.6364046294904554, + "flos": 19462786185600.0, + "grad_norm": 2.0157955998808155, + "language_loss": 0.74359572, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.76461315, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 10585, + "time_per_iteration": 2.348602294921875 + }, + { + "auxiliary_loss_clip": 0.01053522, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.0143671, + "balance_loss_mlp": 1.01682401, + "epoch": 0.6364647527431234, + "flos": 20994844867200.0, + "grad_norm": 1.7627335178819241, + "language_loss": 0.83564305, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.85654974, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 10586, + "time_per_iteration": 2.4007129669189453 + }, + { + "auxiliary_loss_clip": 0.01053381, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.01085877, + "balance_loss_mlp": 1.0177424, + "epoch": 0.6365248759957913, + "flos": 25769741495040.0, + "grad_norm": 1.4538629842312503, + "language_loss": 0.73724508, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.75811452, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 10587, + "time_per_iteration": 2.401477098464966 + }, + { + "auxiliary_loss_clip": 0.01054192, + "auxiliary_loss_mlp": 0.01037469, + "balance_loss_clip": 1.01387727, + "balance_loss_mlp": 1.0169481, + "epoch": 0.6365849992484593, + "flos": 22454493655680.0, + "grad_norm": 2.021686373013303, + "language_loss": 0.77966654, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.80058318, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 10588, + "time_per_iteration": 2.4241607189178467 + }, + { + "auxiliary_loss_clip": 0.01053325, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.0099287, + "balance_loss_mlp": 1.01797986, + "epoch": 0.6366451225011273, + "flos": 19024696045440.0, + "grad_norm": 1.3792874206452939, + "language_loss": 0.80564713, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82652754, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35351562, + "step": 10589, + "time_per_iteration": 2.371394157409668 + }, + { + "auxiliary_loss_clip": 0.01053409, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.01456928, + "balance_loss_mlp": 1.0171504, + "epoch": 0.6367052457537953, + "flos": 25227225878400.0, + "grad_norm": 3.7049579184520467, + "language_loss": 0.68489027, + "learning_rate": 1.231800487863257e-06, + "loss": 0.70581067, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 10590, + "time_per_iteration": 3.834165573120117 + }, + { + "auxiliary_loss_clip": 0.01058435, + "auxiliary_loss_mlp": 0.01043515, + "balance_loss_clip": 1.01688397, + "balance_loss_mlp": 1.01860523, + "epoch": 0.6367653690064633, + "flos": 19207431434880.0, + "grad_norm": 2.3495765803875326, + "language_loss": 0.79849184, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.81951135, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3984375, + "step": 10591, + "time_per_iteration": 2.3824236392974854 + }, + { + "auxiliary_loss_clip": 0.01054569, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.00821209, + "balance_loss_mlp": 1.01862824, + "epoch": 0.6368254922591312, + "flos": 23545774022400.0, + "grad_norm": 1.4267203044658878, + "language_loss": 0.89747477, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91832358, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 10592, + "time_per_iteration": 2.416938066482544 + }, + { + "auxiliary_loss_clip": 0.01051486, + "auxiliary_loss_mlp": 0.01036208, + "balance_loss_clip": 1.01423752, + "balance_loss_mlp": 1.01735365, + "epoch": 0.6368856155117992, + "flos": 26466153851520.0, + "grad_norm": 1.3554144470817984, + "language_loss": 0.69443345, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.71531045, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 10593, + "time_per_iteration": 2.3927080631256104 + }, + { + "auxiliary_loss_clip": 0.0105279, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.01357436, + "balance_loss_mlp": 1.0173254, + "epoch": 0.6369457387644671, + "flos": 33691045294080.0, + "grad_norm": 1.8689730736205863, + "language_loss": 0.64861178, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.66950333, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 10594, + "time_per_iteration": 2.4740400314331055 + }, + { + "auxiliary_loss_clip": 0.01010527, + "auxiliary_loss_mlp": 0.01015672, + "balance_loss_clip": 1.01306093, + "balance_loss_mlp": 1.00321436, + "epoch": 0.6370058620171352, + "flos": 70905140173440.0, + "grad_norm": 0.7723935863087253, + "language_loss": 0.54724157, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56750351, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.07324219, + "step": 10595, + "time_per_iteration": 3.1471354961395264 + }, + { + "auxiliary_loss_clip": 0.01055949, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_clip": 1.01602578, + "balance_loss_mlp": 1.01798737, + "epoch": 0.6370659852698031, + "flos": 21140886551040.0, + "grad_norm": 1.6666198966593575, + "language_loss": 0.68110383, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.70208323, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 10596, + "time_per_iteration": 2.384780168533325 + }, + { + "auxiliary_loss_clip": 0.01054751, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.01618385, + "balance_loss_mlp": 1.01734018, + "epoch": 0.6371261085224711, + "flos": 20192261466240.0, + "grad_norm": 2.09522073756666, + "language_loss": 0.8106634, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.83160877, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.375, + "step": 10597, + "time_per_iteration": 2.365744113922119 + }, + { + "auxiliary_loss_clip": 0.01056301, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.0155468, + "balance_loss_mlp": 1.01927042, + "epoch": 0.637186231775139, + "flos": 19682494571520.0, + "grad_norm": 1.7837036168454625, + "language_loss": 0.7522254, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77316439, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.37109375, + "step": 10598, + "time_per_iteration": 2.3837478160858154 + }, + { + "auxiliary_loss_clip": 0.01053949, + "auxiliary_loss_mlp": 0.01036319, + "balance_loss_clip": 1.0127635, + "balance_loss_mlp": 1.01660025, + "epoch": 0.637246355027807, + "flos": 13070573602560.0, + "grad_norm": 2.2546418351519475, + "language_loss": 0.69410825, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.71501094, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37304688, + "step": 10599, + "time_per_iteration": 2.324079751968384 + }, + { + "auxiliary_loss_clip": 0.01055104, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.01303196, + "balance_loss_mlp": 1.0176084, + "epoch": 0.6373064782804749, + "flos": 18221693708160.0, + "grad_norm": 1.9253765385793242, + "language_loss": 0.82289577, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.84382987, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 10600, + "time_per_iteration": 2.3559093475341797 + }, + { + "auxiliary_loss_clip": 0.01053734, + "auxiliary_loss_mlp": 0.01043095, + "balance_loss_clip": 1.01959836, + "balance_loss_mlp": 1.01728725, + "epoch": 0.637366601533143, + "flos": 24497331661440.0, + "grad_norm": 1.4630287045656956, + "language_loss": 0.80345476, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82442307, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 10601, + "time_per_iteration": 2.400719404220581 + }, + { + "auxiliary_loss_clip": 0.01056373, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.01521182, + "balance_loss_mlp": 1.0189724, + "epoch": 0.6374267247858109, + "flos": 26357853214080.0, + "grad_norm": 1.9223051188917244, + "language_loss": 0.68614793, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.70709622, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 10602, + "time_per_iteration": 2.4397361278533936 + }, + { + "auxiliary_loss_clip": 0.01053274, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.01319885, + "balance_loss_mlp": 1.01749349, + "epoch": 0.6374868480384789, + "flos": 20370807492480.0, + "grad_norm": 1.697791208248418, + "language_loss": 0.80390084, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.82479239, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 10603, + "time_per_iteration": 2.3665642738342285 + }, + { + "auxiliary_loss_clip": 0.01054334, + "auxiliary_loss_mlp": 0.01041314, + "balance_loss_clip": 1.01551723, + "balance_loss_mlp": 1.01737845, + "epoch": 0.6375469712911469, + "flos": 20995193980800.0, + "grad_norm": 1.999363034791124, + "language_loss": 0.78986752, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.81082398, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 10604, + "time_per_iteration": 2.38584566116333 + }, + { + "auxiliary_loss_clip": 0.01055169, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.00942171, + "balance_loss_mlp": 1.01718986, + "epoch": 0.6376070945438148, + "flos": 19714824357120.0, + "grad_norm": 1.601346196488208, + "language_loss": 0.78348655, + "learning_rate": 1.226409972197281e-06, + "loss": 0.80437708, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 10605, + "time_per_iteration": 2.389240264892578 + }, + { + "auxiliary_loss_clip": 0.01055318, + "auxiliary_loss_mlp": 0.01041264, + "balance_loss_clip": 1.0153718, + "balance_loss_mlp": 1.01775038, + "epoch": 0.6376672177964828, + "flos": 21505694014080.0, + "grad_norm": 1.6322736209049107, + "language_loss": 0.66693819, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.68790394, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 10606, + "time_per_iteration": 2.379565477371216 + }, + { + "auxiliary_loss_clip": 0.01053013, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.01643062, + "balance_loss_mlp": 1.01760197, + "epoch": 0.6377273410491507, + "flos": 18842868351360.0, + "grad_norm": 1.5385206171560668, + "language_loss": 0.76431644, + "learning_rate": 1.225691734459971e-06, + "loss": 0.78523403, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 10607, + "time_per_iteration": 2.353355646133423 + }, + { + "auxiliary_loss_clip": 0.01054772, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.01520693, + "balance_loss_mlp": 1.01722693, + "epoch": 0.6377874643018188, + "flos": 53061138086400.0, + "grad_norm": 1.8270057650195868, + "language_loss": 0.66970658, + "learning_rate": 1.225332659627278e-06, + "loss": 0.69065046, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 10608, + "time_per_iteration": 2.683927536010742 + }, + { + "auxiliary_loss_clip": 0.01009675, + "auxiliary_loss_mlp": 0.01002365, + "balance_loss_clip": 1.00018311, + "balance_loss_mlp": 1.00241137, + "epoch": 0.6378475875544867, + "flos": 65131972640640.0, + "grad_norm": 0.7134789297696591, + "language_loss": 0.51892561, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53904599, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.07275391, + "step": 10609, + "time_per_iteration": 2.96268367767334 + }, + { + "auxiliary_loss_clip": 0.01051812, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.01520276, + "balance_loss_mlp": 1.01757264, + "epoch": 0.6379077108071547, + "flos": 23001652483200.0, + "grad_norm": 1.5450428879717857, + "language_loss": 0.75435358, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.7752257, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.34179688, + "step": 10610, + "time_per_iteration": 2.3828418254852295 + }, + { + "auxiliary_loss_clip": 0.01009651, + "auxiliary_loss_mlp": 0.01002363, + "balance_loss_clip": 0.99997842, + "balance_loss_mlp": 1.00231504, + "epoch": 0.6379678340598226, + "flos": 67598201237760.0, + "grad_norm": 0.8385822727671279, + "language_loss": 0.63262922, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65274936, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.07324219, + "step": 10611, + "time_per_iteration": 3.0573079586029053 + }, + { + "auxiliary_loss_clip": 0.01054229, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.01793635, + "balance_loss_mlp": 1.0167197, + "epoch": 0.6380279573124906, + "flos": 29678756693760.0, + "grad_norm": 1.8553445063427902, + "language_loss": 0.73624122, + "learning_rate": 1.223896654187282e-06, + "loss": 0.75720239, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.375, + "step": 10612, + "time_per_iteration": 2.4269909858703613 + }, + { + "auxiliary_loss_clip": 0.01009373, + "auxiliary_loss_mlp": 0.01008336, + "balance_loss_clip": 1.00594032, + "balance_loss_mlp": 1.0021081, + "epoch": 0.6380880805651585, + "flos": 66480981730560.0, + "grad_norm": 0.7204601007870353, + "language_loss": 0.58019388, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.600371, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.07275391, + "step": 10613, + "time_per_iteration": 2.9454524517059326 + }, + { + "auxiliary_loss_clip": 0.01054966, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.01481223, + "balance_loss_mlp": 1.01715517, + "epoch": 0.6381482038178266, + "flos": 23913863153280.0, + "grad_norm": 1.8488710244293527, + "language_loss": 0.77190042, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.79284525, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 10614, + "time_per_iteration": 2.3909289836883545 + }, + { + "auxiliary_loss_clip": 0.01052852, + "auxiliary_loss_mlp": 0.01040033, + "balance_loss_clip": 1.01608372, + "balance_loss_mlp": 1.01721048, + "epoch": 0.6382083270704945, + "flos": 24241907088000.0, + "grad_norm": 1.9351499339242786, + "language_loss": 0.80545157, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.82638037, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35546875, + "step": 10615, + "time_per_iteration": 3.6266438961029053 + }, + { + "auxiliary_loss_clip": 0.01009821, + "auxiliary_loss_mlp": 0.01002153, + "balance_loss_clip": 0.99990022, + "balance_loss_mlp": 1.00257933, + "epoch": 0.6382684503231625, + "flos": 70771736401920.0, + "grad_norm": 0.6581695666093357, + "language_loss": 0.55737668, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57749641, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.07226562, + "step": 10616, + "time_per_iteration": 3.1206557750701904 + }, + { + "auxiliary_loss_clip": 0.01053684, + "auxiliary_loss_mlp": 0.01041923, + "balance_loss_clip": 1.01963103, + "balance_loss_mlp": 1.01715922, + "epoch": 0.6383285735758305, + "flos": 16543907544960.0, + "grad_norm": 1.7650089907929127, + "language_loss": 0.85297406, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.8739301, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36523438, + "step": 10617, + "time_per_iteration": 2.404188871383667 + }, + { + "auxiliary_loss_clip": 0.0105487, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.01564527, + "balance_loss_mlp": 1.01723599, + "epoch": 0.6383886968284984, + "flos": 14426809344000.0, + "grad_norm": 1.8911769906679654, + "language_loss": 0.88774347, + "learning_rate": 1.221743529196936e-06, + "loss": 0.9086988, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 10618, + "time_per_iteration": 2.344275951385498 + }, + { + "auxiliary_loss_clip": 0.01054902, + "auxiliary_loss_mlp": 0.01040899, + "balance_loss_clip": 1.01851189, + "balance_loss_mlp": 1.01845264, + "epoch": 0.6384488200811664, + "flos": 17928737556480.0, + "grad_norm": 1.749997948713539, + "language_loss": 0.74908483, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.77004278, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36523438, + "step": 10619, + "time_per_iteration": 2.3621978759765625 + }, + { + "auxiliary_loss_clip": 0.01058841, + "auxiliary_loss_mlp": 0.01045404, + "balance_loss_clip": 1.0174737, + "balance_loss_mlp": 1.01840949, + "epoch": 0.6385089433338343, + "flos": 18514580037120.0, + "grad_norm": 2.1160464467817217, + "language_loss": 0.77392125, + "learning_rate": 1.221026056814193e-06, + "loss": 0.79496372, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40429688, + "step": 10620, + "time_per_iteration": 3.6905622482299805 + }, + { + "auxiliary_loss_clip": 0.01052905, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.0128088, + "balance_loss_mlp": 1.01640725, + "epoch": 0.6385690665865024, + "flos": 24752476944000.0, + "grad_norm": 2.9232748885131916, + "language_loss": 0.72212791, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.74301887, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36523438, + "step": 10621, + "time_per_iteration": 3.794584035873413 + }, + { + "auxiliary_loss_clip": 0.01049548, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.01078987, + "balance_loss_mlp": 1.01580131, + "epoch": 0.6386291898391703, + "flos": 20119537370880.0, + "grad_norm": 1.6100612540441017, + "language_loss": 0.79087341, + "learning_rate": 1.220308702586529e-06, + "loss": 0.81169093, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33789062, + "step": 10622, + "time_per_iteration": 2.381413459777832 + }, + { + "auxiliary_loss_clip": 0.0105134, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.01608658, + "balance_loss_mlp": 1.0164696, + "epoch": 0.6386893130918383, + "flos": 16866505307520.0, + "grad_norm": 1.8028721696702565, + "language_loss": 0.75806749, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.7789548, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34960938, + "step": 10623, + "time_per_iteration": 2.348745107650757 + }, + { + "auxiliary_loss_clip": 0.01049956, + "auxiliary_loss_mlp": 0.01029948, + "balance_loss_clip": 1.00932503, + "balance_loss_mlp": 1.01612282, + "epoch": 0.6387494363445062, + "flos": 22965168245760.0, + "grad_norm": 1.4462931949638416, + "language_loss": 0.77092171, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.79172081, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.33789062, + "step": 10624, + "time_per_iteration": 2.4097609519958496 + }, + { + "auxiliary_loss_clip": 0.01053507, + "auxiliary_loss_mlp": 0.01038881, + "balance_loss_clip": 1.01611245, + "balance_loss_mlp": 1.01716065, + "epoch": 0.6388095595971742, + "flos": 22856588317440.0, + "grad_norm": 1.704357042494605, + "language_loss": 0.8184998, + "learning_rate": 1.21923289302382e-06, + "loss": 0.83942372, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 10625, + "time_per_iteration": 2.3822128772735596 + }, + { + "auxiliary_loss_clip": 0.01054648, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.015872, + "balance_loss_mlp": 1.01845527, + "epoch": 0.6388696828498421, + "flos": 17310565290240.0, + "grad_norm": 1.9290037254456298, + "language_loss": 0.73599392, + "learning_rate": 1.218874349031654e-06, + "loss": 0.7569409, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 10626, + "time_per_iteration": 2.3626840114593506 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.0104254, + "balance_loss_clip": 1.01681459, + "balance_loss_mlp": 1.01728415, + "epoch": 0.6389298061025102, + "flos": 17127690255360.0, + "grad_norm": 1.7086403001885544, + "language_loss": 0.7418834, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.76285231, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 10627, + "time_per_iteration": 2.4111955165863037 + }, + { + "auxiliary_loss_clip": 0.01057413, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.01038718, + "balance_loss_mlp": 1.01879215, + "epoch": 0.6389899293551781, + "flos": 27709690124160.0, + "grad_norm": 2.063970073771208, + "language_loss": 0.69109917, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.71203303, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 10628, + "time_per_iteration": 2.468275785446167 + }, + { + "auxiliary_loss_clip": 0.01050788, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.01068497, + "balance_loss_mlp": 1.01635194, + "epoch": 0.6390500526078461, + "flos": 21214623075840.0, + "grad_norm": 1.9806997635833794, + "language_loss": 0.69359159, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.71441865, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 10629, + "time_per_iteration": 2.3919527530670166 + }, + { + "auxiliary_loss_clip": 0.01057499, + "auxiliary_loss_mlp": 0.01044842, + "balance_loss_clip": 1.01694751, + "balance_loss_mlp": 1.01861835, + "epoch": 0.6391101758605141, + "flos": 21579954209280.0, + "grad_norm": 1.508144503193366, + "language_loss": 0.76485586, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.78587925, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38867188, + "step": 10630, + "time_per_iteration": 2.4242208003997803 + }, + { + "auxiliary_loss_clip": 0.01053424, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.01266813, + "balance_loss_mlp": 1.01806498, + "epoch": 0.639170299113182, + "flos": 19899479871360.0, + "grad_norm": 1.4929711276250475, + "language_loss": 0.70923376, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.73010325, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.35351562, + "step": 10631, + "time_per_iteration": 3.773270606994629 + }, + { + "auxiliary_loss_clip": 0.01013836, + "auxiliary_loss_mlp": 0.01004072, + "balance_loss_clip": 1.00171161, + "balance_loss_mlp": 1.0066359, + "epoch": 0.63923042236585, + "flos": 69874434748800.0, + "grad_norm": 0.7701817058709713, + "language_loss": 0.63134599, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.65152502, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07226562, + "step": 10632, + "time_per_iteration": 3.075253963470459 + }, + { + "auxiliary_loss_clip": 0.01051536, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.01543725, + "balance_loss_mlp": 1.01663542, + "epoch": 0.639290545618518, + "flos": 22673713282560.0, + "grad_norm": 2.4893384341823404, + "language_loss": 0.68404341, + "learning_rate": 1.216365371217893e-06, + "loss": 0.70494461, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34960938, + "step": 10633, + "time_per_iteration": 2.3674769401550293 + }, + { + "auxiliary_loss_clip": 0.01053071, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.0125947, + "balance_loss_mlp": 1.01757765, + "epoch": 0.639350668871186, + "flos": 19828152230400.0, + "grad_norm": 1.926916928585002, + "language_loss": 0.82910407, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84998322, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 10634, + "time_per_iteration": 2.3737244606018066 + }, + { + "auxiliary_loss_clip": 0.01052563, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.0147028, + "balance_loss_mlp": 1.01647067, + "epoch": 0.6394107921238539, + "flos": 20552425718400.0, + "grad_norm": 1.4361987849612483, + "language_loss": 0.75948489, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.78040087, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 10635, + "time_per_iteration": 2.3667232990264893 + }, + { + "auxiliary_loss_clip": 0.01053787, + "auxiliary_loss_mlp": 0.01040594, + "balance_loss_clip": 1.01792085, + "balance_loss_mlp": 1.01791143, + "epoch": 0.6394709153765219, + "flos": 25773826124160.0, + "grad_norm": 1.6008064540546543, + "language_loss": 0.7299993, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.75094306, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 10636, + "time_per_iteration": 2.4134092330932617 + }, + { + "auxiliary_loss_clip": 0.01055584, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.01516318, + "balance_loss_mlp": 1.01776516, + "epoch": 0.6395310386291898, + "flos": 17529191424000.0, + "grad_norm": 1.8053433036710436, + "language_loss": 0.74995255, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.77089989, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 10637, + "time_per_iteration": 2.363415002822876 + }, + { + "auxiliary_loss_clip": 0.010541, + "auxiliary_loss_mlp": 0.01046037, + "balance_loss_clip": 1.02235031, + "balance_loss_mlp": 1.0167141, + "epoch": 0.6395911618818578, + "flos": 18587234309760.0, + "grad_norm": 2.5926863534588414, + "language_loss": 0.79344386, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.81444526, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 10638, + "time_per_iteration": 2.420346736907959 + }, + { + "auxiliary_loss_clip": 0.01052331, + "auxiliary_loss_mlp": 0.0103891, + "balance_loss_clip": 1.01447248, + "balance_loss_mlp": 1.01652098, + "epoch": 0.6396512851345257, + "flos": 28365289234560.0, + "grad_norm": 1.5124309398215467, + "language_loss": 0.82738215, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.84829462, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 10639, + "time_per_iteration": 2.4191524982452393 + }, + { + "auxiliary_loss_clip": 0.01011023, + "auxiliary_loss_mlp": 0.01003252, + "balance_loss_clip": 1.001261, + "balance_loss_mlp": 1.00392365, + "epoch": 0.6397114083871938, + "flos": 70720903595520.0, + "grad_norm": 0.8320940677187404, + "language_loss": 0.59150916, + "learning_rate": 1.21385784946359e-06, + "loss": 0.6116519, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.07128906, + "step": 10640, + "time_per_iteration": 2.9737091064453125 + }, + { + "auxiliary_loss_clip": 0.01050203, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.01454735, + "balance_loss_mlp": 1.01619995, + "epoch": 0.6397715316398617, + "flos": 18141777872640.0, + "grad_norm": 1.7307083880800425, + "language_loss": 0.78962648, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.81047511, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 10641, + "time_per_iteration": 2.3401100635528564 + }, + { + "auxiliary_loss_clip": 0.01055904, + "auxiliary_loss_mlp": 0.01043673, + "balance_loss_clip": 1.01661253, + "balance_loss_mlp": 1.01682508, + "epoch": 0.6398316548925297, + "flos": 25738319404800.0, + "grad_norm": 1.7720164104084586, + "language_loss": 0.65601408, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.67700988, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 10642, + "time_per_iteration": 2.4264872074127197 + }, + { + "auxiliary_loss_clip": 0.01009963, + "auxiliary_loss_mlp": 0.0100258, + "balance_loss_clip": 1.00051725, + "balance_loss_mlp": 1.00282574, + "epoch": 0.6398917781451977, + "flos": 71211399419520.0, + "grad_norm": 1.0035766970766313, + "language_loss": 0.56111306, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.58123845, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.07128906, + "step": 10643, + "time_per_iteration": 2.9874422550201416 + }, + { + "auxiliary_loss_clip": 0.01055457, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.01511145, + "balance_loss_mlp": 1.01737249, + "epoch": 0.6399519013978656, + "flos": 20520794160000.0, + "grad_norm": 1.8211860127869288, + "language_loss": 0.77781874, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79878056, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 10644, + "time_per_iteration": 2.429062604904175 + }, + { + "auxiliary_loss_clip": 0.01052905, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.02005279, + "balance_loss_mlp": 1.01715517, + "epoch": 0.6400120246505336, + "flos": 24459730260480.0, + "grad_norm": 1.4375778847104521, + "language_loss": 0.8322382, + "learning_rate": 1.212067656542203e-06, + "loss": 0.85319865, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 10645, + "time_per_iteration": 2.3978002071380615 + }, + { + "auxiliary_loss_clip": 0.01055956, + "auxiliary_loss_mlp": 0.01042703, + "balance_loss_clip": 1.01799119, + "balance_loss_mlp": 1.01750565, + "epoch": 0.6400721479032015, + "flos": 28364835386880.0, + "grad_norm": 1.8606091758658874, + "language_loss": 0.75443137, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.77541798, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 10646, + "time_per_iteration": 2.4393293857574463 + }, + { + "auxiliary_loss_clip": 0.01053159, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.02249014, + "balance_loss_mlp": 1.01613033, + "epoch": 0.6401322711558696, + "flos": 17815723885440.0, + "grad_norm": 2.0004776007126277, + "language_loss": 0.813142, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.83414042, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 10647, + "time_per_iteration": 2.323615789413452 + }, + { + "auxiliary_loss_clip": 0.01052843, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.01777518, + "balance_loss_mlp": 1.01817369, + "epoch": 0.6401923944085375, + "flos": 26029669633920.0, + "grad_norm": 1.7499279505412735, + "language_loss": 0.76826632, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.78918725, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 10648, + "time_per_iteration": 2.4515225887298584 + }, + { + "auxiliary_loss_clip": 0.01053441, + "auxiliary_loss_mlp": 0.01042019, + "balance_loss_clip": 1.01816487, + "balance_loss_mlp": 1.01631796, + "epoch": 0.6402525176612055, + "flos": 23585330459520.0, + "grad_norm": 1.8370955993018379, + "language_loss": 0.79389489, + "learning_rate": 1.210636039936138e-06, + "loss": 0.8148495, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 10649, + "time_per_iteration": 2.390174150466919 + }, + { + "auxiliary_loss_clip": 0.01054136, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.02014709, + "balance_loss_mlp": 1.01708424, + "epoch": 0.6403126409138734, + "flos": 18040424595840.0, + "grad_norm": 1.7017725699080744, + "language_loss": 0.77053905, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.7915253, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 10650, + "time_per_iteration": 2.4473276138305664 + }, + { + "auxiliary_loss_clip": 0.01053579, + "auxiliary_loss_mlp": 0.010462, + "balance_loss_clip": 1.02220309, + "balance_loss_mlp": 1.01722991, + "epoch": 0.6403727641665414, + "flos": 21978453000960.0, + "grad_norm": 1.5087404024015796, + "language_loss": 0.712767, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.73376483, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 10651, + "time_per_iteration": 2.366281509399414 + }, + { + "auxiliary_loss_clip": 0.01053701, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.01440394, + "balance_loss_mlp": 1.01770568, + "epoch": 0.6404328874192093, + "flos": 24894503821440.0, + "grad_norm": 2.555140152421492, + "language_loss": 0.65466464, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.675578, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 10652, + "time_per_iteration": 2.5446150302886963 + }, + { + "auxiliary_loss_clip": 0.01054734, + "auxiliary_loss_mlp": 0.01037741, + "balance_loss_clip": 1.01456666, + "balance_loss_mlp": 1.01851678, + "epoch": 0.6404930106718774, + "flos": 17596399524480.0, + "grad_norm": 1.8003078557787768, + "language_loss": 0.80441046, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.82533526, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36328125, + "step": 10653, + "time_per_iteration": 2.318232536315918 + }, + { + "auxiliary_loss_clip": 0.01058634, + "auxiliary_loss_mlp": 0.01043136, + "balance_loss_clip": 1.01556253, + "balance_loss_mlp": 1.01761961, + "epoch": 0.6405531339245453, + "flos": 20156824569600.0, + "grad_norm": 2.1589398323501854, + "language_loss": 0.73159534, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.75261301, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.41015625, + "step": 10654, + "time_per_iteration": 2.4215598106384277 + }, + { + "auxiliary_loss_clip": 0.01055533, + "auxiliary_loss_mlp": 0.01046545, + "balance_loss_clip": 1.02089095, + "balance_loss_mlp": 1.01787949, + "epoch": 0.6406132571772133, + "flos": 21941270536320.0, + "grad_norm": 1.7208725452161113, + "language_loss": 0.7370075, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75802827, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37695312, + "step": 10655, + "time_per_iteration": 3.6894776821136475 + }, + { + "auxiliary_loss_clip": 0.01054686, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.01250887, + "balance_loss_mlp": 1.01685095, + "epoch": 0.6406733804298813, + "flos": 28766720580480.0, + "grad_norm": 1.9432084490104047, + "language_loss": 0.84308749, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.86402196, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 10656, + "time_per_iteration": 2.4234580993652344 + }, + { + "auxiliary_loss_clip": 0.0105462, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.01433516, + "balance_loss_mlp": 1.01701593, + "epoch": 0.6407335036825492, + "flos": 17456222949120.0, + "grad_norm": 2.7073119739660623, + "language_loss": 0.73978353, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.7607199, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 10657, + "time_per_iteration": 2.3537533283233643 + }, + { + "auxiliary_loss_clip": 0.01053939, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.02166939, + "balance_loss_mlp": 1.01739037, + "epoch": 0.6407936269352172, + "flos": 22124250305280.0, + "grad_norm": 1.9475887052450147, + "language_loss": 0.78454375, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.80552977, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 10658, + "time_per_iteration": 2.4106826782226562 + }, + { + "auxiliary_loss_clip": 0.01054628, + "auxiliary_loss_mlp": 0.01039686, + "balance_loss_clip": 1.01474714, + "balance_loss_mlp": 1.01623309, + "epoch": 0.6408537501878852, + "flos": 23109569095680.0, + "grad_norm": 1.6646400793514755, + "language_loss": 0.77166903, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.7926122, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3828125, + "step": 10659, + "time_per_iteration": 3.7069573402404785 + }, + { + "auxiliary_loss_clip": 0.01055338, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.01168704, + "balance_loss_mlp": 1.01826179, + "epoch": 0.6409138734405532, + "flos": 16471497651840.0, + "grad_norm": 2.5744518849273654, + "language_loss": 0.78555185, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80646062, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 10660, + "time_per_iteration": 3.84342098236084 + }, + { + "auxiliary_loss_clip": 0.01058872, + "auxiliary_loss_mlp": 0.01049936, + "balance_loss_clip": 1.02132618, + "balance_loss_mlp": 1.01926017, + "epoch": 0.6409739966932211, + "flos": 22776986684160.0, + "grad_norm": 1.9754310464487264, + "language_loss": 0.69699097, + "learning_rate": 1.206344067135727e-06, + "loss": 0.71807903, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39453125, + "step": 10661, + "time_per_iteration": 2.380420207977295 + }, + { + "auxiliary_loss_clip": 0.0105255, + "auxiliary_loss_mlp": 0.01036597, + "balance_loss_clip": 1.01512742, + "balance_loss_mlp": 1.01785946, + "epoch": 0.6410341199458891, + "flos": 25150975735680.0, + "grad_norm": 1.566880225688396, + "language_loss": 0.766415, + "learning_rate": 1.205986598033362e-06, + "loss": 0.78730643, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 10662, + "time_per_iteration": 2.400042772293091 + }, + { + "auxiliary_loss_clip": 0.01054208, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.01440287, + "balance_loss_mlp": 1.01721311, + "epoch": 0.641094243198557, + "flos": 27045153705600.0, + "grad_norm": 2.544351464232849, + "language_loss": 0.71007144, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.73099732, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 10663, + "time_per_iteration": 2.4289908409118652 + }, + { + "auxiliary_loss_clip": 0.01056272, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.01549721, + "balance_loss_mlp": 1.01792169, + "epoch": 0.641154366451225, + "flos": 25373372296320.0, + "grad_norm": 1.8729769079934668, + "language_loss": 0.68965578, + "learning_rate": 1.205271750169389e-06, + "loss": 0.71064746, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.3828125, + "step": 10664, + "time_per_iteration": 2.403374671936035 + }, + { + "auxiliary_loss_clip": 0.01053273, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.01465058, + "balance_loss_mlp": 1.01735592, + "epoch": 0.6412144897038929, + "flos": 25152232544640.0, + "grad_norm": 1.7627688978215696, + "language_loss": 0.67152077, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.69243711, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 10665, + "time_per_iteration": 2.4104065895080566 + }, + { + "auxiliary_loss_clip": 0.01052678, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.01312363, + "balance_loss_mlp": 1.01731217, + "epoch": 0.641274612956561, + "flos": 23439637889280.0, + "grad_norm": 1.8658452993328458, + "language_loss": 0.64999056, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.67087507, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 10666, + "time_per_iteration": 2.389007568359375 + }, + { + "auxiliary_loss_clip": 0.01054399, + "auxiliary_loss_mlp": 0.01043106, + "balance_loss_clip": 1.01831055, + "balance_loss_mlp": 1.01716471, + "epoch": 0.6413347362092289, + "flos": 19426476504960.0, + "grad_norm": 1.646074272396001, + "language_loss": 0.72778642, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.74876148, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 10667, + "time_per_iteration": 2.41025447845459 + }, + { + "auxiliary_loss_clip": 0.01059378, + "auxiliary_loss_mlp": 0.01047361, + "balance_loss_clip": 1.01929951, + "balance_loss_mlp": 1.01849008, + "epoch": 0.6413948594618969, + "flos": 17195771139840.0, + "grad_norm": 2.797411574128994, + "language_loss": 0.79323852, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.8143059, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40820312, + "step": 10668, + "time_per_iteration": 2.3776769638061523 + }, + { + "auxiliary_loss_clip": 0.01056034, + "auxiliary_loss_mlp": 0.01040755, + "balance_loss_clip": 1.01718736, + "balance_loss_mlp": 1.01885843, + "epoch": 0.6414549827145648, + "flos": 22268790800640.0, + "grad_norm": 1.6522261830969793, + "language_loss": 0.69097501, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.71194291, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37109375, + "step": 10669, + "time_per_iteration": 2.4285190105438232 + }, + { + "auxiliary_loss_clip": 0.01059013, + "auxiliary_loss_mlp": 0.01047294, + "balance_loss_clip": 1.01950669, + "balance_loss_mlp": 1.01917839, + "epoch": 0.6415151059672328, + "flos": 19639342264320.0, + "grad_norm": 1.7723180267559417, + "language_loss": 0.79479444, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.81585753, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 10670, + "time_per_iteration": 3.8522849082946777 + }, + { + "auxiliary_loss_clip": 0.01056987, + "auxiliary_loss_mlp": 0.01042497, + "balance_loss_clip": 1.01507878, + "balance_loss_mlp": 1.01845694, + "epoch": 0.6415752292199008, + "flos": 14864969306880.0, + "grad_norm": 2.9585137338296885, + "language_loss": 0.91009593, + "learning_rate": 1.20277073264638e-06, + "loss": 0.93109083, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 10671, + "time_per_iteration": 2.341731548309326 + }, + { + "auxiliary_loss_clip": 0.01052726, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.01184702, + "balance_loss_mlp": 1.01808846, + "epoch": 0.6416353524725688, + "flos": 13734725996160.0, + "grad_norm": 4.996627810404741, + "language_loss": 0.70782137, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.72869098, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 10672, + "time_per_iteration": 2.3723559379577637 + }, + { + "auxiliary_loss_clip": 0.01057629, + "auxiliary_loss_mlp": 0.01044643, + "balance_loss_clip": 1.01621222, + "balance_loss_mlp": 1.01736414, + "epoch": 0.6416954757252368, + "flos": 24533780987520.0, + "grad_norm": 2.2783872948137547, + "language_loss": 0.75390226, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.77492499, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.40234375, + "step": 10673, + "time_per_iteration": 2.390533208847046 + }, + { + "auxiliary_loss_clip": 0.01054922, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.01393366, + "balance_loss_mlp": 1.01728511, + "epoch": 0.6417555989779047, + "flos": 27708747517440.0, + "grad_norm": 2.1857987204806304, + "language_loss": 0.71395153, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.73489869, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 10674, + "time_per_iteration": 2.4211432933807373 + }, + { + "auxiliary_loss_clip": 0.01057691, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.01694036, + "balance_loss_mlp": 1.01853454, + "epoch": 0.6418157222305727, + "flos": 20555637563520.0, + "grad_norm": 1.86127577495924, + "language_loss": 0.68764055, + "learning_rate": 1.201342244560338e-06, + "loss": 0.70864767, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 10675, + "time_per_iteration": 2.3760430812835693 + }, + { + "auxiliary_loss_clip": 0.01054251, + "auxiliary_loss_mlp": 0.01042324, + "balance_loss_clip": 1.01756454, + "balance_loss_mlp": 1.01711011, + "epoch": 0.6418758454832406, + "flos": 22600430605440.0, + "grad_norm": 1.9630571367314473, + "language_loss": 0.67968976, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.70065546, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 10676, + "time_per_iteration": 2.551560878753662 + }, + { + "auxiliary_loss_clip": 0.01056227, + "auxiliary_loss_mlp": 0.01040718, + "balance_loss_clip": 1.015481, + "balance_loss_mlp": 1.01858425, + "epoch": 0.6419359687359086, + "flos": 27374035512960.0, + "grad_norm": 2.2638541269874692, + "language_loss": 0.76676083, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.78773028, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 10677, + "time_per_iteration": 2.437696933746338 + }, + { + "auxiliary_loss_clip": 0.01009876, + "auxiliary_loss_mlp": 0.0101811, + "balance_loss_clip": 1.01574957, + "balance_loss_mlp": 1.00221074, + "epoch": 0.6419960919885765, + "flos": 67248791550720.0, + "grad_norm": 0.8556270219672066, + "language_loss": 0.60833019, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62861001, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07666016, + "step": 10678, + "time_per_iteration": 3.1020407676696777 + }, + { + "auxiliary_loss_clip": 0.01053049, + "auxiliary_loss_mlp": 0.01044937, + "balance_loss_clip": 1.02058268, + "balance_loss_mlp": 1.01705718, + "epoch": 0.6420562152412446, + "flos": 19900841414400.0, + "grad_norm": 1.750339292614293, + "language_loss": 0.68475562, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.7057355, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.359375, + "step": 10679, + "time_per_iteration": 2.371885299682617 + }, + { + "auxiliary_loss_clip": 0.01057117, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.01758385, + "balance_loss_mlp": 1.01858175, + "epoch": 0.6421163384939125, + "flos": 24789031004160.0, + "grad_norm": 1.699589616678148, + "language_loss": 0.74394464, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.76494062, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38476562, + "step": 10680, + "time_per_iteration": 2.4071402549743652 + }, + { + "auxiliary_loss_clip": 0.01053144, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.0136714, + "balance_loss_mlp": 1.01689231, + "epoch": 0.6421764617465805, + "flos": 25591649316480.0, + "grad_norm": 2.0390462060793606, + "language_loss": 0.69125843, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.71215564, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 10681, + "time_per_iteration": 2.4107799530029297 + }, + { + "auxiliary_loss_clip": 0.01052854, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.01549006, + "balance_loss_mlp": 1.01629424, + "epoch": 0.6422365849992484, + "flos": 14133923015040.0, + "grad_norm": 1.6734649179945302, + "language_loss": 0.75446945, + "learning_rate": 1.198843556910427e-06, + "loss": 0.77538538, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 10682, + "time_per_iteration": 2.334526538848877 + }, + { + "auxiliary_loss_clip": 0.01050962, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.01315141, + "balance_loss_mlp": 1.0157485, + "epoch": 0.6422967082519164, + "flos": 22382781989760.0, + "grad_norm": 1.55992840760253, + "language_loss": 0.79874563, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81962186, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 10683, + "time_per_iteration": 2.400954484939575 + }, + { + "auxiliary_loss_clip": 0.01053875, + "auxiliary_loss_mlp": 0.01044048, + "balance_loss_clip": 1.01833451, + "balance_loss_mlp": 1.01613474, + "epoch": 0.6423568315045844, + "flos": 14647041400320.0, + "grad_norm": 1.6955508467256435, + "language_loss": 0.68951005, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.71048927, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 10684, + "time_per_iteration": 2.3624110221862793 + }, + { + "auxiliary_loss_clip": 0.01052769, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.01396525, + "balance_loss_mlp": 1.01598036, + "epoch": 0.6424169547572524, + "flos": 26832706882560.0, + "grad_norm": 2.126061593480188, + "language_loss": 0.73062062, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.75152707, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 10685, + "time_per_iteration": 2.5211782455444336 + }, + { + "auxiliary_loss_clip": 0.01052381, + "auxiliary_loss_mlp": 0.01039957, + "balance_loss_clip": 1.01641345, + "balance_loss_mlp": 1.01642394, + "epoch": 0.6424770780099204, + "flos": 22706427093120.0, + "grad_norm": 1.6148825060397416, + "language_loss": 0.75902009, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77994347, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 10686, + "time_per_iteration": 2.3691868782043457 + }, + { + "auxiliary_loss_clip": 0.01057265, + "auxiliary_loss_mlp": 0.01051222, + "balance_loss_clip": 1.02233708, + "balance_loss_mlp": 1.01790047, + "epoch": 0.6425372012625883, + "flos": 28468422990720.0, + "grad_norm": 2.179395057425765, + "language_loss": 0.69784379, + "learning_rate": 1.197059691144867e-06, + "loss": 0.71892869, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.39257812, + "step": 10687, + "time_per_iteration": 2.4287612438201904 + }, + { + "auxiliary_loss_clip": 0.01055453, + "auxiliary_loss_mlp": 0.01039165, + "balance_loss_clip": 1.01503706, + "balance_loss_mlp": 1.01735544, + "epoch": 0.6425973245152563, + "flos": 29350398556800.0, + "grad_norm": 1.8218903750570488, + "language_loss": 0.67328203, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.69422823, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.38085938, + "step": 10688, + "time_per_iteration": 2.4716384410858154 + }, + { + "auxiliary_loss_clip": 0.01053591, + "auxiliary_loss_mlp": 0.01042854, + "balance_loss_clip": 1.01866603, + "balance_loss_mlp": 1.01607394, + "epoch": 0.6426574477679242, + "flos": 16429602153600.0, + "grad_norm": 1.7036518508961833, + "language_loss": 0.74397719, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.76494169, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 10689, + "time_per_iteration": 2.4085705280303955 + }, + { + "auxiliary_loss_clip": 0.01051974, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.01225734, + "balance_loss_mlp": 1.01583719, + "epoch": 0.6427175710205922, + "flos": 21834820200960.0, + "grad_norm": 2.193113961251425, + "language_loss": 0.73283064, + "learning_rate": 1.195989736948226e-06, + "loss": 0.75370026, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36132812, + "step": 10690, + "time_per_iteration": 2.395763635635376 + }, + { + "auxiliary_loss_clip": 0.01051792, + "auxiliary_loss_mlp": 0.01040189, + "balance_loss_clip": 1.01540542, + "balance_loss_mlp": 1.01596475, + "epoch": 0.6427776942732601, + "flos": 17785628426880.0, + "grad_norm": 1.7830753943888145, + "language_loss": 0.78103065, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.80195045, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 10691, + "time_per_iteration": 2.3645246028900146 + }, + { + "auxiliary_loss_clip": 0.01055384, + "auxiliary_loss_mlp": 0.01039385, + "balance_loss_clip": 1.01164472, + "balance_loss_mlp": 1.01674926, + "epoch": 0.6428378175259282, + "flos": 15084991895040.0, + "grad_norm": 1.6204300017191564, + "language_loss": 0.7551949, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.77614248, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38671875, + "step": 10692, + "time_per_iteration": 2.354276180267334 + }, + { + "auxiliary_loss_clip": 0.01053239, + "auxiliary_loss_mlp": 0.01040137, + "balance_loss_clip": 1.01617575, + "balance_loss_mlp": 1.01689506, + "epoch": 0.6428979407785961, + "flos": 23840650298880.0, + "grad_norm": 1.9530625566787807, + "language_loss": 0.6293633, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.65029705, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 10693, + "time_per_iteration": 2.3854289054870605 + }, + { + "auxiliary_loss_clip": 0.010556, + "auxiliary_loss_mlp": 0.01040498, + "balance_loss_clip": 1.01370001, + "balance_loss_mlp": 1.01649117, + "epoch": 0.6429580640312641, + "flos": 32925469800960.0, + "grad_norm": 1.6536205758790061, + "language_loss": 0.61994201, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.640903, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 10694, + "time_per_iteration": 2.459805488586426 + }, + { + "auxiliary_loss_clip": 0.01056276, + "auxiliary_loss_mlp": 0.01041451, + "balance_loss_clip": 1.01629817, + "balance_loss_mlp": 1.01781714, + "epoch": 0.643018187283932, + "flos": 21067324583040.0, + "grad_norm": 1.504407988310007, + "language_loss": 0.80979681, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.83077407, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38476562, + "step": 10695, + "time_per_iteration": 3.580958366394043 + }, + { + "auxiliary_loss_clip": 0.01054903, + "auxiliary_loss_mlp": 0.01045051, + "balance_loss_clip": 1.01746631, + "balance_loss_mlp": 1.01708186, + "epoch": 0.6430783105366, + "flos": 26723428727040.0, + "grad_norm": 1.7353823201177376, + "language_loss": 0.74842203, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.76942152, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.37695312, + "step": 10696, + "time_per_iteration": 2.4017081260681152 + }, + { + "auxiliary_loss_clip": 0.01052038, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.00971532, + "balance_loss_mlp": 1.01667142, + "epoch": 0.643138433789268, + "flos": 23695690867200.0, + "grad_norm": 1.6477296287006251, + "language_loss": 0.76333517, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.78418195, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 10697, + "time_per_iteration": 2.4084744453430176 + }, + { + "auxiliary_loss_clip": 0.01053377, + "auxiliary_loss_mlp": 0.01039519, + "balance_loss_clip": 1.01646376, + "balance_loss_mlp": 1.01693583, + "epoch": 0.643198557041936, + "flos": 34200812188800.0, + "grad_norm": 1.4906432920631827, + "language_loss": 0.6720891, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.69301808, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 10698, + "time_per_iteration": 3.9717938899993896 + }, + { + "auxiliary_loss_clip": 0.0100969, + "auxiliary_loss_mlp": 0.01004272, + "balance_loss_clip": 1.00209081, + "balance_loss_mlp": 1.00244236, + "epoch": 0.643258680294604, + "flos": 67623059992320.0, + "grad_norm": 0.8624979824899943, + "language_loss": 0.6359002, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65603983, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.07226562, + "step": 10699, + "time_per_iteration": 2.9856224060058594 + }, + { + "auxiliary_loss_clip": 0.01052825, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.01269412, + "balance_loss_mlp": 1.01709771, + "epoch": 0.6433188035472719, + "flos": 25184981266560.0, + "grad_norm": 1.7671468828838948, + "language_loss": 0.70035625, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.72122163, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.35742188, + "step": 10700, + "time_per_iteration": 3.7813150882720947 + }, + { + "auxiliary_loss_clip": 0.01055144, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.00848031, + "balance_loss_mlp": 1.01742578, + "epoch": 0.6433789267999399, + "flos": 24972394798080.0, + "grad_norm": 1.6634902475705713, + "language_loss": 0.74155116, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.7624476, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37695312, + "step": 10701, + "time_per_iteration": 2.4258172512054443 + }, + { + "auxiliary_loss_clip": 0.01055013, + "auxiliary_loss_mlp": 0.01040611, + "balance_loss_clip": 1.01384902, + "balance_loss_mlp": 1.01651549, + "epoch": 0.6434390500526078, + "flos": 17565082168320.0, + "grad_norm": 2.1375158074004377, + "language_loss": 0.83228147, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.85323769, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38476562, + "step": 10702, + "time_per_iteration": 2.3326172828674316 + }, + { + "auxiliary_loss_clip": 0.01052458, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.01410317, + "balance_loss_mlp": 1.01666069, + "epoch": 0.6434991733052758, + "flos": 20842728606720.0, + "grad_norm": 2.0747672869938913, + "language_loss": 0.76191789, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.7828154, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35742188, + "step": 10703, + "time_per_iteration": 2.363732099533081 + }, + { + "auxiliary_loss_clip": 0.01010596, + "auxiliary_loss_mlp": 0.01001486, + "balance_loss_clip": 0.99922103, + "balance_loss_mlp": 1.00305915, + "epoch": 0.6435592965579437, + "flos": 66091210778880.0, + "grad_norm": 0.6945155037601164, + "language_loss": 0.54729277, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56741357, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.07519531, + "step": 10704, + "time_per_iteration": 3.0140440464019775 + }, + { + "auxiliary_loss_clip": 0.01052889, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.01218653, + "balance_loss_mlp": 1.01629162, + "epoch": 0.6436194198106118, + "flos": 23767716735360.0, + "grad_norm": 1.6281130896651437, + "language_loss": 0.78319055, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.80405295, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.36523438, + "step": 10705, + "time_per_iteration": 2.4040493965148926 + }, + { + "auxiliary_loss_clip": 0.01052926, + "auxiliary_loss_mlp": 0.01040967, + "balance_loss_clip": 1.0180192, + "balance_loss_mlp": 1.01659822, + "epoch": 0.6436795430632797, + "flos": 20229269374080.0, + "grad_norm": 1.888773325305717, + "language_loss": 0.8063305, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.82726943, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 10706, + "time_per_iteration": 2.3540918827056885 + }, + { + "auxiliary_loss_clip": 0.01052227, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.01396537, + "balance_loss_mlp": 1.01605856, + "epoch": 0.6437396663159477, + "flos": 20300841394560.0, + "grad_norm": 1.838388547099875, + "language_loss": 0.81572294, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.83661687, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 10707, + "time_per_iteration": 2.3948333263397217 + }, + { + "auxiliary_loss_clip": 0.01054303, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.01642239, + "balance_loss_mlp": 1.01744175, + "epoch": 0.6437997895686156, + "flos": 23877448738560.0, + "grad_norm": 1.8660772370602432, + "language_loss": 0.86525905, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.88619882, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 10708, + "time_per_iteration": 2.4702959060668945 + }, + { + "auxiliary_loss_clip": 0.01058243, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_clip": 1.01764095, + "balance_loss_mlp": 1.01843274, + "epoch": 0.6438599128212836, + "flos": 18988281630720.0, + "grad_norm": 3.2966283248168, + "language_loss": 0.67608833, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.69711095, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 10709, + "time_per_iteration": 3.814629077911377 + }, + { + "auxiliary_loss_clip": 0.01052714, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.01861072, + "balance_loss_mlp": 1.01615, + "epoch": 0.6439200360739517, + "flos": 24095236999680.0, + "grad_norm": 1.6438641321722702, + "language_loss": 0.81361133, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.83456832, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 10710, + "time_per_iteration": 2.395677328109741 + }, + { + "auxiliary_loss_clip": 0.01052841, + "auxiliary_loss_mlp": 0.01038785, + "balance_loss_clip": 1.0136323, + "balance_loss_mlp": 1.01610041, + "epoch": 0.6439801593266196, + "flos": 31900873864320.0, + "grad_norm": 1.8083886125286188, + "language_loss": 0.6709764, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.69189262, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 10711, + "time_per_iteration": 2.5016355514526367 + }, + { + "auxiliary_loss_clip": 0.01054665, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.01914263, + "balance_loss_mlp": 1.01777828, + "epoch": 0.6440402825792876, + "flos": 27124650604800.0, + "grad_norm": 1.641758298556535, + "language_loss": 0.79769158, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.8186599, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36914062, + "step": 10712, + "time_per_iteration": 2.4183201789855957 + }, + { + "auxiliary_loss_clip": 0.01054815, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.01197124, + "balance_loss_mlp": 1.01650321, + "epoch": 0.6441004058319555, + "flos": 20666661287040.0, + "grad_norm": 1.7981348117655414, + "language_loss": 0.83995491, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.86086869, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 10713, + "time_per_iteration": 2.3674724102020264 + }, + { + "auxiliary_loss_clip": 0.01051097, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.01750064, + "balance_loss_mlp": 1.01646471, + "epoch": 0.6441605290846235, + "flos": 26024956600320.0, + "grad_norm": 1.557684890340087, + "language_loss": 0.78950322, + "learning_rate": 1.187440012188684e-06, + "loss": 0.81040585, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34570312, + "step": 10714, + "time_per_iteration": 2.4062259197235107 + }, + { + "auxiliary_loss_clip": 0.01051863, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.01479375, + "balance_loss_mlp": 1.01637661, + "epoch": 0.6442206523372914, + "flos": 24898344071040.0, + "grad_norm": 1.5237721180283321, + "language_loss": 0.8197763, + "learning_rate": 1.187084157517583e-06, + "loss": 0.84066415, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 10715, + "time_per_iteration": 2.4270832538604736 + }, + { + "auxiliary_loss_clip": 0.01053142, + "auxiliary_loss_mlp": 0.01041288, + "balance_loss_clip": 1.01730299, + "balance_loss_mlp": 1.01516271, + "epoch": 0.6442807755899594, + "flos": 25155130187520.0, + "grad_norm": 1.9108719552172138, + "language_loss": 0.82817119, + "learning_rate": 1.186728333672332e-06, + "loss": 0.84911549, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38085938, + "step": 10716, + "time_per_iteration": 2.419827699661255 + }, + { + "auxiliary_loss_clip": 0.01055216, + "auxiliary_loss_mlp": 0.01044894, + "balance_loss_clip": 1.01779795, + "balance_loss_mlp": 1.01651645, + "epoch": 0.6443408988426274, + "flos": 27343276738560.0, + "grad_norm": 1.8757148115109004, + "language_loss": 0.79551828, + "learning_rate": 1.186372540666424e-06, + "loss": 0.81651938, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38671875, + "step": 10717, + "time_per_iteration": 2.415886163711548 + }, + { + "auxiliary_loss_clip": 0.01052067, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.014642, + "balance_loss_mlp": 1.01691508, + "epoch": 0.6444010220952954, + "flos": 27927094360320.0, + "grad_norm": 2.9504539086100205, + "language_loss": 0.68914402, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.71004641, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 10718, + "time_per_iteration": 2.411648750305176 + }, + { + "auxiliary_loss_clip": 0.01009957, + "auxiliary_loss_mlp": 0.01010483, + "balance_loss_clip": 1.00758588, + "balance_loss_mlp": 1.00239456, + "epoch": 0.6444611453479633, + "flos": 71212167469440.0, + "grad_norm": 0.7712524103471421, + "language_loss": 0.49760127, + "learning_rate": 1.185661047226603e-06, + "loss": 0.5178057, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.07568359, + "step": 10719, + "time_per_iteration": 3.1737265586853027 + }, + { + "auxiliary_loss_clip": 0.01055814, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_clip": 1.01983488, + "balance_loss_mlp": 1.01812685, + "epoch": 0.6445212686006313, + "flos": 22704192766080.0, + "grad_norm": 1.682722298092546, + "language_loss": 0.78767395, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80868006, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 10720, + "time_per_iteration": 2.3661375045776367 + }, + { + "auxiliary_loss_clip": 0.01054318, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.01585507, + "balance_loss_mlp": 1.01670694, + "epoch": 0.6445813918532992, + "flos": 21177754813440.0, + "grad_norm": 1.9505274983071488, + "language_loss": 0.78128159, + "learning_rate": 1.18494967730604e-06, + "loss": 0.80224323, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 10721, + "time_per_iteration": 2.387746810913086 + }, + { + "auxiliary_loss_clip": 0.01052756, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.01627731, + "balance_loss_mlp": 1.01553679, + "epoch": 0.6446415151059672, + "flos": 25190741640960.0, + "grad_norm": 2.0513060507288063, + "language_loss": 0.73878652, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75973314, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 10722, + "time_per_iteration": 2.388833999633789 + }, + { + "auxiliary_loss_clip": 0.01053136, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.0102396, + "balance_loss_mlp": 1.01675332, + "epoch": 0.6447016383586353, + "flos": 25301032225920.0, + "grad_norm": 1.4792525898387812, + "language_loss": 0.79362714, + "learning_rate": 1.184238431012635e-06, + "loss": 0.81449652, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 10723, + "time_per_iteration": 2.414485454559326 + }, + { + "auxiliary_loss_clip": 0.01057727, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.01952934, + "balance_loss_mlp": 1.01840436, + "epoch": 0.6447617616113032, + "flos": 27702079447680.0, + "grad_norm": 1.5103253222744428, + "language_loss": 0.59427392, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.61529553, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.39453125, + "step": 10724, + "time_per_iteration": 2.4422607421875 + }, + { + "auxiliary_loss_clip": 0.01054386, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.01350439, + "balance_loss_mlp": 1.01806796, + "epoch": 0.6448218848639712, + "flos": 23037997075200.0, + "grad_norm": 1.68644636235414, + "language_loss": 0.84755027, + "learning_rate": 1.183527308454271e-06, + "loss": 0.86844927, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 10725, + "time_per_iteration": 2.391510486602783 + }, + { + "auxiliary_loss_clip": 0.01055559, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.01499808, + "balance_loss_mlp": 1.01725495, + "epoch": 0.6448820081166391, + "flos": 24495027511680.0, + "grad_norm": 1.9892069472440201, + "language_loss": 0.82797849, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.8489356, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 10726, + "time_per_iteration": 2.4205148220062256 + }, + { + "auxiliary_loss_clip": 0.01056457, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.01704335, + "balance_loss_mlp": 1.01771736, + "epoch": 0.6449421313693071, + "flos": 22418183975040.0, + "grad_norm": 1.7069441959826102, + "language_loss": 0.82297122, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.84395808, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 10727, + "time_per_iteration": 2.393566131591797 + }, + { + "auxiliary_loss_clip": 0.01058732, + "auxiliary_loss_mlp": 0.01038891, + "balance_loss_clip": 1.01223564, + "balance_loss_mlp": 1.01750565, + "epoch": 0.645002254621975, + "flos": 20224800720000.0, + "grad_norm": 1.7841363261415206, + "language_loss": 0.80189377, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.82287002, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.41210938, + "step": 10728, + "time_per_iteration": 2.3540499210357666 + }, + { + "auxiliary_loss_clip": 0.01055426, + "auxiliary_loss_mlp": 0.0104362, + "balance_loss_clip": 1.01621377, + "balance_loss_mlp": 1.01744628, + "epoch": 0.645062377874643, + "flos": 27854195708160.0, + "grad_norm": 1.7961420948594051, + "language_loss": 0.75625938, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.77724981, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.37890625, + "step": 10729, + "time_per_iteration": 2.432633638381958 + }, + { + "auxiliary_loss_clip": 0.01054392, + "auxiliary_loss_mlp": 0.01042242, + "balance_loss_clip": 1.01731515, + "balance_loss_mlp": 1.01686764, + "epoch": 0.645122501127311, + "flos": 25300333998720.0, + "grad_norm": 1.6011409930579357, + "language_loss": 0.67619675, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.69716311, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 10730, + "time_per_iteration": 2.395230770111084 + }, + { + "auxiliary_loss_clip": 0.01055907, + "auxiliary_loss_mlp": 0.01040148, + "balance_loss_clip": 1.0150423, + "balance_loss_mlp": 1.01763082, + "epoch": 0.645182624379979, + "flos": 18806349202560.0, + "grad_norm": 1.4671281571022448, + "language_loss": 0.64338934, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.66434985, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 10731, + "time_per_iteration": 2.3703746795654297 + }, + { + "auxiliary_loss_clip": 0.01053643, + "auxiliary_loss_mlp": 0.01041254, + "balance_loss_clip": 1.0159694, + "balance_loss_mlp": 1.01715302, + "epoch": 0.6452427476326469, + "flos": 18331216243200.0, + "grad_norm": 1.800895069199503, + "language_loss": 0.69714987, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.71809882, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 10732, + "time_per_iteration": 2.3427488803863525 + }, + { + "auxiliary_loss_clip": 0.01053508, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.01679516, + "balance_loss_mlp": 1.01802886, + "epoch": 0.6453028708853149, + "flos": 22783619842560.0, + "grad_norm": 1.5554288016469293, + "language_loss": 0.76663816, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.78757781, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 10733, + "time_per_iteration": 2.4185116291046143 + }, + { + "auxiliary_loss_clip": 0.01056726, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.01731586, + "balance_loss_mlp": 1.01836801, + "epoch": 0.6453629941379828, + "flos": 23945005952640.0, + "grad_norm": 2.6930892556651402, + "language_loss": 0.68514895, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.70612955, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38476562, + "step": 10734, + "time_per_iteration": 3.7996668815612793 + }, + { + "auxiliary_loss_clip": 0.01053781, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_clip": 1.01778495, + "balance_loss_mlp": 1.01878285, + "epoch": 0.6454231173906508, + "flos": 17675407664640.0, + "grad_norm": 2.9105899314277166, + "language_loss": 0.75481194, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.77576637, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34960938, + "step": 10735, + "time_per_iteration": 2.3712570667266846 + }, + { + "auxiliary_loss_clip": 0.01055001, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.01539695, + "balance_loss_mlp": 1.01810312, + "epoch": 0.6454832406433189, + "flos": 23291710992000.0, + "grad_norm": 1.7073602188211963, + "language_loss": 0.75933772, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.78027594, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 10736, + "time_per_iteration": 2.392698049545288 + }, + { + "auxiliary_loss_clip": 0.01058739, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.01578188, + "balance_loss_mlp": 1.02004504, + "epoch": 0.6455433638959868, + "flos": 20156161253760.0, + "grad_norm": 1.900999164393352, + "language_loss": 0.72134382, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.74232841, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.38671875, + "step": 10737, + "time_per_iteration": 2.391960859298706 + }, + { + "auxiliary_loss_clip": 0.01011817, + "auxiliary_loss_mlp": 0.01002632, + "balance_loss_clip": 1.00027215, + "balance_loss_mlp": 1.00413668, + "epoch": 0.6456034871486548, + "flos": 66529510387200.0, + "grad_norm": 0.795175811814308, + "language_loss": 0.5848195, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60496402, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07714844, + "step": 10738, + "time_per_iteration": 4.379191637039185 + }, + { + "auxiliary_loss_clip": 0.01053243, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.012254, + "balance_loss_mlp": 1.01744652, + "epoch": 0.6456636104013227, + "flos": 24204969002880.0, + "grad_norm": 1.6325777095879264, + "language_loss": 0.75423586, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.77512997, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35742188, + "step": 10739, + "time_per_iteration": 3.8138628005981445 + }, + { + "auxiliary_loss_clip": 0.01055794, + "auxiliary_loss_mlp": 0.01040912, + "balance_loss_clip": 1.01546085, + "balance_loss_mlp": 1.01777565, + "epoch": 0.6457237336539907, + "flos": 23622931860480.0, + "grad_norm": 2.150783004085481, + "language_loss": 0.73029488, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.75126195, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 10740, + "time_per_iteration": 2.3869235515594482 + }, + { + "auxiliary_loss_clip": 0.01012066, + "auxiliary_loss_mlp": 0.01019412, + "balance_loss_clip": 1.01665783, + "balance_loss_mlp": 1.00446248, + "epoch": 0.6457838569066586, + "flos": 65842454275200.0, + "grad_norm": 0.6717043207987834, + "language_loss": 0.55340743, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57372224, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.07617188, + "step": 10741, + "time_per_iteration": 3.0664541721343994 + }, + { + "auxiliary_loss_clip": 0.01052078, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.01781857, + "balance_loss_mlp": 1.01730251, + "epoch": 0.6458439801593266, + "flos": 22380896776320.0, + "grad_norm": 1.6226447133758133, + "language_loss": 0.81991458, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.84082884, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 10742, + "time_per_iteration": 2.3851029872894287 + }, + { + "auxiliary_loss_clip": 0.01051921, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.0123105, + "balance_loss_mlp": 1.01692116, + "epoch": 0.6459041034119946, + "flos": 24788123308800.0, + "grad_norm": 1.5506538038782571, + "language_loss": 0.82652557, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.8474018, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34960938, + "step": 10743, + "time_per_iteration": 2.4102864265441895 + }, + { + "auxiliary_loss_clip": 0.01053756, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.01677227, + "balance_loss_mlp": 1.01748621, + "epoch": 0.6459642266646626, + "flos": 18324583084800.0, + "grad_norm": 1.7463949148069677, + "language_loss": 0.72941077, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.75035518, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 10744, + "time_per_iteration": 2.3483335971832275 + }, + { + "auxiliary_loss_clip": 0.01052256, + "auxiliary_loss_mlp": 0.01039819, + "balance_loss_clip": 1.01584637, + "balance_loss_mlp": 1.01625538, + "epoch": 0.6460243499173305, + "flos": 43579670094720.0, + "grad_norm": 1.6235906214319749, + "language_loss": 0.68572462, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.70664543, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 10745, + "time_per_iteration": 2.5791451930999756 + }, + { + "auxiliary_loss_clip": 0.01052343, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.01580834, + "balance_loss_mlp": 1.01653504, + "epoch": 0.6460844731699985, + "flos": 19243042888320.0, + "grad_norm": 4.0771560974304455, + "language_loss": 0.75030899, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.77121782, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 10746, + "time_per_iteration": 2.3682284355163574 + }, + { + "auxiliary_loss_clip": 0.01055678, + "auxiliary_loss_mlp": 0.01039029, + "balance_loss_clip": 1.01698756, + "balance_loss_mlp": 1.01829195, + "epoch": 0.6461445964226664, + "flos": 27452135957760.0, + "grad_norm": 1.407197156894725, + "language_loss": 0.67873442, + "learning_rate": 1.175713157660413e-06, + "loss": 0.69968146, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.37304688, + "step": 10747, + "time_per_iteration": 2.470689296722412 + }, + { + "auxiliary_loss_clip": 0.01052606, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 1.01769567, + "epoch": 0.6462047196753344, + "flos": 20294662083840.0, + "grad_norm": 1.5512036110985203, + "language_loss": 0.68282259, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.70376855, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34960938, + "step": 10748, + "time_per_iteration": 2.3981056213378906 + }, + { + "auxiliary_loss_clip": 0.01056987, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_clip": 1.01756263, + "balance_loss_mlp": 1.01873374, + "epoch": 0.6462648429280025, + "flos": 22017241388160.0, + "grad_norm": 1.8297628943502924, + "language_loss": 0.77267909, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.79368258, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 10749, + "time_per_iteration": 3.8550045490264893 + }, + { + "auxiliary_loss_clip": 0.01053691, + "auxiliary_loss_mlp": 0.01041064, + "balance_loss_clip": 1.01768696, + "balance_loss_mlp": 1.01655889, + "epoch": 0.6463249661806704, + "flos": 27779935512960.0, + "grad_norm": 1.717767441695848, + "language_loss": 0.78332275, + "learning_rate": 1.17464876058473e-06, + "loss": 0.80427027, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 10750, + "time_per_iteration": 2.4255354404449463 + }, + { + "auxiliary_loss_clip": 0.0105581, + "auxiliary_loss_mlp": 0.01044097, + "balance_loss_clip": 1.01974297, + "balance_loss_mlp": 1.01764882, + "epoch": 0.6463850894333384, + "flos": 22049606085120.0, + "grad_norm": 2.212409144374465, + "language_loss": 0.70545793, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.726457, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38085938, + "step": 10751, + "time_per_iteration": 2.4009346961975098 + }, + { + "auxiliary_loss_clip": 0.01054199, + "auxiliary_loss_mlp": 0.01047769, + "balance_loss_clip": 1.02459455, + "balance_loss_mlp": 1.01710939, + "epoch": 0.6464452126860063, + "flos": 21105170363520.0, + "grad_norm": 2.114633290230461, + "language_loss": 0.72967124, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.75069094, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 10752, + "time_per_iteration": 2.370039463043213 + }, + { + "auxiliary_loss_clip": 0.01054393, + "auxiliary_loss_mlp": 0.01046104, + "balance_loss_clip": 1.01930606, + "balance_loss_mlp": 1.01659274, + "epoch": 0.6465053359386743, + "flos": 16027298023680.0, + "grad_norm": 1.7612873787385885, + "language_loss": 0.79361367, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.81461859, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.37890625, + "step": 10753, + "time_per_iteration": 2.367999315261841 + }, + { + "auxiliary_loss_clip": 0.01052386, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_clip": 1.02625108, + "balance_loss_mlp": 1.01682591, + "epoch": 0.6465654591913422, + "flos": 23397707479680.0, + "grad_norm": 1.6362025251436505, + "language_loss": 0.85716891, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.8781901, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35546875, + "step": 10754, + "time_per_iteration": 2.4041528701782227 + }, + { + "auxiliary_loss_clip": 0.01053089, + "auxiliary_loss_mlp": 0.01045076, + "balance_loss_clip": 1.02106738, + "balance_loss_mlp": 1.01722264, + "epoch": 0.6466255824440102, + "flos": 15376377035520.0, + "grad_norm": 3.1817582807084706, + "language_loss": 0.60957265, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.63055432, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 10755, + "time_per_iteration": 2.346297264099121 + }, + { + "auxiliary_loss_clip": 0.01052884, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.02036846, + "balance_loss_mlp": 1.01653504, + "epoch": 0.6466857056966782, + "flos": 16251928911360.0, + "grad_norm": 2.0630861501193425, + "language_loss": 0.69222093, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.7131952, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 10756, + "time_per_iteration": 2.336540699005127 + }, + { + "auxiliary_loss_clip": 0.01057236, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.02274823, + "balance_loss_mlp": 1.01786733, + "epoch": 0.6467458289493462, + "flos": 21177196231680.0, + "grad_norm": 3.0634835665401465, + "language_loss": 0.7641927, + "learning_rate": 1.172166263444844e-06, + "loss": 0.78523582, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.39453125, + "step": 10757, + "time_per_iteration": 2.356574296951294 + }, + { + "auxiliary_loss_clip": 0.01052525, + "auxiliary_loss_mlp": 0.01041802, + "balance_loss_clip": 1.01823473, + "balance_loss_mlp": 1.01738441, + "epoch": 0.6468059522020141, + "flos": 17967316475520.0, + "grad_norm": 1.548720198723808, + "language_loss": 0.75781214, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.77875543, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 10758, + "time_per_iteration": 2.363762140274048 + }, + { + "auxiliary_loss_clip": 0.0105371, + "auxiliary_loss_mlp": 0.01051472, + "balance_loss_clip": 1.02530515, + "balance_loss_mlp": 1.0169245, + "epoch": 0.6468660754546821, + "flos": 17889320764800.0, + "grad_norm": 1.8491373520255292, + "language_loss": 0.68865347, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.70970523, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3671875, + "step": 10759, + "time_per_iteration": 2.3759570121765137 + }, + { + "auxiliary_loss_clip": 0.01055926, + "auxiliary_loss_mlp": 0.01040885, + "balance_loss_clip": 1.01707864, + "balance_loss_mlp": 1.01778293, + "epoch": 0.64692619870735, + "flos": 22599906935040.0, + "grad_norm": 1.4806612172451579, + "language_loss": 0.765571, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.78653914, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 10760, + "time_per_iteration": 2.4225521087646484 + }, + { + "auxiliary_loss_clip": 0.01052047, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.01488686, + "balance_loss_mlp": 1.01672626, + "epoch": 0.646986321960018, + "flos": 49598940867840.0, + "grad_norm": 1.5112768083666424, + "language_loss": 0.66126728, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.68215448, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35351562, + "step": 10761, + "time_per_iteration": 2.613661527633667 + }, + { + "auxiliary_loss_clip": 0.01054146, + "auxiliary_loss_mlp": 0.01040465, + "balance_loss_clip": 1.01581228, + "balance_loss_mlp": 1.01722002, + "epoch": 0.6470464452126861, + "flos": 21907369739520.0, + "grad_norm": 1.9644075105599186, + "language_loss": 0.71024525, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.7311914, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36914062, + "step": 10762, + "time_per_iteration": 2.409022808074951 + }, + { + "auxiliary_loss_clip": 0.01056518, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.01988125, + "balance_loss_mlp": 1.01837718, + "epoch": 0.647106568465354, + "flos": 18105363457920.0, + "grad_norm": 1.8577301819439889, + "language_loss": 0.83436453, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.855371, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38085938, + "step": 10763, + "time_per_iteration": 2.352147102355957 + }, + { + "auxiliary_loss_clip": 0.01009766, + "auxiliary_loss_mlp": 0.01008341, + "balance_loss_clip": 1.00595677, + "balance_loss_mlp": 1.00233459, + "epoch": 0.647166691718022, + "flos": 69476773829760.0, + "grad_norm": 0.7101647323556208, + "language_loss": 0.57929057, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59947163, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.07421875, + "step": 10764, + "time_per_iteration": 3.2071871757507324 + }, + { + "auxiliary_loss_clip": 0.01051894, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.01480865, + "balance_loss_mlp": 1.01674795, + "epoch": 0.6472268149706899, + "flos": 34093733448960.0, + "grad_norm": 1.8034147837497108, + "language_loss": 0.61609912, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.63698763, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3515625, + "step": 10765, + "time_per_iteration": 2.483252763748169 + }, + { + "auxiliary_loss_clip": 0.01052273, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.01179504, + "balance_loss_mlp": 1.01748705, + "epoch": 0.6472869382233579, + "flos": 28109969395200.0, + "grad_norm": 1.780535568082008, + "language_loss": 0.63800842, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65886569, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34765625, + "step": 10766, + "time_per_iteration": 2.44814395904541 + }, + { + "auxiliary_loss_clip": 0.01054837, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.01256084, + "balance_loss_mlp": 1.01816034, + "epoch": 0.6473470614760258, + "flos": 22491047715840.0, + "grad_norm": 1.6303592850671424, + "language_loss": 0.77162993, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.79253471, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 10767, + "time_per_iteration": 2.3859012126922607 + }, + { + "auxiliary_loss_clip": 0.01052623, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.01195371, + "balance_loss_mlp": 1.01787627, + "epoch": 0.6474071847286939, + "flos": 14538042535680.0, + "grad_norm": 1.9618581802708077, + "language_loss": 0.7883839, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.80925846, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34765625, + "step": 10768, + "time_per_iteration": 2.364060163497925 + }, + { + "auxiliary_loss_clip": 0.01053795, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.01005888, + "balance_loss_mlp": 1.01806331, + "epoch": 0.6474673079813618, + "flos": 24097052390400.0, + "grad_norm": 2.0090715355000746, + "language_loss": 0.721825, + "learning_rate": 1.167914135250663e-06, + "loss": 0.7426765, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35742188, + "step": 10769, + "time_per_iteration": 2.4069390296936035 + }, + { + "auxiliary_loss_clip": 0.01053166, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.01266026, + "balance_loss_mlp": 1.01855993, + "epoch": 0.6475274312340298, + "flos": 14975294803200.0, + "grad_norm": 1.8178753569146218, + "language_loss": 0.7480287, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.76890707, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34570312, + "step": 10770, + "time_per_iteration": 2.4079084396362305 + }, + { + "auxiliary_loss_clip": 0.01056294, + "auxiliary_loss_mlp": 0.01036837, + "balance_loss_clip": 1.01104045, + "balance_loss_mlp": 1.01817322, + "epoch": 0.6475875544866977, + "flos": 25044176286720.0, + "grad_norm": 1.8954301642959925, + "language_loss": 0.74890387, + "learning_rate": 1.167205888330325e-06, + "loss": 0.76983517, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38085938, + "step": 10771, + "time_per_iteration": 2.425462007522583 + }, + { + "auxiliary_loss_clip": 0.01055454, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_clip": 1.0175004, + "balance_loss_mlp": 1.0194943, + "epoch": 0.6476476777393657, + "flos": 16471218360960.0, + "grad_norm": 1.9796592032778333, + "language_loss": 0.74976975, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.77073807, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 10772, + "time_per_iteration": 2.4447150230407715 + }, + { + "auxiliary_loss_clip": 0.01052213, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.0155201, + "balance_loss_mlp": 1.01731837, + "epoch": 0.6477078009920336, + "flos": 25811078411520.0, + "grad_norm": 1.5387638023143058, + "language_loss": 0.83863413, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85950649, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.34960938, + "step": 10773, + "time_per_iteration": 2.4091789722442627 + }, + { + "auxiliary_loss_clip": 0.01050884, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.00823641, + "balance_loss_mlp": 1.0170126, + "epoch": 0.6477679242447016, + "flos": 17675163285120.0, + "grad_norm": 1.6100838379248177, + "language_loss": 0.79435909, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.81516922, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 10774, + "time_per_iteration": 3.6323893070220947 + }, + { + "auxiliary_loss_clip": 0.01053819, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.01786137, + "epoch": 0.6478280474973696, + "flos": 21031259281920.0, + "grad_norm": 2.269975489431806, + "language_loss": 0.70842749, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.72937626, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 10775, + "time_per_iteration": 2.401000738143921 + }, + { + "auxiliary_loss_clip": 0.01055355, + "auxiliary_loss_mlp": 0.01043381, + "balance_loss_clip": 1.01856196, + "balance_loss_mlp": 1.01849294, + "epoch": 0.6478881707500376, + "flos": 21615844953600.0, + "grad_norm": 2.59157063435956, + "language_loss": 0.67743123, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.69841856, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 10776, + "time_per_iteration": 2.372556447982788 + }, + { + "auxiliary_loss_clip": 0.01054842, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.01134026, + "balance_loss_mlp": 1.01717687, + "epoch": 0.6479482940027056, + "flos": 18441576650880.0, + "grad_norm": 2.4722359899951556, + "language_loss": 0.80916715, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.83007073, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 10777, + "time_per_iteration": 2.3571970462799072 + }, + { + "auxiliary_loss_clip": 0.01054595, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.01598144, + "balance_loss_mlp": 1.01832831, + "epoch": 0.6480084172553735, + "flos": 22162968869760.0, + "grad_norm": 1.8850138482697545, + "language_loss": 0.74282831, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.7637682, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 10778, + "time_per_iteration": 3.702528238296509 + }, + { + "auxiliary_loss_clip": 0.01052423, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.0150038, + "balance_loss_mlp": 1.01671982, + "epoch": 0.6480685405080415, + "flos": 24315085031040.0, + "grad_norm": 1.4987539021289757, + "language_loss": 0.7923708, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.81326962, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 10779, + "time_per_iteration": 3.7859139442443848 + }, + { + "auxiliary_loss_clip": 0.01011707, + "auxiliary_loss_mlp": 0.01002646, + "balance_loss_clip": 1.00051248, + "balance_loss_mlp": 1.00422502, + "epoch": 0.6481286637607094, + "flos": 59888017630080.0, + "grad_norm": 0.7202543686298508, + "language_loss": 0.59475619, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61489969, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.07470703, + "step": 10780, + "time_per_iteration": 3.0126631259918213 + }, + { + "auxiliary_loss_clip": 0.01053554, + "auxiliary_loss_mlp": 0.01045251, + "balance_loss_clip": 1.02156377, + "balance_loss_mlp": 1.01742435, + "epoch": 0.6481887870133775, + "flos": 25482999565440.0, + "grad_norm": 1.9196869839371726, + "language_loss": 0.80842417, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.82941222, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36132812, + "step": 10781, + "time_per_iteration": 2.403019905090332 + }, + { + "auxiliary_loss_clip": 0.01055093, + "auxiliary_loss_mlp": 0.01040558, + "balance_loss_clip": 1.01652539, + "balance_loss_mlp": 1.01843333, + "epoch": 0.6482489102660454, + "flos": 19929400773120.0, + "grad_norm": 3.285919599521553, + "language_loss": 0.8038035, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.82476008, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 10782, + "time_per_iteration": 2.346817970275879 + }, + { + "auxiliary_loss_clip": 0.01053973, + "auxiliary_loss_mlp": 0.01041108, + "balance_loss_clip": 1.01738548, + "balance_loss_mlp": 1.01712894, + "epoch": 0.6483090335187134, + "flos": 26978259807360.0, + "grad_norm": 2.173204223796254, + "language_loss": 0.66183722, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.68278807, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36914062, + "step": 10783, + "time_per_iteration": 2.4247231483459473 + }, + { + "auxiliary_loss_clip": 0.01055424, + "auxiliary_loss_mlp": 0.01045485, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.0176506, + "epoch": 0.6483691567713813, + "flos": 25076925008640.0, + "grad_norm": 1.7895460703469552, + "language_loss": 0.89709628, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.91810524, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 10784, + "time_per_iteration": 2.4067225456237793 + }, + { + "auxiliary_loss_clip": 0.01053308, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.01389635, + "balance_loss_mlp": 1.01759315, + "epoch": 0.6484292800240493, + "flos": 16105084266240.0, + "grad_norm": 2.1743876122555927, + "language_loss": 0.75526309, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.77615684, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35742188, + "step": 10785, + "time_per_iteration": 2.3301734924316406 + }, + { + "auxiliary_loss_clip": 0.01051925, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.016873, + "balance_loss_mlp": 1.01665425, + "epoch": 0.6484894032767172, + "flos": 28839130473600.0, + "grad_norm": 1.4619784339513133, + "language_loss": 0.70022833, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.72113961, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 10786, + "time_per_iteration": 2.44413685798645 + }, + { + "auxiliary_loss_clip": 0.01051906, + "auxiliary_loss_mlp": 0.01043339, + "balance_loss_clip": 1.02121329, + "balance_loss_mlp": 1.01546884, + "epoch": 0.6485495265293852, + "flos": 30225740964480.0, + "grad_norm": 2.0407075140378, + "language_loss": 0.72299671, + "learning_rate": 1.161544469455041e-06, + "loss": 0.74394912, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 10787, + "time_per_iteration": 2.4500064849853516 + }, + { + "auxiliary_loss_clip": 0.01055739, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.01476169, + "balance_loss_mlp": 1.01707757, + "epoch": 0.6486096497820532, + "flos": 20081202831360.0, + "grad_norm": 2.0061588118475417, + "language_loss": 0.85293061, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.87388843, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 10788, + "time_per_iteration": 2.3468503952026367 + }, + { + "auxiliary_loss_clip": 0.01053959, + "auxiliary_loss_mlp": 0.01040898, + "balance_loss_clip": 1.01609111, + "balance_loss_mlp": 1.0170908, + "epoch": 0.6486697730347212, + "flos": 17128109191680.0, + "grad_norm": 1.8566565403983104, + "language_loss": 0.7852869, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.80623543, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36914062, + "step": 10789, + "time_per_iteration": 3.799407958984375 + }, + { + "auxiliary_loss_clip": 0.01050838, + "auxiliary_loss_mlp": 0.01038869, + "balance_loss_clip": 1.01657724, + "balance_loss_mlp": 1.01577413, + "epoch": 0.6487298962873892, + "flos": 38910351018240.0, + "grad_norm": 1.6775657055038795, + "language_loss": 0.76645803, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78735507, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34960938, + "step": 10790, + "time_per_iteration": 2.627246856689453 + }, + { + "auxiliary_loss_clip": 0.01052741, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.01619005, + "balance_loss_mlp": 1.01747978, + "epoch": 0.6487900195400571, + "flos": 11947033272960.0, + "grad_norm": 2.1058852646791837, + "language_loss": 0.60890162, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62980747, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 10791, + "time_per_iteration": 2.4384970664978027 + }, + { + "auxiliary_loss_clip": 0.01052028, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_clip": 1.02188623, + "balance_loss_mlp": 1.01596367, + "epoch": 0.6488501427927251, + "flos": 22343400109440.0, + "grad_norm": 1.8345734828988036, + "language_loss": 0.86987901, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.89083391, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36132812, + "step": 10792, + "time_per_iteration": 2.3898937702178955 + }, + { + "auxiliary_loss_clip": 0.01055897, + "auxiliary_loss_mlp": 0.01042675, + "balance_loss_clip": 1.01894021, + "balance_loss_mlp": 1.01770473, + "epoch": 0.648910266045393, + "flos": 22235204206080.0, + "grad_norm": 1.8819855135312773, + "language_loss": 0.79001176, + "learning_rate": 1.159423532850735e-06, + "loss": 0.81099743, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 10793, + "time_per_iteration": 2.440857410430908 + }, + { + "auxiliary_loss_clip": 0.01054212, + "auxiliary_loss_mlp": 0.01043436, + "balance_loss_clip": 1.01818788, + "balance_loss_mlp": 1.01679039, + "epoch": 0.6489703892980611, + "flos": 25300089619200.0, + "grad_norm": 2.3236880751140045, + "language_loss": 0.76088244, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.78185892, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 10794, + "time_per_iteration": 2.4491820335388184 + }, + { + "auxiliary_loss_clip": 0.01052852, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.01581037, + "balance_loss_mlp": 1.01597214, + "epoch": 0.649030512550729, + "flos": 24570753984000.0, + "grad_norm": 1.63262525907455, + "language_loss": 0.71187222, + "learning_rate": 1.158716808837621e-06, + "loss": 0.73279095, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 10795, + "time_per_iteration": 2.549916982650757 + }, + { + "auxiliary_loss_clip": 0.0105524, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.0176605, + "balance_loss_mlp": 1.01783586, + "epoch": 0.649090635803397, + "flos": 26243652556800.0, + "grad_norm": 1.7588386041057074, + "language_loss": 0.54977721, + "learning_rate": 1.158363494676679e-06, + "loss": 0.57077622, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.375, + "step": 10796, + "time_per_iteration": 2.4991137981414795 + }, + { + "auxiliary_loss_clip": 0.01054273, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.01630974, + "balance_loss_mlp": 1.01722825, + "epoch": 0.6491507590560649, + "flos": 24936189851520.0, + "grad_norm": 2.56010857797697, + "language_loss": 0.78931248, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.81024933, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 10797, + "time_per_iteration": 2.6582343578338623 + }, + { + "auxiliary_loss_clip": 0.01052453, + "auxiliary_loss_mlp": 0.01040107, + "balance_loss_clip": 1.01655138, + "balance_loss_mlp": 1.01812005, + "epoch": 0.6492108823087329, + "flos": 19498781664000.0, + "grad_norm": 1.986628273593499, + "language_loss": 0.71138167, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.73230726, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.34375, + "step": 10798, + "time_per_iteration": 2.3892533779144287 + }, + { + "auxiliary_loss_clip": 0.0105228, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.01171517, + "balance_loss_mlp": 1.01654768, + "epoch": 0.6492710055614008, + "flos": 19718280581760.0, + "grad_norm": 1.6933737037440477, + "language_loss": 0.77758974, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79845417, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 10799, + "time_per_iteration": 2.34598708152771 + }, + { + "auxiliary_loss_clip": 0.01054183, + "auxiliary_loss_mlp": 0.01042247, + "balance_loss_clip": 1.01598549, + "balance_loss_mlp": 1.01655781, + "epoch": 0.6493311288140688, + "flos": 24315853080960.0, + "grad_norm": 1.8082726795302015, + "language_loss": 0.72255343, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.74351776, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37695312, + "step": 10800, + "time_per_iteration": 2.4152514934539795 + }, + { + "auxiliary_loss_clip": 0.01012254, + "auxiliary_loss_mlp": 0.01009801, + "balance_loss_clip": 1.00763106, + "balance_loss_mlp": 1.004632, + "epoch": 0.6493912520667368, + "flos": 70931465205120.0, + "grad_norm": 0.7659409592913372, + "language_loss": 0.60339117, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62361169, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.07617188, + "step": 10801, + "time_per_iteration": 3.119224786758423 + }, + { + "auxiliary_loss_clip": 0.01058505, + "auxiliary_loss_mlp": 0.01040451, + "balance_loss_clip": 1.01379609, + "balance_loss_mlp": 1.01935887, + "epoch": 0.6494513753194048, + "flos": 25336608768000.0, + "grad_norm": 1.5580560758472957, + "language_loss": 0.7928952, + "learning_rate": 1.156244280393614e-06, + "loss": 0.81388474, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 10802, + "time_per_iteration": 2.4118871688842773 + }, + { + "auxiliary_loss_clip": 0.01054805, + "auxiliary_loss_mlp": 0.01043636, + "balance_loss_clip": 1.01767218, + "balance_loss_mlp": 1.01696563, + "epoch": 0.6495114985720728, + "flos": 24680800189440.0, + "grad_norm": 1.5252889481577423, + "language_loss": 0.75296843, + "learning_rate": 1.155891189918541e-06, + "loss": 0.77395284, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 10803, + "time_per_iteration": 2.392454147338867 + }, + { + "auxiliary_loss_clip": 0.01054249, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.01424599, + "balance_loss_mlp": 1.01690984, + "epoch": 0.6495716218247407, + "flos": 23650269321600.0, + "grad_norm": 2.3120280784356235, + "language_loss": 0.70802343, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72896159, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 10804, + "time_per_iteration": 2.372480869293213 + }, + { + "auxiliary_loss_clip": 0.0105427, + "auxiliary_loss_mlp": 0.01038932, + "balance_loss_clip": 1.01441038, + "balance_loss_mlp": 1.01816607, + "epoch": 0.6496317450774087, + "flos": 22345075854720.0, + "grad_norm": 1.7655794729103709, + "language_loss": 0.7335664, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.75449848, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36132812, + "step": 10805, + "time_per_iteration": 2.368354320526123 + }, + { + "auxiliary_loss_clip": 0.01054707, + "auxiliary_loss_mlp": 0.0104193, + "balance_loss_clip": 1.01693177, + "balance_loss_mlp": 1.01687789, + "epoch": 0.6496918683300766, + "flos": 30517335573120.0, + "grad_norm": 2.0946999363031416, + "language_loss": 0.67294276, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.69390917, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 10806, + "time_per_iteration": 2.458112955093384 + }, + { + "auxiliary_loss_clip": 0.01056394, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.01186657, + "balance_loss_mlp": 1.01743197, + "epoch": 0.6497519915827447, + "flos": 12458161710720.0, + "grad_norm": 1.953209290468795, + "language_loss": 0.80800068, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.82894415, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 10807, + "time_per_iteration": 2.345942735671997 + }, + { + "auxiliary_loss_clip": 0.01012555, + "auxiliary_loss_mlp": 0.01016534, + "balance_loss_clip": 1.01367259, + "balance_loss_mlp": 1.00509548, + "epoch": 0.6498121148354126, + "flos": 69090075077760.0, + "grad_norm": 0.7943385853823598, + "language_loss": 0.59015685, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.61044776, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.07421875, + "step": 10808, + "time_per_iteration": 3.1716926097869873 + }, + { + "auxiliary_loss_clip": 0.01053774, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.01162612, + "balance_loss_mlp": 1.01777697, + "epoch": 0.6498722380880806, + "flos": 36895827991680.0, + "grad_norm": 2.3949854073535537, + "language_loss": 0.64320111, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.6640991, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 10809, + "time_per_iteration": 2.50059175491333 + }, + { + "auxiliary_loss_clip": 0.01053574, + "auxiliary_loss_mlp": 0.01040137, + "balance_loss_clip": 1.01747513, + "balance_loss_mlp": 1.0180676, + "epoch": 0.6499323613407485, + "flos": 29016629159040.0, + "grad_norm": 1.4860361677129648, + "language_loss": 0.82337165, + "learning_rate": 1.153420453586008e-06, + "loss": 0.84430873, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 10810, + "time_per_iteration": 2.499319076538086 + }, + { + "auxiliary_loss_clip": 0.01052879, + "auxiliary_loss_mlp": 0.01037498, + "balance_loss_clip": 1.01459801, + "balance_loss_mlp": 1.01728308, + "epoch": 0.6499924845934165, + "flos": 20118245650560.0, + "grad_norm": 1.5157854985562063, + "language_loss": 0.72673565, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.74763942, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 10811, + "time_per_iteration": 2.3682758808135986 + }, + { + "auxiliary_loss_clip": 0.01054061, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.0109992, + "balance_loss_mlp": 1.01849842, + "epoch": 0.6500526078460844, + "flos": 24420313468800.0, + "grad_norm": 1.4352515154730583, + "language_loss": 0.78567165, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.80654573, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 10812, + "time_per_iteration": 2.429349184036255 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01038271, + "balance_loss_clip": 1.01329708, + "balance_loss_mlp": 1.01695597, + "epoch": 0.6501127310987524, + "flos": 23329905886080.0, + "grad_norm": 1.6975651038743764, + "language_loss": 0.85679889, + "learning_rate": 1.152362047854413e-06, + "loss": 0.877729, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 10813, + "time_per_iteration": 3.654006004333496 + }, + { + "auxiliary_loss_clip": 0.01054623, + "auxiliary_loss_mlp": 0.01037055, + "balance_loss_clip": 1.01440537, + "balance_loss_mlp": 1.01806223, + "epoch": 0.6501728543514204, + "flos": 18696826667520.0, + "grad_norm": 1.7236413506071793, + "language_loss": 0.81134665, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.83226347, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36523438, + "step": 10814, + "time_per_iteration": 2.3598430156707764 + }, + { + "auxiliary_loss_clip": 0.01055487, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.01578593, + "balance_loss_mlp": 1.01737833, + "epoch": 0.6502329776040884, + "flos": 44198191474560.0, + "grad_norm": 1.6140048157093179, + "language_loss": 0.66163272, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.6825999, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 10815, + "time_per_iteration": 2.5559487342834473 + }, + { + "auxiliary_loss_clip": 0.01057345, + "auxiliary_loss_mlp": 0.01042923, + "balance_loss_clip": 1.0149684, + "balance_loss_mlp": 1.01870847, + "epoch": 0.6502931008567564, + "flos": 14573863457280.0, + "grad_norm": 1.943309925440172, + "language_loss": 0.77384222, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.79484493, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38671875, + "step": 10816, + "time_per_iteration": 2.361140727996826 + }, + { + "auxiliary_loss_clip": 0.010544, + "auxiliary_loss_mlp": 0.01041617, + "balance_loss_clip": 1.01696491, + "balance_loss_mlp": 1.01764655, + "epoch": 0.6503532241094243, + "flos": 21394006974720.0, + "grad_norm": 1.6930775684579886, + "language_loss": 0.73744267, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75840288, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 10817, + "time_per_iteration": 3.825584650039673 + }, + { + "auxiliary_loss_clip": 0.01054045, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.01540089, + "balance_loss_mlp": 1.01673055, + "epoch": 0.6504133473620923, + "flos": 74738256209280.0, + "grad_norm": 1.4441933597132044, + "language_loss": 0.72427166, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74522364, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 10818, + "time_per_iteration": 4.228999853134155 + }, + { + "auxiliary_loss_clip": 0.01056469, + "auxiliary_loss_mlp": 0.01042907, + "balance_loss_clip": 1.01579881, + "balance_loss_mlp": 1.01771092, + "epoch": 0.6504734706147602, + "flos": 19712415473280.0, + "grad_norm": 2.0167462666421727, + "language_loss": 0.66551143, + "learning_rate": 1.150246104600249e-06, + "loss": 0.6865052, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38671875, + "step": 10819, + "time_per_iteration": 2.3784759044647217 + }, + { + "auxiliary_loss_clip": 0.01055548, + "auxiliary_loss_mlp": 0.01044781, + "balance_loss_clip": 1.01887727, + "balance_loss_mlp": 1.01717901, + "epoch": 0.6505335938674283, + "flos": 25555688749440.0, + "grad_norm": 1.8658747469555255, + "language_loss": 0.84263122, + "learning_rate": 1.14989356009286e-06, + "loss": 0.86363459, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 10820, + "time_per_iteration": 2.4128024578094482 + }, + { + "auxiliary_loss_clip": 0.01057633, + "auxiliary_loss_mlp": 0.01042478, + "balance_loss_clip": 1.01566827, + "balance_loss_mlp": 1.01774764, + "epoch": 0.6505937171200962, + "flos": 17820471830400.0, + "grad_norm": 2.068466120624152, + "language_loss": 0.79893208, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.81993324, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3984375, + "step": 10821, + "time_per_iteration": 2.3682940006256104 + }, + { + "auxiliary_loss_clip": 0.01051738, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.01067734, + "balance_loss_mlp": 1.01651025, + "epoch": 0.6506538403727642, + "flos": 20667080223360.0, + "grad_norm": 1.4371886423491413, + "language_loss": 0.80632854, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82718146, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 10822, + "time_per_iteration": 2.353161573410034 + }, + { + "auxiliary_loss_clip": 0.01053419, + "auxiliary_loss_mlp": 0.01041802, + "balance_loss_clip": 1.01533771, + "balance_loss_mlp": 1.01583409, + "epoch": 0.6507139636254321, + "flos": 11720831374080.0, + "grad_norm": 1.8946028144907963, + "language_loss": 0.87778509, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89873731, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.375, + "step": 10823, + "time_per_iteration": 2.3475732803344727 + }, + { + "auxiliary_loss_clip": 0.010543, + "auxiliary_loss_mlp": 0.01046177, + "balance_loss_clip": 1.01972461, + "balance_loss_mlp": 1.01634932, + "epoch": 0.6507740868781001, + "flos": 26760506457600.0, + "grad_norm": 2.8027049062445655, + "language_loss": 0.67927575, + "learning_rate": 1.148483704558183e-06, + "loss": 0.70028055, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 10824, + "time_per_iteration": 2.3966002464294434 + }, + { + "auxiliary_loss_clip": 0.01055083, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.01794612, + "balance_loss_mlp": 1.01626194, + "epoch": 0.650834210130768, + "flos": 16470799424640.0, + "grad_norm": 7.717703101173519, + "language_loss": 0.88988507, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.91086233, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38671875, + "step": 10825, + "time_per_iteration": 2.3522849082946777 + }, + { + "auxiliary_loss_clip": 0.010569, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_clip": 1.0141443, + "balance_loss_mlp": 1.01737392, + "epoch": 0.650894333383436, + "flos": 17127725166720.0, + "grad_norm": 2.386447377950689, + "language_loss": 0.74998832, + "learning_rate": 1.147778970474885e-06, + "loss": 0.77100021, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.39453125, + "step": 10826, + "time_per_iteration": 2.3244056701660156 + }, + { + "auxiliary_loss_clip": 0.01052451, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.01493919, + "balance_loss_mlp": 1.01666927, + "epoch": 0.650954456636104, + "flos": 18733241082240.0, + "grad_norm": 2.0653776747881785, + "language_loss": 0.71008837, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.73101044, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35742188, + "step": 10827, + "time_per_iteration": 2.355087995529175 + }, + { + "auxiliary_loss_clip": 0.01054807, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.01951623, + "balance_loss_mlp": 1.01726246, + "epoch": 0.651014579888772, + "flos": 24527287474560.0, + "grad_norm": 1.9168299355887195, + "language_loss": 0.77912331, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.80010486, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 10828, + "time_per_iteration": 3.916597366333008 + }, + { + "auxiliary_loss_clip": 0.01053263, + "auxiliary_loss_mlp": 0.01041275, + "balance_loss_clip": 1.01674139, + "balance_loss_mlp": 1.0165906, + "epoch": 0.65107470314144, + "flos": 24059939748480.0, + "grad_norm": 2.298963801721368, + "language_loss": 0.90059984, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.92154527, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 10829, + "time_per_iteration": 2.370250701904297 + }, + { + "auxiliary_loss_clip": 0.01009248, + "auxiliary_loss_mlp": 0.01003689, + "balance_loss_clip": 1.00080407, + "balance_loss_mlp": 1.0018847, + "epoch": 0.6511348263941079, + "flos": 72477139317120.0, + "grad_norm": 0.6487071572043749, + "language_loss": 0.55510783, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57523721, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.02880859, + "router_z_loss_mlp": 0.07373047, + "step": 10830, + "time_per_iteration": 3.1682636737823486 + }, + { + "auxiliary_loss_clip": 0.01056096, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.01672339, + "balance_loss_mlp": 1.01717949, + "epoch": 0.6511949496467759, + "flos": 23366564680320.0, + "grad_norm": 1.8691742682468517, + "language_loss": 0.75837529, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.77936924, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38867188, + "step": 10831, + "time_per_iteration": 2.3737268447875977 + }, + { + "auxiliary_loss_clip": 0.01009399, + "auxiliary_loss_mlp": 0.01008161, + "balance_loss_clip": 1.00544274, + "balance_loss_mlp": 1.00206065, + "epoch": 0.6512550728994438, + "flos": 67329824549760.0, + "grad_norm": 0.6468390732548982, + "language_loss": 0.51084518, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53102076, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.07324219, + "step": 10832, + "time_per_iteration": 3.1326940059661865 + }, + { + "auxiliary_loss_clip": 0.01055941, + "auxiliary_loss_mlp": 0.01044532, + "balance_loss_clip": 1.01692355, + "balance_loss_mlp": 1.01686358, + "epoch": 0.6513151961521119, + "flos": 21140642171520.0, + "grad_norm": 2.7371950685804918, + "language_loss": 0.85071003, + "learning_rate": 1.145313419848316e-06, + "loss": 0.87171471, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.390625, + "step": 10833, + "time_per_iteration": 2.3562026023864746 + }, + { + "auxiliary_loss_clip": 0.01055041, + "auxiliary_loss_mlp": 0.01037942, + "balance_loss_clip": 1.01232433, + "balance_loss_mlp": 1.01713872, + "epoch": 0.6513753194047798, + "flos": 15157925458560.0, + "grad_norm": 2.9030836898556043, + "language_loss": 0.84866822, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.86959809, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 10834, + "time_per_iteration": 2.3514082431793213 + }, + { + "auxiliary_loss_clip": 0.01057013, + "auxiliary_loss_mlp": 0.01043798, + "balance_loss_clip": 1.01884723, + "balance_loss_mlp": 1.01775813, + "epoch": 0.6514354426574478, + "flos": 30225322028160.0, + "grad_norm": 1.3972494887904792, + "language_loss": 0.77849197, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79950011, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39257812, + "step": 10835, + "time_per_iteration": 2.428652286529541 + }, + { + "auxiliary_loss_clip": 0.01056418, + "auxiliary_loss_mlp": 0.01043919, + "balance_loss_clip": 1.01726377, + "balance_loss_mlp": 1.01848257, + "epoch": 0.6514955659101157, + "flos": 24204480243840.0, + "grad_norm": 1.532486729260784, + "language_loss": 0.78667426, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.80767763, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 10836, + "time_per_iteration": 2.4050674438476562 + }, + { + "auxiliary_loss_clip": 0.01055808, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.01382971, + "balance_loss_mlp": 1.01829147, + "epoch": 0.6515556891627837, + "flos": 12377163623040.0, + "grad_norm": 1.895021463482281, + "language_loss": 0.82713223, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84808028, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 10837, + "time_per_iteration": 2.3338091373443604 + }, + { + "auxiliary_loss_clip": 0.01053446, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.01210141, + "balance_loss_mlp": 1.01712751, + "epoch": 0.6516158124154516, + "flos": 49599359804160.0, + "grad_norm": 2.0284265369776344, + "language_loss": 0.59868884, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.61960632, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36328125, + "step": 10838, + "time_per_iteration": 2.642843008041382 + }, + { + "auxiliary_loss_clip": 0.01008796, + "auxiliary_loss_mlp": 0.01009198, + "balance_loss_clip": 1.00656331, + "balance_loss_mlp": 1.00158143, + "epoch": 0.6516759356681197, + "flos": 59699731334400.0, + "grad_norm": 0.7565750832514271, + "language_loss": 0.6106931, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.63087308, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.07226562, + "step": 10839, + "time_per_iteration": 3.060102701187134 + }, + { + "auxiliary_loss_clip": 0.01054037, + "auxiliary_loss_mlp": 0.01032769, + "balance_loss_clip": 1.00953531, + "balance_loss_mlp": 1.01756358, + "epoch": 0.6517360589207876, + "flos": 37449305775360.0, + "grad_norm": 1.533482435905043, + "language_loss": 0.68293023, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70379823, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 10840, + "time_per_iteration": 2.56447172164917 + }, + { + "auxiliary_loss_clip": 0.01054962, + "auxiliary_loss_mlp": 0.01038342, + "balance_loss_clip": 1.01417816, + "balance_loss_mlp": 1.01720476, + "epoch": 0.6517961821734556, + "flos": 25373721409920.0, + "grad_norm": 1.9917287178557868, + "language_loss": 0.75145662, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.77238965, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 10841, + "time_per_iteration": 2.4008235931396484 + }, + { + "auxiliary_loss_clip": 0.01056452, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.01160622, + "balance_loss_mlp": 1.01850867, + "epoch": 0.6518563054261236, + "flos": 28765743062400.0, + "grad_norm": 1.4708041940498449, + "language_loss": 0.63665831, + "learning_rate": 1.142145760331648e-06, + "loss": 0.65758967, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 10842, + "time_per_iteration": 2.4356653690338135 + }, + { + "auxiliary_loss_clip": 0.01010079, + "auxiliary_loss_mlp": 0.0100809, + "balance_loss_clip": 1.00576591, + "balance_loss_mlp": 1.00279379, + "epoch": 0.6519164286787915, + "flos": 68921725034880.0, + "grad_norm": 0.8262203834906235, + "language_loss": 0.56176281, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58194453, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.07324219, + "step": 10843, + "time_per_iteration": 2.8479809761047363 + }, + { + "auxiliary_loss_clip": 0.0105766, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.01718593, + "balance_loss_mlp": 1.0185113, + "epoch": 0.6519765519314595, + "flos": 20441087792640.0, + "grad_norm": 3.0742250346746425, + "language_loss": 0.83656204, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.85759181, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.390625, + "step": 10844, + "time_per_iteration": 2.3978476524353027 + }, + { + "auxiliary_loss_clip": 0.01054919, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.01190925, + "balance_loss_mlp": 1.01755881, + "epoch": 0.6520366751841274, + "flos": 28401703649280.0, + "grad_norm": 1.8590370575435202, + "language_loss": 0.61440438, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.63533175, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 10845, + "time_per_iteration": 2.4270389080047607 + }, + { + "auxiliary_loss_clip": 0.01053957, + "auxiliary_loss_mlp": 0.01041641, + "balance_loss_clip": 1.01510525, + "balance_loss_mlp": 1.01772237, + "epoch": 0.6520967984367955, + "flos": 22272316848000.0, + "grad_norm": 1.6485110342784552, + "language_loss": 0.80947173, + "learning_rate": 1.140738756857194e-06, + "loss": 0.83042765, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36328125, + "step": 10846, + "time_per_iteration": 2.3721299171447754 + }, + { + "auxiliary_loss_clip": 0.01010062, + "auxiliary_loss_mlp": 0.01001878, + "balance_loss_clip": 0.99932724, + "balance_loss_mlp": 1.0026989, + "epoch": 0.6521569216894634, + "flos": 68913309530880.0, + "grad_norm": 0.7061262379449746, + "language_loss": 0.60234952, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62246889, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.07373047, + "step": 10847, + "time_per_iteration": 3.1255948543548584 + }, + { + "auxiliary_loss_clip": 0.01056176, + "auxiliary_loss_mlp": 0.01045294, + "balance_loss_clip": 1.01925921, + "balance_loss_mlp": 1.0177654, + "epoch": 0.6522170449421314, + "flos": 29129293716480.0, + "grad_norm": 1.9688649551604427, + "language_loss": 0.81930685, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.8403216, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 10848, + "time_per_iteration": 2.461585521697998 + }, + { + "auxiliary_loss_clip": 0.01053698, + "auxiliary_loss_mlp": 0.01046121, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.01681352, + "epoch": 0.6522771681947993, + "flos": 26650704631680.0, + "grad_norm": 2.2284534242375775, + "language_loss": 0.76329631, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.78429449, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 10849, + "time_per_iteration": 2.421409845352173 + }, + { + "auxiliary_loss_clip": 0.01053444, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02322435, + "balance_loss_mlp": 1.01757956, + "epoch": 0.6523372914474673, + "flos": 25738563784320.0, + "grad_norm": 1.5799737942617464, + "language_loss": 0.69317603, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.71417856, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 10850, + "time_per_iteration": 2.393721580505371 + }, + { + "auxiliary_loss_clip": 0.01054864, + "auxiliary_loss_mlp": 0.01043372, + "balance_loss_clip": 1.0180397, + "balance_loss_mlp": 1.01850986, + "epoch": 0.6523974147001352, + "flos": 24826178557440.0, + "grad_norm": 1.5839390083016187, + "language_loss": 0.67862773, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.69961005, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 10851, + "time_per_iteration": 2.3956663608551025 + }, + { + "auxiliary_loss_clip": 0.01056415, + "auxiliary_loss_mlp": 0.0105025, + "balance_loss_clip": 1.02565742, + "balance_loss_mlp": 1.01645088, + "epoch": 0.6524575379528033, + "flos": 26316586120320.0, + "grad_norm": 2.219100611821956, + "language_loss": 0.75660157, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.77766824, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.40039062, + "step": 10852, + "time_per_iteration": 2.406606912612915 + }, + { + "auxiliary_loss_clip": 0.0105741, + "auxiliary_loss_mlp": 0.0105085, + "balance_loss_clip": 1.02319336, + "balance_loss_mlp": 1.0175463, + "epoch": 0.6525176612054712, + "flos": 19493300580480.0, + "grad_norm": 2.3591754115656514, + "language_loss": 0.68156993, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.70265257, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 10853, + "time_per_iteration": 3.6012465953826904 + }, + { + "auxiliary_loss_clip": 0.0100973, + "auxiliary_loss_mlp": 0.01010705, + "balance_loss_clip": 1.00860679, + "balance_loss_mlp": 1.00235808, + "epoch": 0.6525777844581392, + "flos": 71703534211200.0, + "grad_norm": 0.7303666065794024, + "language_loss": 0.63113761, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65134203, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.07373047, + "step": 10854, + "time_per_iteration": 3.10831880569458 + }, + { + "auxiliary_loss_clip": 0.01055495, + "auxiliary_loss_mlp": 0.01048009, + "balance_loss_clip": 1.0218302, + "balance_loss_mlp": 1.01704836, + "epoch": 0.6526379077108072, + "flos": 26651856706560.0, + "grad_norm": 1.799005469093403, + "language_loss": 0.78347981, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.80451483, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 10855, + "time_per_iteration": 2.4424846172332764 + }, + { + "auxiliary_loss_clip": 0.01050339, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.01811528, + "balance_loss_mlp": 1.01473308, + "epoch": 0.6526980309634751, + "flos": 22819266207360.0, + "grad_norm": 2.078824012189562, + "language_loss": 0.8019731, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.8228873, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 10856, + "time_per_iteration": 3.8704025745391846 + }, + { + "auxiliary_loss_clip": 0.01053356, + "auxiliary_loss_mlp": 0.01044398, + "balance_loss_clip": 1.01897097, + "balance_loss_mlp": 1.01560926, + "epoch": 0.6527581542161431, + "flos": 28363822957440.0, + "grad_norm": 1.8947831298706803, + "language_loss": 0.74254572, + "learning_rate": 1.136872187988815e-06, + "loss": 0.76352328, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 10857, + "time_per_iteration": 3.8371284008026123 + }, + { + "auxiliary_loss_clip": 0.01054927, + "auxiliary_loss_mlp": 0.0105357, + "balance_loss_clip": 1.02867913, + "balance_loss_mlp": 1.01732707, + "epoch": 0.652818277468811, + "flos": 18368224151040.0, + "grad_norm": 2.84322490143623, + "language_loss": 0.64806604, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.66915107, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37695312, + "step": 10858, + "time_per_iteration": 2.3932607173919678 + }, + { + "auxiliary_loss_clip": 0.01052178, + "auxiliary_loss_mlp": 0.01047495, + "balance_loss_clip": 1.02385581, + "balance_loss_mlp": 1.01667356, + "epoch": 0.6528784007214791, + "flos": 18035327537280.0, + "grad_norm": 1.7028701505147006, + "language_loss": 0.79627031, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.81726706, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 10859, + "time_per_iteration": 2.32818341255188 + }, + { + "auxiliary_loss_clip": 0.01055147, + "auxiliary_loss_mlp": 0.01046749, + "balance_loss_clip": 1.02122688, + "balance_loss_mlp": 1.01644051, + "epoch": 0.652938523974147, + "flos": 22380931687680.0, + "grad_norm": 1.5592536190391166, + "language_loss": 0.68833625, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70935524, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 10860, + "time_per_iteration": 2.3756582736968994 + }, + { + "auxiliary_loss_clip": 0.01055723, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_clip": 1.01888037, + "balance_loss_mlp": 1.01694453, + "epoch": 0.652998647226815, + "flos": 16763092260480.0, + "grad_norm": 1.8854254635591694, + "language_loss": 0.68607676, + "learning_rate": 1.135467143909712e-06, + "loss": 0.70707935, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 10861, + "time_per_iteration": 2.342540979385376 + }, + { + "auxiliary_loss_clip": 0.01056078, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.0156846, + "balance_loss_mlp": 1.01766777, + "epoch": 0.6530587704794829, + "flos": 35771065764480.0, + "grad_norm": 1.6633586769561004, + "language_loss": 0.66415519, + "learning_rate": 1.135115964814572e-06, + "loss": 0.68513596, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38476562, + "step": 10862, + "time_per_iteration": 2.4994595050811768 + }, + { + "auxiliary_loss_clip": 0.01052439, + "auxiliary_loss_mlp": 0.01044685, + "balance_loss_clip": 1.02086675, + "balance_loss_mlp": 1.01606095, + "epoch": 0.6531188937321509, + "flos": 19315173490560.0, + "grad_norm": 3.8362502703751677, + "language_loss": 0.7799868, + "learning_rate": 1.13476481851592e-06, + "loss": 0.80095804, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 10863, + "time_per_iteration": 2.3574490547180176 + }, + { + "auxiliary_loss_clip": 0.01053895, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.01327407, + "balance_loss_mlp": 1.01698899, + "epoch": 0.6531790169848188, + "flos": 22892653618560.0, + "grad_norm": 1.754641195639989, + "language_loss": 0.75449103, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.77539831, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37109375, + "step": 10864, + "time_per_iteration": 2.3847403526306152 + }, + { + "auxiliary_loss_clip": 0.01053348, + "auxiliary_loss_mlp": 0.010436, + "balance_loss_clip": 1.02062833, + "balance_loss_mlp": 1.01717329, + "epoch": 0.6532391402374869, + "flos": 29562426443520.0, + "grad_norm": 1.6469026409168777, + "language_loss": 0.87563455, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.89660406, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36132812, + "step": 10865, + "time_per_iteration": 2.43021821975708 + }, + { + "auxiliary_loss_clip": 0.01057038, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.01520467, + "balance_loss_mlp": 1.01737428, + "epoch": 0.6532992634901548, + "flos": 23104541859840.0, + "grad_norm": 1.6352807888459828, + "language_loss": 0.82381558, + "learning_rate": 1.133711576532051e-06, + "loss": 0.84480226, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39648438, + "step": 10866, + "time_per_iteration": 2.3758327960968018 + }, + { + "auxiliary_loss_clip": 0.01053299, + "auxiliary_loss_mlp": 0.01039336, + "balance_loss_clip": 1.01610184, + "balance_loss_mlp": 1.01641703, + "epoch": 0.6533593867428228, + "flos": 26066153871360.0, + "grad_norm": 1.4173117049431991, + "language_loss": 0.8317157, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.85264206, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 10867, + "time_per_iteration": 3.9364686012268066 + }, + { + "auxiliary_loss_clip": 0.01053763, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.01081097, + "balance_loss_mlp": 1.01575303, + "epoch": 0.6534195099954908, + "flos": 21211481053440.0, + "grad_norm": 1.671957066998913, + "language_loss": 0.81966102, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.84055626, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 10868, + "time_per_iteration": 2.3885602951049805 + }, + { + "auxiliary_loss_clip": 0.01055695, + "auxiliary_loss_mlp": 0.01045563, + "balance_loss_clip": 1.01932478, + "balance_loss_mlp": 1.01749218, + "epoch": 0.6534796332481587, + "flos": 19645556486400.0, + "grad_norm": 2.149068195606952, + "language_loss": 0.81982821, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.84084082, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 10869, + "time_per_iteration": 2.3572838306427 + }, + { + "auxiliary_loss_clip": 0.01056108, + "auxiliary_loss_mlp": 0.01038677, + "balance_loss_clip": 1.01463223, + "balance_loss_mlp": 1.01791143, + "epoch": 0.6535397565008267, + "flos": 24021395740800.0, + "grad_norm": 3.0202915512863564, + "language_loss": 0.73001981, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.75096762, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3828125, + "step": 10870, + "time_per_iteration": 2.409684658050537 + }, + { + "auxiliary_loss_clip": 0.01056414, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.02316785, + "balance_loss_mlp": 1.0191102, + "epoch": 0.6535998797534947, + "flos": 24601757137920.0, + "grad_norm": 2.4298286629920667, + "language_loss": 0.7597698, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.78081691, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 10871, + "time_per_iteration": 2.3893327713012695 + }, + { + "auxiliary_loss_clip": 0.01052118, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.01134765, + "balance_loss_mlp": 1.0173496, + "epoch": 0.6536600030061627, + "flos": 23363143367040.0, + "grad_norm": 1.6236961443726945, + "language_loss": 0.5666132, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.58748066, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 10872, + "time_per_iteration": 2.3857245445251465 + }, + { + "auxiliary_loss_clip": 0.01053958, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.01663446, + "balance_loss_mlp": 1.01742339, + "epoch": 0.6537201262588306, + "flos": 23877344004480.0, + "grad_norm": 1.7761890507169438, + "language_loss": 0.76036984, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.78131902, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 10873, + "time_per_iteration": 2.3857414722442627 + }, + { + "auxiliary_loss_clip": 0.01055518, + "auxiliary_loss_mlp": 0.01043382, + "balance_loss_clip": 1.01866949, + "balance_loss_mlp": 1.0184834, + "epoch": 0.6537802495114986, + "flos": 24353559216000.0, + "grad_norm": 1.5016789792219662, + "language_loss": 0.76069772, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.78168672, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37109375, + "step": 10874, + "time_per_iteration": 2.4362268447875977 + }, + { + "auxiliary_loss_clip": 0.01053684, + "auxiliary_loss_mlp": 0.01037964, + "balance_loss_clip": 1.01452708, + "balance_loss_mlp": 1.01740479, + "epoch": 0.6538403727641665, + "flos": 27995768737920.0, + "grad_norm": 1.4996858374638242, + "language_loss": 0.8291434, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.85005987, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 10875, + "time_per_iteration": 2.4080471992492676 + }, + { + "auxiliary_loss_clip": 0.01053589, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.0189054, + "balance_loss_mlp": 1.01658022, + "epoch": 0.6539004960168345, + "flos": 27562356720000.0, + "grad_norm": 1.86530938985815, + "language_loss": 0.7062211, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72719431, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 10876, + "time_per_iteration": 2.4242231845855713 + }, + { + "auxiliary_loss_clip": 0.01054263, + "auxiliary_loss_mlp": 0.01042673, + "balance_loss_clip": 1.01737714, + "balance_loss_mlp": 1.01730323, + "epoch": 0.6539606192695024, + "flos": 14529419429760.0, + "grad_norm": 1.9702038324060185, + "language_loss": 0.80256522, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.82353461, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 10877, + "time_per_iteration": 2.32517409324646 + }, + { + "auxiliary_loss_clip": 0.01054814, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.01445508, + "balance_loss_mlp": 1.0169276, + "epoch": 0.6540207425221705, + "flos": 21615286371840.0, + "grad_norm": 2.1518569150335534, + "language_loss": 0.80450457, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.82543856, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 10878, + "time_per_iteration": 2.396343946456909 + }, + { + "auxiliary_loss_clip": 0.01054054, + "auxiliary_loss_mlp": 0.01038247, + "balance_loss_clip": 1.01309419, + "balance_loss_mlp": 1.01743579, + "epoch": 0.6540808657748384, + "flos": 17668215924480.0, + "grad_norm": 1.953428167092928, + "language_loss": 0.85477471, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.87569773, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 10879, + "time_per_iteration": 2.353224039077759 + }, + { + "auxiliary_loss_clip": 0.01056609, + "auxiliary_loss_mlp": 0.01036749, + "balance_loss_clip": 1.01036823, + "balance_loss_mlp": 1.01719689, + "epoch": 0.6541409890275064, + "flos": 14537414131200.0, + "grad_norm": 3.2230934059481253, + "language_loss": 0.74054599, + "learning_rate": 1.128800362199601e-06, + "loss": 0.76147962, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39453125, + "step": 10880, + "time_per_iteration": 2.3977901935577393 + }, + { + "auxiliary_loss_clip": 0.01052563, + "auxiliary_loss_mlp": 0.01034577, + "balance_loss_clip": 1.01227307, + "balance_loss_mlp": 1.01713121, + "epoch": 0.6542011122801744, + "flos": 17164349049600.0, + "grad_norm": 1.858943996426421, + "language_loss": 0.85518855, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.87605989, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 10881, + "time_per_iteration": 2.3558406829833984 + }, + { + "auxiliary_loss_clip": 0.01057324, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.01300192, + "balance_loss_mlp": 1.01859117, + "epoch": 0.6542612355328423, + "flos": 18185628407040.0, + "grad_norm": 4.504819140392659, + "language_loss": 0.7913698, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.81232864, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 10882, + "time_per_iteration": 2.335016965866089 + }, + { + "auxiliary_loss_clip": 0.01056662, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.013767, + "balance_loss_mlp": 1.01808083, + "epoch": 0.6543213587855103, + "flos": 19791423613440.0, + "grad_norm": 2.1893033138058113, + "language_loss": 0.83097523, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.85193551, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 10883, + "time_per_iteration": 2.3662497997283936 + }, + { + "auxiliary_loss_clip": 0.01055696, + "auxiliary_loss_mlp": 0.01046238, + "balance_loss_clip": 1.02034557, + "balance_loss_mlp": 1.01812446, + "epoch": 0.6543814820381783, + "flos": 21104053200000.0, + "grad_norm": 3.2152067760635017, + "language_loss": 0.86441487, + "learning_rate": 1.127398345803988e-06, + "loss": 0.88543421, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 10884, + "time_per_iteration": 2.361074447631836 + }, + { + "auxiliary_loss_clip": 0.01056349, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.01481342, + "balance_loss_mlp": 1.019014, + "epoch": 0.6544416052908463, + "flos": 20192994604800.0, + "grad_norm": 3.629881508952694, + "language_loss": 0.81829488, + "learning_rate": 1.127047924394715e-06, + "loss": 0.83926582, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 10885, + "time_per_iteration": 2.396135091781616 + }, + { + "auxiliary_loss_clip": 0.01053308, + "auxiliary_loss_mlp": 0.01039152, + "balance_loss_clip": 1.01454687, + "balance_loss_mlp": 1.01705599, + "epoch": 0.6545017285435142, + "flos": 23367123262080.0, + "grad_norm": 1.672135285596585, + "language_loss": 0.72619742, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74712199, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 10886, + "time_per_iteration": 2.3702552318573 + }, + { + "auxiliary_loss_clip": 0.01053905, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.01229739, + "balance_loss_mlp": 1.017241, + "epoch": 0.6545618517961822, + "flos": 19133729821440.0, + "grad_norm": 1.7211594343936427, + "language_loss": 0.78939897, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.81030303, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 10887, + "time_per_iteration": 2.3636722564697266 + }, + { + "auxiliary_loss_clip": 0.01053661, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.01293254, + "balance_loss_mlp": 1.01715231, + "epoch": 0.6546219750488501, + "flos": 14937763224960.0, + "grad_norm": 2.2734524463173442, + "language_loss": 0.7960279, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.8169387, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 10888, + "time_per_iteration": 2.325827121734619 + }, + { + "auxiliary_loss_clip": 0.01051919, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.01372635, + "balance_loss_mlp": 1.01653302, + "epoch": 0.6546820983015181, + "flos": 36319027553280.0, + "grad_norm": 2.1289576948138493, + "language_loss": 0.67559969, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.6964922, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35351562, + "step": 10889, + "time_per_iteration": 2.498112440109253 + }, + { + "auxiliary_loss_clip": 0.01053956, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.01513004, + "balance_loss_mlp": 1.01654184, + "epoch": 0.654742221554186, + "flos": 20410433752320.0, + "grad_norm": 1.4094993274914456, + "language_loss": 0.80036259, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82129604, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 10890, + "time_per_iteration": 2.351438522338867 + }, + { + "auxiliary_loss_clip": 0.01053781, + "auxiliary_loss_mlp": 0.0104284, + "balance_loss_clip": 1.01829529, + "balance_loss_mlp": 1.01634085, + "epoch": 0.6548023448068541, + "flos": 24862488238080.0, + "grad_norm": 2.1029633713808686, + "language_loss": 0.66346121, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.68442738, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 10891, + "time_per_iteration": 2.4404098987579346 + }, + { + "auxiliary_loss_clip": 0.01053497, + "auxiliary_loss_mlp": 0.01048485, + "balance_loss_clip": 1.02424979, + "balance_loss_mlp": 1.01628685, + "epoch": 0.654862468059522, + "flos": 21426685873920.0, + "grad_norm": 1.8861935014433036, + "language_loss": 0.80268615, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.82370597, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 10892, + "time_per_iteration": 3.5707638263702393 + }, + { + "auxiliary_loss_clip": 0.01056025, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_clip": 1.0186162, + "balance_loss_mlp": 1.01756763, + "epoch": 0.65492259131219, + "flos": 26576653904640.0, + "grad_norm": 1.8254655954455277, + "language_loss": 0.79119468, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.81220579, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38476562, + "step": 10893, + "time_per_iteration": 2.4414963722229004 + }, + { + "auxiliary_loss_clip": 0.01057282, + "auxiliary_loss_mlp": 0.01045731, + "balance_loss_clip": 1.0186466, + "balance_loss_mlp": 1.01797593, + "epoch": 0.6549827145648579, + "flos": 21500422398720.0, + "grad_norm": 1.5908157098753875, + "language_loss": 0.7148149, + "learning_rate": 1.123895622914766e-06, + "loss": 0.73584509, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39257812, + "step": 10894, + "time_per_iteration": 2.36429500579834 + }, + { + "auxiliary_loss_clip": 0.01055959, + "auxiliary_loss_mlp": 0.01046278, + "balance_loss_clip": 1.01995635, + "balance_loss_mlp": 1.01716232, + "epoch": 0.6550428378175259, + "flos": 22593378510720.0, + "grad_norm": 2.5921001848732086, + "language_loss": 0.64239269, + "learning_rate": 1.123545533127549e-06, + "loss": 0.66341507, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38671875, + "step": 10895, + "time_per_iteration": 3.6776013374328613 + }, + { + "auxiliary_loss_clip": 0.01052834, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.01752448, + "balance_loss_mlp": 1.01600385, + "epoch": 0.655102961070194, + "flos": 12822969173760.0, + "grad_norm": 1.7941781488503206, + "language_loss": 0.80018985, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.82111812, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3671875, + "step": 10896, + "time_per_iteration": 2.3542048931121826 + }, + { + "auxiliary_loss_clip": 0.0105345, + "auxiliary_loss_mlp": 0.01041238, + "balance_loss_clip": 1.01614475, + "balance_loss_mlp": 1.01740742, + "epoch": 0.6551630843228619, + "flos": 24789903788160.0, + "grad_norm": 1.43851306082348, + "language_loss": 0.71638358, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.7373305, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.359375, + "step": 10897, + "time_per_iteration": 3.8449649810791016 + }, + { + "auxiliary_loss_clip": 0.01055199, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.01426244, + "balance_loss_mlp": 1.01681566, + "epoch": 0.6552232075755299, + "flos": 16723605646080.0, + "grad_norm": 1.883190156750911, + "language_loss": 0.76374972, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.7847048, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 10898, + "time_per_iteration": 2.3632802963256836 + }, + { + "auxiliary_loss_clip": 0.01053863, + "auxiliary_loss_mlp": 0.01040831, + "balance_loss_clip": 1.01853895, + "balance_loss_mlp": 1.01736808, + "epoch": 0.6552833308281978, + "flos": 22015425997440.0, + "grad_norm": 2.702339383351021, + "language_loss": 0.75170422, + "learning_rate": 1.122145506463827e-06, + "loss": 0.77265114, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36523438, + "step": 10899, + "time_per_iteration": 2.3497908115386963 + }, + { + "auxiliary_loss_clip": 0.01053618, + "auxiliary_loss_mlp": 0.01038825, + "balance_loss_clip": 1.01617503, + "balance_loss_mlp": 1.01682305, + "epoch": 0.6553434540808658, + "flos": 24862243858560.0, + "grad_norm": 2.1313179106792144, + "language_loss": 0.57240915, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.59333348, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36914062, + "step": 10900, + "time_per_iteration": 2.3976008892059326 + }, + { + "auxiliary_loss_clip": 0.01054494, + "auxiliary_loss_mlp": 0.01043719, + "balance_loss_clip": 1.01666999, + "balance_loss_mlp": 1.01764059, + "epoch": 0.6554035773335337, + "flos": 23219964414720.0, + "grad_norm": 4.18235754323933, + "language_loss": 0.78035593, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.80133808, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.36914062, + "step": 10901, + "time_per_iteration": 2.3664395809173584 + }, + { + "auxiliary_loss_clip": 0.0105292, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.01305532, + "balance_loss_mlp": 1.01668859, + "epoch": 0.6554637005862017, + "flos": 22782502679040.0, + "grad_norm": 1.8648306804526138, + "language_loss": 0.74666953, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.76756942, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 10902, + "time_per_iteration": 2.448197364807129 + }, + { + "auxiliary_loss_clip": 0.01052551, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.01399136, + "balance_loss_mlp": 1.01742744, + "epoch": 0.6555238238388696, + "flos": 21506147861760.0, + "grad_norm": 1.7724073784425207, + "language_loss": 0.68918109, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.7100755, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 10903, + "time_per_iteration": 2.362318754196167 + }, + { + "auxiliary_loss_clip": 0.0105386, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.0157392, + "balance_loss_mlp": 1.01672411, + "epoch": 0.6555839470915377, + "flos": 30518138534400.0, + "grad_norm": 1.7697872251891265, + "language_loss": 0.67786908, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.6988132, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 10904, + "time_per_iteration": 2.4458484649658203 + }, + { + "auxiliary_loss_clip": 0.01053715, + "auxiliary_loss_mlp": 0.01041699, + "balance_loss_clip": 1.01574671, + "balance_loss_mlp": 1.01647234, + "epoch": 0.6556440703442056, + "flos": 24641837245440.0, + "grad_norm": 1.7964741925287844, + "language_loss": 0.91474867, + "learning_rate": 1.120046465383464e-06, + "loss": 0.9357028, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37304688, + "step": 10905, + "time_per_iteration": 2.3655476570129395 + }, + { + "auxiliary_loss_clip": 0.0105067, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.01428556, + "balance_loss_mlp": 1.01585603, + "epoch": 0.6557041935968736, + "flos": 23731337232000.0, + "grad_norm": 1.7031375079309898, + "language_loss": 0.77000517, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.79086959, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 10906, + "time_per_iteration": 2.379581928253174 + }, + { + "auxiliary_loss_clip": 0.01055003, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.01600063, + "balance_loss_mlp": 1.01758504, + "epoch": 0.6557643168495415, + "flos": 11102135437440.0, + "grad_norm": 3.203707656998057, + "language_loss": 0.76722348, + "learning_rate": 1.119347051825267e-06, + "loss": 0.78816646, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 10907, + "time_per_iteration": 3.7495715618133545 + }, + { + "auxiliary_loss_clip": 0.0105414, + "auxiliary_loss_mlp": 0.01037341, + "balance_loss_clip": 1.01268888, + "balance_loss_mlp": 1.01671576, + "epoch": 0.6558244401022095, + "flos": 30189710574720.0, + "grad_norm": 1.3931173120940865, + "language_loss": 0.72902644, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74994123, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 10908, + "time_per_iteration": 2.435680866241455 + }, + { + "auxiliary_loss_clip": 0.01053414, + "auxiliary_loss_mlp": 0.01038025, + "balance_loss_clip": 1.0149219, + "balance_loss_mlp": 1.01724505, + "epoch": 0.6558845633548775, + "flos": 17930099099520.0, + "grad_norm": 2.4092058942148538, + "language_loss": 0.83502561, + "learning_rate": 1.118647771844861e-06, + "loss": 0.85593998, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 10909, + "time_per_iteration": 2.35548734664917 + }, + { + "auxiliary_loss_clip": 0.01053342, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.01306188, + "balance_loss_mlp": 1.0161798, + "epoch": 0.6559446866075455, + "flos": 21903180376320.0, + "grad_norm": 2.4595225433419077, + "language_loss": 0.6644873, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.68540311, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 10910, + "time_per_iteration": 2.3611481189727783 + }, + { + "auxiliary_loss_clip": 0.01057648, + "auxiliary_loss_mlp": 0.01040517, + "balance_loss_clip": 1.01166892, + "balance_loss_mlp": 1.01799583, + "epoch": 0.6560048098602135, + "flos": 14127359679360.0, + "grad_norm": 2.768977876119546, + "language_loss": 0.77378666, + "learning_rate": 1.117948625548313e-06, + "loss": 0.79476833, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.39648438, + "step": 10911, + "time_per_iteration": 2.306070566177368 + }, + { + "auxiliary_loss_clip": 0.01049543, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.0116576, + "balance_loss_mlp": 1.01526022, + "epoch": 0.6560649331128814, + "flos": 18806558670720.0, + "grad_norm": 1.6868363606378773, + "language_loss": 0.76185125, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.78266442, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.34179688, + "step": 10912, + "time_per_iteration": 2.3844969272613525 + }, + { + "auxiliary_loss_clip": 0.01057482, + "auxiliary_loss_mlp": 0.01045138, + "balance_loss_clip": 1.01794648, + "balance_loss_mlp": 1.01772666, + "epoch": 0.6561250563655494, + "flos": 17052731832960.0, + "grad_norm": 1.842829019046689, + "language_loss": 0.78754532, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.80857158, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39648438, + "step": 10913, + "time_per_iteration": 2.330132484436035 + }, + { + "auxiliary_loss_clip": 0.01049366, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.01140141, + "balance_loss_mlp": 1.01583683, + "epoch": 0.6561851796182173, + "flos": 22636565729280.0, + "grad_norm": 1.6942201478969052, + "language_loss": 0.72311091, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.74392927, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.3359375, + "step": 10914, + "time_per_iteration": 2.387188196182251 + }, + { + "auxiliary_loss_clip": 0.01052785, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.00910366, + "balance_loss_mlp": 1.01666594, + "epoch": 0.6562453028708853, + "flos": 19238364766080.0, + "grad_norm": 1.6568745222252, + "language_loss": 0.75020367, + "learning_rate": 1.116550734430958e-06, + "loss": 0.77106601, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36132812, + "step": 10915, + "time_per_iteration": 2.3345999717712402 + }, + { + "auxiliary_loss_clip": 0.01051076, + "auxiliary_loss_mlp": 0.01036583, + "balance_loss_clip": 1.0134325, + "balance_loss_mlp": 1.0164355, + "epoch": 0.6563054261235532, + "flos": 23800290900480.0, + "grad_norm": 2.004224316972854, + "language_loss": 0.80660868, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.8274852, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34570312, + "step": 10916, + "time_per_iteration": 2.403728723526001 + }, + { + "auxiliary_loss_clip": 0.01053267, + "auxiliary_loss_mlp": 0.0103945, + "balance_loss_clip": 1.01787305, + "balance_loss_mlp": 1.0173831, + "epoch": 0.6563655493762213, + "flos": 19239167727360.0, + "grad_norm": 1.7967780130513291, + "language_loss": 0.77458829, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.79551548, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.359375, + "step": 10917, + "time_per_iteration": 2.3653388023376465 + }, + { + "auxiliary_loss_clip": 0.0105015, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.00940859, + "balance_loss_mlp": 1.01535964, + "epoch": 0.6564256726288892, + "flos": 25555269813120.0, + "grad_norm": 1.8337976947019927, + "language_loss": 0.72035462, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.74117517, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 10918, + "time_per_iteration": 2.4084508419036865 + }, + { + "auxiliary_loss_clip": 0.01050156, + "auxiliary_loss_mlp": 0.01033479, + "balance_loss_clip": 1.01308179, + "balance_loss_mlp": 1.01622021, + "epoch": 0.6564857958815572, + "flos": 22199522929920.0, + "grad_norm": 1.5341689138688268, + "language_loss": 0.76743597, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78827226, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.33984375, + "step": 10919, + "time_per_iteration": 2.3554093837738037 + }, + { + "auxiliary_loss_clip": 0.01009078, + "auxiliary_loss_mlp": 0.01001902, + "balance_loss_clip": 0.999542, + "balance_loss_mlp": 1.00179625, + "epoch": 0.6565459191342251, + "flos": 58120470627840.0, + "grad_norm": 0.7225237045151639, + "language_loss": 0.53100681, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.55111659, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07275391, + "step": 10920, + "time_per_iteration": 3.043001413345337 + }, + { + "auxiliary_loss_clip": 0.01052192, + "auxiliary_loss_mlp": 0.01037011, + "balance_loss_clip": 1.01443291, + "balance_loss_mlp": 1.01667356, + "epoch": 0.6566060423868931, + "flos": 30808336688640.0, + "grad_norm": 1.3962137759036009, + "language_loss": 0.66480821, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.68570018, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 10921, + "time_per_iteration": 2.45133376121521 + }, + { + "auxiliary_loss_clip": 0.010507, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.01303685, + "balance_loss_mlp": 1.01556206, + "epoch": 0.6566661656395612, + "flos": 23366320300800.0, + "grad_norm": 1.8158656213809317, + "language_loss": 0.82414806, + "learning_rate": 1.114105715254205e-06, + "loss": 0.8450191, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3515625, + "step": 10922, + "time_per_iteration": 2.401548385620117 + }, + { + "auxiliary_loss_clip": 0.01053029, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.01589084, + "balance_loss_mlp": 1.01666355, + "epoch": 0.6567262888922291, + "flos": 25734514066560.0, + "grad_norm": 2.2849731962286604, + "language_loss": 0.72738314, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.74831814, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 10923, + "time_per_iteration": 2.4093282222747803 + }, + { + "auxiliary_loss_clip": 0.01055359, + "auxiliary_loss_mlp": 0.01039858, + "balance_loss_clip": 1.01452613, + "balance_loss_mlp": 1.01827335, + "epoch": 0.6567864121448971, + "flos": 17122907399040.0, + "grad_norm": 2.0504406935467143, + "language_loss": 0.81592751, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.83687967, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 10924, + "time_per_iteration": 2.358916997909546 + }, + { + "auxiliary_loss_clip": 0.01050552, + "auxiliary_loss_mlp": 0.01037422, + "balance_loss_clip": 1.01648891, + "balance_loss_mlp": 1.01575649, + "epoch": 0.656846535397565, + "flos": 22418218886400.0, + "grad_norm": 1.8437444048092415, + "language_loss": 0.73723346, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.75811315, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 10925, + "time_per_iteration": 2.370487689971924 + }, + { + "auxiliary_loss_clip": 0.01053315, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.01226211, + "balance_loss_mlp": 1.01723742, + "epoch": 0.656906658650233, + "flos": 17703792466560.0, + "grad_norm": 2.7257720635469727, + "language_loss": 0.7370615, + "learning_rate": 1.112709300197942e-06, + "loss": 0.75793993, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 10926, + "time_per_iteration": 2.3539621829986572 + }, + { + "auxiliary_loss_clip": 0.01054367, + "auxiliary_loss_mlp": 0.01040985, + "balance_loss_clip": 1.01615345, + "balance_loss_mlp": 1.01676393, + "epoch": 0.6569667819029009, + "flos": 21174193854720.0, + "grad_norm": 1.724360005698026, + "language_loss": 0.73916948, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.76012301, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 10927, + "time_per_iteration": 2.3523316383361816 + }, + { + "auxiliary_loss_clip": 0.01008783, + "auxiliary_loss_mlp": 0.01005012, + "balance_loss_clip": 1.00286603, + "balance_loss_mlp": 1.00157428, + "epoch": 0.6570269051555689, + "flos": 68758330538880.0, + "grad_norm": 0.7299083684526992, + "language_loss": 0.64463818, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66477621, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.07226562, + "step": 10928, + "time_per_iteration": 3.041449546813965 + }, + { + "auxiliary_loss_clip": 0.01052851, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.01261544, + "balance_loss_mlp": 1.01679599, + "epoch": 0.6570870284082369, + "flos": 26318192042880.0, + "grad_norm": 1.5912710905572895, + "language_loss": 0.78762591, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80851531, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 10929, + "time_per_iteration": 2.418849468231201 + }, + { + "auxiliary_loss_clip": 0.01052894, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.0130111, + "balance_loss_mlp": 1.01713538, + "epoch": 0.6571471516609049, + "flos": 26173616636160.0, + "grad_norm": 1.6396423609876543, + "language_loss": 0.66619676, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.68710911, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.35742188, + "step": 10930, + "time_per_iteration": 2.4309372901916504 + }, + { + "auxiliary_loss_clip": 0.01052698, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_clip": 1.0129981, + "balance_loss_mlp": 1.01586652, + "epoch": 0.6572072749135728, + "flos": 20375206323840.0, + "grad_norm": 1.7537098013232442, + "language_loss": 0.71959436, + "learning_rate": 1.110964538515258e-06, + "loss": 0.74048948, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 10931, + "time_per_iteration": 2.349529266357422 + }, + { + "auxiliary_loss_clip": 0.01052146, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.01257038, + "balance_loss_mlp": 1.01545143, + "epoch": 0.6572673981662408, + "flos": 17127794989440.0, + "grad_norm": 2.1255689868781733, + "language_loss": 0.70263046, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.7235055, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 10932, + "time_per_iteration": 3.5594820976257324 + }, + { + "auxiliary_loss_clip": 0.01051768, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.01322317, + "balance_loss_mlp": 1.01618099, + "epoch": 0.6573275214189087, + "flos": 41273692104960.0, + "grad_norm": 1.7189496006406522, + "language_loss": 0.81332016, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.83417886, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.35546875, + "step": 10933, + "time_per_iteration": 2.5212414264678955 + }, + { + "auxiliary_loss_clip": 0.01053441, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.01656377, + "balance_loss_mlp": 1.01697063, + "epoch": 0.6573876446715767, + "flos": 22889127571200.0, + "grad_norm": 1.8250006476761995, + "language_loss": 0.75405931, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.77500385, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36523438, + "step": 10934, + "time_per_iteration": 2.405470371246338 + }, + { + "auxiliary_loss_clip": 0.01051166, + "auxiliary_loss_mlp": 0.01037021, + "balance_loss_clip": 1.01507425, + "balance_loss_mlp": 1.01576376, + "epoch": 0.6574477679242448, + "flos": 44016468514560.0, + "grad_norm": 1.64415481347668, + "language_loss": 0.76994187, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.7908237, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 10935, + "time_per_iteration": 3.818411350250244 + }, + { + "auxiliary_loss_clip": 0.01053235, + "auxiliary_loss_mlp": 0.01042393, + "balance_loss_clip": 1.01608324, + "balance_loss_mlp": 1.01586652, + "epoch": 0.6575078911769127, + "flos": 24570369959040.0, + "grad_norm": 1.683741949834991, + "language_loss": 0.80072856, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.82168478, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37304688, + "step": 10936, + "time_per_iteration": 2.4135582447052 + }, + { + "auxiliary_loss_clip": 0.01049466, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.01390159, + "balance_loss_mlp": 1.01527667, + "epoch": 0.6575680144295807, + "flos": 20922958644480.0, + "grad_norm": 1.901685194596564, + "language_loss": 0.70595843, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.7268098, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34179688, + "step": 10937, + "time_per_iteration": 3.7711799144744873 + }, + { + "auxiliary_loss_clip": 0.010507, + "auxiliary_loss_mlp": 0.01037381, + "balance_loss_clip": 1.01388526, + "balance_loss_mlp": 1.01532888, + "epoch": 0.6576281376822486, + "flos": 10924881131520.0, + "grad_norm": 2.387956708532646, + "language_loss": 0.70245135, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.72333217, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 10938, + "time_per_iteration": 2.403691053390503 + }, + { + "auxiliary_loss_clip": 0.01053009, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.01450157, + "balance_loss_mlp": 1.01609755, + "epoch": 0.6576882609349166, + "flos": 19280539555200.0, + "grad_norm": 1.9705309228256223, + "language_loss": 0.72391266, + "learning_rate": 1.108174673550927e-06, + "loss": 0.74482095, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36914062, + "step": 10939, + "time_per_iteration": 2.379563570022583 + }, + { + "auxiliary_loss_clip": 0.0105295, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.01451135, + "balance_loss_mlp": 1.01580095, + "epoch": 0.6577483841875845, + "flos": 20219773484160.0, + "grad_norm": 3.773742666089784, + "language_loss": 0.79613531, + "learning_rate": 1.107826092473037e-06, + "loss": 0.81704915, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 10940, + "time_per_iteration": 2.3426568508148193 + }, + { + "auxiliary_loss_clip": 0.01053958, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.01390302, + "balance_loss_mlp": 1.01594949, + "epoch": 0.6578085074402525, + "flos": 34749646761600.0, + "grad_norm": 2.6251113442083196, + "language_loss": 0.70015579, + "learning_rate": 1.107477545226471e-06, + "loss": 0.7210815, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38085938, + "step": 10941, + "time_per_iteration": 2.5188167095184326 + }, + { + "auxiliary_loss_clip": 0.01049687, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.01105118, + "balance_loss_mlp": 1.01470423, + "epoch": 0.6578686306929205, + "flos": 23470047550080.0, + "grad_norm": 1.7584490587122854, + "language_loss": 0.70199746, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.72283036, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 10942, + "time_per_iteration": 2.414778709411621 + }, + { + "auxiliary_loss_clip": 0.01056749, + "auxiliary_loss_mlp": 0.01038065, + "balance_loss_clip": 1.01291215, + "balance_loss_mlp": 1.01759684, + "epoch": 0.6579287539455885, + "flos": 18076105872000.0, + "grad_norm": 2.082827533394438, + "language_loss": 0.72336876, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.74431694, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.390625, + "step": 10943, + "time_per_iteration": 2.3819007873535156 + }, + { + "auxiliary_loss_clip": 0.010509, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.01369786, + "balance_loss_mlp": 1.01527548, + "epoch": 0.6579888771982564, + "flos": 28660025865600.0, + "grad_norm": 1.7508207743363633, + "language_loss": 0.60904765, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.62991738, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 10944, + "time_per_iteration": 2.441410541534424 + }, + { + "auxiliary_loss_clip": 0.01055082, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.01986217, + "balance_loss_mlp": 1.01668715, + "epoch": 0.6580490004509244, + "flos": 25045363272960.0, + "grad_norm": 1.4993276569153515, + "language_loss": 0.73571789, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.75672507, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 10945, + "time_per_iteration": 2.3921141624450684 + }, + { + "auxiliary_loss_clip": 0.01051686, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.01306832, + "balance_loss_mlp": 1.01593733, + "epoch": 0.6581091237035923, + "flos": 43507085644800.0, + "grad_norm": 1.5646281810448457, + "language_loss": 0.70832735, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72921467, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35742188, + "step": 10946, + "time_per_iteration": 2.5434823036193848 + }, + { + "auxiliary_loss_clip": 0.01053255, + "auxiliary_loss_mlp": 0.01034849, + "balance_loss_clip": 1.01199675, + "balance_loss_mlp": 1.01729488, + "epoch": 0.6581692469562603, + "flos": 22414413548160.0, + "grad_norm": 1.8205855876643469, + "language_loss": 0.83194196, + "learning_rate": 1.105386972944934e-06, + "loss": 0.85282302, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 10947, + "time_per_iteration": 3.7467236518859863 + }, + { + "auxiliary_loss_clip": 0.01053671, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.01284885, + "balance_loss_mlp": 1.01650763, + "epoch": 0.6582293702089284, + "flos": 24858717811200.0, + "grad_norm": 1.6085025705062304, + "language_loss": 0.77908909, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79997897, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.37109375, + "step": 10948, + "time_per_iteration": 2.3765974044799805 + }, + { + "auxiliary_loss_clip": 0.01051944, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.01231301, + "balance_loss_mlp": 1.01639676, + "epoch": 0.6582894934615963, + "flos": 23038555656960.0, + "grad_norm": 1.9903554031842552, + "language_loss": 0.80499327, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.82586831, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 10949, + "time_per_iteration": 2.360328197479248 + }, + { + "auxiliary_loss_clip": 0.010091, + "auxiliary_loss_mlp": 0.0100449, + "balance_loss_clip": 1.00208199, + "balance_loss_mlp": 1.00166845, + "epoch": 0.6583496167142643, + "flos": 72548432046720.0, + "grad_norm": 0.7324140752680341, + "language_loss": 0.61844182, + "learning_rate": 1.104342144597323e-06, + "loss": 0.6385777, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.07421875, + "step": 10950, + "time_per_iteration": 3.089111804962158 + }, + { + "auxiliary_loss_clip": 0.01049647, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.01461005, + "balance_loss_mlp": 1.01500785, + "epoch": 0.6584097399669322, + "flos": 13078009722240.0, + "grad_norm": 2.341140602960788, + "language_loss": 0.68222904, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.70308119, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34765625, + "step": 10951, + "time_per_iteration": 2.3374369144439697 + }, + { + "auxiliary_loss_clip": 0.01051236, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.01401067, + "balance_loss_mlp": 1.01657748, + "epoch": 0.6584698632196002, + "flos": 28691936714880.0, + "grad_norm": 1.322642626373734, + "language_loss": 0.77318543, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.79406166, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34570312, + "step": 10952, + "time_per_iteration": 2.451864242553711 + }, + { + "auxiliary_loss_clip": 0.0105196, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.01437473, + "balance_loss_mlp": 1.01702571, + "epoch": 0.6585299864722681, + "flos": 14318403972480.0, + "grad_norm": 1.7553921040810019, + "language_loss": 0.74745941, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.76833349, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34960938, + "step": 10953, + "time_per_iteration": 2.3536643981933594 + }, + { + "auxiliary_loss_clip": 0.01051288, + "auxiliary_loss_mlp": 0.01039283, + "balance_loss_clip": 1.01569104, + "balance_loss_mlp": 1.01584816, + "epoch": 0.6585901097249361, + "flos": 26796676492800.0, + "grad_norm": 1.8806439409830178, + "language_loss": 0.79280996, + "learning_rate": 1.102949515683546e-06, + "loss": 0.8137157, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 10954, + "time_per_iteration": 2.38666033744812 + }, + { + "auxiliary_loss_clip": 0.01053919, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.01441455, + "balance_loss_mlp": 1.01725554, + "epoch": 0.658650232977604, + "flos": 18732158830080.0, + "grad_norm": 2.0434802285520814, + "language_loss": 0.71030712, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.73123252, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 10955, + "time_per_iteration": 2.3292224407196045 + }, + { + "auxiliary_loss_clip": 0.01050368, + "auxiliary_loss_mlp": 0.01037248, + "balance_loss_clip": 1.01571918, + "balance_loss_mlp": 1.01551068, + "epoch": 0.6587103562302721, + "flos": 24752302387200.0, + "grad_norm": 1.866510187607943, + "language_loss": 0.81850755, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.83938372, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 10956, + "time_per_iteration": 2.3850834369659424 + }, + { + "auxiliary_loss_clip": 0.01053414, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.01480412, + "balance_loss_mlp": 1.01755619, + "epoch": 0.65877047948294, + "flos": 22345040943360.0, + "grad_norm": 2.295498705213328, + "language_loss": 0.83506829, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.85597658, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 10957, + "time_per_iteration": 2.3747761249542236 + }, + { + "auxiliary_loss_clip": 0.01052023, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.0157764, + "balance_loss_mlp": 1.0167532, + "epoch": 0.658830602735608, + "flos": 45178971788160.0, + "grad_norm": 1.5916049244810861, + "language_loss": 0.77391624, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.79480028, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3515625, + "step": 10958, + "time_per_iteration": 2.5576913356781006 + }, + { + "auxiliary_loss_clip": 0.01051865, + "auxiliary_loss_mlp": 0.01041848, + "balance_loss_clip": 1.01870942, + "balance_loss_mlp": 1.01665354, + "epoch": 0.6588907259882759, + "flos": 19900597034880.0, + "grad_norm": 1.5332520199329662, + "language_loss": 0.76263595, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.78357315, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 10959, + "time_per_iteration": 2.372605800628662 + }, + { + "auxiliary_loss_clip": 0.01053159, + "auxiliary_loss_mlp": 0.01035017, + "balance_loss_clip": 1.01214075, + "balance_loss_mlp": 1.01674628, + "epoch": 0.6589508492409439, + "flos": 24132628932480.0, + "grad_norm": 1.7638883857446737, + "language_loss": 0.66060436, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.68148613, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 10960, + "time_per_iteration": 2.370884656906128 + }, + { + "auxiliary_loss_clip": 0.01055611, + "auxiliary_loss_mlp": 0.01042093, + "balance_loss_clip": 1.01717842, + "balance_loss_mlp": 1.01770711, + "epoch": 0.659010972493612, + "flos": 18221938087680.0, + "grad_norm": 4.165001894199809, + "language_loss": 0.83000278, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.8509798, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 10961, + "time_per_iteration": 2.352940320968628 + }, + { + "auxiliary_loss_clip": 0.01052651, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.01341331, + "balance_loss_mlp": 1.01716447, + "epoch": 0.6590710957462799, + "flos": 27598771134720.0, + "grad_norm": 1.6740917085539997, + "language_loss": 0.75781333, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.7786904, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 10962, + "time_per_iteration": 2.4125282764434814 + }, + { + "auxiliary_loss_clip": 0.01053771, + "auxiliary_loss_mlp": 0.0103589, + "balance_loss_clip": 1.01334715, + "balance_loss_mlp": 1.0168066, + "epoch": 0.6591312189989479, + "flos": 20301923646720.0, + "grad_norm": 1.8625958978669943, + "language_loss": 0.81050873, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.83140528, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.37109375, + "step": 10963, + "time_per_iteration": 2.3857223987579346 + }, + { + "auxiliary_loss_clip": 0.01049406, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.00779212, + "balance_loss_mlp": 1.01440763, + "epoch": 0.6591913422516158, + "flos": 12312120026880.0, + "grad_norm": 1.6316626684218902, + "language_loss": 0.79739797, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.81819201, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34960938, + "step": 10964, + "time_per_iteration": 2.3310420513153076 + }, + { + "auxiliary_loss_clip": 0.0105286, + "auxiliary_loss_mlp": 0.01039753, + "balance_loss_clip": 1.01629257, + "balance_loss_mlp": 1.01609814, + "epoch": 0.6592514655042838, + "flos": 25883418481920.0, + "grad_norm": 1.7041331202188874, + "language_loss": 0.75081921, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.77174532, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 10965, + "time_per_iteration": 2.4712212085723877 + }, + { + "auxiliary_loss_clip": 0.01055009, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.01139295, + "balance_loss_mlp": 1.01725268, + "epoch": 0.6593115887569517, + "flos": 14062769930880.0, + "grad_norm": 2.192735635203252, + "language_loss": 0.75114125, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.77205801, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 10966, + "time_per_iteration": 2.367060422897339 + }, + { + "auxiliary_loss_clip": 0.01051857, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.01452422, + "balance_loss_mlp": 1.01619565, + "epoch": 0.6593717120096197, + "flos": 24716760756480.0, + "grad_norm": 1.5462423954098592, + "language_loss": 0.7806654, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.80157137, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35742188, + "step": 10967, + "time_per_iteration": 2.4023942947387695 + }, + { + "auxiliary_loss_clip": 0.01009121, + "auxiliary_loss_mlp": 0.01002781, + "balance_loss_clip": 1.00020659, + "balance_loss_mlp": 1.00183713, + "epoch": 0.6594318352622877, + "flos": 55554772101120.0, + "grad_norm": 0.6957618228155965, + "language_loss": 0.48576456, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50588363, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.07275391, + "step": 10968, + "time_per_iteration": 2.9842116832733154 + }, + { + "auxiliary_loss_clip": 0.01053429, + "auxiliary_loss_mlp": 0.01037523, + "balance_loss_clip": 1.01406288, + "balance_loss_mlp": 1.01655889, + "epoch": 0.6594919585149557, + "flos": 17455978569600.0, + "grad_norm": 2.0316048677157936, + "language_loss": 0.80807054, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.82898009, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 10969, + "time_per_iteration": 2.353447437286377 + }, + { + "auxiliary_loss_clip": 0.01051803, + "auxiliary_loss_mlp": 0.01035746, + "balance_loss_clip": 1.01351309, + "balance_loss_mlp": 1.01593232, + "epoch": 0.6595520817676236, + "flos": 18222252289920.0, + "grad_norm": 1.8692972874235732, + "language_loss": 0.68200767, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.70288312, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 10970, + "time_per_iteration": 2.329214334487915 + }, + { + "auxiliary_loss_clip": 0.01051819, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.01010013, + "balance_loss_mlp": 1.01676023, + "epoch": 0.6596122050202916, + "flos": 22198685057280.0, + "grad_norm": 1.5379416264348598, + "language_loss": 0.78184575, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.80270636, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.34960938, + "step": 10971, + "time_per_iteration": 2.368194103240967 + }, + { + "auxiliary_loss_clip": 0.01055264, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.01359773, + "balance_loss_mlp": 1.01739848, + "epoch": 0.6596723282729595, + "flos": 14172955781760.0, + "grad_norm": 2.5114955286695224, + "language_loss": 0.72969651, + "learning_rate": 1.096689432978629e-06, + "loss": 0.75062865, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 10972, + "time_per_iteration": 3.531177043914795 + }, + { + "auxiliary_loss_clip": 0.01052442, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.00963712, + "balance_loss_mlp": 1.01630497, + "epoch": 0.6597324515256275, + "flos": 30551934597120.0, + "grad_norm": 3.483721229483422, + "language_loss": 0.56522918, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.586092, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 10973, + "time_per_iteration": 2.4305319786071777 + }, + { + "auxiliary_loss_clip": 0.01056006, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.01668632, + "balance_loss_mlp": 1.01743579, + "epoch": 0.6597925747782956, + "flos": 17638888515840.0, + "grad_norm": 1.9094726533963842, + "language_loss": 0.79976726, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.82073903, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38476562, + "step": 10974, + "time_per_iteration": 2.321925640106201 + }, + { + "auxiliary_loss_clip": 0.01054466, + "auxiliary_loss_mlp": 0.01038872, + "balance_loss_clip": 1.01384974, + "balance_loss_mlp": 1.01662982, + "epoch": 0.6598526980309635, + "flos": 22818044309760.0, + "grad_norm": 2.2315298059667596, + "language_loss": 0.70118642, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.72211981, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 10975, + "time_per_iteration": 5.159539461135864 + }, + { + "auxiliary_loss_clip": 0.01052063, + "auxiliary_loss_mlp": 0.01034642, + "balance_loss_clip": 1.01209974, + "balance_loss_mlp": 1.01568377, + "epoch": 0.6599128212836315, + "flos": 21067010380800.0, + "grad_norm": 1.6984704357896858, + "language_loss": 0.71947908, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.74034613, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 10976, + "time_per_iteration": 2.362541437149048 + }, + { + "auxiliary_loss_clip": 0.01050316, + "auxiliary_loss_mlp": 0.01037291, + "balance_loss_clip": 1.01482022, + "balance_loss_mlp": 1.01553404, + "epoch": 0.6599729445362994, + "flos": 22162445199360.0, + "grad_norm": 1.6513543481654722, + "language_loss": 0.68433475, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.7052108, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 10977, + "time_per_iteration": 2.3722870349884033 + }, + { + "auxiliary_loss_clip": 0.01054666, + "auxiliary_loss_mlp": 0.01042395, + "balance_loss_clip": 1.01672935, + "balance_loss_mlp": 1.01674461, + "epoch": 0.6600330677889674, + "flos": 18149109258240.0, + "grad_norm": 2.0388476031754723, + "language_loss": 0.8262496, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.84722012, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 10978, + "time_per_iteration": 2.400414228439331 + }, + { + "auxiliary_loss_clip": 0.0105538, + "auxiliary_loss_mlp": 0.01040727, + "balance_loss_clip": 1.01572943, + "balance_loss_mlp": 1.01671243, + "epoch": 0.6600931910416353, + "flos": 18149144169600.0, + "grad_norm": 2.1193792882188616, + "language_loss": 0.68270773, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.70366883, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 10979, + "time_per_iteration": 2.3367092609405518 + }, + { + "auxiliary_loss_clip": 0.01053341, + "auxiliary_loss_mlp": 0.01041077, + "balance_loss_clip": 1.01584053, + "balance_loss_mlp": 1.01628292, + "epoch": 0.6601533142943034, + "flos": 17419773623040.0, + "grad_norm": 2.2172033147694092, + "language_loss": 0.74270737, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.76365161, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 10980, + "time_per_iteration": 2.3616421222686768 + }, + { + "auxiliary_loss_clip": 0.01050735, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.01521456, + "balance_loss_mlp": 1.01621246, + "epoch": 0.6602134375469713, + "flos": 28218339855360.0, + "grad_norm": 1.5874318777085463, + "language_loss": 0.74207604, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.76295245, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 10981, + "time_per_iteration": 2.4377834796905518 + }, + { + "auxiliary_loss_clip": 0.0105431, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.01527798, + "balance_loss_mlp": 1.01675153, + "epoch": 0.6602735607996393, + "flos": 29416943341440.0, + "grad_norm": 1.8519774688071646, + "language_loss": 0.70182443, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.72277123, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 10982, + "time_per_iteration": 2.4352056980133057 + }, + { + "auxiliary_loss_clip": 0.01052217, + "auxiliary_loss_mlp": 0.01037372, + "balance_loss_clip": 1.01517498, + "balance_loss_mlp": 1.0164938, + "epoch": 0.6603336840523072, + "flos": 18587059752960.0, + "grad_norm": 1.6188509616678113, + "language_loss": 0.70595336, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.7268492, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35742188, + "step": 10983, + "time_per_iteration": 2.354893922805786 + }, + { + "auxiliary_loss_clip": 0.01052704, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.01417875, + "balance_loss_mlp": 1.01612127, + "epoch": 0.6603938073049752, + "flos": 33253478824320.0, + "grad_norm": 1.791988521132254, + "language_loss": 0.71985877, + "learning_rate": 1.092522205413239e-06, + "loss": 0.74077994, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 10984, + "time_per_iteration": 2.5645902156829834 + }, + { + "auxiliary_loss_clip": 0.01052814, + "auxiliary_loss_mlp": 0.01038567, + "balance_loss_clip": 1.01387918, + "balance_loss_mlp": 1.01677692, + "epoch": 0.6604539305576431, + "flos": 17383324296960.0, + "grad_norm": 1.6499117730196402, + "language_loss": 0.84855783, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.86947161, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 10985, + "time_per_iteration": 2.409233808517456 + }, + { + "auxiliary_loss_clip": 0.01054168, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.01367831, + "balance_loss_mlp": 1.01633811, + "epoch": 0.6605140538103111, + "flos": 21250094883840.0, + "grad_norm": 2.347029977403867, + "language_loss": 0.75266331, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.77360046, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 10986, + "time_per_iteration": 3.8403308391571045 + }, + { + "auxiliary_loss_clip": 0.01050977, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.01272058, + "balance_loss_mlp": 1.0159204, + "epoch": 0.6605741770629792, + "flos": 13880837502720.0, + "grad_norm": 1.7389573446518238, + "language_loss": 0.80381495, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.82467866, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 10987, + "time_per_iteration": 2.3704097270965576 + }, + { + "auxiliary_loss_clip": 0.01008407, + "auxiliary_loss_mlp": 0.01002825, + "balance_loss_clip": 1.0002383, + "balance_loss_mlp": 1.00096047, + "epoch": 0.6606343003156471, + "flos": 69312436727040.0, + "grad_norm": 0.8056466248847116, + "language_loss": 0.54119754, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56130993, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.07470703, + "step": 10988, + "time_per_iteration": 3.1228384971618652 + }, + { + "auxiliary_loss_clip": 0.01051587, + "auxiliary_loss_mlp": 0.01045615, + "balance_loss_clip": 1.02340603, + "balance_loss_mlp": 1.01635671, + "epoch": 0.6606944235683151, + "flos": 27271146136320.0, + "grad_norm": 2.4243254624798167, + "language_loss": 0.78070533, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.80167729, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 10989, + "time_per_iteration": 2.470628023147583 + }, + { + "auxiliary_loss_clip": 0.01052795, + "auxiliary_loss_mlp": 0.01039678, + "balance_loss_clip": 1.01564574, + "balance_loss_mlp": 1.01677346, + "epoch": 0.660754546820983, + "flos": 13771943372160.0, + "grad_norm": 2.2924281566128655, + "language_loss": 0.7832368, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.80416155, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 10990, + "time_per_iteration": 2.3420610427856445 + }, + { + "auxiliary_loss_clip": 0.01054883, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.01676059, + "balance_loss_mlp": 1.01700115, + "epoch": 0.660814670073651, + "flos": 15704316236160.0, + "grad_norm": 1.9415244563694911, + "language_loss": 0.61760986, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.63858223, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 10991, + "time_per_iteration": 2.3624494075775146 + }, + { + "auxiliary_loss_clip": 0.01056415, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.01970387, + "balance_loss_mlp": 1.01775122, + "epoch": 0.6608747933263189, + "flos": 20848977740160.0, + "grad_norm": 2.76579974510713, + "language_loss": 0.70288944, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.72391087, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 10992, + "time_per_iteration": 2.356637716293335 + }, + { + "auxiliary_loss_clip": 0.01053224, + "auxiliary_loss_mlp": 0.01041814, + "balance_loss_clip": 1.01666057, + "balance_loss_mlp": 1.01536238, + "epoch": 0.660934916578987, + "flos": 20631049833600.0, + "grad_norm": 1.6692457210310343, + "language_loss": 0.88315344, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.90410376, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 10993, + "time_per_iteration": 2.376451253890991 + }, + { + "auxiliary_loss_clip": 0.01058203, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.01587415, + "balance_loss_mlp": 1.01810646, + "epoch": 0.6609950398316549, + "flos": 25112571373440.0, + "grad_norm": 1.704756258372959, + "language_loss": 0.67169654, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69271636, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.40234375, + "step": 10994, + "time_per_iteration": 2.3987278938293457 + }, + { + "auxiliary_loss_clip": 0.01056889, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.01756525, + "balance_loss_mlp": 1.01815343, + "epoch": 0.6610551630843229, + "flos": 18660202784640.0, + "grad_norm": 1.6564119628171163, + "language_loss": 0.78127825, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.80226731, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38671875, + "step": 10995, + "time_per_iteration": 2.3313136100769043 + }, + { + "auxiliary_loss_clip": 0.01054413, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.0137918, + "balance_loss_mlp": 1.0169785, + "epoch": 0.6611152863369908, + "flos": 23257077056640.0, + "grad_norm": 1.9644766004032226, + "language_loss": 0.75808704, + "learning_rate": 1.088359933123053e-06, + "loss": 0.77899849, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.375, + "step": 10996, + "time_per_iteration": 2.353569507598877 + }, + { + "auxiliary_loss_clip": 0.01054851, + "auxiliary_loss_mlp": 0.01046794, + "balance_loss_clip": 1.02175951, + "balance_loss_mlp": 1.01749444, + "epoch": 0.6611754095896588, + "flos": 22158744595200.0, + "grad_norm": 1.9069974854158243, + "language_loss": 0.69787759, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71889406, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37304688, + "step": 10997, + "time_per_iteration": 2.3434255123138428 + }, + { + "auxiliary_loss_clip": 0.01056138, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.01166892, + "balance_loss_mlp": 1.01812291, + "epoch": 0.6612355328423267, + "flos": 13990360037760.0, + "grad_norm": 1.8801047148020151, + "language_loss": 0.69401163, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.7149415, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 10998, + "time_per_iteration": 2.3429505825042725 + }, + { + "auxiliary_loss_clip": 0.01008624, + "auxiliary_loss_mlp": 0.01006127, + "balance_loss_clip": 1.00349295, + "balance_loss_mlp": 1.00120711, + "epoch": 0.6612956560949947, + "flos": 61450660529280.0, + "grad_norm": 0.6572434108159338, + "language_loss": 0.51242673, + "learning_rate": 1.087320141976297e-06, + "loss": 0.5325743, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.07421875, + "step": 10999, + "time_per_iteration": 2.9681572914123535 + }, + { + "auxiliary_loss_clip": 0.01054965, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_clip": 1.0184406, + "balance_loss_mlp": 1.01608419, + "epoch": 0.6613557793476627, + "flos": 21615565662720.0, + "grad_norm": 2.263205087962403, + "language_loss": 0.7114495, + "learning_rate": 1.086973614127679e-06, + "loss": 0.73245466, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38867188, + "step": 11000, + "time_per_iteration": 2.3870296478271484 + }, + { + "auxiliary_loss_clip": 0.01050859, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.016029, + "balance_loss_mlp": 1.01564121, + "epoch": 0.6614159026003307, + "flos": 34018740115200.0, + "grad_norm": 1.8063003811644438, + "language_loss": 0.66129017, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.68218696, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 11001, + "time_per_iteration": 2.4838876724243164 + }, + { + "auxiliary_loss_clip": 0.0105201, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.01913428, + "balance_loss_mlp": 1.01588488, + "epoch": 0.6614760258529987, + "flos": 24096144695040.0, + "grad_norm": 1.8163447648323026, + "language_loss": 0.73503041, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75598562, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 11002, + "time_per_iteration": 2.3817479610443115 + }, + { + "auxiliary_loss_clip": 0.01052859, + "auxiliary_loss_mlp": 0.01042931, + "balance_loss_clip": 1.0167768, + "balance_loss_mlp": 1.0160234, + "epoch": 0.6615361491056666, + "flos": 14902884910080.0, + "grad_norm": 2.2757079346919453, + "language_loss": 0.7978884, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.81884629, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3671875, + "step": 11003, + "time_per_iteration": 2.33282470703125 + }, + { + "auxiliary_loss_clip": 0.01058194, + "auxiliary_loss_mlp": 0.01047453, + "balance_loss_clip": 1.01623237, + "balance_loss_mlp": 1.01796997, + "epoch": 0.6615962723583346, + "flos": 15303967142400.0, + "grad_norm": 2.236055847937065, + "language_loss": 0.70581049, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.72686696, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.40234375, + "step": 11004, + "time_per_iteration": 2.371011972427368 + }, + { + "auxiliary_loss_clip": 0.01055964, + "auxiliary_loss_mlp": 0.01042609, + "balance_loss_clip": 1.01564419, + "balance_loss_mlp": 1.01709247, + "epoch": 0.6616563956110025, + "flos": 18731635159680.0, + "grad_norm": 2.1013190084760893, + "language_loss": 0.71520829, + "learning_rate": 1.085241494478132e-06, + "loss": 0.73619401, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38867188, + "step": 11005, + "time_per_iteration": 2.335397243499756 + }, + { + "auxiliary_loss_clip": 0.01054271, + "auxiliary_loss_mlp": 0.01039655, + "balance_loss_clip": 1.01458502, + "balance_loss_mlp": 1.0174129, + "epoch": 0.6617165188636706, + "flos": 24494015082240.0, + "grad_norm": 1.9630766430702902, + "language_loss": 0.78858519, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80952442, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 11006, + "time_per_iteration": 2.4194066524505615 + }, + { + "auxiliary_loss_clip": 0.01054434, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_clip": 1.01806891, + "balance_loss_mlp": 1.01729155, + "epoch": 0.6617766421163385, + "flos": 22378662449280.0, + "grad_norm": 1.7701009770813798, + "language_loss": 0.77308661, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.79406846, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 11007, + "time_per_iteration": 2.3893160820007324 + }, + { + "auxiliary_loss_clip": 0.01053751, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.01305246, + "balance_loss_mlp": 1.01678944, + "epoch": 0.6618367653690065, + "flos": 20849361765120.0, + "grad_norm": 1.6711976655336938, + "language_loss": 0.78928673, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.81021601, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36914062, + "step": 11008, + "time_per_iteration": 2.35719895362854 + }, + { + "auxiliary_loss_clip": 0.01056952, + "auxiliary_loss_mlp": 0.01045625, + "balance_loss_clip": 1.01601338, + "balance_loss_mlp": 1.01687372, + "epoch": 0.6618968886216744, + "flos": 17711368231680.0, + "grad_norm": 2.005208322110249, + "language_loss": 0.82779467, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.84882045, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40039062, + "step": 11009, + "time_per_iteration": 2.32336688041687 + }, + { + "auxiliary_loss_clip": 0.01009027, + "auxiliary_loss_mlp": 0.0100297, + "balance_loss_clip": 1.00025165, + "balance_loss_mlp": 1.00156367, + "epoch": 0.6619570118743424, + "flos": 67032155364480.0, + "grad_norm": 1.0249942177720932, + "language_loss": 0.67454445, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69466448, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.07470703, + "step": 11010, + "time_per_iteration": 2.9407904148101807 + }, + { + "auxiliary_loss_clip": 0.01053683, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.0167129, + "balance_loss_mlp": 1.01554489, + "epoch": 0.6620171351270103, + "flos": 18659923493760.0, + "grad_norm": 1.6766997569195867, + "language_loss": 0.72627485, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.7472499, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38085938, + "step": 11011, + "time_per_iteration": 3.610153913497925 + }, + { + "auxiliary_loss_clip": 0.01054407, + "auxiliary_loss_mlp": 0.0104015, + "balance_loss_clip": 1.01586723, + "balance_loss_mlp": 1.01767731, + "epoch": 0.6620772583796783, + "flos": 24169357549440.0, + "grad_norm": 2.0170854952038093, + "language_loss": 0.73461086, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.75555646, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 11012, + "time_per_iteration": 2.394791603088379 + }, + { + "auxiliary_loss_clip": 0.01050571, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.01086545, + "balance_loss_mlp": 1.01614761, + "epoch": 0.6621373816323463, + "flos": 23622408190080.0, + "grad_norm": 1.5772244000080036, + "language_loss": 0.80569404, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.826527, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 11013, + "time_per_iteration": 2.382826328277588 + }, + { + "auxiliary_loss_clip": 0.01054248, + "auxiliary_loss_mlp": 0.01037565, + "balance_loss_clip": 1.01322198, + "balance_loss_mlp": 1.01725733, + "epoch": 0.6621975048850143, + "flos": 18441227537280.0, + "grad_norm": 1.8677204766154925, + "language_loss": 0.72084004, + "learning_rate": 1.082125865538971e-06, + "loss": 0.74175817, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36914062, + "step": 11014, + "time_per_iteration": 3.7482316493988037 + }, + { + "auxiliary_loss_clip": 0.01051237, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.01480055, + "balance_loss_mlp": 1.01645398, + "epoch": 0.6622576281376823, + "flos": 14063014310400.0, + "grad_norm": 1.915870147983513, + "language_loss": 0.78297246, + "learning_rate": 1.081779858400137e-06, + "loss": 0.80386162, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34765625, + "step": 11015, + "time_per_iteration": 3.7315332889556885 + }, + { + "auxiliary_loss_clip": 0.0105251, + "auxiliary_loss_mlp": 0.01037042, + "balance_loss_clip": 1.01303315, + "balance_loss_mlp": 1.01607943, + "epoch": 0.6623177513903502, + "flos": 17018028074880.0, + "grad_norm": 1.7712905860677401, + "language_loss": 0.83531785, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.85621339, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 11016, + "time_per_iteration": 2.3255972862243652 + }, + { + "auxiliary_loss_clip": 0.01053263, + "auxiliary_loss_mlp": 0.01038098, + "balance_loss_clip": 1.01340985, + "balance_loss_mlp": 1.01613665, + "epoch": 0.6623778746430182, + "flos": 17270170980480.0, + "grad_norm": 1.937628600669207, + "language_loss": 0.7223087, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.74322236, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37109375, + "step": 11017, + "time_per_iteration": 2.3364663124084473 + }, + { + "auxiliary_loss_clip": 0.0105263, + "auxiliary_loss_mlp": 0.01049474, + "balance_loss_clip": 1.02316415, + "balance_loss_mlp": 1.01601243, + "epoch": 0.6624379978956861, + "flos": 48791016028800.0, + "grad_norm": 1.8590390368919154, + "language_loss": 0.7810185, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.8020395, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3671875, + "step": 11018, + "time_per_iteration": 2.597356081008911 + }, + { + "auxiliary_loss_clip": 0.01054518, + "auxiliary_loss_mlp": 0.010469, + "balance_loss_clip": 1.01899314, + "balance_loss_mlp": 1.01676583, + "epoch": 0.6624981211483542, + "flos": 18951448279680.0, + "grad_norm": 2.269058083104104, + "language_loss": 0.84312022, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.86413437, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.37695312, + "step": 11019, + "time_per_iteration": 2.3729348182678223 + }, + { + "auxiliary_loss_clip": 0.01051971, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01261258, + "balance_loss_mlp": 1.01621604, + "epoch": 0.6625582444010221, + "flos": 23255506045440.0, + "grad_norm": 1.734133460513638, + "language_loss": 0.72696048, + "learning_rate": 1.080050345253328e-06, + "loss": 0.74783969, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35742188, + "step": 11020, + "time_per_iteration": 2.3715970516204834 + }, + { + "auxiliary_loss_clip": 0.01056308, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.01443732, + "balance_loss_mlp": 1.01686239, + "epoch": 0.6626183676536901, + "flos": 21393832417920.0, + "grad_norm": 1.6973368451313842, + "language_loss": 0.73517621, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.75615358, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39453125, + "step": 11021, + "time_per_iteration": 2.3756909370422363 + }, + { + "auxiliary_loss_clip": 0.01054954, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.01987159, + "balance_loss_mlp": 1.01764214, + "epoch": 0.662678490906358, + "flos": 14570511966720.0, + "grad_norm": 3.248210328346275, + "language_loss": 0.84570658, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.86670637, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 11022, + "time_per_iteration": 2.318394660949707 + }, + { + "auxiliary_loss_clip": 0.01058358, + "auxiliary_loss_mlp": 0.01042046, + "balance_loss_clip": 1.01236272, + "balance_loss_mlp": 1.01758289, + "epoch": 0.662738614159026, + "flos": 15991581836160.0, + "grad_norm": 2.245351747554217, + "language_loss": 0.74408782, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.76509184, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40820312, + "step": 11023, + "time_per_iteration": 2.389620065689087 + }, + { + "auxiliary_loss_clip": 0.01052202, + "auxiliary_loss_mlp": 0.01040863, + "balance_loss_clip": 1.01636541, + "balance_loss_mlp": 1.0159421, + "epoch": 0.6627987374116939, + "flos": 19535335724160.0, + "grad_norm": 1.6669599124674837, + "language_loss": 0.75515962, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.77609026, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 11024, + "time_per_iteration": 2.366286516189575 + }, + { + "auxiliary_loss_clip": 0.01053902, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.0160116, + "balance_loss_mlp": 1.01664138, + "epoch": 0.662858860664362, + "flos": 15702012086400.0, + "grad_norm": 3.4564825527258787, + "language_loss": 0.72003198, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.74098337, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 11025, + "time_per_iteration": 3.805116891860962 + }, + { + "auxiliary_loss_clip": 0.01054636, + "auxiliary_loss_mlp": 0.0104248, + "balance_loss_clip": 1.01736295, + "balance_loss_mlp": 1.01776481, + "epoch": 0.6629189839170299, + "flos": 20153333433600.0, + "grad_norm": 1.5783810565648915, + "language_loss": 0.80002558, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.82099676, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 11026, + "time_per_iteration": 2.3950438499450684 + }, + { + "auxiliary_loss_clip": 0.01054084, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.01401663, + "balance_loss_mlp": 1.01750684, + "epoch": 0.6629791071696979, + "flos": 20914579918080.0, + "grad_norm": 1.699493919176788, + "language_loss": 0.76714694, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78806698, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36523438, + "step": 11027, + "time_per_iteration": 2.368635654449463 + }, + { + "auxiliary_loss_clip": 0.01053796, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.01238751, + "balance_loss_mlp": 1.01656163, + "epoch": 0.6630392304223659, + "flos": 20845940451840.0, + "grad_norm": 2.3760086183588216, + "language_loss": 0.71846992, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.73939002, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 11028, + "time_per_iteration": 2.3263754844665527 + }, + { + "auxiliary_loss_clip": 0.01051837, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.01644015, + "balance_loss_mlp": 1.01573384, + "epoch": 0.6630993536750338, + "flos": 20994775044480.0, + "grad_norm": 1.8073190190692723, + "language_loss": 0.80768323, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.82858908, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36132812, + "step": 11029, + "time_per_iteration": 2.3579559326171875 + }, + { + "auxiliary_loss_clip": 0.01054279, + "auxiliary_loss_mlp": 0.0103953, + "balance_loss_clip": 1.01306593, + "balance_loss_mlp": 1.01634121, + "epoch": 0.6631594769277018, + "flos": 18258073211520.0, + "grad_norm": 1.969291156411193, + "language_loss": 0.78064871, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.80158675, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 11030, + "time_per_iteration": 2.3274123668670654 + }, + { + "auxiliary_loss_clip": 0.01056142, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.01413655, + "balance_loss_mlp": 1.01725447, + "epoch": 0.6632196001803697, + "flos": 17819564135040.0, + "grad_norm": 3.505748101806825, + "language_loss": 0.77175272, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.79271555, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38867188, + "step": 11031, + "time_per_iteration": 2.3321754932403564 + }, + { + "auxiliary_loss_clip": 0.01055615, + "auxiliary_loss_mlp": 0.01038293, + "balance_loss_clip": 1.01359248, + "balance_loss_mlp": 1.01764846, + "epoch": 0.6632797234330378, + "flos": 12669561192960.0, + "grad_norm": 2.8878250999856085, + "language_loss": 0.77050829, + "learning_rate": 1.075903075048228e-06, + "loss": 0.7914474, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38085938, + "step": 11032, + "time_per_iteration": 2.3173999786376953 + }, + { + "auxiliary_loss_clip": 0.01050869, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.01486838, + "balance_loss_mlp": 1.0150857, + "epoch": 0.6633398466857057, + "flos": 23583654714240.0, + "grad_norm": 1.9021182024802024, + "language_loss": 0.81679559, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.8376894, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 11033, + "time_per_iteration": 2.3874671459198 + }, + { + "auxiliary_loss_clip": 0.01053996, + "auxiliary_loss_mlp": 0.01041356, + "balance_loss_clip": 1.01514196, + "balance_loss_mlp": 1.0157187, + "epoch": 0.6633999699383737, + "flos": 20630630897280.0, + "grad_norm": 1.6312835047773901, + "language_loss": 0.81573606, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.83668959, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 11034, + "time_per_iteration": 2.350402593612671 + }, + { + "auxiliary_loss_clip": 0.01052898, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.0123229, + "balance_loss_mlp": 1.01681638, + "epoch": 0.6634600931910416, + "flos": 21796066725120.0, + "grad_norm": 1.562458206269915, + "language_loss": 0.76819491, + "learning_rate": 1.074867045054166e-06, + "loss": 0.7890777, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 11035, + "time_per_iteration": 2.3610806465148926 + }, + { + "auxiliary_loss_clip": 0.01053953, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.01342869, + "balance_loss_mlp": 1.01585829, + "epoch": 0.6635202164437096, + "flos": 18731914450560.0, + "grad_norm": 1.8680490459639247, + "language_loss": 0.83906519, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85997856, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.38085938, + "step": 11036, + "time_per_iteration": 2.3151466846466064 + }, + { + "auxiliary_loss_clip": 0.01008777, + "auxiliary_loss_mlp": 0.01002892, + "balance_loss_clip": 1.00050831, + "balance_loss_mlp": 1.00134587, + "epoch": 0.6635803396963775, + "flos": 60219482878080.0, + "grad_norm": 0.7752356146725335, + "language_loss": 0.52385253, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54396921, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.07421875, + "step": 11037, + "time_per_iteration": 2.9919838905334473 + }, + { + "auxiliary_loss_clip": 0.01056221, + "auxiliary_loss_mlp": 0.01038951, + "balance_loss_clip": 1.01357162, + "balance_loss_mlp": 1.01842523, + "epoch": 0.6636404629490456, + "flos": 29165812865280.0, + "grad_norm": 1.7308428446831448, + "language_loss": 0.80135077, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.82230246, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 11038, + "time_per_iteration": 2.419877290725708 + }, + { + "auxiliary_loss_clip": 0.01054239, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.01738632, + "epoch": 0.6637005862017135, + "flos": 38906231477760.0, + "grad_norm": 2.3483036259570005, + "language_loss": 0.65533406, + "learning_rate": 1.073486162925716e-06, + "loss": 0.67631525, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.36914062, + "step": 11039, + "time_per_iteration": 2.508627414703369 + }, + { + "auxiliary_loss_clip": 0.01055323, + "auxiliary_loss_mlp": 0.01036713, + "balance_loss_clip": 1.01129746, + "balance_loss_mlp": 1.01682949, + "epoch": 0.6637607094543815, + "flos": 22782258299520.0, + "grad_norm": 1.634375009208824, + "language_loss": 0.65410435, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.67502475, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 11040, + "time_per_iteration": 2.382423162460327 + }, + { + "auxiliary_loss_clip": 0.01051056, + "auxiliary_loss_mlp": 0.01041335, + "balance_loss_clip": 1.01882839, + "balance_loss_mlp": 1.01507556, + "epoch": 0.6638208327070495, + "flos": 18113113779840.0, + "grad_norm": 2.752992661402511, + "language_loss": 0.72869432, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.74961823, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.359375, + "step": 11041, + "time_per_iteration": 2.3562893867492676 + }, + { + "auxiliary_loss_clip": 0.01052071, + "auxiliary_loss_mlp": 0.01043923, + "balance_loss_clip": 1.01875842, + "balance_loss_mlp": 1.01555276, + "epoch": 0.6638809559597174, + "flos": 29423576499840.0, + "grad_norm": 2.335265886249534, + "language_loss": 0.63769978, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.6586597, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 11042, + "time_per_iteration": 2.4152917861938477 + }, + { + "auxiliary_loss_clip": 0.0105608, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.01127768, + "balance_loss_mlp": 1.01629186, + "epoch": 0.6639410792123854, + "flos": 28071495210240.0, + "grad_norm": 1.9004996666444258, + "language_loss": 0.69648266, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.71743822, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.3984375, + "step": 11043, + "time_per_iteration": 2.4085865020751953 + }, + { + "auxiliary_loss_clip": 0.01050426, + "auxiliary_loss_mlp": 0.01036771, + "balance_loss_clip": 1.0144912, + "balance_loss_mlp": 1.01609886, + "epoch": 0.6640012024650533, + "flos": 25555199990400.0, + "grad_norm": 1.5380082793740195, + "language_loss": 0.84339392, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86426592, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 11044, + "time_per_iteration": 2.381227970123291 + }, + { + "auxiliary_loss_clip": 0.01052423, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.01296806, + "balance_loss_mlp": 1.01571822, + "epoch": 0.6640613257177214, + "flos": 14866051559040.0, + "grad_norm": 2.0578606047597385, + "language_loss": 0.71234465, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.73324585, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3671875, + "step": 11045, + "time_per_iteration": 2.3562262058258057 + }, + { + "auxiliary_loss_clip": 0.01054499, + "auxiliary_loss_mlp": 0.01039439, + "balance_loss_clip": 1.01450086, + "balance_loss_mlp": 1.01680017, + "epoch": 0.6641214489703893, + "flos": 23219999326080.0, + "grad_norm": 1.520629315522978, + "language_loss": 0.65145737, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.67239678, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37695312, + "step": 11046, + "time_per_iteration": 2.3806560039520264 + }, + { + "auxiliary_loss_clip": 0.01052834, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.01454115, + "balance_loss_mlp": 1.01606131, + "epoch": 0.6641815722230573, + "flos": 37741109852160.0, + "grad_norm": 2.4924034493364275, + "language_loss": 0.72004569, + "learning_rate": 1.070726085914088e-06, + "loss": 0.74096155, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 11047, + "time_per_iteration": 2.51143217086792 + }, + { + "auxiliary_loss_clip": 0.01053681, + "auxiliary_loss_mlp": 0.010378, + "balance_loss_clip": 1.01208663, + "balance_loss_mlp": 1.01685143, + "epoch": 0.6642416954757252, + "flos": 17930168922240.0, + "grad_norm": 1.9681979442625266, + "language_loss": 0.77844352, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79935831, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3671875, + "step": 11048, + "time_per_iteration": 2.3412768840789795 + }, + { + "auxiliary_loss_clip": 0.01008603, + "auxiliary_loss_mlp": 0.01006602, + "balance_loss_clip": 1.00434875, + "balance_loss_mlp": 1.00136185, + "epoch": 0.6643018187283932, + "flos": 51992829394560.0, + "grad_norm": 0.7512400024556176, + "language_loss": 0.55066133, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57081336, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.07226562, + "step": 11049, + "time_per_iteration": 3.016592502593994 + }, + { + "auxiliary_loss_clip": 0.01053016, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.01149035, + "balance_loss_mlp": 1.01601672, + "epoch": 0.6643619419810611, + "flos": 30225356939520.0, + "grad_norm": 1.5633864817236542, + "language_loss": 0.65155321, + "learning_rate": 1.069691638104648e-06, + "loss": 0.67243487, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 11050, + "time_per_iteration": 2.4380106925964355 + }, + { + "auxiliary_loss_clip": 0.0105196, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.01503325, + "balance_loss_mlp": 1.01641452, + "epoch": 0.6644220652337292, + "flos": 22965028600320.0, + "grad_norm": 2.2382984307500586, + "language_loss": 0.79854983, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.81945479, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 11051, + "time_per_iteration": 3.6065726280212402 + }, + { + "auxiliary_loss_clip": 0.01053516, + "auxiliary_loss_mlp": 0.01041823, + "balance_loss_clip": 1.01794541, + "balance_loss_mlp": 1.01738071, + "epoch": 0.6644821884863971, + "flos": 21141165841920.0, + "grad_norm": 1.6669988972452632, + "language_loss": 0.86320388, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.88415724, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36132812, + "step": 11052, + "time_per_iteration": 2.351734161376953 + }, + { + "auxiliary_loss_clip": 0.01055672, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.01766634, + "balance_loss_mlp": 1.01776612, + "epoch": 0.6645423117390651, + "flos": 20191807618560.0, + "grad_norm": 2.16655708495554, + "language_loss": 0.76051271, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.78151178, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 11053, + "time_per_iteration": 2.359241247177124 + }, + { + "auxiliary_loss_clip": 0.01051789, + "auxiliary_loss_mlp": 0.01036481, + "balance_loss_clip": 1.01182806, + "balance_loss_mlp": 1.0161196, + "epoch": 0.6646024349917331, + "flos": 24350836129920.0, + "grad_norm": 1.615278128809129, + "language_loss": 0.80435836, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.82524109, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 11054, + "time_per_iteration": 3.7380881309509277 + }, + { + "auxiliary_loss_clip": 0.01051638, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.01099026, + "balance_loss_mlp": 1.01637602, + "epoch": 0.664662558244401, + "flos": 18805720798080.0, + "grad_norm": 1.595842642680796, + "language_loss": 0.74626446, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76711971, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 11055, + "time_per_iteration": 3.7174551486968994 + }, + { + "auxiliary_loss_clip": 0.01052847, + "auxiliary_loss_mlp": 0.01040294, + "balance_loss_clip": 1.01443779, + "balance_loss_mlp": 1.01613605, + "epoch": 0.664722681497069, + "flos": 18951797393280.0, + "grad_norm": 1.9269800948912357, + "language_loss": 0.73933256, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.76026398, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 11056, + "time_per_iteration": 2.355121612548828 + }, + { + "auxiliary_loss_clip": 0.0105236, + "auxiliary_loss_mlp": 0.01039102, + "balance_loss_clip": 1.01478302, + "balance_loss_mlp": 1.01592112, + "epoch": 0.6647828047497369, + "flos": 19570318773120.0, + "grad_norm": 1.978106573746429, + "language_loss": 0.70822358, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.72913826, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 11057, + "time_per_iteration": 2.3549771308898926 + }, + { + "auxiliary_loss_clip": 0.01054194, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.0126102, + "balance_loss_mlp": 1.01716018, + "epoch": 0.664842928002405, + "flos": 23148322571520.0, + "grad_norm": 1.7775560859634052, + "language_loss": 0.81240928, + "learning_rate": 1.066934663776291e-06, + "loss": 0.83331537, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 11058, + "time_per_iteration": 2.3633978366851807 + }, + { + "auxiliary_loss_clip": 0.01009222, + "auxiliary_loss_mlp": 0.01002792, + "balance_loss_clip": 1.00058699, + "balance_loss_mlp": 1.00205207, + "epoch": 0.6649030512550729, + "flos": 65241844289280.0, + "grad_norm": 0.7903196064692106, + "language_loss": 0.62720037, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64732051, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.07177734, + "step": 11059, + "time_per_iteration": 2.8993260860443115 + }, + { + "auxiliary_loss_clip": 0.01052696, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.01721048, + "balance_loss_mlp": 1.01613545, + "epoch": 0.6649631745077409, + "flos": 20193727743360.0, + "grad_norm": 1.4298423642113622, + "language_loss": 0.79701805, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81794596, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 11060, + "time_per_iteration": 2.389570713043213 + }, + { + "auxiliary_loss_clip": 0.01052562, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.01264572, + "balance_loss_mlp": 1.01631558, + "epoch": 0.6650232977604088, + "flos": 17237596815360.0, + "grad_norm": 1.6681991528514222, + "language_loss": 0.79207468, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81296217, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 11061, + "time_per_iteration": 2.345914840698242 + }, + { + "auxiliary_loss_clip": 0.01052293, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.01250744, + "balance_loss_mlp": 1.01685119, + "epoch": 0.6650834210130768, + "flos": 10006316593920.0, + "grad_norm": 2.269524265448418, + "language_loss": 0.57289517, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59377438, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 11062, + "time_per_iteration": 2.3277463912963867 + }, + { + "auxiliary_loss_clip": 0.01054157, + "auxiliary_loss_mlp": 0.01038952, + "balance_loss_clip": 1.01023436, + "balance_loss_mlp": 1.01516688, + "epoch": 0.6651435442657447, + "flos": 10451319183360.0, + "grad_norm": 1.7834484340318153, + "language_loss": 0.77057225, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.79150331, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.390625, + "step": 11063, + "time_per_iteration": 2.3249144554138184 + }, + { + "auxiliary_loss_clip": 0.01052718, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.01542401, + "balance_loss_mlp": 1.01702499, + "epoch": 0.6652036675184128, + "flos": 22343190641280.0, + "grad_norm": 2.085207738086411, + "language_loss": 0.71247017, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.733383, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 11064, + "time_per_iteration": 2.4113166332244873 + }, + { + "auxiliary_loss_clip": 0.01008169, + "auxiliary_loss_mlp": 0.01004797, + "balance_loss_clip": 1.00249636, + "balance_loss_mlp": 1.00101471, + "epoch": 0.6652637907710807, + "flos": 52906995100800.0, + "grad_norm": 0.8616491667351515, + "language_loss": 0.63150954, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65163922, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.07128906, + "step": 11065, + "time_per_iteration": 4.334945201873779 + }, + { + "auxiliary_loss_clip": 0.01052664, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.01207304, + "balance_loss_mlp": 1.01692581, + "epoch": 0.6653239140237487, + "flos": 23103738898560.0, + "grad_norm": 2.0781541315325645, + "language_loss": 0.63631618, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.65719789, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35742188, + "step": 11066, + "time_per_iteration": 2.359642505645752 + }, + { + "auxiliary_loss_clip": 0.01053912, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.0127697, + "balance_loss_mlp": 1.01639295, + "epoch": 0.6653840372764167, + "flos": 25958167436160.0, + "grad_norm": 1.5381060837561717, + "language_loss": 0.70299721, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72391355, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 11067, + "time_per_iteration": 2.413130521774292 + }, + { + "auxiliary_loss_clip": 0.01008448, + "auxiliary_loss_mlp": 0.01003363, + "balance_loss_clip": 1.00062096, + "balance_loss_mlp": 1.00095749, + "epoch": 0.6654441605290846, + "flos": 66039051340800.0, + "grad_norm": 0.9267257122259847, + "language_loss": 0.72269142, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74280953, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.07519531, + "step": 11068, + "time_per_iteration": 2.9948508739471436 + }, + { + "auxiliary_loss_clip": 0.01008748, + "auxiliary_loss_mlp": 0.01010429, + "balance_loss_clip": 1.00825977, + "balance_loss_mlp": 1.00143588, + "epoch": 0.6655042837817526, + "flos": 65192371159680.0, + "grad_norm": 0.7120506108063273, + "language_loss": 0.57849514, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59868693, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.07324219, + "step": 11069, + "time_per_iteration": 3.1411874294281006 + }, + { + "auxiliary_loss_clip": 0.01008244, + "auxiliary_loss_mlp": 0.01002791, + "balance_loss_clip": 1.0003351, + "balance_loss_mlp": 1.00093198, + "epoch": 0.6655644070344205, + "flos": 69005411297280.0, + "grad_norm": 0.7530776933455405, + "language_loss": 0.6358887, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65599906, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.07324219, + "step": 11070, + "time_per_iteration": 3.0696499347686768 + }, + { + "auxiliary_loss_clip": 0.01051162, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.01495767, + "balance_loss_mlp": 1.01521778, + "epoch": 0.6656245302870886, + "flos": 36314209785600.0, + "grad_norm": 2.08031918678009, + "language_loss": 0.5993371, + "learning_rate": 1.062459413096116e-06, + "loss": 0.62022048, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 11071, + "time_per_iteration": 2.5099880695343018 + }, + { + "auxiliary_loss_clip": 0.0105343, + "auxiliary_loss_mlp": 0.01038722, + "balance_loss_clip": 1.01551175, + "balance_loss_mlp": 1.01711905, + "epoch": 0.6656846535397565, + "flos": 21793867309440.0, + "grad_norm": 1.662710577623812, + "language_loss": 0.73561239, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75653386, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 11072, + "time_per_iteration": 2.36806321144104 + }, + { + "auxiliary_loss_clip": 0.01053382, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.01420236, + "balance_loss_mlp": 1.01663649, + "epoch": 0.6657447767924245, + "flos": 37486104215040.0, + "grad_norm": 1.693904306827611, + "language_loss": 0.71574211, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.7366665, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3671875, + "step": 11073, + "time_per_iteration": 2.526862382888794 + }, + { + "auxiliary_loss_clip": 0.01055249, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.01447999, + "balance_loss_mlp": 1.01761627, + "epoch": 0.6658049000450924, + "flos": 16836724051200.0, + "grad_norm": 2.4404081932689503, + "language_loss": 0.56646264, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58741218, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 11074, + "time_per_iteration": 2.3248791694641113 + }, + { + "auxiliary_loss_clip": 0.01052271, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.01446795, + "balance_loss_mlp": 1.0170902, + "epoch": 0.6658650232977604, + "flos": 33509566713600.0, + "grad_norm": 1.4626357704944923, + "language_loss": 0.7277298, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74862289, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 11075, + "time_per_iteration": 2.4545867443084717 + }, + { + "auxiliary_loss_clip": 0.01050715, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.01618838, + "balance_loss_mlp": 1.01602411, + "epoch": 0.6659251465504283, + "flos": 37704800171520.0, + "grad_norm": 1.7338755664049061, + "language_loss": 0.66984951, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.69073641, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34570312, + "step": 11076, + "time_per_iteration": 2.512529134750366 + }, + { + "auxiliary_loss_clip": 0.01051812, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.01143384, + "balance_loss_mlp": 1.0157268, + "epoch": 0.6659852698030964, + "flos": 24892444051200.0, + "grad_norm": 1.719472356063621, + "language_loss": 0.76245737, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.78332663, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 11077, + "time_per_iteration": 2.3991341590881348 + }, + { + "auxiliary_loss_clip": 0.01053247, + "auxiliary_loss_mlp": 0.0103945, + "balance_loss_clip": 1.01467848, + "balance_loss_mlp": 1.01680899, + "epoch": 0.6660453930557643, + "flos": 24351674002560.0, + "grad_norm": 1.5606811678236454, + "language_loss": 0.67498875, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.6959157, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 11078, + "time_per_iteration": 2.380706548690796 + }, + { + "auxiliary_loss_clip": 0.01053631, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.01536798, + "balance_loss_mlp": 1.01606512, + "epoch": 0.6661055163084323, + "flos": 10597046664960.0, + "grad_norm": 2.1465229995504096, + "language_loss": 0.71546638, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.73640472, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 11079, + "time_per_iteration": 2.331461191177368 + }, + { + "auxiliary_loss_clip": 0.01052769, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01234448, + "balance_loss_mlp": 1.01683378, + "epoch": 0.6661656395611003, + "flos": 24056448612480.0, + "grad_norm": 1.534315362453059, + "language_loss": 0.81357062, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.83444846, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 11080, + "time_per_iteration": 2.3926966190338135 + }, + { + "auxiliary_loss_clip": 0.01050203, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.01491213, + "balance_loss_mlp": 1.01525557, + "epoch": 0.6662257628137682, + "flos": 23035169255040.0, + "grad_norm": 1.7969901882229995, + "language_loss": 0.7913422, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.81220627, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34960938, + "step": 11081, + "time_per_iteration": 2.3693289756774902 + }, + { + "auxiliary_loss_clip": 0.01054494, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.01369154, + "balance_loss_mlp": 1.01721632, + "epoch": 0.6662858860664362, + "flos": 24753279905280.0, + "grad_norm": 1.8425349681182936, + "language_loss": 0.81548941, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.8364284, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 11082, + "time_per_iteration": 2.4081456661224365 + }, + { + "auxiliary_loss_clip": 0.01050977, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.01141, + "balance_loss_mlp": 1.01590967, + "epoch": 0.6663460093191041, + "flos": 20008094711040.0, + "grad_norm": 1.5705430900448303, + "language_loss": 0.84503484, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86588126, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 11083, + "time_per_iteration": 2.360698938369751 + }, + { + "auxiliary_loss_clip": 0.01055592, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.01795256, + "balance_loss_mlp": 1.01684666, + "epoch": 0.6664061325717722, + "flos": 17820436919040.0, + "grad_norm": 2.8497659526144754, + "language_loss": 0.87059861, + "learning_rate": 1.057990170638731e-06, + "loss": 0.89158875, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 11084, + "time_per_iteration": 2.3598005771636963 + }, + { + "auxiliary_loss_clip": 0.01053075, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.01181138, + "balance_loss_mlp": 1.01556349, + "epoch": 0.6664662558244401, + "flos": 18075931315200.0, + "grad_norm": 2.383218340047172, + "language_loss": 0.74446201, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.76537168, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 11085, + "time_per_iteration": 2.3618545532226562 + }, + { + "auxiliary_loss_clip": 0.01051472, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.01076818, + "balance_loss_mlp": 1.01533949, + "epoch": 0.6665263790771081, + "flos": 21573286139520.0, + "grad_norm": 1.7854271489477993, + "language_loss": 0.81906641, + "learning_rate": 1.057303129975894e-06, + "loss": 0.83992791, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 11086, + "time_per_iteration": 2.4040451049804688 + }, + { + "auxiliary_loss_clip": 0.01053144, + "auxiliary_loss_mlp": 0.01039279, + "balance_loss_clip": 1.01571155, + "balance_loss_mlp": 1.01575017, + "epoch": 0.666586502329776, + "flos": 24205492673280.0, + "grad_norm": 1.8908288651514884, + "language_loss": 0.76270235, + "learning_rate": 1.056959663258702e-06, + "loss": 0.78362656, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.375, + "step": 11087, + "time_per_iteration": 2.3763089179992676 + }, + { + "auxiliary_loss_clip": 0.01052797, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.0122689, + "balance_loss_mlp": 1.01653194, + "epoch": 0.666646625582444, + "flos": 22199418195840.0, + "grad_norm": 2.0160489869812253, + "language_loss": 0.65974188, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.68062651, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 11088, + "time_per_iteration": 2.382625102996826 + }, + { + "auxiliary_loss_clip": 0.01054874, + "auxiliary_loss_mlp": 0.01040649, + "balance_loss_clip": 1.01574636, + "balance_loss_mlp": 1.01738501, + "epoch": 0.6667067488351119, + "flos": 18258945995520.0, + "grad_norm": 1.9049040515903986, + "language_loss": 0.6587162, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.67967141, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 11089, + "time_per_iteration": 2.3442752361297607 + }, + { + "auxiliary_loss_clip": 0.01053207, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.01250124, + "balance_loss_mlp": 1.01676106, + "epoch": 0.66676687208778, + "flos": 17235641779200.0, + "grad_norm": 2.209393072286546, + "language_loss": 0.82180685, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.84269792, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36523438, + "step": 11090, + "time_per_iteration": 3.643364906311035 + }, + { + "auxiliary_loss_clip": 0.01053616, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.01450443, + "balance_loss_mlp": 1.01620984, + "epoch": 0.6668269953404479, + "flos": 19751273683200.0, + "grad_norm": 2.0236698465470977, + "language_loss": 0.79260385, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.81353545, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 11091, + "time_per_iteration": 2.3484153747558594 + }, + { + "auxiliary_loss_clip": 0.01051364, + "auxiliary_loss_mlp": 0.01036904, + "balance_loss_clip": 1.01411128, + "balance_loss_mlp": 1.01527798, + "epoch": 0.6668871185931159, + "flos": 20557383131520.0, + "grad_norm": 1.7849590555252393, + "language_loss": 0.80407488, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.82495761, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 11092, + "time_per_iteration": 2.3365137577056885 + }, + { + "auxiliary_loss_clip": 0.0100965, + "auxiliary_loss_mlp": 0.01010033, + "balance_loss_clip": 1.00745785, + "balance_loss_mlp": 1.00209618, + "epoch": 0.6669472418457839, + "flos": 58085452667520.0, + "grad_norm": 0.756347656755789, + "language_loss": 0.57846516, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59866196, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.07568359, + "step": 11093, + "time_per_iteration": 3.0215201377868652 + }, + { + "auxiliary_loss_clip": 0.01051364, + "auxiliary_loss_mlp": 0.01039866, + "balance_loss_clip": 1.01687026, + "balance_loss_mlp": 1.01574564, + "epoch": 0.6670073650984518, + "flos": 26063989367040.0, + "grad_norm": 1.9316821396453496, + "language_loss": 0.77387172, + "learning_rate": 1.054556398252703e-06, + "loss": 0.79478401, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 11094, + "time_per_iteration": 3.704000473022461 + }, + { + "auxiliary_loss_clip": 0.01053477, + "auxiliary_loss_mlp": 0.01041711, + "balance_loss_clip": 1.01715398, + "balance_loss_mlp": 1.0164063, + "epoch": 0.6670674883511198, + "flos": 32415458526720.0, + "grad_norm": 1.768657619877568, + "language_loss": 0.74496132, + "learning_rate": 1.05421321798155e-06, + "loss": 0.76591313, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 11095, + "time_per_iteration": 3.828704595565796 + }, + { + "auxiliary_loss_clip": 0.01052712, + "auxiliary_loss_mlp": 0.01037416, + "balance_loss_clip": 1.0150404, + "balance_loss_mlp": 1.01741552, + "epoch": 0.6671276116037878, + "flos": 18036898548480.0, + "grad_norm": 2.0568076570125764, + "language_loss": 0.74117565, + "learning_rate": 1.053870073574727e-06, + "loss": 0.76207685, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3515625, + "step": 11096, + "time_per_iteration": 2.3367269039154053 + }, + { + "auxiliary_loss_clip": 0.01050022, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.0188787, + "balance_loss_mlp": 1.01610792, + "epoch": 0.6671877348564558, + "flos": 23765971167360.0, + "grad_norm": 3.4943703868276685, + "language_loss": 0.65780115, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.67869425, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.33984375, + "step": 11097, + "time_per_iteration": 2.409106969833374 + }, + { + "auxiliary_loss_clip": 0.01053461, + "auxiliary_loss_mlp": 0.01042962, + "balance_loss_clip": 1.01802301, + "balance_loss_mlp": 1.01589191, + "epoch": 0.6672478581091237, + "flos": 20917442649600.0, + "grad_norm": 1.7491957289690647, + "language_loss": 0.77287662, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.79384089, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 11098, + "time_per_iteration": 2.439530611038208 + }, + { + "auxiliary_loss_clip": 0.01052613, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.01820588, + "balance_loss_mlp": 1.01595688, + "epoch": 0.6673079813617917, + "flos": 27854544821760.0, + "grad_norm": 1.5718982019815968, + "language_loss": 0.75541848, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.77635807, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 11099, + "time_per_iteration": 2.4369869232177734 + }, + { + "auxiliary_loss_clip": 0.01051946, + "auxiliary_loss_mlp": 0.01045399, + "balance_loss_clip": 1.02046037, + "balance_loss_mlp": 1.01616347, + "epoch": 0.6673681046144596, + "flos": 21615775130880.0, + "grad_norm": 1.9049178209630346, + "language_loss": 0.78646219, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80743563, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35742188, + "step": 11100, + "time_per_iteration": 2.3699584007263184 + }, + { + "auxiliary_loss_clip": 0.01051927, + "auxiliary_loss_mlp": 0.01045109, + "balance_loss_clip": 1.0221374, + "balance_loss_mlp": 1.0164187, + "epoch": 0.6674282278671276, + "flos": 20888743645440.0, + "grad_norm": 1.7542286929410327, + "language_loss": 0.60885721, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62982762, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 11101, + "time_per_iteration": 2.4126718044281006 + }, + { + "auxiliary_loss_clip": 0.01055771, + "auxiliary_loss_mlp": 0.01040464, + "balance_loss_clip": 1.01335621, + "balance_loss_mlp": 1.01658773, + "epoch": 0.6674883511197955, + "flos": 23623036594560.0, + "grad_norm": 1.632053212543779, + "language_loss": 0.72539759, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.74636, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.390625, + "step": 11102, + "time_per_iteration": 2.383998394012451 + }, + { + "auxiliary_loss_clip": 0.01053321, + "auxiliary_loss_mlp": 0.01038096, + "balance_loss_clip": 1.01369333, + "balance_loss_mlp": 1.01627111, + "epoch": 0.6675484743724636, + "flos": 19608653312640.0, + "grad_norm": 1.4899469736121012, + "language_loss": 0.84968531, + "learning_rate": 1.051469068021034e-06, + "loss": 0.87059951, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 11103, + "time_per_iteration": 2.3735787868499756 + }, + { + "auxiliary_loss_clip": 0.01054011, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.01222265, + "balance_loss_mlp": 1.01669574, + "epoch": 0.6676085976251315, + "flos": 14318578529280.0, + "grad_norm": 1.9408855611533244, + "language_loss": 0.79684138, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.81773543, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37304688, + "step": 11104, + "time_per_iteration": 3.774425506591797 + }, + { + "auxiliary_loss_clip": 0.01055294, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.01570046, + "balance_loss_mlp": 1.01751339, + "epoch": 0.6676687208777995, + "flos": 38103159317760.0, + "grad_norm": 1.696436883369857, + "language_loss": 0.58837366, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60931969, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37695312, + "step": 11105, + "time_per_iteration": 2.517803907394409 + }, + { + "auxiliary_loss_clip": 0.01056351, + "auxiliary_loss_mlp": 0.01042784, + "balance_loss_clip": 1.01685572, + "balance_loss_mlp": 1.01689148, + "epoch": 0.6677288441304675, + "flos": 23980617406080.0, + "grad_norm": 1.6624265309868647, + "language_loss": 0.74219739, + "learning_rate": 1.0504406049066e-06, + "loss": 0.76318872, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 11106, + "time_per_iteration": 2.403228282928467 + }, + { + "auxiliary_loss_clip": 0.01052442, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.0097301, + "balance_loss_mlp": 1.01672149, + "epoch": 0.6677889673831354, + "flos": 24169532106240.0, + "grad_norm": 1.814311614309522, + "language_loss": 0.78022659, + "learning_rate": 1.0500978558659e-06, + "loss": 0.80108726, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 11107, + "time_per_iteration": 2.3622937202453613 + }, + { + "auxiliary_loss_clip": 0.01050393, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.01321745, + "balance_loss_mlp": 1.01549673, + "epoch": 0.6678490906358034, + "flos": 22308556705920.0, + "grad_norm": 2.5874244894986638, + "language_loss": 0.90971595, + "learning_rate": 1.049755142845583e-06, + "loss": 0.93057501, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 11108, + "time_per_iteration": 2.3738913536071777 + }, + { + "auxiliary_loss_clip": 0.01050834, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.00984216, + "balance_loss_mlp": 1.01582289, + "epoch": 0.6679092138884714, + "flos": 36897399002880.0, + "grad_norm": 1.3501642786322143, + "language_loss": 0.83394641, + "learning_rate": 1.049412465858646e-06, + "loss": 0.85476536, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 11109, + "time_per_iteration": 2.4977805614471436 + }, + { + "auxiliary_loss_clip": 0.01052047, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.01609921, + "balance_loss_mlp": 1.01577854, + "epoch": 0.6679693371411394, + "flos": 18149318726400.0, + "grad_norm": 1.9493853812743032, + "language_loss": 0.71636319, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.73728096, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 11110, + "time_per_iteration": 2.3509953022003174 + }, + { + "auxiliary_loss_clip": 0.01054436, + "auxiliary_loss_mlp": 0.01039367, + "balance_loss_clip": 1.01260495, + "balance_loss_mlp": 1.0160675, + "epoch": 0.6680294603938073, + "flos": 27196955763840.0, + "grad_norm": 1.89903469361097, + "language_loss": 0.74880332, + "learning_rate": 1.04872722003689e-06, + "loss": 0.76974142, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3828125, + "step": 11111, + "time_per_iteration": 2.4025425910949707 + }, + { + "auxiliary_loss_clip": 0.01051649, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.01330054, + "balance_loss_mlp": 1.01598597, + "epoch": 0.6680895836464753, + "flos": 21724250325120.0, + "grad_norm": 2.8022053310098465, + "language_loss": 0.67001224, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.6909073, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 11112, + "time_per_iteration": 2.366159677505493 + }, + { + "auxiliary_loss_clip": 0.01053112, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.01271582, + "balance_loss_mlp": 1.01683784, + "epoch": 0.6681497068991432, + "flos": 19645451752320.0, + "grad_norm": 2.3877243277458455, + "language_loss": 0.64102721, + "learning_rate": 1.048042118504569e-06, + "loss": 0.66192424, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 11113, + "time_per_iteration": 2.3433260917663574 + }, + { + "auxiliary_loss_clip": 0.01051613, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.01051044, + "balance_loss_mlp": 1.01591158, + "epoch": 0.6682098301518112, + "flos": 17418237523200.0, + "grad_norm": 1.9516651319348364, + "language_loss": 0.66727704, + "learning_rate": 1.047699621879422e-06, + "loss": 0.68812537, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35742188, + "step": 11114, + "time_per_iteration": 2.320551633834839 + }, + { + "auxiliary_loss_clip": 0.01053731, + "auxiliary_loss_mlp": 0.01038421, + "balance_loss_clip": 1.01573551, + "balance_loss_mlp": 1.0168035, + "epoch": 0.6682699534044791, + "flos": 22597986810240.0, + "grad_norm": 1.577834355332142, + "language_loss": 0.79573482, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.81665635, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36914062, + "step": 11115, + "time_per_iteration": 2.3728630542755127 + }, + { + "auxiliary_loss_clip": 0.01052612, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.0095973, + "balance_loss_mlp": 1.01555336, + "epoch": 0.6683300766571472, + "flos": 24862523149440.0, + "grad_norm": 1.7020649569688744, + "language_loss": 0.80740649, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.82827282, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 11116, + "time_per_iteration": 2.3787641525268555 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.01563144, + "balance_loss_mlp": 1.01682806, + "epoch": 0.6683901999098151, + "flos": 27125383743360.0, + "grad_norm": 1.6017618841728283, + "language_loss": 0.79713601, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81809711, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37109375, + "step": 11117, + "time_per_iteration": 2.4083964824676514 + }, + { + "auxiliary_loss_clip": 0.01053527, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.011127, + "balance_loss_mlp": 1.0158962, + "epoch": 0.6684503231624831, + "flos": 20738023839360.0, + "grad_norm": 1.5704368987309416, + "language_loss": 0.66644192, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.68735081, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37695312, + "step": 11118, + "time_per_iteration": 2.3498356342315674 + }, + { + "auxiliary_loss_clip": 0.01051541, + "auxiliary_loss_mlp": 0.01033429, + "balance_loss_clip": 1.01111281, + "balance_loss_mlp": 1.01578665, + "epoch": 0.668510446415151, + "flos": 21761118587520.0, + "grad_norm": 1.4449181094383172, + "language_loss": 0.69689739, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.71774709, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 11119, + "time_per_iteration": 2.376066207885742 + }, + { + "auxiliary_loss_clip": 0.01053698, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.01189923, + "balance_loss_mlp": 1.01684213, + "epoch": 0.668570569667819, + "flos": 30189920042880.0, + "grad_norm": 1.7133694913750732, + "language_loss": 0.68963921, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.71053553, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 11120, + "time_per_iteration": 2.414970874786377 + }, + { + "auxiliary_loss_clip": 0.01053265, + "auxiliary_loss_mlp": 0.01038104, + "balance_loss_clip": 1.01409578, + "balance_loss_mlp": 1.01645792, + "epoch": 0.668630692920487, + "flos": 24169497194880.0, + "grad_norm": 2.021859863721085, + "language_loss": 0.73561335, + "learning_rate": 1.045303157347638e-06, + "loss": 0.75652707, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 11121, + "time_per_iteration": 2.387606620788574 + }, + { + "auxiliary_loss_clip": 0.01055333, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_clip": 1.01443481, + "balance_loss_mlp": 1.01676726, + "epoch": 0.668690816173155, + "flos": 17456188037760.0, + "grad_norm": 2.617572073354759, + "language_loss": 0.7240622, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.74502265, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 11122, + "time_per_iteration": 2.331324815750122 + }, + { + "auxiliary_loss_clip": 0.01053602, + "auxiliary_loss_mlp": 0.01034439, + "balance_loss_clip": 1.00957143, + "balance_loss_mlp": 1.01708257, + "epoch": 0.668750939425823, + "flos": 25004061267840.0, + "grad_norm": 2.2109273225574593, + "language_loss": 0.72394323, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.74482363, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 11123, + "time_per_iteration": 2.3914716243743896 + }, + { + "auxiliary_loss_clip": 0.01055872, + "auxiliary_loss_mlp": 0.0104363, + "balance_loss_clip": 1.01751161, + "balance_loss_mlp": 1.01683235, + "epoch": 0.6688110626784909, + "flos": 24095655936000.0, + "grad_norm": 1.687601623462158, + "language_loss": 0.80513746, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.82613248, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 11124, + "time_per_iteration": 2.3770549297332764 + }, + { + "auxiliary_loss_clip": 0.01054241, + "auxiliary_loss_mlp": 0.01038989, + "balance_loss_clip": 1.0145632, + "balance_loss_mlp": 1.01687896, + "epoch": 0.6688711859311589, + "flos": 21758535146880.0, + "grad_norm": 2.0008967964980005, + "language_loss": 0.74675715, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76768947, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 11125, + "time_per_iteration": 2.387712001800537 + }, + { + "auxiliary_loss_clip": 0.01055385, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.01525736, + "balance_loss_mlp": 1.01827061, + "epoch": 0.6689313091838268, + "flos": 22928544362880.0, + "grad_norm": 2.779555637617207, + "language_loss": 0.68178427, + "learning_rate": 1.043592482774116e-06, + "loss": 0.70272917, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 11126, + "time_per_iteration": 2.3853566646575928 + }, + { + "auxiliary_loss_clip": 0.01052366, + "auxiliary_loss_mlp": 0.0103593, + "balance_loss_clip": 1.01262474, + "balance_loss_mlp": 1.01483047, + "epoch": 0.6689914324364948, + "flos": 20885112864000.0, + "grad_norm": 1.8247251928245596, + "language_loss": 0.71883649, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73971945, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 11127, + "time_per_iteration": 2.395620346069336 + }, + { + "auxiliary_loss_clip": 0.01055616, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.01788759, + "balance_loss_mlp": 1.01560271, + "epoch": 0.6690515556891627, + "flos": 22747100693760.0, + "grad_norm": 1.9505001727243227, + "language_loss": 0.81481642, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.83583283, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40039062, + "step": 11128, + "time_per_iteration": 2.37058687210083 + }, + { + "auxiliary_loss_clip": 0.01053569, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.01437581, + "balance_loss_mlp": 1.01570153, + "epoch": 0.6691116789418308, + "flos": 23330324822400.0, + "grad_norm": 1.7546993793741807, + "language_loss": 0.81760806, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.8385371, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 11129, + "time_per_iteration": 3.6070003509521484 + }, + { + "auxiliary_loss_clip": 0.01050959, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.01567459, + "balance_loss_mlp": 1.0162003, + "epoch": 0.6691718021944987, + "flos": 32445798364800.0, + "grad_norm": 1.7768678947667995, + "language_loss": 0.7158798, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.73676527, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 11130, + "time_per_iteration": 2.447805404663086 + }, + { + "auxiliary_loss_clip": 0.01050881, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.01608431, + "balance_loss_mlp": 1.01603341, + "epoch": 0.6692319254471667, + "flos": 23730499359360.0, + "grad_norm": 1.594577438066387, + "language_loss": 0.70763457, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72853798, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34960938, + "step": 11131, + "time_per_iteration": 2.4214019775390625 + }, + { + "auxiliary_loss_clip": 0.01052186, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.01366448, + "balance_loss_mlp": 1.01563537, + "epoch": 0.6692920486998346, + "flos": 14427053723520.0, + "grad_norm": 2.2815171772407488, + "language_loss": 0.67980152, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.70072091, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36523438, + "step": 11132, + "time_per_iteration": 2.31101393699646 + }, + { + "auxiliary_loss_clip": 0.0105276, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.01329339, + "balance_loss_mlp": 1.01578546, + "epoch": 0.6693521719525026, + "flos": 21506392241280.0, + "grad_norm": 1.6375899266586758, + "language_loss": 0.75883776, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.7797538, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36914062, + "step": 11133, + "time_per_iteration": 3.6703567504882812 + }, + { + "auxiliary_loss_clip": 0.0105752, + "auxiliary_loss_mlp": 0.01049725, + "balance_loss_clip": 1.02037537, + "balance_loss_mlp": 1.01839495, + "epoch": 0.6694122952051706, + "flos": 25405876638720.0, + "grad_norm": 2.0812098729123214, + "language_loss": 0.68439066, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.70546305, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.390625, + "step": 11134, + "time_per_iteration": 3.829972267150879 + }, + { + "auxiliary_loss_clip": 0.01057377, + "auxiliary_loss_mlp": 0.01046325, + "balance_loss_clip": 1.01620054, + "balance_loss_mlp": 1.01802063, + "epoch": 0.6694724184578386, + "flos": 25660672807680.0, + "grad_norm": 1.7955832102487468, + "language_loss": 0.78872031, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.80975729, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.39453125, + "step": 11135, + "time_per_iteration": 2.3893089294433594 + }, + { + "auxiliary_loss_clip": 0.01051146, + "auxiliary_loss_mlp": 0.01039558, + "balance_loss_clip": 1.0159905, + "balance_loss_mlp": 1.01625419, + "epoch": 0.6695325417105066, + "flos": 17708435677440.0, + "grad_norm": 2.130713397003565, + "language_loss": 0.75296158, + "learning_rate": 1.040173855277898e-06, + "loss": 0.77386868, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34765625, + "step": 11136, + "time_per_iteration": 2.3752248287200928 + }, + { + "auxiliary_loss_clip": 0.01056941, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.01394558, + "balance_loss_mlp": 1.01807249, + "epoch": 0.6695926649631745, + "flos": 24458962210560.0, + "grad_norm": 1.758038412284191, + "language_loss": 0.63325661, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.65422595, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 11137, + "time_per_iteration": 2.3739590644836426 + }, + { + "auxiliary_loss_clip": 0.01051563, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.01435232, + "balance_loss_mlp": 1.01584017, + "epoch": 0.6696527882158425, + "flos": 24278984818560.0, + "grad_norm": 1.9302855700072241, + "language_loss": 0.67662716, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.69753653, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35742188, + "step": 11138, + "time_per_iteration": 2.3812472820281982 + }, + { + "auxiliary_loss_clip": 0.01050198, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.0145061, + "balance_loss_mlp": 1.01510119, + "epoch": 0.6697129114685104, + "flos": 23001652483200.0, + "grad_norm": 1.886655235935033, + "language_loss": 0.74081814, + "learning_rate": 1.039148976175053e-06, + "loss": 0.76168704, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3515625, + "step": 11139, + "time_per_iteration": 2.369271993637085 + }, + { + "auxiliary_loss_clip": 0.010496, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.01452971, + "balance_loss_mlp": 1.01504755, + "epoch": 0.6697730347211784, + "flos": 22637019576960.0, + "grad_norm": 1.883661104118875, + "language_loss": 0.72291058, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.7437672, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 11140, + "time_per_iteration": 2.398655891418457 + }, + { + "auxiliary_loss_clip": 0.01054222, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.01155257, + "balance_loss_mlp": 1.0160116, + "epoch": 0.6698331579738463, + "flos": 28875963824640.0, + "grad_norm": 2.0742197059580003, + "language_loss": 0.76844776, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.78935945, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 11141, + "time_per_iteration": 2.4253556728363037 + }, + { + "auxiliary_loss_clip": 0.01054258, + "auxiliary_loss_mlp": 0.01042554, + "balance_loss_clip": 1.01698351, + "balance_loss_mlp": 1.01644993, + "epoch": 0.6698932812265144, + "flos": 24205946520960.0, + "grad_norm": 6.33558595047457, + "language_loss": 0.83553934, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.85650742, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37695312, + "step": 11142, + "time_per_iteration": 2.3848793506622314 + }, + { + "auxiliary_loss_clip": 0.01052136, + "auxiliary_loss_mlp": 0.01038742, + "balance_loss_clip": 1.01501942, + "balance_loss_mlp": 1.01593375, + "epoch": 0.6699534044791823, + "flos": 22089197433600.0, + "grad_norm": 2.170220690116343, + "language_loss": 0.70936733, + "learning_rate": 1.037782980862959e-06, + "loss": 0.73027617, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36132812, + "step": 11143, + "time_per_iteration": 2.386200189590454 + }, + { + "auxiliary_loss_clip": 0.01050335, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.01405406, + "balance_loss_mlp": 1.01531124, + "epoch": 0.6700135277318503, + "flos": 25191195488640.0, + "grad_norm": 1.4403435064305836, + "language_loss": 0.70899856, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.72985744, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 11144, + "time_per_iteration": 3.803497791290283 + }, + { + "auxiliary_loss_clip": 0.01052807, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.01163781, + "balance_loss_mlp": 1.01634526, + "epoch": 0.6700736509845182, + "flos": 23439079307520.0, + "grad_norm": 1.760328020760937, + "language_loss": 0.75283015, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.77371538, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 11145, + "time_per_iteration": 2.3802082538604736 + }, + { + "auxiliary_loss_clip": 0.01054294, + "auxiliary_loss_mlp": 0.01043082, + "balance_loss_clip": 1.01726127, + "balance_loss_mlp": 1.01614809, + "epoch": 0.6701337742371862, + "flos": 24388786644480.0, + "grad_norm": 1.611937137218705, + "language_loss": 0.72061288, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.74158669, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38085938, + "step": 11146, + "time_per_iteration": 2.4337122440338135 + }, + { + "auxiliary_loss_clip": 0.01049834, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.01596749, + "balance_loss_mlp": 1.01490247, + "epoch": 0.6701938974898543, + "flos": 14792768881920.0, + "grad_norm": 1.9523151288661005, + "language_loss": 0.79713207, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.8180095, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34960938, + "step": 11147, + "time_per_iteration": 2.350590944290161 + }, + { + "auxiliary_loss_clip": 0.01054404, + "auxiliary_loss_mlp": 0.01035273, + "balance_loss_clip": 1.01075125, + "balance_loss_mlp": 1.01755905, + "epoch": 0.6702540207425222, + "flos": 20153054142720.0, + "grad_norm": 1.7225347272687048, + "language_loss": 0.71577656, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.73667336, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3671875, + "step": 11148, + "time_per_iteration": 2.3787808418273926 + }, + { + "auxiliary_loss_clip": 0.01052853, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.01251245, + "balance_loss_mlp": 1.01578522, + "epoch": 0.6703141439951902, + "flos": 21213121887360.0, + "grad_norm": 1.7937357302727512, + "language_loss": 0.71856451, + "learning_rate": 1.035735082774636e-06, + "loss": 0.73946071, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 11149, + "time_per_iteration": 2.369661808013916 + }, + { + "auxiliary_loss_clip": 0.01055138, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.0122416, + "balance_loss_mlp": 1.01716614, + "epoch": 0.6703742672478581, + "flos": 23111419397760.0, + "grad_norm": 1.8041468063171908, + "language_loss": 0.74790531, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.76881218, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 11150, + "time_per_iteration": 2.3930537700653076 + }, + { + "auxiliary_loss_clip": 0.01052779, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.01020885, + "balance_loss_mlp": 1.01629984, + "epoch": 0.6704343905005261, + "flos": 22527811244160.0, + "grad_norm": 2.118659427203162, + "language_loss": 0.79482961, + "learning_rate": 1.035052742460671e-06, + "loss": 0.81570363, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 11151, + "time_per_iteration": 2.3773388862609863 + }, + { + "auxiliary_loss_clip": 0.01008369, + "auxiliary_loss_mlp": 0.0100662, + "balance_loss_clip": 1.00424826, + "balance_loss_mlp": 1.00149465, + "epoch": 0.670494513753194, + "flos": 64789473536640.0, + "grad_norm": 0.7936826860557125, + "language_loss": 0.55503136, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57518119, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.06884766, + "step": 11152, + "time_per_iteration": 3.1202144622802734 + }, + { + "auxiliary_loss_clip": 0.01054478, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.01430249, + "balance_loss_mlp": 1.01637793, + "epoch": 0.670554637005862, + "flos": 23510441859840.0, + "grad_norm": 1.5887262492988332, + "language_loss": 0.81628597, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.83722854, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 11153, + "time_per_iteration": 2.419187068939209 + }, + { + "auxiliary_loss_clip": 0.01053171, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.01593471, + "balance_loss_mlp": 1.01618218, + "epoch": 0.67061476025853, + "flos": 19462402160640.0, + "grad_norm": 1.6711944924875497, + "language_loss": 0.77149135, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.79242498, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 11154, + "time_per_iteration": 2.3821256160736084 + }, + { + "auxiliary_loss_clip": 0.01055409, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.01666629, + "balance_loss_mlp": 1.01792169, + "epoch": 0.670674883511198, + "flos": 20518978769280.0, + "grad_norm": 1.5347356468952325, + "language_loss": 0.76946187, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.79042071, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 11155, + "time_per_iteration": 2.3736956119537354 + }, + { + "auxiliary_loss_clip": 0.01054957, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.01433253, + "balance_loss_mlp": 1.01799202, + "epoch": 0.6707350067638659, + "flos": 25482790097280.0, + "grad_norm": 1.7510763414102284, + "language_loss": 0.8243506, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84529048, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36914062, + "step": 11156, + "time_per_iteration": 2.3882389068603516 + }, + { + "auxiliary_loss_clip": 0.01052999, + "auxiliary_loss_mlp": 0.01039583, + "balance_loss_clip": 1.01600337, + "balance_loss_mlp": 1.0165509, + "epoch": 0.6707951300165339, + "flos": 22272351759360.0, + "grad_norm": 1.7991300997342061, + "language_loss": 0.75378656, + "learning_rate": 1.033006600114165e-06, + "loss": 0.77471238, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 11157, + "time_per_iteration": 2.3758904933929443 + }, + { + "auxiliary_loss_clip": 0.01055462, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.01377833, + "balance_loss_mlp": 1.01744223, + "epoch": 0.6708552532692018, + "flos": 23983549960320.0, + "grad_norm": 1.5749311083830946, + "language_loss": 0.75156802, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.77251565, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 11158, + "time_per_iteration": 2.4210853576660156 + }, + { + "auxiliary_loss_clip": 0.01055314, + "auxiliary_loss_mlp": 0.01042307, + "balance_loss_clip": 1.01747561, + "balance_loss_mlp": 1.01711822, + "epoch": 0.6709153765218698, + "flos": 24936329496960.0, + "grad_norm": 1.4301384266384447, + "language_loss": 0.8235774, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.84455365, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 11159, + "time_per_iteration": 2.4076154232025146 + }, + { + "auxiliary_loss_clip": 0.0105368, + "auxiliary_loss_mlp": 0.01038601, + "balance_loss_clip": 1.01303077, + "balance_loss_mlp": 1.01708126, + "epoch": 0.6709754997745379, + "flos": 17529261246720.0, + "grad_norm": 1.5246328349125604, + "language_loss": 0.77541286, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.7963357, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36523438, + "step": 11160, + "time_per_iteration": 2.34653377532959 + }, + { + "auxiliary_loss_clip": 0.01051607, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.0145371, + "balance_loss_mlp": 1.01582122, + "epoch": 0.6710356230272058, + "flos": 22089790926720.0, + "grad_norm": 2.2475762981924934, + "language_loss": 0.74794281, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.76884139, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 11161, + "time_per_iteration": 2.371363401412964 + }, + { + "auxiliary_loss_clip": 0.01056884, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.01700151, + "balance_loss_mlp": 1.01771951, + "epoch": 0.6710957462798738, + "flos": 24205318116480.0, + "grad_norm": 1.7566980269132053, + "language_loss": 0.69537812, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.71638596, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 11162, + "time_per_iteration": 2.411726474761963 + }, + { + "auxiliary_loss_clip": 0.01053505, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.01712751, + "balance_loss_mlp": 1.01725471, + "epoch": 0.6711558695325417, + "flos": 19093091132160.0, + "grad_norm": 1.6708994586892005, + "language_loss": 0.70656508, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72750378, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 11163, + "time_per_iteration": 2.311664342880249 + }, + { + "auxiliary_loss_clip": 0.01051433, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.01671791, + "balance_loss_mlp": 1.01708364, + "epoch": 0.6712159927852097, + "flos": 25556666267520.0, + "grad_norm": 1.6454346898293588, + "language_loss": 0.76693583, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.78785086, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34375, + "step": 11164, + "time_per_iteration": 2.429870128631592 + }, + { + "auxiliary_loss_clip": 0.01052387, + "auxiliary_loss_mlp": 0.01043147, + "balance_loss_clip": 1.01955581, + "balance_loss_mlp": 1.01588988, + "epoch": 0.6712761160378776, + "flos": 22227942643200.0, + "grad_norm": 1.9043281867697093, + "language_loss": 0.66386366, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.68481898, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 11165, + "time_per_iteration": 2.350022077560425 + }, + { + "auxiliary_loss_clip": 0.0105075, + "auxiliary_loss_mlp": 0.01041027, + "balance_loss_clip": 1.01718545, + "balance_loss_mlp": 1.01548791, + "epoch": 0.6713362392905456, + "flos": 22454423832960.0, + "grad_norm": 1.951279135897549, + "language_loss": 0.73526412, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.75618196, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 11166, + "time_per_iteration": 2.4447855949401855 + }, + { + "auxiliary_loss_clip": 0.01051858, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.01255059, + "balance_loss_mlp": 1.01640475, + "epoch": 0.6713963625432136, + "flos": 25629006337920.0, + "grad_norm": 2.131204506991228, + "language_loss": 0.7834419, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.80431706, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 11167, + "time_per_iteration": 2.4093410968780518 + }, + { + "auxiliary_loss_clip": 0.01052535, + "auxiliary_loss_mlp": 0.01040723, + "balance_loss_clip": 1.01716757, + "balance_loss_mlp": 1.01592374, + "epoch": 0.6714564857958816, + "flos": 35005036423680.0, + "grad_norm": 1.8954778320817187, + "language_loss": 0.69598007, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71691263, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3671875, + "step": 11168, + "time_per_iteration": 2.5489211082458496 + }, + { + "auxiliary_loss_clip": 0.01054949, + "auxiliary_loss_mlp": 0.01048122, + "balance_loss_clip": 1.02065599, + "balance_loss_mlp": 1.01674628, + "epoch": 0.6715166090485495, + "flos": 26278914896640.0, + "grad_norm": 1.8276990544672413, + "language_loss": 0.74899304, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.7700237, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.3828125, + "step": 11169, + "time_per_iteration": 3.627204418182373 + }, + { + "auxiliary_loss_clip": 0.01055665, + "auxiliary_loss_mlp": 0.0104197, + "balance_loss_clip": 1.01654315, + "balance_loss_mlp": 1.0175432, + "epoch": 0.6715767323012175, + "flos": 15923256572160.0, + "grad_norm": 2.3078466258054102, + "language_loss": 0.76797116, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78894746, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 11170, + "time_per_iteration": 2.4076972007751465 + }, + { + "auxiliary_loss_clip": 0.01054488, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.01494229, + "balance_loss_mlp": 1.01630521, + "epoch": 0.6716368555538854, + "flos": 17490542682240.0, + "grad_norm": 4.208387216710968, + "language_loss": 0.76288652, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.78382611, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3828125, + "step": 11171, + "time_per_iteration": 2.3279902935028076 + }, + { + "auxiliary_loss_clip": 0.01053865, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_clip": 1.02056789, + "balance_loss_mlp": 1.01668251, + "epoch": 0.6716969788065534, + "flos": 16760648465280.0, + "grad_norm": 1.4750846901602184, + "language_loss": 0.87450647, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.89548898, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 11172, + "time_per_iteration": 2.363266706466675 + }, + { + "auxiliary_loss_clip": 0.01053085, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.01654696, + "balance_loss_mlp": 1.01615357, + "epoch": 0.6717571020592215, + "flos": 22708731242880.0, + "grad_norm": 1.655556468613651, + "language_loss": 0.64295065, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.66389418, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36914062, + "step": 11173, + "time_per_iteration": 3.8535830974578857 + }, + { + "auxiliary_loss_clip": 0.01056073, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.01850498, + "balance_loss_mlp": 1.0165751, + "epoch": 0.6718172253118894, + "flos": 18733101436800.0, + "grad_norm": 3.906373021860777, + "language_loss": 0.7291038, + "learning_rate": 1.02721637475002e-06, + "loss": 0.75011432, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39648438, + "step": 11174, + "time_per_iteration": 3.7550208568573 + }, + { + "auxiliary_loss_clip": 0.01051877, + "auxiliary_loss_mlp": 0.0103568, + "balance_loss_clip": 1.01274443, + "balance_loss_mlp": 1.01610827, + "epoch": 0.6718773485645574, + "flos": 15631627052160.0, + "grad_norm": 2.136773607954721, + "language_loss": 0.70541888, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.72629452, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35742188, + "step": 11175, + "time_per_iteration": 2.394313335418701 + }, + { + "auxiliary_loss_clip": 0.0105271, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.01601517, + "balance_loss_mlp": 1.01679444, + "epoch": 0.6719374718172253, + "flos": 19353752409600.0, + "grad_norm": 2.4315442964137053, + "language_loss": 0.75025302, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.77118099, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.359375, + "step": 11176, + "time_per_iteration": 2.3464784622192383 + }, + { + "auxiliary_loss_clip": 0.01053052, + "auxiliary_loss_mlp": 0.01037155, + "balance_loss_clip": 1.01234782, + "balance_loss_mlp": 1.01590788, + "epoch": 0.6719975950698933, + "flos": 21980233480320.0, + "grad_norm": 1.7800881199918708, + "language_loss": 0.73990482, + "learning_rate": 1.026195675108182e-06, + "loss": 0.76080692, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 11177, + "time_per_iteration": 2.401541233062744 + }, + { + "auxiliary_loss_clip": 0.01053307, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.01500964, + "balance_loss_mlp": 1.01640844, + "epoch": 0.6720577183225612, + "flos": 25226911676160.0, + "grad_norm": 1.9318392640061994, + "language_loss": 0.7752713, + "learning_rate": 1.025855515730551e-06, + "loss": 0.79624093, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.36914062, + "step": 11178, + "time_per_iteration": 2.392406702041626 + }, + { + "auxiliary_loss_clip": 0.01054293, + "auxiliary_loss_mlp": 0.01042432, + "balance_loss_clip": 1.01855397, + "balance_loss_mlp": 1.0165633, + "epoch": 0.6721178415752292, + "flos": 16944954865920.0, + "grad_norm": 1.757826512586554, + "language_loss": 0.71767688, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.73864412, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37695312, + "step": 11179, + "time_per_iteration": 2.3909482955932617 + }, + { + "auxiliary_loss_clip": 0.01053217, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.01376438, + "balance_loss_mlp": 1.01691258, + "epoch": 0.6721779648278972, + "flos": 21540362860800.0, + "grad_norm": 1.4887267934023418, + "language_loss": 0.75062823, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.77153337, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 11180, + "time_per_iteration": 2.3668999671936035 + }, + { + "auxiliary_loss_clip": 0.01053731, + "auxiliary_loss_mlp": 0.01036369, + "balance_loss_clip": 1.01213336, + "balance_loss_mlp": 1.0171535, + "epoch": 0.6722380880805652, + "flos": 22604235943680.0, + "grad_norm": 1.389827792963096, + "language_loss": 0.75930154, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.78020257, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11181, + "time_per_iteration": 2.4671754837036133 + }, + { + "auxiliary_loss_clip": 0.0105414, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.01219714, + "balance_loss_mlp": 1.01634991, + "epoch": 0.6722982113332331, + "flos": 15924338824320.0, + "grad_norm": 2.256772585669221, + "language_loss": 0.76061094, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.78151166, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37890625, + "step": 11182, + "time_per_iteration": 2.3252110481262207 + }, + { + "auxiliary_loss_clip": 0.01051224, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.01482153, + "balance_loss_mlp": 1.01548982, + "epoch": 0.6723583345859011, + "flos": 20595089266560.0, + "grad_norm": 1.780156157528948, + "language_loss": 0.70582354, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.72672117, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35742188, + "step": 11183, + "time_per_iteration": 3.786536931991577 + }, + { + "auxiliary_loss_clip": 0.01052892, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.01423335, + "balance_loss_mlp": 1.01614904, + "epoch": 0.672418457838569, + "flos": 21724773995520.0, + "grad_norm": 2.4335847744494923, + "language_loss": 0.79111218, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.81203669, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 11184, + "time_per_iteration": 2.3463525772094727 + }, + { + "auxiliary_loss_clip": 0.01058395, + "auxiliary_loss_mlp": 0.01039578, + "balance_loss_clip": 1.0134716, + "balance_loss_mlp": 1.01889586, + "epoch": 0.672478581091237, + "flos": 21469314510720.0, + "grad_norm": 1.8171154627750272, + "language_loss": 0.6786105, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.69959021, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39453125, + "step": 11185, + "time_per_iteration": 2.425527811050415 + }, + { + "auxiliary_loss_clip": 0.01054328, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.01412034, + "balance_loss_mlp": 1.01714611, + "epoch": 0.6725387043439051, + "flos": 30845449330560.0, + "grad_norm": 2.223599457505325, + "language_loss": 0.81936312, + "learning_rate": 1.023135571620345e-06, + "loss": 0.84029835, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 11186, + "time_per_iteration": 2.4141592979431152 + }, + { + "auxiliary_loss_clip": 0.01052476, + "auxiliary_loss_mlp": 0.01039113, + "balance_loss_clip": 1.01567614, + "balance_loss_mlp": 1.0172627, + "epoch": 0.672598827596573, + "flos": 24054947424000.0, + "grad_norm": 1.648590536068683, + "language_loss": 0.81096184, + "learning_rate": 1.022795745163813e-06, + "loss": 0.83187771, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 11187, + "time_per_iteration": 2.444425106048584 + }, + { + "auxiliary_loss_clip": 0.010566, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_clip": 1.01622593, + "balance_loss_mlp": 1.0173372, + "epoch": 0.672658950849241, + "flos": 21870780768000.0, + "grad_norm": 2.1881549012276404, + "language_loss": 0.71703374, + "learning_rate": 1.022455955762965e-06, + "loss": 0.73802495, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 11188, + "time_per_iteration": 2.358220100402832 + }, + { + "auxiliary_loss_clip": 0.01051154, + "auxiliary_loss_mlp": 0.01038438, + "balance_loss_clip": 1.01522827, + "balance_loss_mlp": 1.01617527, + "epoch": 0.6727190741019089, + "flos": 23220976844160.0, + "grad_norm": 1.8130962046668468, + "language_loss": 0.77149785, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.7923938, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34960938, + "step": 11189, + "time_per_iteration": 2.4324443340301514 + }, + { + "auxiliary_loss_clip": 0.01056552, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.01652586, + "balance_loss_mlp": 1.01683569, + "epoch": 0.6727791973545769, + "flos": 15777703647360.0, + "grad_norm": 1.9808707417688924, + "language_loss": 0.76791763, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.78894353, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.39648438, + "step": 11190, + "time_per_iteration": 2.3326523303985596 + }, + { + "auxiliary_loss_clip": 0.01052976, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_clip": 1.0160172, + "balance_loss_mlp": 1.01641476, + "epoch": 0.6728393206072448, + "flos": 21248838074880.0, + "grad_norm": 1.5945699223644034, + "language_loss": 0.7780419, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79899585, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3671875, + "step": 11191, + "time_per_iteration": 2.4357545375823975 + }, + { + "auxiliary_loss_clip": 0.01051176, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.01224899, + "balance_loss_mlp": 1.0164814, + "epoch": 0.6728994438599128, + "flos": 32121943793280.0, + "grad_norm": 1.7506412938171345, + "language_loss": 0.87019533, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.89106953, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.34765625, + "step": 11192, + "time_per_iteration": 2.4485106468200684 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.01586175, + "balance_loss_mlp": 1.01699913, + "epoch": 0.6729595671125808, + "flos": 23111244840960.0, + "grad_norm": 1.8373273135529498, + "language_loss": 0.76596963, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78694487, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 11193, + "time_per_iteration": 2.409313678741455 + }, + { + "auxiliary_loss_clip": 0.01052978, + "auxiliary_loss_mlp": 0.01040051, + "balance_loss_clip": 1.01560068, + "balance_loss_mlp": 1.01635408, + "epoch": 0.6730196903652488, + "flos": 14610522251520.0, + "grad_norm": 1.9033375931183367, + "language_loss": 0.79331046, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.81424069, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 11194, + "time_per_iteration": 2.4071316719055176 + }, + { + "auxiliary_loss_clip": 0.0105477, + "auxiliary_loss_mlp": 0.01039071, + "balance_loss_clip": 1.01519322, + "balance_loss_mlp": 1.01688087, + "epoch": 0.6730798136179167, + "flos": 21104856161280.0, + "grad_norm": 1.949109161589922, + "language_loss": 0.91388071, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.9348191, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37890625, + "step": 11195, + "time_per_iteration": 2.4333207607269287 + }, + { + "auxiliary_loss_clip": 0.01052383, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.01389146, + "balance_loss_mlp": 1.01574063, + "epoch": 0.6731399368705847, + "flos": 28984997600640.0, + "grad_norm": 1.7134904256146055, + "language_loss": 0.73134601, + "learning_rate": 1.019738976106662e-06, + "loss": 0.75225061, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 11196, + "time_per_iteration": 2.407958507537842 + }, + { + "auxiliary_loss_clip": 0.01008505, + "auxiliary_loss_mlp": 0.01003068, + "balance_loss_clip": 1.00079107, + "balance_loss_mlp": 1.00156784, + "epoch": 0.6732000601232526, + "flos": 64740386298240.0, + "grad_norm": 0.7951740180139502, + "language_loss": 0.56653446, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58665019, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.06933594, + "step": 11197, + "time_per_iteration": 2.929255247116089 + }, + { + "auxiliary_loss_clip": 0.01050724, + "auxiliary_loss_mlp": 0.01038087, + "balance_loss_clip": 1.01586652, + "balance_loss_mlp": 1.01575494, + "epoch": 0.6732601833759206, + "flos": 17200693641600.0, + "grad_norm": 2.0893327541967097, + "language_loss": 0.76616001, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.78704816, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 11198, + "time_per_iteration": 2.3472352027893066 + }, + { + "auxiliary_loss_clip": 0.01052776, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.01435161, + "balance_loss_mlp": 1.01544476, + "epoch": 0.6733203066285887, + "flos": 18657933546240.0, + "grad_norm": 2.153742943219092, + "language_loss": 0.83109432, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.85201466, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37304688, + "step": 11199, + "time_per_iteration": 2.3395397663116455 + }, + { + "auxiliary_loss_clip": 0.01054604, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.01259303, + "balance_loss_mlp": 1.01709652, + "epoch": 0.6733804298812566, + "flos": 35807864204160.0, + "grad_norm": 1.7069933053319137, + "language_loss": 0.72391117, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.74483633, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 11200, + "time_per_iteration": 2.4895339012145996 + }, + { + "auxiliary_loss_clip": 0.01054871, + "auxiliary_loss_mlp": 0.01040325, + "balance_loss_clip": 1.01530313, + "balance_loss_mlp": 1.01791716, + "epoch": 0.6734405531339246, + "flos": 61636714364160.0, + "grad_norm": 1.6422479235830079, + "language_loss": 0.65071422, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.67166615, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 11201, + "time_per_iteration": 2.732311487197876 + }, + { + "auxiliary_loss_clip": 0.01054782, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.01335979, + "balance_loss_mlp": 1.01666737, + "epoch": 0.6735006763865925, + "flos": 20521282919040.0, + "grad_norm": 1.8576787956246579, + "language_loss": 0.6496762, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.6706025, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38085938, + "step": 11202, + "time_per_iteration": 2.3703365325927734 + }, + { + "auxiliary_loss_clip": 0.0105395, + "auxiliary_loss_mlp": 0.01035433, + "balance_loss_clip": 1.01335597, + "balance_loss_mlp": 1.01760232, + "epoch": 0.6735607996392605, + "flos": 13917985056000.0, + "grad_norm": 2.401803121906537, + "language_loss": 0.75684547, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.77773935, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 11203, + "time_per_iteration": 2.348654270172119 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.01328993, + "balance_loss_mlp": 1.01684558, + "epoch": 0.6736209228919284, + "flos": 18806244468480.0, + "grad_norm": 1.7524300373947743, + "language_loss": 0.68837124, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.70933485, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 11204, + "time_per_iteration": 2.33200740814209 + }, + { + "auxiliary_loss_clip": 0.01055602, + "auxiliary_loss_mlp": 0.01044696, + "balance_loss_clip": 1.0196383, + "balance_loss_mlp": 1.01784599, + "epoch": 0.6736810461445965, + "flos": 20372169035520.0, + "grad_norm": 1.5906572749603662, + "language_loss": 0.74530828, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76631129, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 11205, + "time_per_iteration": 2.376124382019043 + }, + { + "auxiliary_loss_clip": 0.01049553, + "auxiliary_loss_mlp": 0.01035842, + "balance_loss_clip": 1.01300144, + "balance_loss_mlp": 1.01482725, + "epoch": 0.6737411693972644, + "flos": 30006242046720.0, + "grad_norm": 1.5708399003439901, + "language_loss": 0.73156106, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.752415, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34765625, + "step": 11206, + "time_per_iteration": 2.445042133331299 + }, + { + "auxiliary_loss_clip": 0.01058824, + "auxiliary_loss_mlp": 0.01043912, + "balance_loss_clip": 1.01654124, + "balance_loss_mlp": 1.01839113, + "epoch": 0.6738012926499324, + "flos": 25446166214400.0, + "grad_norm": 1.9174645953156422, + "language_loss": 0.68303955, + "learning_rate": 1.016007014855092e-06, + "loss": 0.70406687, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.40429688, + "step": 11207, + "time_per_iteration": 2.450312376022339 + }, + { + "auxiliary_loss_clip": 0.01052176, + "auxiliary_loss_mlp": 0.01041617, + "balance_loss_clip": 1.01868117, + "balance_loss_mlp": 1.01691866, + "epoch": 0.6738614159026003, + "flos": 20775834708480.0, + "grad_norm": 2.553395698014128, + "language_loss": 0.7439664, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.76490438, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 11208, + "time_per_iteration": 2.3964240550994873 + }, + { + "auxiliary_loss_clip": 0.0105424, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.01412988, + "balance_loss_mlp": 1.01719582, + "epoch": 0.6739215391552683, + "flos": 19566059587200.0, + "grad_norm": 5.514860337447355, + "language_loss": 0.76861751, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.78955001, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37109375, + "step": 11209, + "time_per_iteration": 3.5512943267822266 + }, + { + "auxiliary_loss_clip": 0.01049727, + "auxiliary_loss_mlp": 0.01034526, + "balance_loss_clip": 1.01145935, + "balance_loss_mlp": 1.01496768, + "epoch": 0.6739816624079362, + "flos": 24387075987840.0, + "grad_norm": 1.7109167790238478, + "language_loss": 0.67575395, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.6965965, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34570312, + "step": 11210, + "time_per_iteration": 2.400242567062378 + }, + { + "auxiliary_loss_clip": 0.01049826, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.01256609, + "balance_loss_mlp": 1.0153445, + "epoch": 0.6740417856606042, + "flos": 22527078105600.0, + "grad_norm": 6.295556336115029, + "language_loss": 0.80925012, + "learning_rate": 1.014651056529377e-06, + "loss": 0.83008969, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34375, + "step": 11211, + "time_per_iteration": 2.3902089595794678 + }, + { + "auxiliary_loss_clip": 0.01050896, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.00891602, + "balance_loss_mlp": 1.01610374, + "epoch": 0.6741019089132723, + "flos": 25774279971840.0, + "grad_norm": 1.5331738405103945, + "language_loss": 0.77070451, + "learning_rate": 1.014312160327143e-06, + "loss": 0.79152459, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 11212, + "time_per_iteration": 3.783914566040039 + }, + { + "auxiliary_loss_clip": 0.01052622, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.01617384, + "balance_loss_mlp": 1.01603556, + "epoch": 0.6741620321659402, + "flos": 21104611781760.0, + "grad_norm": 2.272392810555447, + "language_loss": 0.79225612, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.81318307, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 11213, + "time_per_iteration": 3.7933425903320312 + }, + { + "auxiliary_loss_clip": 0.01053912, + "auxiliary_loss_mlp": 0.01039243, + "balance_loss_clip": 1.01329112, + "balance_loss_mlp": 1.01594782, + "epoch": 0.6742221554186082, + "flos": 20739385382400.0, + "grad_norm": 1.7533477792829248, + "language_loss": 0.69426835, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.71519989, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 11214, + "time_per_iteration": 2.3653347492218018 + }, + { + "auxiliary_loss_clip": 0.01052808, + "auxiliary_loss_mlp": 0.01040944, + "balance_loss_clip": 1.01731682, + "balance_loss_mlp": 1.01649487, + "epoch": 0.6742822786712761, + "flos": 37772776321920.0, + "grad_norm": 2.514347004734488, + "language_loss": 0.73163247, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.75257003, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 11215, + "time_per_iteration": 2.508117198944092 + }, + { + "auxiliary_loss_clip": 0.01053148, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.01477218, + "balance_loss_mlp": 1.01641965, + "epoch": 0.6743424019239441, + "flos": 37262520668160.0, + "grad_norm": 2.483328537531991, + "language_loss": 0.68133378, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.70224416, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 11216, + "time_per_iteration": 2.4973838329315186 + }, + { + "auxiliary_loss_clip": 0.01007918, + "auxiliary_loss_mlp": 0.01007348, + "balance_loss_clip": 1.00511837, + "balance_loss_mlp": 1.00111365, + "epoch": 0.674402525176612, + "flos": 65994011953920.0, + "grad_norm": 0.7531869504142047, + "language_loss": 0.56323338, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58338606, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.06835938, + "step": 11217, + "time_per_iteration": 3.0888004302978516 + }, + { + "auxiliary_loss_clip": 0.0105226, + "auxiliary_loss_mlp": 0.01040481, + "balance_loss_clip": 1.01543546, + "balance_loss_mlp": 1.01634264, + "epoch": 0.67446264842928, + "flos": 26460218920320.0, + "grad_norm": 1.8496034563233383, + "language_loss": 0.75967765, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.78060502, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.359375, + "step": 11218, + "time_per_iteration": 2.4399452209472656 + }, + { + "auxiliary_loss_clip": 0.01052507, + "auxiliary_loss_mlp": 0.01043139, + "balance_loss_clip": 1.0179497, + "balance_loss_mlp": 1.01618588, + "epoch": 0.674522771681948, + "flos": 23731267409280.0, + "grad_norm": 1.5400469968713306, + "language_loss": 0.66781509, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68877155, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 11219, + "time_per_iteration": 2.3657610416412354 + }, + { + "auxiliary_loss_clip": 0.01055392, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.01454699, + "balance_loss_mlp": 1.0168941, + "epoch": 0.674582894934616, + "flos": 24753175171200.0, + "grad_norm": 5.100870799851118, + "language_loss": 0.76080352, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.78174919, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38476562, + "step": 11220, + "time_per_iteration": 2.423861265182495 + }, + { + "auxiliary_loss_clip": 0.01054428, + "auxiliary_loss_mlp": 0.01039989, + "balance_loss_clip": 1.01484752, + "balance_loss_mlp": 1.01705027, + "epoch": 0.6746430181872839, + "flos": 24825480330240.0, + "grad_norm": 1.543893623072826, + "language_loss": 0.71649611, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.73744029, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 11221, + "time_per_iteration": 2.3897268772125244 + }, + { + "auxiliary_loss_clip": 0.01051359, + "auxiliary_loss_mlp": 0.01040336, + "balance_loss_clip": 1.01707792, + "balance_loss_mlp": 1.01619387, + "epoch": 0.6747031414399519, + "flos": 16872544972800.0, + "grad_norm": 2.281338816938866, + "language_loss": 0.59331489, + "learning_rate": 1.010925256180498e-06, + "loss": 0.61423182, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 11222, + "time_per_iteration": 2.3602776527404785 + }, + { + "auxiliary_loss_clip": 0.01052646, + "auxiliary_loss_mlp": 0.01040442, + "balance_loss_clip": 1.01488304, + "balance_loss_mlp": 1.01633692, + "epoch": 0.6747632646926198, + "flos": 22783794399360.0, + "grad_norm": 1.617190723229836, + "language_loss": 0.77581966, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.79675055, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36328125, + "step": 11223, + "time_per_iteration": 3.811629056930542 + }, + { + "auxiliary_loss_clip": 0.01054491, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.01515436, + "balance_loss_mlp": 1.01690674, + "epoch": 0.6748233879452878, + "flos": 20045102618880.0, + "grad_norm": 1.673007834055559, + "language_loss": 0.75642747, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77737963, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 11224, + "time_per_iteration": 2.365281343460083 + }, + { + "auxiliary_loss_clip": 0.01051037, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.01365995, + "balance_loss_mlp": 1.01593423, + "epoch": 0.6748835111979558, + "flos": 23001722305920.0, + "grad_norm": 1.6241218338514314, + "language_loss": 0.63629115, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65714943, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.3515625, + "step": 11225, + "time_per_iteration": 2.3551177978515625 + }, + { + "auxiliary_loss_clip": 0.01050001, + "auxiliary_loss_mlp": 0.01036246, + "balance_loss_clip": 1.01464486, + "balance_loss_mlp": 1.01591051, + "epoch": 0.6749436344506238, + "flos": 12196662560640.0, + "grad_norm": 1.7124872275994794, + "language_loss": 0.64428532, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66514784, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34179688, + "step": 11226, + "time_per_iteration": 2.338573694229126 + }, + { + "auxiliary_loss_clip": 0.01054104, + "auxiliary_loss_mlp": 0.01039771, + "balance_loss_clip": 1.01572669, + "balance_loss_mlp": 1.01711082, + "epoch": 0.6750037577032918, + "flos": 11872947634560.0, + "grad_norm": 2.1996949634505496, + "language_loss": 0.73554635, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.7564851, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 11227, + "time_per_iteration": 2.3238253593444824 + }, + { + "auxiliary_loss_clip": 0.01051943, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.01114774, + "balance_loss_mlp": 1.01623905, + "epoch": 0.6750638809559597, + "flos": 17018656479360.0, + "grad_norm": 2.0290034100689844, + "language_loss": 0.72095686, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.74181843, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 11228, + "time_per_iteration": 2.4230082035064697 + }, + { + "auxiliary_loss_clip": 0.01009732, + "auxiliary_loss_mlp": 0.01002175, + "balance_loss_clip": 0.9997192, + "balance_loss_mlp": 1.00286996, + "epoch": 0.6751240042086277, + "flos": 70947384785280.0, + "grad_norm": 0.7591488868751114, + "language_loss": 0.5330714, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55319047, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.06835938, + "step": 11229, + "time_per_iteration": 3.048752546310425 + }, + { + "auxiliary_loss_clip": 0.01052747, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.01099193, + "balance_loss_mlp": 1.01641679, + "epoch": 0.6751841274612956, + "flos": 22674027484800.0, + "grad_norm": 2.052975486513101, + "language_loss": 0.81281394, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.83368421, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 11230, + "time_per_iteration": 2.4306225776672363 + }, + { + "auxiliary_loss_clip": 0.01050639, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.01383841, + "balance_loss_mlp": 1.01652777, + "epoch": 0.6752442507139637, + "flos": 21287556639360.0, + "grad_norm": 1.5598429293008773, + "language_loss": 0.67060828, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.69147897, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.33984375, + "step": 11231, + "time_per_iteration": 2.3528716564178467 + }, + { + "auxiliary_loss_clip": 0.01055204, + "auxiliary_loss_mlp": 0.01043799, + "balance_loss_clip": 1.01647604, + "balance_loss_mlp": 1.01662707, + "epoch": 0.6753043739666316, + "flos": 28255661965440.0, + "grad_norm": 3.0749381550938386, + "language_loss": 0.68208766, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.70307767, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38476562, + "step": 11232, + "time_per_iteration": 2.427762746810913 + }, + { + "auxiliary_loss_clip": 0.01051437, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.01266837, + "balance_loss_mlp": 1.01607144, + "epoch": 0.6753644972192996, + "flos": 21359303216640.0, + "grad_norm": 1.5474324197997924, + "language_loss": 0.73380989, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.75467777, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 11233, + "time_per_iteration": 2.3637351989746094 + }, + { + "auxiliary_loss_clip": 0.01053385, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02027345, + "balance_loss_mlp": 1.01660776, + "epoch": 0.6754246204719675, + "flos": 26540763160320.0, + "grad_norm": 1.7190881463842758, + "language_loss": 0.77959764, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.80057788, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 11234, + "time_per_iteration": 2.4112913608551025 + }, + { + "auxiliary_loss_clip": 0.01052186, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.01274753, + "balance_loss_mlp": 1.0160284, + "epoch": 0.6754847437246355, + "flos": 25555514192640.0, + "grad_norm": 1.4980263651276857, + "language_loss": 0.76174974, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.78262603, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 11235, + "time_per_iteration": 2.3982231616973877 + }, + { + "auxiliary_loss_clip": 0.01009136, + "auxiliary_loss_mlp": 0.01004784, + "balance_loss_clip": 1.00220931, + "balance_loss_mlp": 1.0022217, + "epoch": 0.6755448669773034, + "flos": 59510502432000.0, + "grad_norm": 0.7978702145228411, + "language_loss": 0.515154, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53529322, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.06933594, + "step": 11236, + "time_per_iteration": 2.9726338386535645 + }, + { + "auxiliary_loss_clip": 0.01052986, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.01403999, + "balance_loss_mlp": 1.0172559, + "epoch": 0.6756049902299714, + "flos": 23293421648640.0, + "grad_norm": 2.251847424990654, + "language_loss": 0.77061641, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.79153585, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.35742188, + "step": 11237, + "time_per_iteration": 2.3756790161132812 + }, + { + "auxiliary_loss_clip": 0.01053566, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.01998305, + "balance_loss_mlp": 1.01745725, + "epoch": 0.6756651134826394, + "flos": 31574121649920.0, + "grad_norm": 1.9260401935067815, + "language_loss": 0.78808886, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.80907202, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 11238, + "time_per_iteration": 2.4313414096832275 + }, + { + "auxiliary_loss_clip": 0.0105598, + "auxiliary_loss_mlp": 0.01042293, + "balance_loss_clip": 1.01649618, + "balance_loss_mlp": 1.01728463, + "epoch": 0.6757252367353074, + "flos": 27271041402240.0, + "grad_norm": 1.701257224388742, + "language_loss": 0.68104428, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.70202702, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 11239, + "time_per_iteration": 2.3921566009521484 + }, + { + "auxiliary_loss_clip": 0.0105221, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.01348913, + "balance_loss_mlp": 1.01679671, + "epoch": 0.6757853599879754, + "flos": 16830125804160.0, + "grad_norm": 1.7479341775624346, + "language_loss": 0.83706176, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85794556, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 11240, + "time_per_iteration": 2.3573455810546875 + }, + { + "auxiliary_loss_clip": 0.01057452, + "auxiliary_loss_mlp": 0.01048263, + "balance_loss_clip": 1.01933074, + "balance_loss_mlp": 1.01769531, + "epoch": 0.6758454832406433, + "flos": 23218986896640.0, + "grad_norm": 2.0989220104304906, + "language_loss": 0.75970536, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.78076249, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.3984375, + "step": 11241, + "time_per_iteration": 2.3480899333953857 + }, + { + "auxiliary_loss_clip": 0.01054355, + "auxiliary_loss_mlp": 0.01040837, + "balance_loss_clip": 1.01555252, + "balance_loss_mlp": 1.01720202, + "epoch": 0.6759056064933113, + "flos": 16288622616960.0, + "grad_norm": 2.2666344290488536, + "language_loss": 0.81797755, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.83892941, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 11242, + "time_per_iteration": 2.3419365882873535 + }, + { + "auxiliary_loss_clip": 0.0105158, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.01485217, + "balance_loss_mlp": 1.01556087, + "epoch": 0.6759657297459792, + "flos": 25921089705600.0, + "grad_norm": 1.965817171642329, + "language_loss": 0.73769748, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.75859642, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36132812, + "step": 11243, + "time_per_iteration": 2.4012815952301025 + }, + { + "auxiliary_loss_clip": 0.01053362, + "auxiliary_loss_mlp": 0.01041304, + "balance_loss_clip": 1.01741409, + "balance_loss_mlp": 1.01702034, + "epoch": 0.6760258529986473, + "flos": 22999767269760.0, + "grad_norm": 1.746431872682607, + "language_loss": 0.73671883, + "learning_rate": 1.003487287162221e-06, + "loss": 0.75766551, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 11244, + "time_per_iteration": 2.3787403106689453 + }, + { + "auxiliary_loss_clip": 0.01053845, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_clip": 1.02049255, + "balance_loss_mlp": 1.01675367, + "epoch": 0.6760859762513152, + "flos": 20958290807040.0, + "grad_norm": 1.8975951159605597, + "language_loss": 0.86797643, + "learning_rate": 1.003149631190393e-06, + "loss": 0.88896346, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 11245, + "time_per_iteration": 2.3712878227233887 + }, + { + "auxiliary_loss_clip": 0.01056182, + "auxiliary_loss_mlp": 0.01040997, + "balance_loss_clip": 1.01605821, + "balance_loss_mlp": 1.01714551, + "epoch": 0.6761460995039832, + "flos": 23621814696960.0, + "grad_norm": 1.7636016090495583, + "language_loss": 0.75315988, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.77413166, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.390625, + "step": 11246, + "time_per_iteration": 2.3789920806884766 + }, + { + "auxiliary_loss_clip": 0.01052206, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.01219273, + "balance_loss_mlp": 1.0157702, + "epoch": 0.6762062227566511, + "flos": 20770004511360.0, + "grad_norm": 1.7842532650098828, + "language_loss": 0.88562119, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90649885, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 11247, + "time_per_iteration": 2.3573360443115234 + }, + { + "auxiliary_loss_clip": 0.01007944, + "auxiliary_loss_mlp": 0.01004182, + "balance_loss_clip": 1.00194085, + "balance_loss_mlp": 1.00108695, + "epoch": 0.6762663460093191, + "flos": 52814963157120.0, + "grad_norm": 0.8379559024183869, + "language_loss": 0.54075325, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56087452, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06835938, + "step": 11248, + "time_per_iteration": 4.254245758056641 + }, + { + "auxiliary_loss_clip": 0.01049191, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.01664805, + "balance_loss_mlp": 1.01621783, + "epoch": 0.676326469261987, + "flos": 23695167196800.0, + "grad_norm": 1.985554267292256, + "language_loss": 0.74695665, + "learning_rate": 1.001799385437761e-06, + "loss": 0.76783079, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33007812, + "step": 11249, + "time_per_iteration": 2.422506809234619 + }, + { + "auxiliary_loss_clip": 0.01053067, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.01981568, + "balance_loss_mlp": 1.01568723, + "epoch": 0.676386592514655, + "flos": 14062874664960.0, + "grad_norm": 2.3508218292613954, + "language_loss": 0.75480735, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.77580881, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.37304688, + "step": 11250, + "time_per_iteration": 2.373190402984619 + }, + { + "auxiliary_loss_clip": 0.01054302, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_clip": 1.01902938, + "balance_loss_mlp": 1.01720679, + "epoch": 0.676446715767323, + "flos": 20411201802240.0, + "grad_norm": 2.474140781676039, + "language_loss": 0.77035689, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.79133081, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 11251, + "time_per_iteration": 2.3637707233428955 + }, + { + "auxiliary_loss_clip": 0.01052734, + "auxiliary_loss_mlp": 0.0103694, + "balance_loss_clip": 1.01264477, + "balance_loss_mlp": 1.01679325, + "epoch": 0.676506839019991, + "flos": 21287172614400.0, + "grad_norm": 1.5594920691893785, + "language_loss": 0.71216697, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.7330637, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.359375, + "step": 11252, + "time_per_iteration": 3.7509493827819824 + }, + { + "auxiliary_loss_clip": 0.01054972, + "auxiliary_loss_mlp": 0.01039842, + "balance_loss_clip": 1.01634562, + "balance_loss_mlp": 1.01746178, + "epoch": 0.676566962272659, + "flos": 29931248712960.0, + "grad_norm": 1.7012152859068883, + "language_loss": 0.68295085, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.70389897, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 11253, + "time_per_iteration": 3.8877933025360107 + }, + { + "auxiliary_loss_clip": 0.0105668, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_clip": 1.0169673, + "balance_loss_mlp": 1.01819921, + "epoch": 0.6766270855253269, + "flos": 17930238744960.0, + "grad_norm": 1.8017611255494455, + "language_loss": 0.78054392, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.80155408, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38476562, + "step": 11254, + "time_per_iteration": 2.352283239364624 + }, + { + "auxiliary_loss_clip": 0.01054465, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.01386535, + "balance_loss_mlp": 1.01736975, + "epoch": 0.6766872087779949, + "flos": 23103948366720.0, + "grad_norm": 1.8887426335313948, + "language_loss": 0.73382336, + "learning_rate": 9.997751526206835e-07, + "loss": 0.75474608, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 11255, + "time_per_iteration": 2.4247100353240967 + }, + { + "auxiliary_loss_clip": 0.01053473, + "auxiliary_loss_mlp": 0.01045862, + "balance_loss_clip": 1.01958871, + "balance_loss_mlp": 1.01622665, + "epoch": 0.6767473320306628, + "flos": 26211951175680.0, + "grad_norm": 1.9549020741031604, + "language_loss": 0.7714802, + "learning_rate": 9.994379131600828e-07, + "loss": 0.7924735, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37304688, + "step": 11256, + "time_per_iteration": 2.4050979614257812 + }, + { + "auxiliary_loss_clip": 0.01054935, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.01193953, + "balance_loss_mlp": 1.01843023, + "epoch": 0.6768074552833309, + "flos": 18367770303360.0, + "grad_norm": 2.0767070682364213, + "language_loss": 0.66376221, + "learning_rate": 9.991007116408965e-07, + "loss": 0.68468213, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36523438, + "step": 11257, + "time_per_iteration": 2.3848414421081543 + }, + { + "auxiliary_loss_clip": 0.01050617, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.01387405, + "balance_loss_mlp": 1.01533937, + "epoch": 0.6768675785359988, + "flos": 23038800036480.0, + "grad_norm": 1.594611908202567, + "language_loss": 0.7636444, + "learning_rate": 9.987635480759109e-07, + "loss": 0.78451592, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 11258, + "time_per_iteration": 2.360917806625366 + }, + { + "auxiliary_loss_clip": 0.01051723, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.01588309, + "balance_loss_mlp": 1.01611567, + "epoch": 0.6769277017886668, + "flos": 33035131981440.0, + "grad_norm": 1.5451505589981567, + "language_loss": 0.6802907, + "learning_rate": 9.984264224779127e-07, + "loss": 0.70121431, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35546875, + "step": 11259, + "time_per_iteration": 2.461118698120117 + }, + { + "auxiliary_loss_clip": 0.01054307, + "auxiliary_loss_mlp": 0.01038829, + "balance_loss_clip": 1.0150708, + "balance_loss_mlp": 1.01704264, + "epoch": 0.6769878250413347, + "flos": 20847406728960.0, + "grad_norm": 2.014787359444734, + "language_loss": 0.86561245, + "learning_rate": 9.980893348596839e-07, + "loss": 0.88654387, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37304688, + "step": 11260, + "time_per_iteration": 2.3591816425323486 + }, + { + "auxiliary_loss_clip": 0.01054857, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.01181448, + "balance_loss_mlp": 1.01662123, + "epoch": 0.6770479482940027, + "flos": 15595072992000.0, + "grad_norm": 2.4520220412066824, + "language_loss": 0.78611684, + "learning_rate": 9.977522852340081e-07, + "loss": 0.80704415, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 11261, + "time_per_iteration": 2.3850655555725098 + }, + { + "auxiliary_loss_clip": 0.01054355, + "auxiliary_loss_mlp": 0.01048343, + "balance_loss_clip": 1.01871967, + "balance_loss_mlp": 1.01611567, + "epoch": 0.6771080715466706, + "flos": 18620122677120.0, + "grad_norm": 1.8252450479222821, + "language_loss": 0.88763559, + "learning_rate": 9.97415273613666e-07, + "loss": 0.90866256, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.3828125, + "step": 11262, + "time_per_iteration": 2.346998691558838 + }, + { + "auxiliary_loss_clip": 0.01055462, + "auxiliary_loss_mlp": 0.01038132, + "balance_loss_clip": 1.01258588, + "balance_loss_mlp": 1.01787972, + "epoch": 0.6771681947993387, + "flos": 12494611036800.0, + "grad_norm": 1.9814405248972815, + "language_loss": 0.75111645, + "learning_rate": 9.97078300011439e-07, + "loss": 0.77205247, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 11263, + "time_per_iteration": 3.758420944213867 + }, + { + "auxiliary_loss_clip": 0.01053714, + "auxiliary_loss_mlp": 0.01047035, + "balance_loss_clip": 1.02028441, + "balance_loss_mlp": 1.01604259, + "epoch": 0.6772283180520066, + "flos": 22235867521920.0, + "grad_norm": 2.1154180733141854, + "language_loss": 0.69256043, + "learning_rate": 9.967413644401016e-07, + "loss": 0.71356797, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.37695312, + "step": 11264, + "time_per_iteration": 2.369351387023926 + }, + { + "auxiliary_loss_clip": 0.01053264, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.01458621, + "balance_loss_mlp": 1.01742959, + "epoch": 0.6772884413046746, + "flos": 16142231819520.0, + "grad_norm": 2.2737909251482455, + "language_loss": 0.7466321, + "learning_rate": 9.964044669124324e-07, + "loss": 0.76755488, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 11265, + "time_per_iteration": 2.3745946884155273 + }, + { + "auxiliary_loss_clip": 0.0105203, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.01329088, + "balance_loss_mlp": 1.01638627, + "epoch": 0.6773485645573426, + "flos": 19134742250880.0, + "grad_norm": 2.765439884532397, + "language_loss": 0.62763751, + "learning_rate": 9.96067607441207e-07, + "loss": 0.648525, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 11266, + "time_per_iteration": 2.348555564880371 + }, + { + "auxiliary_loss_clip": 0.01055355, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_clip": 1.0155201, + "balance_loss_mlp": 1.01764941, + "epoch": 0.6774086878100105, + "flos": 14136052608000.0, + "grad_norm": 2.059845012442261, + "language_loss": 0.72085965, + "learning_rate": 9.957307860391976e-07, + "loss": 0.74182606, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 11267, + "time_per_iteration": 2.352788209915161 + }, + { + "auxiliary_loss_clip": 0.01053946, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.01550162, + "balance_loss_mlp": 1.01749241, + "epoch": 0.6774688110626785, + "flos": 22196066705280.0, + "grad_norm": 1.9447508611679605, + "language_loss": 0.7184279, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73937643, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 11268, + "time_per_iteration": 2.3769571781158447 + }, + { + "auxiliary_loss_clip": 0.01054605, + "auxiliary_loss_mlp": 0.01040565, + "balance_loss_clip": 1.01679468, + "balance_loss_mlp": 1.01732993, + "epoch": 0.6775289343153464, + "flos": 23038834947840.0, + "grad_norm": 1.4935839834510718, + "language_loss": 0.77973628, + "learning_rate": 9.950572574939194e-07, + "loss": 0.80068803, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37304688, + "step": 11269, + "time_per_iteration": 2.3872597217559814 + }, + { + "auxiliary_loss_clip": 0.01054923, + "auxiliary_loss_mlp": 0.01044737, + "balance_loss_clip": 1.01774836, + "balance_loss_mlp": 1.01649261, + "epoch": 0.6775890575680145, + "flos": 18292602412800.0, + "grad_norm": 2.699809028304398, + "language_loss": 0.75388038, + "learning_rate": 9.94720550376189e-07, + "loss": 0.77487695, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38476562, + "step": 11270, + "time_per_iteration": 2.3385002613067627 + }, + { + "auxiliary_loss_clip": 0.01053778, + "auxiliary_loss_mlp": 0.01039292, + "balance_loss_clip": 1.01356673, + "balance_loss_mlp": 1.01712251, + "epoch": 0.6776491808206824, + "flos": 25335317047680.0, + "grad_norm": 1.7617927050073507, + "language_loss": 0.73063993, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75157058, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3671875, + "step": 11271, + "time_per_iteration": 2.4167957305908203 + }, + { + "auxiliary_loss_clip": 0.01054766, + "auxiliary_loss_mlp": 0.01038748, + "balance_loss_clip": 1.01445317, + "balance_loss_mlp": 1.01685202, + "epoch": 0.6777093040733504, + "flos": 26027121104640.0, + "grad_norm": 1.7515857738516651, + "language_loss": 0.68855506, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70949018, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37890625, + "step": 11272, + "time_per_iteration": 2.4052112102508545 + }, + { + "auxiliary_loss_clip": 0.01056218, + "auxiliary_loss_mlp": 0.01043533, + "balance_loss_clip": 1.01592433, + "balance_loss_mlp": 1.01702034, + "epoch": 0.6777694273260183, + "flos": 18002648638080.0, + "grad_norm": 1.8895045822361585, + "language_loss": 0.75351954, + "learning_rate": 9.937106577958481e-07, + "loss": 0.774517, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.390625, + "step": 11273, + "time_per_iteration": 2.363518238067627 + }, + { + "auxiliary_loss_clip": 0.01051835, + "auxiliary_loss_mlp": 0.01044938, + "balance_loss_clip": 1.02055943, + "balance_loss_mlp": 1.01611567, + "epoch": 0.6778295505786863, + "flos": 23439952091520.0, + "grad_norm": 1.7445604503654453, + "language_loss": 0.70837665, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72934437, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35742188, + "step": 11274, + "time_per_iteration": 2.3861422538757324 + }, + { + "auxiliary_loss_clip": 0.01056165, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.01400733, + "balance_loss_mlp": 1.01811242, + "epoch": 0.6778896738313542, + "flos": 19097420140800.0, + "grad_norm": 1.547533175675112, + "language_loss": 0.67390382, + "learning_rate": 9.930375868473093e-07, + "loss": 0.69487834, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38085938, + "step": 11275, + "time_per_iteration": 2.4139182567596436 + }, + { + "auxiliary_loss_clip": 0.01055886, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.01435518, + "balance_loss_mlp": 1.0188179, + "epoch": 0.6779497970840223, + "flos": 26102742842880.0, + "grad_norm": 2.179060574676903, + "language_loss": 0.73494768, + "learning_rate": 9.927011086428335e-07, + "loss": 0.75589257, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 11276, + "time_per_iteration": 2.4253876209259033 + }, + { + "auxiliary_loss_clip": 0.01053492, + "auxiliary_loss_mlp": 0.01042398, + "balance_loss_clip": 1.0160532, + "balance_loss_mlp": 1.01668525, + "epoch": 0.6780099203366902, + "flos": 19718210759040.0, + "grad_norm": 1.7095118807542862, + "language_loss": 0.77528596, + "learning_rate": 9.923646686352317e-07, + "loss": 0.79624486, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.36914062, + "step": 11277, + "time_per_iteration": 2.3601174354553223 + }, + { + "auxiliary_loss_clip": 0.01054054, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.01425159, + "balance_loss_mlp": 1.01632261, + "epoch": 0.6780700435893582, + "flos": 18213803740800.0, + "grad_norm": 2.579252165846449, + "language_loss": 0.85349035, + "learning_rate": 9.920282668372627e-07, + "loss": 0.87441659, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37695312, + "step": 11278, + "time_per_iteration": 2.3053858280181885 + }, + { + "auxiliary_loss_clip": 0.0105086, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.01296175, + "balance_loss_mlp": 1.01654005, + "epoch": 0.6781301668420262, + "flos": 25375013130240.0, + "grad_norm": 1.6275199152831021, + "language_loss": 0.7088905, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72974271, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34179688, + "step": 11279, + "time_per_iteration": 2.467942237854004 + }, + { + "auxiliary_loss_clip": 0.01055205, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.01404333, + "balance_loss_mlp": 1.01787639, + "epoch": 0.6781902900946941, + "flos": 24019405793280.0, + "grad_norm": 1.9048314269531397, + "language_loss": 0.75921476, + "learning_rate": 9.913555779212485e-07, + "loss": 0.7801646, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 11280, + "time_per_iteration": 2.373027801513672 + }, + { + "auxiliary_loss_clip": 0.01053004, + "auxiliary_loss_mlp": 0.01046597, + "balance_loss_clip": 1.02101493, + "balance_loss_mlp": 1.01571667, + "epoch": 0.6782504133473621, + "flos": 19645731043200.0, + "grad_norm": 1.933457490988833, + "language_loss": 0.71751374, + "learning_rate": 9.910192908287104e-07, + "loss": 0.73850977, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 11281, + "time_per_iteration": 2.3672471046447754 + }, + { + "auxiliary_loss_clip": 0.01051609, + "auxiliary_loss_mlp": 0.01040307, + "balance_loss_clip": 1.01778805, + "balance_loss_mlp": 1.01540756, + "epoch": 0.67831053660003, + "flos": 24931686286080.0, + "grad_norm": 1.574980703608313, + "language_loss": 0.65066314, + "learning_rate": 9.906830419968217e-07, + "loss": 0.67158228, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 11282, + "time_per_iteration": 2.4322304725646973 + }, + { + "auxiliary_loss_clip": 0.01056606, + "auxiliary_loss_mlp": 0.01050129, + "balance_loss_clip": 1.02331901, + "balance_loss_mlp": 1.01752567, + "epoch": 0.6783706598526981, + "flos": 31207149682560.0, + "grad_norm": 1.5794368277177209, + "language_loss": 0.75686443, + "learning_rate": 9.90346831438334e-07, + "loss": 0.77793181, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 11283, + "time_per_iteration": 2.437342643737793 + }, + { + "auxiliary_loss_clip": 0.01051822, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.01377225, + "balance_loss_mlp": 1.01679659, + "epoch": 0.678430783105366, + "flos": 35439949630080.0, + "grad_norm": 1.623192298622196, + "language_loss": 0.57496369, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59584117, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 11284, + "time_per_iteration": 2.4764459133148193 + }, + { + "auxiliary_loss_clip": 0.01052937, + "auxiliary_loss_mlp": 0.01037287, + "balance_loss_clip": 1.01383805, + "balance_loss_mlp": 1.01598859, + "epoch": 0.678490906358034, + "flos": 14427926507520.0, + "grad_norm": 2.0865480729638732, + "language_loss": 0.76822889, + "learning_rate": 9.896745251925535e-07, + "loss": 0.78913116, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 11285, + "time_per_iteration": 2.3642964363098145 + }, + { + "auxiliary_loss_clip": 0.01052086, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.01831532, + "balance_loss_mlp": 1.01640415, + "epoch": 0.6785510296107019, + "flos": 24310232352000.0, + "grad_norm": 1.9595682947868875, + "language_loss": 0.67616379, + "learning_rate": 9.893384295307557e-07, + "loss": 0.69710994, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 11286, + "time_per_iteration": 2.3742895126342773 + }, + { + "auxiliary_loss_clip": 0.01052049, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.01677799, + "balance_loss_mlp": 1.01536, + "epoch": 0.6786111528633699, + "flos": 26976095303040.0, + "grad_norm": 2.204643939786479, + "language_loss": 0.53921473, + "learning_rate": 9.890023721933447e-07, + "loss": 0.56014872, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 11287, + "time_per_iteration": 2.4239234924316406 + }, + { + "auxiliary_loss_clip": 0.01053209, + "auxiliary_loss_mlp": 0.01039124, + "balance_loss_clip": 1.01602173, + "balance_loss_mlp": 1.01763463, + "epoch": 0.6786712761160378, + "flos": 24316376751360.0, + "grad_norm": 1.518466352182726, + "language_loss": 0.77912945, + "learning_rate": 9.886663531930655e-07, + "loss": 0.80005276, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 11288, + "time_per_iteration": 3.675100803375244 + }, + { + "auxiliary_loss_clip": 0.01053751, + "auxiliary_loss_mlp": 0.01044296, + "balance_loss_clip": 1.01960802, + "balance_loss_mlp": 1.01668131, + "epoch": 0.6787313993687059, + "flos": 22929312412800.0, + "grad_norm": 2.472114699776041, + "language_loss": 0.74512041, + "learning_rate": 9.883303725426593e-07, + "loss": 0.76610088, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 11289, + "time_per_iteration": 2.428295135498047 + }, + { + "auxiliary_loss_clip": 0.01052362, + "auxiliary_loss_mlp": 0.01039552, + "balance_loss_clip": 1.01660419, + "balance_loss_mlp": 1.01623309, + "epoch": 0.6787915226213738, + "flos": 26867270995200.0, + "grad_norm": 1.4191528310959205, + "language_loss": 0.80967057, + "learning_rate": 9.879944302548682e-07, + "loss": 0.83058971, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36132812, + "step": 11290, + "time_per_iteration": 2.3952667713165283 + }, + { + "auxiliary_loss_clip": 0.0105147, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.01262355, + "balance_loss_mlp": 1.01670134, + "epoch": 0.6788516458740418, + "flos": 20007885242880.0, + "grad_norm": 1.4823884987894118, + "language_loss": 0.75626892, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77712154, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34765625, + "step": 11291, + "time_per_iteration": 2.3917276859283447 + }, + { + "auxiliary_loss_clip": 0.01053293, + "auxiliary_loss_mlp": 0.01039161, + "balance_loss_clip": 1.01527095, + "balance_loss_mlp": 1.01712668, + "epoch": 0.6789117691267098, + "flos": 28725942245760.0, + "grad_norm": 2.3918937027607154, + "language_loss": 0.76264089, + "learning_rate": 9.873226608180785e-07, + "loss": 0.78356546, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 11292, + "time_per_iteration": 3.9469332695007324 + }, + { + "auxiliary_loss_clip": 0.01052127, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.01464844, + "balance_loss_mlp": 1.01591349, + "epoch": 0.6789718923793777, + "flos": 23402350690560.0, + "grad_norm": 2.296129383324774, + "language_loss": 0.85437715, + "learning_rate": 9.869868336945556e-07, + "loss": 0.87529814, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 11293, + "time_per_iteration": 2.3850936889648438 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_clip": 1.01641655, + "balance_loss_mlp": 1.0176754, + "epoch": 0.6790320156320457, + "flos": 20447825685120.0, + "grad_norm": 2.0774252407999225, + "language_loss": 0.80777556, + "learning_rate": 9.866510449845929e-07, + "loss": 0.82877862, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.390625, + "step": 11294, + "time_per_iteration": 2.3601133823394775 + }, + { + "auxiliary_loss_clip": 0.01052478, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.01342797, + "balance_loss_mlp": 1.01683903, + "epoch": 0.6790921388847136, + "flos": 24166145704320.0, + "grad_norm": 1.8230657832273485, + "language_loss": 0.79918033, + "learning_rate": 9.86315294700924e-07, + "loss": 0.82006478, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 11295, + "time_per_iteration": 2.392289400100708 + }, + { + "auxiliary_loss_clip": 0.0105049, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.01309061, + "balance_loss_mlp": 1.0164479, + "epoch": 0.6791522621373817, + "flos": 21907020625920.0, + "grad_norm": 1.7376730216707605, + "language_loss": 0.72434521, + "learning_rate": 9.859795828562823e-07, + "loss": 0.74518573, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 11296, + "time_per_iteration": 2.3648340702056885 + }, + { + "auxiliary_loss_clip": 0.01051624, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.01579261, + "balance_loss_mlp": 1.01652813, + "epoch": 0.6792123853900496, + "flos": 24825375596160.0, + "grad_norm": 1.6101181970844585, + "language_loss": 0.71841669, + "learning_rate": 9.856439094633949e-07, + "loss": 0.73931611, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 11297, + "time_per_iteration": 2.4036624431610107 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.01333082, + "balance_loss_mlp": 1.01636243, + "epoch": 0.6792725086427176, + "flos": 17565326547840.0, + "grad_norm": 1.9507210384189233, + "language_loss": 0.67675126, + "learning_rate": 9.853082745349918e-07, + "loss": 0.69766843, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37695312, + "step": 11298, + "time_per_iteration": 2.3248016834259033 + }, + { + "auxiliary_loss_clip": 0.01054078, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.01642966, + "balance_loss_mlp": 1.01756382, + "epoch": 0.6793326318953855, + "flos": 26940658406400.0, + "grad_norm": 1.9121428418216855, + "language_loss": 0.73110199, + "learning_rate": 9.84972678083801e-07, + "loss": 0.75202286, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36523438, + "step": 11299, + "time_per_iteration": 2.4153406620025635 + }, + { + "auxiliary_loss_clip": 0.01053092, + "auxiliary_loss_mlp": 0.01040864, + "balance_loss_clip": 1.01671267, + "balance_loss_mlp": 1.01679838, + "epoch": 0.6793927551480535, + "flos": 24317074978560.0, + "grad_norm": 1.4209445640763256, + "language_loss": 0.77907449, + "learning_rate": 9.846371201225488e-07, + "loss": 0.80001402, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 11300, + "time_per_iteration": 2.4375524520874023 + }, + { + "auxiliary_loss_clip": 0.01051945, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.0137862, + "balance_loss_mlp": 1.01626587, + "epoch": 0.6794528784007214, + "flos": 11435835012480.0, + "grad_norm": 2.1343495402906036, + "language_loss": 0.64181101, + "learning_rate": 9.843016006639577e-07, + "loss": 0.66270113, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 11301, + "time_per_iteration": 2.4139604568481445 + }, + { + "auxiliary_loss_clip": 0.01052558, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.01151466, + "balance_loss_mlp": 1.01651525, + "epoch": 0.6795130016533895, + "flos": 25228482687360.0, + "grad_norm": 1.804331287032592, + "language_loss": 0.83806932, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85892653, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.359375, + "step": 11302, + "time_per_iteration": 2.385406017303467 + }, + { + "auxiliary_loss_clip": 0.01052603, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.01627195, + "balance_loss_mlp": 1.01615071, + "epoch": 0.6795731249060574, + "flos": 18295430232960.0, + "grad_norm": 1.8396475543618434, + "language_loss": 0.71670312, + "learning_rate": 9.83630677305654e-07, + "loss": 0.73763376, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11303, + "time_per_iteration": 3.7659027576446533 + }, + { + "auxiliary_loss_clip": 0.01054309, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.0124383, + "balance_loss_mlp": 1.01661503, + "epoch": 0.6796332481587254, + "flos": 20299410028800.0, + "grad_norm": 2.434726827501631, + "language_loss": 0.71962571, + "learning_rate": 9.832952734313813e-07, + "loss": 0.74053669, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37695312, + "step": 11304, + "time_per_iteration": 2.3472208976745605 + }, + { + "auxiliary_loss_clip": 0.01054115, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.01615953, + "balance_loss_mlp": 1.01719534, + "epoch": 0.6796933714113934, + "flos": 23585714484480.0, + "grad_norm": 1.919155189417444, + "language_loss": 0.73916245, + "learning_rate": 9.829599081106536e-07, + "loss": 0.76010954, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 11305, + "time_per_iteration": 2.3813540935516357 + }, + { + "auxiliary_loss_clip": 0.01053604, + "auxiliary_loss_mlp": 0.01036424, + "balance_loss_clip": 1.01171196, + "balance_loss_mlp": 1.01684117, + "epoch": 0.6797534946640613, + "flos": 27118855319040.0, + "grad_norm": 2.1868048299752973, + "language_loss": 0.67097187, + "learning_rate": 9.826245813561882e-07, + "loss": 0.69187224, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3671875, + "step": 11306, + "time_per_iteration": 2.3829691410064697 + }, + { + "auxiliary_loss_clip": 0.01051347, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.01316977, + "balance_loss_mlp": 1.01591039, + "epoch": 0.6798136179167293, + "flos": 22126344986880.0, + "grad_norm": 1.5553161400698892, + "language_loss": 0.81303728, + "learning_rate": 9.822892931807021e-07, + "loss": 0.8339082, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 11307, + "time_per_iteration": 2.3684165477752686 + }, + { + "auxiliary_loss_clip": 0.0105324, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.01850343, + "balance_loss_mlp": 1.01688075, + "epoch": 0.6798737411693972, + "flos": 17487819596160.0, + "grad_norm": 1.6848261866720162, + "language_loss": 0.8996042, + "learning_rate": 9.819540435969066e-07, + "loss": 0.92054749, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 11308, + "time_per_iteration": 2.330977439880371 + }, + { + "auxiliary_loss_clip": 0.01052924, + "auxiliary_loss_mlp": 0.01038604, + "balance_loss_clip": 1.01449966, + "balance_loss_mlp": 1.01680899, + "epoch": 0.6799338644220653, + "flos": 22891187341440.0, + "grad_norm": 2.0323730365446466, + "language_loss": 0.72306037, + "learning_rate": 9.816188326175154e-07, + "loss": 0.74397564, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.359375, + "step": 11309, + "time_per_iteration": 2.3653910160064697 + }, + { + "auxiliary_loss_clip": 0.01052142, + "auxiliary_loss_mlp": 0.01042025, + "balance_loss_clip": 1.01672864, + "balance_loss_mlp": 1.01588535, + "epoch": 0.6799939876747332, + "flos": 23179430459520.0, + "grad_norm": 2.0673285615712387, + "language_loss": 0.85508621, + "learning_rate": 9.812836602552411e-07, + "loss": 0.87602782, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 11310, + "time_per_iteration": 2.3496391773223877 + }, + { + "auxiliary_loss_clip": 0.01052577, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.01139092, + "balance_loss_mlp": 1.01714158, + "epoch": 0.6800541109274012, + "flos": 19498921309440.0, + "grad_norm": 2.461221733558314, + "language_loss": 0.84251785, + "learning_rate": 9.80948526522792e-07, + "loss": 0.86337811, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 11311, + "time_per_iteration": 2.3542094230651855 + }, + { + "auxiliary_loss_clip": 0.01055588, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.01488972, + "balance_loss_mlp": 1.01657796, + "epoch": 0.6801142341800691, + "flos": 22276436388480.0, + "grad_norm": 1.7579058547909288, + "language_loss": 0.77063441, + "learning_rate": 9.806134314328767e-07, + "loss": 0.79160225, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 11312, + "time_per_iteration": 2.392918109893799 + }, + { + "auxiliary_loss_clip": 0.01009025, + "auxiliary_loss_mlp": 0.01004238, + "balance_loss_clip": 1.00174665, + "balance_loss_mlp": 1.00214028, + "epoch": 0.6801743574327371, + "flos": 68711547450240.0, + "grad_norm": 0.6599246814647423, + "language_loss": 0.57335389, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59348649, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.06933594, + "step": 11313, + "time_per_iteration": 3.1280665397644043 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.00877905, + "balance_loss_mlp": 1.0156343, + "epoch": 0.680234480685405, + "flos": 29459187953280.0, + "grad_norm": 2.087798068426426, + "language_loss": 0.69325662, + "learning_rate": 9.799433572314754e-07, + "loss": 0.71410531, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 11314, + "time_per_iteration": 2.4193294048309326 + }, + { + "auxiliary_loss_clip": 0.0105117, + "auxiliary_loss_mlp": 0.0103727, + "balance_loss_clip": 1.01653981, + "balance_loss_mlp": 1.01615846, + "epoch": 0.6802946039380731, + "flos": 15916169566080.0, + "grad_norm": 1.8198767109516336, + "language_loss": 0.82100427, + "learning_rate": 9.796083781453972e-07, + "loss": 0.84188867, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34960938, + "step": 11315, + "time_per_iteration": 2.3320631980895996 + }, + { + "auxiliary_loss_clip": 0.01052958, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.01064742, + "balance_loss_mlp": 1.01656938, + "epoch": 0.680354727190741, + "flos": 22017555590400.0, + "grad_norm": 1.6326475927579498, + "language_loss": 0.70885432, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72973233, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 11316, + "time_per_iteration": 2.376171588897705 + }, + { + "auxiliary_loss_clip": 0.01052362, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.01585579, + "balance_loss_mlp": 1.01652789, + "epoch": 0.680414850443409, + "flos": 18440529310080.0, + "grad_norm": 2.1486012271944084, + "language_loss": 0.67608082, + "learning_rate": 9.789385360660003e-07, + "loss": 0.69699681, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 11317, + "time_per_iteration": 2.3507442474365234 + }, + { + "auxiliary_loss_clip": 0.01054259, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 1.01735544, + "epoch": 0.680474973696077, + "flos": 26357434277760.0, + "grad_norm": 1.427726714094424, + "language_loss": 0.76189744, + "learning_rate": 9.78603673098082e-07, + "loss": 0.7829442, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36914062, + "step": 11318, + "time_per_iteration": 2.4003381729125977 + }, + { + "auxiliary_loss_clip": 0.01049767, + "auxiliary_loss_mlp": 0.01035915, + "balance_loss_clip": 1.01270461, + "balance_loss_mlp": 1.01536596, + "epoch": 0.6805350969487449, + "flos": 18332123938560.0, + "grad_norm": 1.5612113322352652, + "language_loss": 0.68884349, + "learning_rate": 9.782688488616143e-07, + "loss": 0.70970035, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34375, + "step": 11319, + "time_per_iteration": 2.366793394088745 + }, + { + "auxiliary_loss_clip": 0.01052697, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.01474094, + "balance_loss_mlp": 1.01586473, + "epoch": 0.6805952202014129, + "flos": 19936487779200.0, + "grad_norm": 1.9590695334177763, + "language_loss": 0.778593, + "learning_rate": 9.779340633692945e-07, + "loss": 0.79949653, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3671875, + "step": 11320, + "time_per_iteration": 2.341738224029541 + }, + { + "auxiliary_loss_clip": 0.01051464, + "auxiliary_loss_mlp": 0.01039399, + "balance_loss_clip": 1.01485419, + "balance_loss_mlp": 1.01578641, + "epoch": 0.6806553434540809, + "flos": 25223245983360.0, + "grad_norm": 1.9941067757150803, + "language_loss": 0.76120341, + "learning_rate": 9.77599316633817e-07, + "loss": 0.782112, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.35546875, + "step": 11321, + "time_per_iteration": 2.4126908779144287 + }, + { + "auxiliary_loss_clip": 0.01053287, + "auxiliary_loss_mlp": 0.01045625, + "balance_loss_clip": 1.02143741, + "balance_loss_mlp": 1.0170145, + "epoch": 0.6807154667067489, + "flos": 17784615997440.0, + "grad_norm": 1.8758312289261039, + "language_loss": 0.74230587, + "learning_rate": 9.772646086678758e-07, + "loss": 0.76329499, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 11322, + "time_per_iteration": 2.3454885482788086 + }, + { + "auxiliary_loss_clip": 0.01051834, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.01071644, + "balance_loss_mlp": 1.0158776, + "epoch": 0.6807755899594168, + "flos": 22198824702720.0, + "grad_norm": 1.573839321786856, + "language_loss": 0.7952615, + "learning_rate": 9.769299394841638e-07, + "loss": 0.816127, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 11323, + "time_per_iteration": 2.3848392963409424 + }, + { + "auxiliary_loss_clip": 0.01007802, + "auxiliary_loss_mlp": 0.01003447, + "balance_loss_clip": 1.00139701, + "balance_loss_mlp": 1.00098681, + "epoch": 0.6808357132120848, + "flos": 68628105567360.0, + "grad_norm": 0.7705674752663615, + "language_loss": 0.57208943, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59220195, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06835938, + "step": 11324, + "time_per_iteration": 2.795022964477539 + }, + { + "auxiliary_loss_clip": 0.0105355, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.0136652, + "balance_loss_mlp": 1.01712179, + "epoch": 0.6808958364647527, + "flos": 23842186398720.0, + "grad_norm": 2.007031615039834, + "language_loss": 0.69869846, + "learning_rate": 9.76260717514186e-07, + "loss": 0.71963531, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36328125, + "step": 11325, + "time_per_iteration": 2.4232065677642822 + }, + { + "auxiliary_loss_clip": 0.0105418, + "auxiliary_loss_mlp": 0.01037201, + "balance_loss_clip": 1.01264393, + "balance_loss_mlp": 1.01595712, + "epoch": 0.6809559597174207, + "flos": 17710774738560.0, + "grad_norm": 2.2087568090806293, + "language_loss": 0.71746671, + "learning_rate": 9.759261647532974e-07, + "loss": 0.73838055, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 11326, + "time_per_iteration": 2.3051655292510986 + }, + { + "auxiliary_loss_clip": 0.01053347, + "auxiliary_loss_mlp": 0.01040444, + "balance_loss_clip": 1.01666188, + "balance_loss_mlp": 1.01588583, + "epoch": 0.6810160829700886, + "flos": 22490803336320.0, + "grad_norm": 1.9710726230785698, + "language_loss": 0.7456708, + "learning_rate": 9.75591650825392e-07, + "loss": 0.76660872, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 11327, + "time_per_iteration": 2.372598648071289 + }, + { + "auxiliary_loss_clip": 0.0105078, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.01139617, + "balance_loss_mlp": 1.01547813, + "epoch": 0.6810762062227567, + "flos": 16832045928960.0, + "grad_norm": 1.8022847802693245, + "language_loss": 0.78302848, + "learning_rate": 9.752571757431526e-07, + "loss": 0.80388319, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 11328, + "time_per_iteration": 3.5912673473358154 + }, + { + "auxiliary_loss_clip": 0.01053214, + "auxiliary_loss_mlp": 0.01038345, + "balance_loss_clip": 1.01438427, + "balance_loss_mlp": 1.01614332, + "epoch": 0.6811363294754246, + "flos": 12713830663680.0, + "grad_norm": 1.9867205532036336, + "language_loss": 0.65883005, + "learning_rate": 9.74922739519265e-07, + "loss": 0.67974561, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 11329, + "time_per_iteration": 2.341482162475586 + }, + { + "auxiliary_loss_clip": 0.01054363, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.01377773, + "balance_loss_mlp": 1.017012, + "epoch": 0.6811964527280926, + "flos": 17711019118080.0, + "grad_norm": 1.9106282524070042, + "language_loss": 0.80071592, + "learning_rate": 9.745883421664096e-07, + "loss": 0.8216359, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 11330, + "time_per_iteration": 2.3177225589752197 + }, + { + "auxiliary_loss_clip": 0.010534, + "auxiliary_loss_mlp": 0.0103745, + "balance_loss_clip": 1.01242816, + "balance_loss_mlp": 1.01686656, + "epoch": 0.6812565759807605, + "flos": 24862313681280.0, + "grad_norm": 1.7928339999133578, + "language_loss": 0.64917552, + "learning_rate": 9.742539836972665e-07, + "loss": 0.670084, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36523438, + "step": 11331, + "time_per_iteration": 3.8097686767578125 + }, + { + "auxiliary_loss_clip": 0.01051712, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.01333761, + "balance_loss_mlp": 1.01602399, + "epoch": 0.6813166992334285, + "flos": 17165047276800.0, + "grad_norm": 1.689369831393743, + "language_loss": 0.73483318, + "learning_rate": 9.739196641245148e-07, + "loss": 0.75571734, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35742188, + "step": 11332, + "time_per_iteration": 3.730844020843506 + }, + { + "auxiliary_loss_clip": 0.01054633, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.01592517, + "balance_loss_mlp": 1.01661801, + "epoch": 0.6813768224860965, + "flos": 18842554149120.0, + "grad_norm": 2.4554504090549854, + "language_loss": 0.76488346, + "learning_rate": 9.735853834608326e-07, + "loss": 0.78582937, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 11333, + "time_per_iteration": 2.339313268661499 + }, + { + "auxiliary_loss_clip": 0.01055695, + "auxiliary_loss_mlp": 0.01036267, + "balance_loss_clip": 1.0105654, + "balance_loss_mlp": 1.01713085, + "epoch": 0.6814369457387645, + "flos": 24531651394560.0, + "grad_norm": 1.4305273344234692, + "language_loss": 0.72545427, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74637389, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 11334, + "time_per_iteration": 2.422800064086914 + }, + { + "auxiliary_loss_clip": 0.01050348, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.01433885, + "balance_loss_mlp": 1.01486874, + "epoch": 0.6814970689914325, + "flos": 18222007910400.0, + "grad_norm": 1.6455207138431267, + "language_loss": 0.8672936, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88817602, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35546875, + "step": 11335, + "time_per_iteration": 2.3310294151306152 + }, + { + "auxiliary_loss_clip": 0.01048957, + "auxiliary_loss_mlp": 0.01028379, + "balance_loss_clip": 1.00744617, + "balance_loss_mlp": 1.01486683, + "epoch": 0.6815571922441004, + "flos": 25227609903360.0, + "grad_norm": 1.8234258559114502, + "language_loss": 0.83389199, + "learning_rate": 9.725827750509542e-07, + "loss": 0.8546654, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.33984375, + "step": 11336, + "time_per_iteration": 2.4328370094299316 + }, + { + "auxiliary_loss_clip": 0.01051431, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.01173389, + "balance_loss_mlp": 1.0158205, + "epoch": 0.6816173154967684, + "flos": 19455280243200.0, + "grad_norm": 1.812462853137593, + "language_loss": 0.82813579, + "learning_rate": 9.72248650150294e-07, + "loss": 0.8489871, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35546875, + "step": 11337, + "time_per_iteration": 2.3259758949279785 + }, + { + "auxiliary_loss_clip": 0.01050418, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.01466107, + "balance_loss_mlp": 1.01571894, + "epoch": 0.6816774387494363, + "flos": 17930483124480.0, + "grad_norm": 1.5548121892171674, + "language_loss": 0.73011971, + "learning_rate": 9.719145642220673e-07, + "loss": 0.75098467, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34765625, + "step": 11338, + "time_per_iteration": 2.3799033164978027 + }, + { + "auxiliary_loss_clip": 0.01052779, + "auxiliary_loss_mlp": 0.01037984, + "balance_loss_clip": 1.01427341, + "balance_loss_mlp": 1.01705265, + "epoch": 0.6817375620021043, + "flos": 22232027272320.0, + "grad_norm": 1.4589737679692956, + "language_loss": 0.78951555, + "learning_rate": 9.715805172789435e-07, + "loss": 0.8104232, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35742188, + "step": 11339, + "time_per_iteration": 2.355992317199707 + }, + { + "auxiliary_loss_clip": 0.01053085, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.01859868, + "balance_loss_mlp": 1.01653409, + "epoch": 0.6817976852547722, + "flos": 25373232650880.0, + "grad_norm": 3.552380950005672, + "language_loss": 0.7151162, + "learning_rate": 9.712465093335901e-07, + "loss": 0.73607421, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11340, + "time_per_iteration": 2.4233591556549072 + }, + { + "auxiliary_loss_clip": 0.01055985, + "auxiliary_loss_mlp": 0.01044256, + "balance_loss_clip": 1.01938903, + "balance_loss_mlp": 1.01769495, + "epoch": 0.6818578085074403, + "flos": 22264880728320.0, + "grad_norm": 2.3550485434979884, + "language_loss": 0.84839368, + "learning_rate": 9.709125403986722e-07, + "loss": 0.86939609, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3828125, + "step": 11341, + "time_per_iteration": 2.3447048664093018 + }, + { + "auxiliary_loss_clip": 0.01053461, + "auxiliary_loss_mlp": 0.01045125, + "balance_loss_clip": 1.01941168, + "balance_loss_mlp": 1.01675189, + "epoch": 0.6819179317601082, + "flos": 19317128526720.0, + "grad_norm": 2.0356370174976095, + "language_loss": 0.6935879, + "learning_rate": 9.705786104868531e-07, + "loss": 0.71457374, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3671875, + "step": 11342, + "time_per_iteration": 3.799626350402832 + }, + { + "auxiliary_loss_clip": 0.01051757, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.01613879, + "balance_loss_mlp": 1.01549602, + "epoch": 0.6819780550127762, + "flos": 21103110593280.0, + "grad_norm": 1.6064830238785888, + "language_loss": 0.75567943, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77661073, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 11343, + "time_per_iteration": 2.359135150909424 + }, + { + "auxiliary_loss_clip": 0.0105454, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_clip": 1.01921368, + "balance_loss_mlp": 1.01740527, + "epoch": 0.6820381782654441, + "flos": 29715101285760.0, + "grad_norm": 1.6437877363621936, + "language_loss": 0.80945468, + "learning_rate": 9.699108677831639e-07, + "loss": 0.83044052, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 11344, + "time_per_iteration": 2.4701366424560547 + }, + { + "auxiliary_loss_clip": 0.01054017, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.01835322, + "balance_loss_mlp": 1.0168891, + "epoch": 0.6820983015181121, + "flos": 29240841110400.0, + "grad_norm": 3.3114406747545013, + "language_loss": 0.67253613, + "learning_rate": 9.695770550166136e-07, + "loss": 0.69351029, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 11345, + "time_per_iteration": 2.4269392490386963 + }, + { + "auxiliary_loss_clip": 0.01055092, + "auxiliary_loss_mlp": 0.01043908, + "balance_loss_clip": 1.01836133, + "balance_loss_mlp": 1.0175004, + "epoch": 0.6821584247707801, + "flos": 18871008773760.0, + "grad_norm": 2.340703834828303, + "language_loss": 0.65980124, + "learning_rate": 9.692432813238054e-07, + "loss": 0.6807912, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 11346, + "time_per_iteration": 2.3652656078338623 + }, + { + "auxiliary_loss_clip": 0.01053302, + "auxiliary_loss_mlp": 0.0103986, + "balance_loss_clip": 1.01448059, + "balance_loss_mlp": 1.01705098, + "epoch": 0.6822185480234481, + "flos": 21323517206400.0, + "grad_norm": 1.5298447802888782, + "language_loss": 0.79407704, + "learning_rate": 9.689095467173952e-07, + "loss": 0.81500864, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 11347, + "time_per_iteration": 2.3567261695861816 + }, + { + "auxiliary_loss_clip": 0.01008141, + "auxiliary_loss_mlp": 0.01002297, + "balance_loss_clip": 1.00005543, + "balance_loss_mlp": 1.00124049, + "epoch": 0.6822786712761161, + "flos": 63485434010880.0, + "grad_norm": 0.728293305653424, + "language_loss": 0.52573669, + "learning_rate": 9.685758512100378e-07, + "loss": 0.5458411, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06933594, + "step": 11348, + "time_per_iteration": 3.01145076751709 + }, + { + "auxiliary_loss_clip": 0.01051568, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.01508629, + "balance_loss_mlp": 1.01614904, + "epoch": 0.682338794528784, + "flos": 21067883164800.0, + "grad_norm": 1.770305780845218, + "language_loss": 0.80625159, + "learning_rate": 9.682421948143873e-07, + "loss": 0.82714558, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 11349, + "time_per_iteration": 2.3378827571868896 + }, + { + "auxiliary_loss_clip": 0.01057666, + "auxiliary_loss_mlp": 0.01040501, + "balance_loss_clip": 1.01006722, + "balance_loss_mlp": 1.01699567, + "epoch": 0.682398917781452, + "flos": 36281775265920.0, + "grad_norm": 2.1031734446507304, + "language_loss": 0.74915248, + "learning_rate": 9.67908577543096e-07, + "loss": 0.77013421, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.40820312, + "step": 11350, + "time_per_iteration": 2.4916343688964844 + }, + { + "auxiliary_loss_clip": 0.01052453, + "auxiliary_loss_mlp": 0.01039309, + "balance_loss_clip": 1.01522851, + "balance_loss_mlp": 1.01691389, + "epoch": 0.6824590410341199, + "flos": 24858159229440.0, + "grad_norm": 2.9763238195965145, + "language_loss": 0.80416662, + "learning_rate": 9.675749994088161e-07, + "loss": 0.82508421, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35546875, + "step": 11351, + "time_per_iteration": 2.3824033737182617 + }, + { + "auxiliary_loss_clip": 0.01051914, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.01219022, + "balance_loss_mlp": 1.01547277, + "epoch": 0.6825191642867879, + "flos": 22451386544640.0, + "grad_norm": 1.8477021283931088, + "language_loss": 0.74282789, + "learning_rate": 9.672414604241954e-07, + "loss": 0.76368457, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36523438, + "step": 11352, + "time_per_iteration": 2.4317715167999268 + }, + { + "auxiliary_loss_clip": 0.01054761, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.0150646, + "balance_loss_mlp": 1.01696849, + "epoch": 0.6825792875394558, + "flos": 29423087740800.0, + "grad_norm": 1.6703696153045267, + "language_loss": 0.80787569, + "learning_rate": 9.669079606018814e-07, + "loss": 0.8288188, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37890625, + "step": 11353, + "time_per_iteration": 2.4259681701660156 + }, + { + "auxiliary_loss_clip": 0.01051617, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.01039314, + "balance_loss_mlp": 1.01617074, + "epoch": 0.6826394107921239, + "flos": 18769969699200.0, + "grad_norm": 1.6783306293963058, + "language_loss": 0.795811, + "learning_rate": 9.665744999545218e-07, + "loss": 0.81665188, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 11354, + "time_per_iteration": 2.375084638595581 + }, + { + "auxiliary_loss_clip": 0.01052657, + "auxiliary_loss_mlp": 0.0103885, + "balance_loss_clip": 1.0166769, + "balance_loss_mlp": 1.0171026, + "epoch": 0.6826995340447918, + "flos": 16616666551680.0, + "grad_norm": 1.8896232226626437, + "language_loss": 0.62531334, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64622843, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 11355, + "time_per_iteration": 2.3192830085754395 + }, + { + "auxiliary_loss_clip": 0.01050132, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.01564682, + "balance_loss_mlp": 1.01498508, + "epoch": 0.6827596572974598, + "flos": 20847301994880.0, + "grad_norm": 1.9518129522602248, + "language_loss": 0.83647722, + "learning_rate": 9.659076962352398e-07, + "loss": 0.85736406, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 11356, + "time_per_iteration": 2.4157068729400635 + }, + { + "auxiliary_loss_clip": 0.01053777, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.01955652, + "balance_loss_mlp": 1.01665974, + "epoch": 0.6828197805501277, + "flos": 22746961048320.0, + "grad_norm": 1.8109027517291565, + "language_loss": 0.79695088, + "learning_rate": 9.655743531886052e-07, + "loss": 0.81794924, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37109375, + "step": 11357, + "time_per_iteration": 2.39264178276062 + }, + { + "auxiliary_loss_clip": 0.01008161, + "auxiliary_loss_mlp": 0.01004194, + "balance_loss_clip": 1.00204802, + "balance_loss_mlp": 1.00128829, + "epoch": 0.6828799038027957, + "flos": 71642854535040.0, + "grad_norm": 0.8266562495881349, + "language_loss": 0.59640837, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61653185, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06884766, + "step": 11358, + "time_per_iteration": 3.0967319011688232 + }, + { + "auxiliary_loss_clip": 0.01056333, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_clip": 1.01704788, + "balance_loss_mlp": 1.01758206, + "epoch": 0.6829400270554637, + "flos": 19828117319040.0, + "grad_norm": 1.6284598962430035, + "language_loss": 0.79510963, + "learning_rate": 9.64907784784544e-07, + "loss": 0.81609893, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 11359, + "time_per_iteration": 2.357759952545166 + }, + { + "auxiliary_loss_clip": 0.01053748, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.01674414, + "balance_loss_mlp": 1.01633477, + "epoch": 0.6830001503081317, + "flos": 21979570164480.0, + "grad_norm": 2.1086087243854847, + "language_loss": 0.83126533, + "learning_rate": 9.645745594523958e-07, + "loss": 0.85221446, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 11360, + "time_per_iteration": 2.389735698699951 + }, + { + "auxiliary_loss_clip": 0.0105665, + "auxiliary_loss_mlp": 0.01040918, + "balance_loss_clip": 1.01549053, + "balance_loss_mlp": 1.01923323, + "epoch": 0.6830602735607997, + "flos": 24315608701440.0, + "grad_norm": 1.5989442437006638, + "language_loss": 0.75674617, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77772182, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 11361, + "time_per_iteration": 2.3768551349639893 + }, + { + "auxiliary_loss_clip": 0.01007899, + "auxiliary_loss_mlp": 0.01002844, + "balance_loss_clip": 1.00059092, + "balance_loss_mlp": 1.00116277, + "epoch": 0.6831203968134676, + "flos": 57687268078080.0, + "grad_norm": 0.8634784272620708, + "language_loss": 0.59820902, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61831653, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.06738281, + "step": 11362, + "time_per_iteration": 3.1297991275787354 + }, + { + "auxiliary_loss_clip": 0.01055362, + "auxiliary_loss_mlp": 0.01039491, + "balance_loss_clip": 1.01350331, + "balance_loss_mlp": 1.01724708, + "epoch": 0.6831805200661356, + "flos": 14387671843200.0, + "grad_norm": 2.343762547009656, + "language_loss": 0.76665699, + "learning_rate": 9.635751190871074e-07, + "loss": 0.78760552, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38085938, + "step": 11363, + "time_per_iteration": 2.3158111572265625 + }, + { + "auxiliary_loss_clip": 0.01053648, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.01864266, + "balance_loss_mlp": 1.01655674, + "epoch": 0.6832406433188035, + "flos": 22819196384640.0, + "grad_norm": 3.936751934995813, + "language_loss": 0.90873158, + "learning_rate": 9.632420508845063e-07, + "loss": 0.9296875, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 11364, + "time_per_iteration": 2.4252119064331055 + }, + { + "auxiliary_loss_clip": 0.01050886, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.01016283, + "balance_loss_mlp": 1.01595521, + "epoch": 0.6833007665714715, + "flos": 17560892805120.0, + "grad_norm": 1.7910296885189234, + "language_loss": 0.89342546, + "learning_rate": 9.629090219958697e-07, + "loss": 0.91425067, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34960938, + "step": 11365, + "time_per_iteration": 2.353181838989258 + }, + { + "auxiliary_loss_clip": 0.01056667, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.01415062, + "balance_loss_mlp": 1.01705575, + "epoch": 0.6833608898241395, + "flos": 22445102499840.0, + "grad_norm": 2.2021279066629065, + "language_loss": 0.81566691, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83664703, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.39648438, + "step": 11366, + "time_per_iteration": 2.3499133586883545 + }, + { + "auxiliary_loss_clip": 0.01053915, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.01501787, + "balance_loss_mlp": 1.01619911, + "epoch": 0.6834210130768075, + "flos": 24533501696640.0, + "grad_norm": 1.6222430056311468, + "language_loss": 0.77719277, + "learning_rate": 9.622430822110062e-07, + "loss": 0.79814565, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37695312, + "step": 11367, + "time_per_iteration": 3.6529366970062256 + }, + { + "auxiliary_loss_clip": 0.01053966, + "auxiliary_loss_mlp": 0.0105029, + "balance_loss_clip": 1.02356291, + "balance_loss_mlp": 1.01701808, + "epoch": 0.6834811363294754, + "flos": 20046115048320.0, + "grad_norm": 1.6672231246096054, + "language_loss": 0.70510632, + "learning_rate": 9.619101713400312e-07, + "loss": 0.72614884, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.36914062, + "step": 11368, + "time_per_iteration": 2.369992971420288 + }, + { + "auxiliary_loss_clip": 0.01053455, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.01445317, + "balance_loss_mlp": 1.01644075, + "epoch": 0.6835412595821434, + "flos": 24789589585920.0, + "grad_norm": 1.876659628855962, + "language_loss": 0.74168968, + "learning_rate": 9.615772998335261e-07, + "loss": 0.76260245, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 11369, + "time_per_iteration": 2.3782598972320557 + }, + { + "auxiliary_loss_clip": 0.01052705, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.01558113, + "balance_loss_mlp": 1.0154742, + "epoch": 0.6836013828348113, + "flos": 19499340245760.0, + "grad_norm": 3.147366781903276, + "language_loss": 0.80007488, + "learning_rate": 9.612444677041138e-07, + "loss": 0.82101393, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37109375, + "step": 11370, + "time_per_iteration": 2.351530075073242 + }, + { + "auxiliary_loss_clip": 0.01007832, + "auxiliary_loss_mlp": 0.01003233, + "balance_loss_clip": 1.00070572, + "balance_loss_mlp": 1.00093269, + "epoch": 0.6836615060874793, + "flos": 58360706893440.0, + "grad_norm": 0.7499592677271231, + "language_loss": 0.59844542, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61855602, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.06933594, + "step": 11371, + "time_per_iteration": 5.550664186477661 + }, + { + "auxiliary_loss_clip": 0.01051212, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.01111591, + "balance_loss_mlp": 1.01614666, + "epoch": 0.6837216293401474, + "flos": 12166078343040.0, + "grad_norm": 1.8018436660183397, + "language_loss": 0.65006596, + "learning_rate": 9.605789216270511e-07, + "loss": 0.67091078, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3515625, + "step": 11372, + "time_per_iteration": 2.3247623443603516 + }, + { + "auxiliary_loss_clip": 0.01051522, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.013111, + "balance_loss_mlp": 1.01568675, + "epoch": 0.6837817525928153, + "flos": 22126484632320.0, + "grad_norm": 1.5755480603599943, + "language_loss": 0.72922432, + "learning_rate": 9.602462077046375e-07, + "loss": 0.75010109, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 11373, + "time_per_iteration": 2.373307704925537 + }, + { + "auxiliary_loss_clip": 0.01007293, + "auxiliary_loss_mlp": 0.01003694, + "balance_loss_clip": 1.00105965, + "balance_loss_mlp": 1.00047231, + "epoch": 0.6838418758454833, + "flos": 65002409118720.0, + "grad_norm": 1.23504323957892, + "language_loss": 0.56612694, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58623683, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.06835938, + "step": 11374, + "time_per_iteration": 3.1802003383636475 + }, + { + "auxiliary_loss_clip": 0.01054251, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.0113126, + "balance_loss_mlp": 1.01643813, + "epoch": 0.6839019990981512, + "flos": 21029827916160.0, + "grad_norm": 1.6616161163100196, + "language_loss": 0.74834883, + "learning_rate": 9.595808981551312e-07, + "loss": 0.7692529, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 11375, + "time_per_iteration": 2.411531686782837 + }, + { + "auxiliary_loss_clip": 0.01052545, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.01398206, + "balance_loss_mlp": 1.01608419, + "epoch": 0.6839621223508192, + "flos": 24934409372160.0, + "grad_norm": 1.556788079524683, + "language_loss": 0.71406019, + "learning_rate": 9.592483025532651e-07, + "loss": 0.7349534, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36523438, + "step": 11376, + "time_per_iteration": 2.407729148864746 + }, + { + "auxiliary_loss_clip": 0.01054937, + "auxiliary_loss_mlp": 0.01041323, + "balance_loss_clip": 1.01502502, + "balance_loss_mlp": 1.01690638, + "epoch": 0.6840222456034871, + "flos": 26357643745920.0, + "grad_norm": 1.9219086966868084, + "language_loss": 0.7500093, + "learning_rate": 9.58915746416808e-07, + "loss": 0.77097189, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 11377, + "time_per_iteration": 2.389582633972168 + }, + { + "auxiliary_loss_clip": 0.01007625, + "auxiliary_loss_mlp": 0.01006377, + "balance_loss_clip": 1.00370705, + "balance_loss_mlp": 1.00080597, + "epoch": 0.6840823688561551, + "flos": 65984865177600.0, + "grad_norm": 0.7266071172549926, + "language_loss": 0.56967378, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58981371, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.06835938, + "step": 11378, + "time_per_iteration": 3.1209464073181152 + }, + { + "auxiliary_loss_clip": 0.01053253, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_clip": 1.0153991, + "balance_loss_mlp": 1.01580346, + "epoch": 0.684142492108823, + "flos": 21396520592640.0, + "grad_norm": 1.604804490683469, + "language_loss": 0.79544926, + "learning_rate": 9.58250752590561e-07, + "loss": 0.81640804, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.375, + "step": 11379, + "time_per_iteration": 2.359462022781372 + }, + { + "auxiliary_loss_clip": 0.01049984, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.00883377, + "balance_loss_mlp": 1.01616418, + "epoch": 0.6842026153614911, + "flos": 18800588828160.0, + "grad_norm": 2.085792465910839, + "language_loss": 0.70464754, + "learning_rate": 9.57918314925988e-07, + "loss": 0.72545135, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33984375, + "step": 11380, + "time_per_iteration": 2.376437187194824 + }, + { + "auxiliary_loss_clip": 0.01051609, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.01169217, + "balance_loss_mlp": 1.01505899, + "epoch": 0.684262738614159, + "flos": 19645381929600.0, + "grad_norm": 2.129986725740952, + "language_loss": 0.79649007, + "learning_rate": 9.575859167772568e-07, + "loss": 0.81735533, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 11381, + "time_per_iteration": 3.8441781997680664 + }, + { + "auxiliary_loss_clip": 0.01008078, + "auxiliary_loss_mlp": 0.01003237, + "balance_loss_clip": 1.00112712, + "balance_loss_mlp": 1.00106585, + "epoch": 0.684322861866827, + "flos": 62351699564160.0, + "grad_norm": 0.876433460935326, + "language_loss": 0.67290825, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69302136, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.0703125, + "step": 11382, + "time_per_iteration": 2.8873586654663086 + }, + { + "auxiliary_loss_clip": 0.01008802, + "auxiliary_loss_mlp": 0.01002122, + "balance_loss_clip": 0.99967843, + "balance_loss_mlp": 1.00172222, + "epoch": 0.6843829851194949, + "flos": 65801606117760.0, + "grad_norm": 0.8273041813436298, + "language_loss": 0.58214319, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60225242, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.07080078, + "step": 11383, + "time_per_iteration": 3.116757392883301 + }, + { + "auxiliary_loss_clip": 0.01052572, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.01565051, + "balance_loss_mlp": 1.01554716, + "epoch": 0.6844431083721629, + "flos": 27853916417280.0, + "grad_norm": 1.6594473975741142, + "language_loss": 0.80916452, + "learning_rate": 9.565889595521517e-07, + "loss": 0.83007866, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 11384, + "time_per_iteration": 2.4293770790100098 + }, + { + "auxiliary_loss_clip": 0.01055965, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.01289785, + "balance_loss_mlp": 1.01669025, + "epoch": 0.684503231624831, + "flos": 18254163139200.0, + "grad_norm": 1.9324826564400799, + "language_loss": 0.78837657, + "learning_rate": 9.562567195928187e-07, + "loss": 0.80931532, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.39257812, + "step": 11385, + "time_per_iteration": 2.375232696533203 + }, + { + "auxiliary_loss_clip": 0.01057887, + "auxiliary_loss_mlp": 0.01043492, + "balance_loss_clip": 1.01490569, + "balance_loss_mlp": 1.01784146, + "epoch": 0.6845633548774989, + "flos": 17638713959040.0, + "grad_norm": 1.8890213519401367, + "language_loss": 0.8533538, + "learning_rate": 9.55924519212335e-07, + "loss": 0.87436759, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40039062, + "step": 11386, + "time_per_iteration": 2.3406333923339844 + }, + { + "auxiliary_loss_clip": 0.01053173, + "auxiliary_loss_mlp": 0.01041077, + "balance_loss_clip": 1.01852298, + "balance_loss_mlp": 1.01709688, + "epoch": 0.6846234781301669, + "flos": 20806698216960.0, + "grad_norm": 1.9772526472984346, + "language_loss": 0.83759701, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85853946, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 11387, + "time_per_iteration": 2.3421337604522705 + }, + { + "auxiliary_loss_clip": 0.01053142, + "auxiliary_loss_mlp": 0.01038881, + "balance_loss_clip": 1.01488447, + "balance_loss_mlp": 1.01670218, + "epoch": 0.6846836013828348, + "flos": 36099703192320.0, + "grad_norm": 1.502170825808821, + "language_loss": 0.73184991, + "learning_rate": 9.552602372383047e-07, + "loss": 0.75277013, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 11388, + "time_per_iteration": 2.486311674118042 + }, + { + "auxiliary_loss_clip": 0.01052096, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.01068282, + "balance_loss_mlp": 1.01628017, + "epoch": 0.6847437246355028, + "flos": 43140811904640.0, + "grad_norm": 1.90970081294091, + "language_loss": 0.63388002, + "learning_rate": 9.549281556699469e-07, + "loss": 0.6547296, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35742188, + "step": 11389, + "time_per_iteration": 2.5340685844421387 + }, + { + "auxiliary_loss_clip": 0.01007786, + "auxiliary_loss_mlp": 0.0100547, + "balance_loss_clip": 1.00330007, + "balance_loss_mlp": 1.00081158, + "epoch": 0.6848038478881707, + "flos": 71660556460800.0, + "grad_norm": 0.7358965959007148, + "language_loss": 0.56055951, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58069205, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.06982422, + "step": 11390, + "time_per_iteration": 3.096839666366577 + }, + { + "auxiliary_loss_clip": 0.01053999, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.01600718, + "balance_loss_mlp": 1.01767051, + "epoch": 0.6848639711408387, + "flos": 19936801981440.0, + "grad_norm": 2.1161401688098045, + "language_loss": 0.88333428, + "learning_rate": 9.542641114335109e-07, + "loss": 0.90426862, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 11391, + "time_per_iteration": 2.3745217323303223 + }, + { + "auxiliary_loss_clip": 0.01056127, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.01640987, + "balance_loss_mlp": 1.01808608, + "epoch": 0.6849240943935067, + "flos": 26866363299840.0, + "grad_norm": 1.616583042120885, + "language_loss": 0.80129439, + "learning_rate": 9.539321487906117e-07, + "loss": 0.82227039, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 11392, + "time_per_iteration": 2.420243978500366 + }, + { + "auxiliary_loss_clip": 0.01051678, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.01350927, + "balance_loss_mlp": 1.01661479, + "epoch": 0.6849842176461747, + "flos": 13734516528000.0, + "grad_norm": 2.9713834552248644, + "language_loss": 0.72076523, + "learning_rate": 9.536002258147104e-07, + "loss": 0.7416622, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.34960938, + "step": 11393, + "time_per_iteration": 2.3382303714752197 + }, + { + "auxiliary_loss_clip": 0.01055633, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.01554668, + "balance_loss_mlp": 1.01761603, + "epoch": 0.6850443408988426, + "flos": 24971906039040.0, + "grad_norm": 1.5918988199169637, + "language_loss": 0.65063387, + "learning_rate": 9.532683425183936e-07, + "loss": 0.67160821, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38085938, + "step": 11394, + "time_per_iteration": 2.405317544937134 + }, + { + "auxiliary_loss_clip": 0.01055443, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_clip": 1.01998258, + "balance_loss_mlp": 1.01704121, + "epoch": 0.6851044641515106, + "flos": 27743032339200.0, + "grad_norm": 1.5229700433464899, + "language_loss": 0.81700999, + "learning_rate": 9.529364989142468e-07, + "loss": 0.83800143, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 11395, + "time_per_iteration": 2.4437007904052734 + }, + { + "auxiliary_loss_clip": 0.01053918, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_clip": 1.02261734, + "balance_loss_mlp": 1.01748514, + "epoch": 0.6851645874041785, + "flos": 24349963345920.0, + "grad_norm": 1.9005084333620816, + "language_loss": 0.73970962, + "learning_rate": 9.526046950148527e-07, + "loss": 0.76072741, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 11396, + "time_per_iteration": 2.4277310371398926 + }, + { + "auxiliary_loss_clip": 0.01054989, + "auxiliary_loss_mlp": 0.01043981, + "balance_loss_clip": 1.0170989, + "balance_loss_mlp": 1.01720333, + "epoch": 0.6852247106568465, + "flos": 15077171750400.0, + "grad_norm": 2.4587053463050297, + "language_loss": 0.79926455, + "learning_rate": 9.522729308327931e-07, + "loss": 0.82025433, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37695312, + "step": 11397, + "time_per_iteration": 2.3470871448516846 + }, + { + "auxiliary_loss_clip": 0.01054603, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.01470435, + "balance_loss_mlp": 1.01659846, + "epoch": 0.6852848339095146, + "flos": 18769027092480.0, + "grad_norm": 2.04043766801662, + "language_loss": 0.73115468, + "learning_rate": 9.519412063806493e-07, + "loss": 0.75209868, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 11398, + "time_per_iteration": 2.358701705932617 + }, + { + "auxiliary_loss_clip": 0.01051911, + "auxiliary_loss_mlp": 0.01038929, + "balance_loss_clip": 1.01564717, + "balance_loss_mlp": 1.01521575, + "epoch": 0.6853449571621825, + "flos": 27853148367360.0, + "grad_norm": 1.8171658081085251, + "language_loss": 0.72207129, + "learning_rate": 9.516095216709996e-07, + "loss": 0.74297965, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 11399, + "time_per_iteration": 2.4178152084350586 + }, + { + "auxiliary_loss_clip": 0.01052927, + "auxiliary_loss_mlp": 0.0104469, + "balance_loss_clip": 1.02008474, + "balance_loss_mlp": 1.01632285, + "epoch": 0.6854050804148505, + "flos": 18149528194560.0, + "grad_norm": 1.7504873925681992, + "language_loss": 0.71053958, + "learning_rate": 9.512778767164217e-07, + "loss": 0.73151577, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 11400, + "time_per_iteration": 2.3521294593811035 + }, + { + "auxiliary_loss_clip": 0.01058554, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.01502049, + "balance_loss_mlp": 1.01766253, + "epoch": 0.6854652036675184, + "flos": 16325281411200.0, + "grad_norm": 1.8245004352550898, + "language_loss": 0.79666638, + "learning_rate": 9.509462715294927e-07, + "loss": 0.8176918, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.40820312, + "step": 11401, + "time_per_iteration": 2.3620312213897705 + }, + { + "auxiliary_loss_clip": 0.01052535, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.01576924, + "balance_loss_mlp": 1.0166049, + "epoch": 0.6855253269201864, + "flos": 14939892817920.0, + "grad_norm": 1.8095769572688383, + "language_loss": 0.76579332, + "learning_rate": 9.50614706122786e-07, + "loss": 0.78669727, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 11402, + "time_per_iteration": 2.3734965324401855 + }, + { + "auxiliary_loss_clip": 0.01053568, + "auxiliary_loss_mlp": 0.01042131, + "balance_loss_clip": 1.01727557, + "balance_loss_mlp": 1.01692629, + "epoch": 0.6855854501728543, + "flos": 23036670443520.0, + "grad_norm": 2.049032637188324, + "language_loss": 0.73968315, + "learning_rate": 9.502831805088742e-07, + "loss": 0.76064014, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 11403, + "time_per_iteration": 2.3925039768218994 + }, + { + "auxiliary_loss_clip": 0.01052459, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.01045394, + "balance_loss_mlp": 1.0172044, + "epoch": 0.6856455734255223, + "flos": 13252994789760.0, + "grad_norm": 2.095374277071407, + "language_loss": 0.82569098, + "learning_rate": 9.499516947003294e-07, + "loss": 0.84654325, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3515625, + "step": 11404, + "time_per_iteration": 2.357356309890747 + }, + { + "auxiliary_loss_clip": 0.01052477, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.01256871, + "balance_loss_mlp": 1.01684678, + "epoch": 0.6857056966781903, + "flos": 23332279858560.0, + "grad_norm": 1.7751694383968486, + "language_loss": 0.7865603, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80744112, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 11405, + "time_per_iteration": 2.393982172012329 + }, + { + "auxiliary_loss_clip": 0.01007099, + "auxiliary_loss_mlp": 0.01003223, + "balance_loss_clip": 1.00071931, + "balance_loss_mlp": 1.00031745, + "epoch": 0.6857658199308583, + "flos": 61849648080000.0, + "grad_norm": 0.7983949674638053, + "language_loss": 0.61073905, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63084227, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.06787109, + "step": 11406, + "time_per_iteration": 3.0813186168670654 + }, + { + "auxiliary_loss_clip": 0.0105291, + "auxiliary_loss_mlp": 0.01040665, + "balance_loss_clip": 1.01647711, + "balance_loss_mlp": 1.01673388, + "epoch": 0.6858259431835262, + "flos": 16653604636800.0, + "grad_norm": 1.6710915142503449, + "language_loss": 0.771905, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79284072, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 11407, + "time_per_iteration": 3.5456738471984863 + }, + { + "auxiliary_loss_clip": 0.01054731, + "auxiliary_loss_mlp": 0.0104468, + "balance_loss_clip": 1.01895463, + "balance_loss_mlp": 1.01655579, + "epoch": 0.6858860664361942, + "flos": 21871863020160.0, + "grad_norm": 2.182440221219345, + "language_loss": 0.73620093, + "learning_rate": 9.486261497711991e-07, + "loss": 0.75719506, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 11408, + "time_per_iteration": 2.3730273246765137 + }, + { + "auxiliary_loss_clip": 0.01054421, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.01435852, + "balance_loss_mlp": 1.01691341, + "epoch": 0.6859461896888621, + "flos": 15266749766400.0, + "grad_norm": 1.8033499426626518, + "language_loss": 0.7091217, + "learning_rate": 9.482948631780087e-07, + "loss": 0.73005581, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 11409, + "time_per_iteration": 2.333545207977295 + }, + { + "auxiliary_loss_clip": 0.01049634, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.0125463, + "balance_loss_mlp": 1.01608276, + "epoch": 0.6860063129415301, + "flos": 18619424449920.0, + "grad_norm": 1.6981971754958989, + "language_loss": 0.78515393, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80598027, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3359375, + "step": 11410, + "time_per_iteration": 3.910554885864258 + }, + { + "auxiliary_loss_clip": 0.01054953, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.02548897, + "balance_loss_mlp": 1.0163064, + "epoch": 0.6860664361941982, + "flos": 23950242656640.0, + "grad_norm": 2.0066568529767452, + "language_loss": 0.73147869, + "learning_rate": 9.476324096464821e-07, + "loss": 0.75255167, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 11411, + "time_per_iteration": 3.818382501602173 + }, + { + "auxiliary_loss_clip": 0.01053397, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.01525283, + "balance_loss_mlp": 1.01621962, + "epoch": 0.6861265594468661, + "flos": 20406872793600.0, + "grad_norm": 2.0717552477297487, + "language_loss": 0.71665782, + "learning_rate": 9.473012427332654e-07, + "loss": 0.73759973, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37109375, + "step": 11412, + "time_per_iteration": 2.355769157409668 + }, + { + "auxiliary_loss_clip": 0.01053485, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.01395559, + "balance_loss_mlp": 1.01695812, + "epoch": 0.6861866826995341, + "flos": 11428014867840.0, + "grad_norm": 3.10557366282532, + "language_loss": 0.73879433, + "learning_rate": 9.469701157384919e-07, + "loss": 0.75970662, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 11413, + "time_per_iteration": 2.311180591583252 + }, + { + "auxiliary_loss_clip": 0.01052051, + "auxiliary_loss_mlp": 0.01041332, + "balance_loss_clip": 1.01808608, + "balance_loss_mlp": 1.01610494, + "epoch": 0.686246805952202, + "flos": 15996678894720.0, + "grad_norm": 1.878246766086936, + "language_loss": 0.7460165, + "learning_rate": 9.466390286747164e-07, + "loss": 0.76695025, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 11414, + "time_per_iteration": 2.34238862991333 + }, + { + "auxiliary_loss_clip": 0.01055012, + "auxiliary_loss_mlp": 0.01038136, + "balance_loss_clip": 1.01325703, + "balance_loss_mlp": 1.01747167, + "epoch": 0.68630692920487, + "flos": 19825743346560.0, + "grad_norm": 2.1046613464676973, + "language_loss": 0.89145231, + "learning_rate": 9.46307981554495e-07, + "loss": 0.91238379, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 11415, + "time_per_iteration": 2.343322515487671 + }, + { + "auxiliary_loss_clip": 0.0105543, + "auxiliary_loss_mlp": 0.01042469, + "balance_loss_clip": 1.0174228, + "balance_loss_mlp": 1.01711655, + "epoch": 0.6863670524575379, + "flos": 26285024384640.0, + "grad_norm": 1.6120753942973145, + "language_loss": 0.6777283, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69870722, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 11416, + "time_per_iteration": 2.405473232269287 + }, + { + "auxiliary_loss_clip": 0.01054142, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.01288629, + "balance_loss_mlp": 1.0168128, + "epoch": 0.686427175710206, + "flos": 19172099272320.0, + "grad_norm": 1.3240030716321713, + "language_loss": 0.77222443, + "learning_rate": 9.456460071949237e-07, + "loss": 0.79312694, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37304688, + "step": 11417, + "time_per_iteration": 2.3662796020507812 + }, + { + "auxiliary_loss_clip": 0.01053027, + "auxiliary_loss_mlp": 0.01039997, + "balance_loss_clip": 1.01712012, + "balance_loss_mlp": 1.01623583, + "epoch": 0.6864872989628739, + "flos": 18915627358080.0, + "grad_norm": 1.7723684080542992, + "language_loss": 0.78753871, + "learning_rate": 9.45315079980678e-07, + "loss": 0.80846894, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 11418, + "time_per_iteration": 2.338643789291382 + }, + { + "auxiliary_loss_clip": 0.01052672, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.0072794, + "balance_loss_mlp": 1.01661801, + "epoch": 0.6865474222155419, + "flos": 25955060325120.0, + "grad_norm": 1.7581327647645042, + "language_loss": 0.77606386, + "learning_rate": 9.449841927601887e-07, + "loss": 0.79689044, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 11419, + "time_per_iteration": 2.415435791015625 + }, + { + "auxiliary_loss_clip": 0.01052467, + "auxiliary_loss_mlp": 0.01039227, + "balance_loss_clip": 1.01661336, + "balance_loss_mlp": 1.01685405, + "epoch": 0.6866075454682098, + "flos": 18477118281600.0, + "grad_norm": 2.0046152572094798, + "language_loss": 0.72498465, + "learning_rate": 9.446533455460044e-07, + "loss": 0.74590158, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 11420, + "time_per_iteration": 2.32309889793396 + }, + { + "auxiliary_loss_clip": 0.01052122, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.01364958, + "balance_loss_mlp": 1.0162586, + "epoch": 0.6866676687208778, + "flos": 34238588146560.0, + "grad_norm": 1.391249037503435, + "language_loss": 0.75273812, + "learning_rate": 9.443225383506712e-07, + "loss": 0.77362895, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 11421, + "time_per_iteration": 3.901359796524048 + }, + { + "auxiliary_loss_clip": 0.01050059, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.01229, + "balance_loss_mlp": 1.01591301, + "epoch": 0.6867277919735457, + "flos": 21720794100480.0, + "grad_norm": 2.4750192652790175, + "language_loss": 0.77915734, + "learning_rate": 9.439917711867338e-07, + "loss": 0.80000448, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34179688, + "step": 11422, + "time_per_iteration": 2.3792598247528076 + }, + { + "auxiliary_loss_clip": 0.01053669, + "auxiliary_loss_mlp": 0.01040332, + "balance_loss_clip": 1.01520288, + "balance_loss_mlp": 1.01668179, + "epoch": 0.6867879152262137, + "flos": 24096842922240.0, + "grad_norm": 2.0320622025122552, + "language_loss": 0.77850974, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79944974, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 11423, + "time_per_iteration": 2.4101178646087646 + }, + { + "auxiliary_loss_clip": 0.01054316, + "auxiliary_loss_mlp": 0.01038901, + "balance_loss_clip": 1.01469004, + "balance_loss_mlp": 1.01740718, + "epoch": 0.6868480384788818, + "flos": 21614762701440.0, + "grad_norm": 1.417671774622705, + "language_loss": 0.73900366, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75993586, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 11424, + "time_per_iteration": 2.3900701999664307 + }, + { + "auxiliary_loss_clip": 0.01052121, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.01476836, + "balance_loss_mlp": 1.01612568, + "epoch": 0.6869081617315497, + "flos": 26284954561920.0, + "grad_norm": 1.7933362047044097, + "language_loss": 0.66261768, + "learning_rate": 9.429997100087112e-07, + "loss": 0.6835196, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 11425, + "time_per_iteration": 2.4188790321350098 + }, + { + "auxiliary_loss_clip": 0.01052606, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.01297975, + "balance_loss_mlp": 1.01696014, + "epoch": 0.6869682849842177, + "flos": 21104053200000.0, + "grad_norm": 1.7534829414093445, + "language_loss": 0.72768205, + "learning_rate": 9.426691030957657e-07, + "loss": 0.74857092, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 11426, + "time_per_iteration": 2.3830528259277344 + }, + { + "auxiliary_loss_clip": 0.01052863, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.01397467, + "balance_loss_mlp": 1.01647925, + "epoch": 0.6870284082368856, + "flos": 17091694776960.0, + "grad_norm": 2.184818056179269, + "language_loss": 0.85850322, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87939966, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 11427, + "time_per_iteration": 2.3857760429382324 + }, + { + "auxiliary_loss_clip": 0.01053074, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.01305449, + "balance_loss_mlp": 1.01697314, + "epoch": 0.6870885314895536, + "flos": 27306862323840.0, + "grad_norm": 1.4852306847482033, + "language_loss": 0.7707994, + "learning_rate": 9.420080095646909e-07, + "loss": 0.79169178, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 11428, + "time_per_iteration": 2.411097764968872 + }, + { + "auxiliary_loss_clip": 0.01055896, + "auxiliary_loss_mlp": 0.01045039, + "balance_loss_clip": 1.01865816, + "balance_loss_mlp": 1.01793098, + "epoch": 0.6871486547422215, + "flos": 20813471020800.0, + "grad_norm": 1.878803217663699, + "language_loss": 0.74445891, + "learning_rate": 9.4167752297163e-07, + "loss": 0.76546824, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 11429, + "time_per_iteration": 2.36568021774292 + }, + { + "auxiliary_loss_clip": 0.01054269, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.01138496, + "balance_loss_mlp": 1.01710343, + "epoch": 0.6872087779948896, + "flos": 30152807400960.0, + "grad_norm": 1.6865181414198038, + "language_loss": 0.84157979, + "learning_rate": 9.413470765102643e-07, + "loss": 0.86247599, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 11430, + "time_per_iteration": 2.4159929752349854 + }, + { + "auxiliary_loss_clip": 0.0105331, + "auxiliary_loss_mlp": 0.01042503, + "balance_loss_clip": 1.01822019, + "balance_loss_mlp": 1.01665962, + "epoch": 0.6872689012475575, + "flos": 20703529549440.0, + "grad_norm": 2.0313860215205923, + "language_loss": 0.71394795, + "learning_rate": 9.410166701931225e-07, + "loss": 0.73490608, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3671875, + "step": 11431, + "time_per_iteration": 2.363665819168091 + }, + { + "auxiliary_loss_clip": 0.0105311, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.00900424, + "balance_loss_mlp": 1.01600242, + "epoch": 0.6873290245002255, + "flos": 25519658359680.0, + "grad_norm": 1.7519579095684388, + "language_loss": 0.81252372, + "learning_rate": 9.406863040327355e-07, + "loss": 0.83338356, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 11432, + "time_per_iteration": 2.391531467437744 + }, + { + "auxiliary_loss_clip": 0.01050838, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.0136795, + "balance_loss_mlp": 1.01673579, + "epoch": 0.6873891477528934, + "flos": 25190322704640.0, + "grad_norm": 1.7252539829248246, + "language_loss": 0.68783939, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70871449, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.33984375, + "step": 11433, + "time_per_iteration": 2.3989617824554443 + }, + { + "auxiliary_loss_clip": 0.0105378, + "auxiliary_loss_mlp": 0.01048398, + "balance_loss_clip": 1.02469969, + "balance_loss_mlp": 1.01749587, + "epoch": 0.6874492710055614, + "flos": 35150938462080.0, + "grad_norm": 1.7886679608656766, + "language_loss": 0.73253757, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75355935, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36328125, + "step": 11434, + "time_per_iteration": 2.4657020568847656 + }, + { + "auxiliary_loss_clip": 0.01052602, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.01067972, + "balance_loss_mlp": 1.01653969, + "epoch": 0.6875093942582293, + "flos": 17821239880320.0, + "grad_norm": 1.8108592304889672, + "language_loss": 0.81900543, + "learning_rate": 9.396954466173657e-07, + "loss": 0.83986163, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36132812, + "step": 11435, + "time_per_iteration": 2.3510630130767822 + }, + { + "auxiliary_loss_clip": 0.01054081, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.01954675, + "balance_loss_mlp": 1.01652932, + "epoch": 0.6875695175108973, + "flos": 20703494638080.0, + "grad_norm": 2.13403768972144, + "language_loss": 0.81747961, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83845812, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 11436, + "time_per_iteration": 2.348726987838745 + }, + { + "auxiliary_loss_clip": 0.01046728, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.0119921, + "balance_loss_mlp": 1.01481235, + "epoch": 0.6876296407635654, + "flos": 25372848625920.0, + "grad_norm": 5.492950686616412, + "language_loss": 0.83031064, + "learning_rate": 9.390350760205183e-07, + "loss": 0.85109299, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.31835938, + "step": 11437, + "time_per_iteration": 2.437922954559326 + }, + { + "auxiliary_loss_clip": 0.01057944, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.01588154, + "balance_loss_mlp": 1.01822531, + "epoch": 0.6876897640162333, + "flos": 23221186312320.0, + "grad_norm": 3.2994476053622996, + "language_loss": 0.79323244, + "learning_rate": 9.387049510636793e-07, + "loss": 0.81423056, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39648438, + "step": 11438, + "time_per_iteration": 2.369091510772705 + }, + { + "auxiliary_loss_clip": 0.01049504, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.01344669, + "balance_loss_mlp": 1.0150044, + "epoch": 0.6877498872689013, + "flos": 27123149416320.0, + "grad_norm": 1.643890579892767, + "language_loss": 0.73123777, + "learning_rate": 9.383748663512554e-07, + "loss": 0.75209546, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34570312, + "step": 11439, + "time_per_iteration": 2.4404802322387695 + }, + { + "auxiliary_loss_clip": 0.01052096, + "auxiliary_loss_mlp": 0.01038652, + "balance_loss_clip": 1.01459575, + "balance_loss_mlp": 1.01565886, + "epoch": 0.6878100105215692, + "flos": 11580899178240.0, + "grad_norm": 1.7652085383842524, + "language_loss": 0.77155674, + "learning_rate": 9.380448218957623e-07, + "loss": 0.7924642, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 11440, + "time_per_iteration": 2.3258981704711914 + }, + { + "auxiliary_loss_clip": 0.01051063, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.01962066, + "balance_loss_mlp": 1.01581872, + "epoch": 0.6878701337742372, + "flos": 20302133114880.0, + "grad_norm": 1.8455345028598718, + "language_loss": 0.7286045, + "learning_rate": 9.377148177097167e-07, + "loss": 0.74954283, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 11441, + "time_per_iteration": 2.3673832416534424 + }, + { + "auxiliary_loss_clip": 0.01056743, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_clip": 1.01612735, + "balance_loss_mlp": 1.01766801, + "epoch": 0.6879302570269051, + "flos": 13839360940800.0, + "grad_norm": 1.7667219109599703, + "language_loss": 0.67867965, + "learning_rate": 9.373848538056317e-07, + "loss": 0.69968396, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.390625, + "step": 11442, + "time_per_iteration": 2.335947036743164 + }, + { + "auxiliary_loss_clip": 0.01052252, + "auxiliary_loss_mlp": 0.01047595, + "balance_loss_clip": 1.02501678, + "balance_loss_mlp": 1.01655579, + "epoch": 0.6879903802795732, + "flos": 21323587029120.0, + "grad_norm": 2.314234502644966, + "language_loss": 0.70618325, + "learning_rate": 9.370549301960189e-07, + "loss": 0.72718173, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 11443, + "time_per_iteration": 2.3682708740234375 + }, + { + "auxiliary_loss_clip": 0.01055037, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.01845264, + "balance_loss_mlp": 1.01770663, + "epoch": 0.6880505035322411, + "flos": 25150975735680.0, + "grad_norm": 1.518761797283864, + "language_loss": 0.76974887, + "learning_rate": 9.367250468933893e-07, + "loss": 0.7907536, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37304688, + "step": 11444, + "time_per_iteration": 2.404855728149414 + }, + { + "auxiliary_loss_clip": 0.0105065, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.01356232, + "balance_loss_mlp": 1.01500034, + "epoch": 0.6881106267849091, + "flos": 23214588065280.0, + "grad_norm": 2.7124635523575575, + "language_loss": 0.77575308, + "learning_rate": 9.363952039102536e-07, + "loss": 0.79663014, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35546875, + "step": 11445, + "time_per_iteration": 2.3630080223083496 + }, + { + "auxiliary_loss_clip": 0.0100844, + "auxiliary_loss_mlp": 0.01004678, + "balance_loss_clip": 1.0024842, + "balance_loss_mlp": 1.00134933, + "epoch": 0.688170750037577, + "flos": 48482173342080.0, + "grad_norm": 0.820205972513339, + "language_loss": 0.58464539, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60477656, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.07080078, + "step": 11446, + "time_per_iteration": 4.205999374389648 + }, + { + "auxiliary_loss_clip": 0.01054128, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.01524436, + "balance_loss_mlp": 1.01660872, + "epoch": 0.688230873290245, + "flos": 22782537590400.0, + "grad_norm": 1.5371582173993914, + "language_loss": 0.761603, + "learning_rate": 9.357356389524886e-07, + "loss": 0.78256536, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.375, + "step": 11447, + "time_per_iteration": 2.386584758758545 + }, + { + "auxiliary_loss_clip": 0.01054147, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.01617289, + "balance_loss_mlp": 1.0165211, + "epoch": 0.6882909965429129, + "flos": 22454563478400.0, + "grad_norm": 2.004466807386617, + "language_loss": 0.74444127, + "learning_rate": 9.354059170028705e-07, + "loss": 0.76539695, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 11448, + "time_per_iteration": 2.3885252475738525 + }, + { + "auxiliary_loss_clip": 0.01055107, + "auxiliary_loss_mlp": 0.0104065, + "balance_loss_clip": 1.01422083, + "balance_loss_mlp": 1.01626348, + "epoch": 0.688351119795581, + "flos": 26212928693760.0, + "grad_norm": 1.6024194882552285, + "language_loss": 0.76119238, + "learning_rate": 9.350762354227673e-07, + "loss": 0.78214997, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 11449, + "time_per_iteration": 2.4111266136169434 + }, + { + "auxiliary_loss_clip": 0.01050851, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.0173347, + "balance_loss_mlp": 1.01572585, + "epoch": 0.6884112430482489, + "flos": 22564155836160.0, + "grad_norm": 1.7285089577011743, + "language_loss": 0.70851457, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72942281, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 11450, + "time_per_iteration": 3.792769432067871 + }, + { + "auxiliary_loss_clip": 0.01055444, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.01548862, + "balance_loss_mlp": 1.01658726, + "epoch": 0.6884713663009169, + "flos": 17340276723840.0, + "grad_norm": 2.8015795758218713, + "language_loss": 0.78045934, + "learning_rate": 9.344169934211068e-07, + "loss": 0.80143225, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 11451, + "time_per_iteration": 2.341939687728882 + }, + { + "auxiliary_loss_clip": 0.0105338, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.0106343, + "balance_loss_mlp": 1.01653934, + "epoch": 0.6885314895535849, + "flos": 26469575164800.0, + "grad_norm": 1.840430705850967, + "language_loss": 0.69953501, + "learning_rate": 9.340874330245505e-07, + "loss": 0.72040445, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 11452, + "time_per_iteration": 2.420421838760376 + }, + { + "auxiliary_loss_clip": 0.01051723, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_clip": 1.01994133, + "balance_loss_mlp": 1.01578641, + "epoch": 0.6885916128062528, + "flos": 20520514869120.0, + "grad_norm": 1.8168821672280988, + "language_loss": 0.73187315, + "learning_rate": 9.337579130475042e-07, + "loss": 0.75283647, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 11453, + "time_per_iteration": 2.3810086250305176 + }, + { + "auxiliary_loss_clip": 0.01008182, + "auxiliary_loss_mlp": 0.01002868, + "balance_loss_clip": 1.00081778, + "balance_loss_mlp": 1.00134516, + "epoch": 0.6886517360589208, + "flos": 70712629603200.0, + "grad_norm": 0.7773188015408529, + "language_loss": 0.5068633, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52697384, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06835938, + "step": 11454, + "time_per_iteration": 2.8703362941741943 + }, + { + "auxiliary_loss_clip": 0.01051407, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.0128305, + "balance_loss_mlp": 1.01673853, + "epoch": 0.6887118593115887, + "flos": 17892602432640.0, + "grad_norm": 2.0796890364802523, + "language_loss": 0.76109898, + "learning_rate": 9.330989944019263e-07, + "loss": 0.78196841, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34765625, + "step": 11455, + "time_per_iteration": 2.341731309890747 + }, + { + "auxiliary_loss_clip": 0.01053776, + "auxiliary_loss_mlp": 0.01047496, + "balance_loss_clip": 1.02110291, + "balance_loss_mlp": 1.01558304, + "epoch": 0.6887719825642568, + "flos": 17452173231360.0, + "grad_norm": 2.5039186098344723, + "language_loss": 0.7492305, + "learning_rate": 9.327695957583803e-07, + "loss": 0.77024323, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3828125, + "step": 11456, + "time_per_iteration": 2.304070234298706 + }, + { + "auxiliary_loss_clip": 0.01050787, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.01462436, + "balance_loss_mlp": 1.01669312, + "epoch": 0.6888321058169247, + "flos": 23069244608640.0, + "grad_norm": 1.6372825780488538, + "language_loss": 0.81922424, + "learning_rate": 9.32440237584319e-07, + "loss": 0.84010845, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.33984375, + "step": 11457, + "time_per_iteration": 2.3790359497070312 + }, + { + "auxiliary_loss_clip": 0.01055367, + "auxiliary_loss_mlp": 0.01039505, + "balance_loss_clip": 1.01443529, + "balance_loss_mlp": 1.01750338, + "epoch": 0.6888922290695927, + "flos": 23367681843840.0, + "grad_norm": 1.7289092911399988, + "language_loss": 0.77299583, + "learning_rate": 9.321109198922301e-07, + "loss": 0.79394448, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 11458, + "time_per_iteration": 2.3676681518554688 + }, + { + "auxiliary_loss_clip": 0.01053397, + "auxiliary_loss_mlp": 0.01035676, + "balance_loss_clip": 1.01320481, + "balance_loss_mlp": 1.01720428, + "epoch": 0.6889523523222606, + "flos": 17630893814400.0, + "grad_norm": 2.2255707534935887, + "language_loss": 0.68766475, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70855552, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 11459, + "time_per_iteration": 2.3116209506988525 + }, + { + "auxiliary_loss_clip": 0.01053753, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.01409781, + "balance_loss_mlp": 1.01746273, + "epoch": 0.6890124755749286, + "flos": 25226981498880.0, + "grad_norm": 1.467589670020225, + "language_loss": 0.69702542, + "learning_rate": 9.314524060039221e-07, + "loss": 0.71793401, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 11460, + "time_per_iteration": 2.4163267612457275 + }, + { + "auxiliary_loss_clip": 0.01056675, + "auxiliary_loss_mlp": 0.0104201, + "balance_loss_clip": 1.0152837, + "balance_loss_mlp": 1.01764452, + "epoch": 0.6890725988275965, + "flos": 20229199551360.0, + "grad_norm": 1.6390567231115725, + "language_loss": 0.78558654, + "learning_rate": 9.311232098326731e-07, + "loss": 0.80657339, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 11461, + "time_per_iteration": 3.7757821083068848 + }, + { + "auxiliary_loss_clip": 0.01054376, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.01887083, + "balance_loss_mlp": 1.01778841, + "epoch": 0.6891327220802645, + "flos": 14534516488320.0, + "grad_norm": 1.6325304259324493, + "language_loss": 0.71034712, + "learning_rate": 9.307940541933401e-07, + "loss": 0.73131788, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 11462, + "time_per_iteration": 2.3606221675872803 + }, + { + "auxiliary_loss_clip": 0.01053214, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.0100348, + "balance_loss_mlp": 1.01701248, + "epoch": 0.6891928453329325, + "flos": 21138163464960.0, + "grad_norm": 1.4076762683006872, + "language_loss": 0.8804251, + "learning_rate": 9.304649390984034e-07, + "loss": 0.90129024, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36132812, + "step": 11463, + "time_per_iteration": 2.3563036918640137 + }, + { + "auxiliary_loss_clip": 0.01051153, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.01388502, + "balance_loss_mlp": 1.0167191, + "epoch": 0.6892529685856005, + "flos": 17857549560960.0, + "grad_norm": 1.761349322399204, + "language_loss": 0.69400972, + "learning_rate": 9.301358645603428e-07, + "loss": 0.71487629, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 11464, + "time_per_iteration": 2.3600642681121826 + }, + { + "auxiliary_loss_clip": 0.01054238, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.016819, + "balance_loss_mlp": 1.01742911, + "epoch": 0.6893130918382685, + "flos": 29933517951360.0, + "grad_norm": 1.9555964864017952, + "language_loss": 0.6650064, + "learning_rate": 9.298068305916373e-07, + "loss": 0.68594325, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3671875, + "step": 11465, + "time_per_iteration": 2.4247515201568604 + }, + { + "auxiliary_loss_clip": 0.01054572, + "auxiliary_loss_mlp": 0.01038666, + "balance_loss_clip": 1.01432347, + "balance_loss_mlp": 1.0168848, + "epoch": 0.6893732150909364, + "flos": 24387390190080.0, + "grad_norm": 1.4080244407263103, + "language_loss": 0.74039596, + "learning_rate": 9.294778372047649e-07, + "loss": 0.76132834, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37695312, + "step": 11466, + "time_per_iteration": 2.421003580093384 + }, + { + "auxiliary_loss_clip": 0.0105427, + "auxiliary_loss_mlp": 0.01042685, + "balance_loss_clip": 1.01746035, + "balance_loss_mlp": 1.01721883, + "epoch": 0.6894333383436044, + "flos": 16981927862400.0, + "grad_norm": 1.663787210997787, + "language_loss": 0.734384, + "learning_rate": 9.291488844121995e-07, + "loss": 0.75535357, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 11467, + "time_per_iteration": 2.3372459411621094 + }, + { + "auxiliary_loss_clip": 0.01055973, + "auxiliary_loss_mlp": 0.0104292, + "balance_loss_clip": 1.01696825, + "balance_loss_mlp": 1.01697814, + "epoch": 0.6894934615962723, + "flos": 18984650849280.0, + "grad_norm": 2.016854929595268, + "language_loss": 0.82030994, + "learning_rate": 9.288199722264156e-07, + "loss": 0.84129888, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 11468, + "time_per_iteration": 2.3383443355560303 + }, + { + "auxiliary_loss_clip": 0.01054868, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_clip": 1.0192821, + "balance_loss_mlp": 1.0171746, + "epoch": 0.6895535848489404, + "flos": 34530252577920.0, + "grad_norm": 1.6608558158114934, + "language_loss": 0.66898555, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68998039, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 11469, + "time_per_iteration": 2.4896671772003174 + }, + { + "auxiliary_loss_clip": 0.01008282, + "auxiliary_loss_mlp": 0.01004005, + "balance_loss_clip": 1.0015012, + "balance_loss_mlp": 1.00137043, + "epoch": 0.6896137081016083, + "flos": 50072954797440.0, + "grad_norm": 0.804907892950301, + "language_loss": 0.55250263, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57262546, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.06933594, + "step": 11470, + "time_per_iteration": 2.9047701358795166 + }, + { + "auxiliary_loss_clip": 0.0104879, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.01603377, + "balance_loss_mlp": 1.01581764, + "epoch": 0.6896738313542763, + "flos": 19937186006400.0, + "grad_norm": 1.7090542292587518, + "language_loss": 0.78932106, + "learning_rate": 9.278334794344715e-07, + "loss": 0.81016904, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33007812, + "step": 11471, + "time_per_iteration": 2.349907875061035 + }, + { + "auxiliary_loss_clip": 0.01052643, + "auxiliary_loss_mlp": 0.01043136, + "balance_loss_clip": 1.01776838, + "balance_loss_mlp": 1.01594555, + "epoch": 0.6897339546069442, + "flos": 21724424881920.0, + "grad_norm": 2.2095753139025938, + "language_loss": 0.79525906, + "learning_rate": 9.275047298005232e-07, + "loss": 0.81621683, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3671875, + "step": 11472, + "time_per_iteration": 2.3949384689331055 + }, + { + "auxiliary_loss_clip": 0.01051558, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.0147357, + "balance_loss_mlp": 1.01636577, + "epoch": 0.6897940778596122, + "flos": 19825533878400.0, + "grad_norm": 1.5907336707154234, + "language_loss": 0.76839924, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78927445, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 11473, + "time_per_iteration": 2.3631575107574463 + }, + { + "auxiliary_loss_clip": 0.01054343, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01684141, + "epoch": 0.6898542011122801, + "flos": 17309133924480.0, + "grad_norm": 2.0122353641413384, + "language_loss": 0.76908076, + "learning_rate": 9.268473525524751e-07, + "loss": 0.79003334, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.375, + "step": 11474, + "time_per_iteration": 2.3419392108917236 + }, + { + "auxiliary_loss_clip": 0.01053683, + "auxiliary_loss_mlp": 0.01042074, + "balance_loss_clip": 1.01707625, + "balance_loss_mlp": 1.0182023, + "epoch": 0.6899143243649482, + "flos": 24752895880320.0, + "grad_norm": 1.56706670131835, + "language_loss": 0.75660127, + "learning_rate": 9.26518724963303e-07, + "loss": 0.77755886, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35546875, + "step": 11475, + "time_per_iteration": 2.4613237380981445 + }, + { + "auxiliary_loss_clip": 0.0105393, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.01340318, + "balance_loss_mlp": 1.01732278, + "epoch": 0.6899744476176161, + "flos": 17233686743040.0, + "grad_norm": 2.1384181042250443, + "language_loss": 0.89444441, + "learning_rate": 9.261901380806491e-07, + "loss": 0.91536313, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 11476, + "time_per_iteration": 2.347249984741211 + }, + { + "auxiliary_loss_clip": 0.01051671, + "auxiliary_loss_mlp": 0.01041225, + "balance_loss_clip": 1.01685846, + "balance_loss_mlp": 1.01619065, + "epoch": 0.6900345708702841, + "flos": 25409507420160.0, + "grad_norm": 1.3347988916410705, + "language_loss": 0.71568525, + "learning_rate": 9.258615919169724e-07, + "loss": 0.73661423, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 11477, + "time_per_iteration": 2.408958673477173 + }, + { + "auxiliary_loss_clip": 0.01054506, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.01884961, + "balance_loss_mlp": 1.01629567, + "epoch": 0.6900946941229521, + "flos": 23433249110400.0, + "grad_norm": 2.1747924529570977, + "language_loss": 0.68517017, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70615715, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 11478, + "time_per_iteration": 2.37276291847229 + }, + { + "auxiliary_loss_clip": 0.01053824, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_clip": 1.02094817, + "balance_loss_mlp": 1.01676917, + "epoch": 0.69015481737562, + "flos": 17819180110080.0, + "grad_norm": 2.046820502063544, + "language_loss": 0.77392203, + "learning_rate": 9.252046217963843e-07, + "loss": 0.79492247, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 11479, + "time_per_iteration": 2.375174045562744 + }, + { + "auxiliary_loss_clip": 0.01054657, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.0148015, + "balance_loss_mlp": 1.01764238, + "epoch": 0.690214940628288, + "flos": 17455559633280.0, + "grad_norm": 1.6176880431254026, + "language_loss": 0.80087149, + "learning_rate": 9.248761978643856e-07, + "loss": 0.82182491, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37109375, + "step": 11480, + "time_per_iteration": 2.3398263454437256 + }, + { + "auxiliary_loss_clip": 0.01054423, + "auxiliary_loss_mlp": 0.01040788, + "balance_loss_clip": 1.01692259, + "balance_loss_mlp": 1.01818442, + "epoch": 0.6902750638809559, + "flos": 29565498643200.0, + "grad_norm": 2.2548350015536274, + "language_loss": 0.76715171, + "learning_rate": 9.245478147011885e-07, + "loss": 0.78810382, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 11481, + "time_per_iteration": 2.4338207244873047 + }, + { + "auxiliary_loss_clip": 0.01052414, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.00931048, + "balance_loss_mlp": 1.01629472, + "epoch": 0.690335187133624, + "flos": 25555933128960.0, + "grad_norm": 1.7334447765601162, + "language_loss": 0.70254874, + "learning_rate": 9.24219472319246e-07, + "loss": 0.72341752, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36132812, + "step": 11482, + "time_per_iteration": 2.3930554389953613 + }, + { + "auxiliary_loss_clip": 0.01053187, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.01532722, + "balance_loss_mlp": 1.01698375, + "epoch": 0.6903953103862919, + "flos": 22487451845760.0, + "grad_norm": 2.039627890317923, + "language_loss": 0.8332603, + "learning_rate": 9.238911707310096e-07, + "loss": 0.85419172, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 11483, + "time_per_iteration": 2.391796112060547 + }, + { + "auxiliary_loss_clip": 0.01054048, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.01386344, + "balance_loss_mlp": 1.0174222, + "epoch": 0.6904554336389599, + "flos": 26099426263680.0, + "grad_norm": 1.7918064897953652, + "language_loss": 0.66441119, + "learning_rate": 9.235629099489273e-07, + "loss": 0.68531787, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 11484, + "time_per_iteration": 2.3765573501586914 + }, + { + "auxiliary_loss_clip": 0.01053184, + "auxiliary_loss_mlp": 0.01039775, + "balance_loss_clip": 1.01651716, + "balance_loss_mlp": 1.01690745, + "epoch": 0.6905155568916278, + "flos": 31170525799680.0, + "grad_norm": 1.5742287109196982, + "language_loss": 0.74475849, + "learning_rate": 9.232346899854479e-07, + "loss": 0.76568806, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 11485, + "time_per_iteration": 3.750086784362793 + }, + { + "auxiliary_loss_clip": 0.01055012, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_clip": 1.01823914, + "balance_loss_mlp": 1.01731288, + "epoch": 0.6905756801442958, + "flos": 17638713959040.0, + "grad_norm": 1.8893143199063736, + "language_loss": 0.86112505, + "learning_rate": 9.22906510853017e-07, + "loss": 0.88210338, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37695312, + "step": 11486, + "time_per_iteration": 2.3484222888946533 + }, + { + "auxiliary_loss_clip": 0.01051412, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.01328444, + "balance_loss_mlp": 1.01514494, + "epoch": 0.6906358033969637, + "flos": 22342666970880.0, + "grad_norm": 1.4254920715551358, + "language_loss": 0.73115838, + "learning_rate": 9.225783725640786e-07, + "loss": 0.75203449, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 11487, + "time_per_iteration": 2.4042537212371826 + }, + { + "auxiliary_loss_clip": 0.01009493, + "auxiliary_loss_mlp": 0.01006194, + "balance_loss_clip": 1.00385785, + "balance_loss_mlp": 1.00211453, + "epoch": 0.6906959266496318, + "flos": 69744172999680.0, + "grad_norm": 0.9018722559669676, + "language_loss": 0.66812789, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68828475, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.07421875, + "step": 11488, + "time_per_iteration": 3.038252353668213 + }, + { + "auxiliary_loss_clip": 0.01057443, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.01552713, + "balance_loss_mlp": 1.01791835, + "epoch": 0.6907560499022997, + "flos": 21433179386880.0, + "grad_norm": 1.653110569691665, + "language_loss": 0.75834525, + "learning_rate": 9.219222185664519e-07, + "loss": 0.77934134, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 11489, + "time_per_iteration": 3.7904717922210693 + }, + { + "auxiliary_loss_clip": 0.0105559, + "auxiliary_loss_mlp": 0.01045123, + "balance_loss_clip": 1.01825368, + "balance_loss_mlp": 1.0171963, + "epoch": 0.6908161731549677, + "flos": 14391337536000.0, + "grad_norm": 2.4907561228356747, + "language_loss": 0.63248217, + "learning_rate": 9.215942028826445e-07, + "loss": 0.65348923, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38476562, + "step": 11490, + "time_per_iteration": 3.7150027751922607 + }, + { + "auxiliary_loss_clip": 0.0105382, + "auxiliary_loss_mlp": 0.01042776, + "balance_loss_clip": 1.01665759, + "balance_loss_mlp": 1.0170778, + "epoch": 0.6908762964076357, + "flos": 20009945013120.0, + "grad_norm": 1.7516940389577251, + "language_loss": 0.73282564, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75379169, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3671875, + "step": 11491, + "time_per_iteration": 2.3741273880004883 + }, + { + "auxiliary_loss_clip": 0.01052346, + "auxiliary_loss_mlp": 0.01039008, + "balance_loss_clip": 1.01668048, + "balance_loss_mlp": 1.01609099, + "epoch": 0.6909364196603036, + "flos": 28767767921280.0, + "grad_norm": 1.3819539689568943, + "language_loss": 0.71237773, + "learning_rate": 9.20938294207235e-07, + "loss": 0.73329127, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 11492, + "time_per_iteration": 2.431334972381592 + }, + { + "auxiliary_loss_clip": 0.01055498, + "auxiliary_loss_mlp": 0.01049607, + "balance_loss_clip": 1.02370334, + "balance_loss_mlp": 1.01769495, + "epoch": 0.6909965429129716, + "flos": 22527043194240.0, + "grad_norm": 1.935535657016633, + "language_loss": 0.7581138, + "learning_rate": 9.206104012405049e-07, + "loss": 0.77916485, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 11493, + "time_per_iteration": 2.3616623878479004 + }, + { + "auxiliary_loss_clip": 0.0105228, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.01376247, + "balance_loss_mlp": 1.0167563, + "epoch": 0.6910566661656395, + "flos": 18404952768000.0, + "grad_norm": 1.9672020221519684, + "language_loss": 0.75939214, + "learning_rate": 9.20282549204336e-07, + "loss": 0.78029025, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 11494, + "time_per_iteration": 2.364143133163452 + }, + { + "auxiliary_loss_clip": 0.01053, + "auxiliary_loss_mlp": 0.01036056, + "balance_loss_clip": 1.01245224, + "balance_loss_mlp": 1.01648808, + "epoch": 0.6911167894183076, + "flos": 30772655412480.0, + "grad_norm": 1.7676081989034769, + "language_loss": 0.69207048, + "learning_rate": 9.19954738111161e-07, + "loss": 0.71296108, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 11495, + "time_per_iteration": 2.4957523345947266 + }, + { + "auxiliary_loss_clip": 0.01052623, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.01375484, + "balance_loss_mlp": 1.01528084, + "epoch": 0.6911769126709755, + "flos": 13734865641600.0, + "grad_norm": 1.7191030994691852, + "language_loss": 0.75231993, + "learning_rate": 9.196269679734119e-07, + "loss": 0.77323145, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 11496, + "time_per_iteration": 2.469289779663086 + }, + { + "auxiliary_loss_clip": 0.01051543, + "auxiliary_loss_mlp": 0.0104028, + "balance_loss_clip": 1.016891, + "balance_loss_mlp": 1.01595998, + "epoch": 0.6912370359236435, + "flos": 17565885129600.0, + "grad_norm": 1.6227945126849885, + "language_loss": 0.81131971, + "learning_rate": 9.19299238803515e-07, + "loss": 0.83223796, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 11497, + "time_per_iteration": 2.328131914138794 + }, + { + "auxiliary_loss_clip": 0.01054471, + "auxiliary_loss_mlp": 0.0104146, + "balance_loss_clip": 1.01542485, + "balance_loss_mlp": 1.01655626, + "epoch": 0.6912971591763114, + "flos": 22089686192640.0, + "grad_norm": 1.6770496565923623, + "language_loss": 0.81315827, + "learning_rate": 9.189715506138993e-07, + "loss": 0.83411759, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 11498, + "time_per_iteration": 2.428844451904297 + }, + { + "auxiliary_loss_clip": 0.01050609, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.0162437, + "balance_loss_mlp": 1.01497531, + "epoch": 0.6913572824289794, + "flos": 29970176745600.0, + "grad_norm": 1.483604674078266, + "language_loss": 0.86791927, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88881397, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 11499, + "time_per_iteration": 2.521315813064575 + }, + { + "auxiliary_loss_clip": 0.01051259, + "auxiliary_loss_mlp": 0.01037286, + "balance_loss_clip": 1.01458895, + "balance_loss_mlp": 1.01607227, + "epoch": 0.6914174056816473, + "flos": 20447895507840.0, + "grad_norm": 1.627785158530618, + "language_loss": 0.76495326, + "learning_rate": 9.183162972252145e-07, + "loss": 0.78583872, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 11500, + "time_per_iteration": 2.462388277053833 + }, + { + "auxiliary_loss_clip": 0.01054725, + "auxiliary_loss_mlp": 0.01048852, + "balance_loss_clip": 1.02331758, + "balance_loss_mlp": 1.01672268, + "epoch": 0.6914775289343154, + "flos": 21281621708160.0, + "grad_norm": 1.7821653587911734, + "language_loss": 0.78537363, + "learning_rate": 9.179887320509921e-07, + "loss": 0.80640936, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 11501, + "time_per_iteration": 3.768042802810669 + }, + { + "auxiliary_loss_clip": 0.01054586, + "auxiliary_loss_mlp": 0.01039923, + "balance_loss_clip": 1.01500821, + "balance_loss_mlp": 1.01694179, + "epoch": 0.6915376521869833, + "flos": 23876994890880.0, + "grad_norm": 1.77767064320417, + "language_loss": 0.74992776, + "learning_rate": 9.176612079067458e-07, + "loss": 0.77087283, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 11502, + "time_per_iteration": 2.37093186378479 + }, + { + "auxiliary_loss_clip": 0.01056306, + "auxiliary_loss_mlp": 0.01039321, + "balance_loss_clip": 1.01303554, + "balance_loss_mlp": 1.01754487, + "epoch": 0.6915977754396513, + "flos": 11509466803200.0, + "grad_norm": 1.8528140598975427, + "language_loss": 0.75759959, + "learning_rate": 9.173337248048953e-07, + "loss": 0.77855581, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 11503, + "time_per_iteration": 2.3967771530151367 + }, + { + "auxiliary_loss_clip": 0.01053452, + "auxiliary_loss_mlp": 0.0104107, + "balance_loss_clip": 1.01636994, + "balance_loss_mlp": 1.01668644, + "epoch": 0.6916578986923193, + "flos": 22600186225920.0, + "grad_norm": 1.6678818941826938, + "language_loss": 0.78282362, + "learning_rate": 9.170062827578575e-07, + "loss": 0.80376887, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 11504, + "time_per_iteration": 2.428076982498169 + }, + { + "auxiliary_loss_clip": 0.01052478, + "auxiliary_loss_mlp": 0.0103734, + "balance_loss_clip": 1.01421356, + "balance_loss_mlp": 1.01629591, + "epoch": 0.6917180219449872, + "flos": 23476226860800.0, + "grad_norm": 1.6545849003663962, + "language_loss": 0.74881482, + "learning_rate": 9.166788817780499e-07, + "loss": 0.76971304, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 11505, + "time_per_iteration": 2.5259311199188232 + }, + { + "auxiliary_loss_clip": 0.01053395, + "auxiliary_loss_mlp": 0.01037702, + "balance_loss_clip": 1.0127511, + "balance_loss_mlp": 1.01667619, + "epoch": 0.6917781451976552, + "flos": 23731407054720.0, + "grad_norm": 1.8181830886906272, + "language_loss": 0.89358616, + "learning_rate": 9.163515218778886e-07, + "loss": 0.91449714, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 11506, + "time_per_iteration": 2.376441478729248 + }, + { + "auxiliary_loss_clip": 0.01052915, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.01042604, + "balance_loss_mlp": 1.01644421, + "epoch": 0.6918382684503231, + "flos": 31465436987520.0, + "grad_norm": 2.0782644769115373, + "language_loss": 0.7204963, + "learning_rate": 9.160242030697856e-07, + "loss": 0.7413711, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11507, + "time_per_iteration": 2.485802412033081 + }, + { + "auxiliary_loss_clip": 0.01054546, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.01342702, + "balance_loss_mlp": 1.0162077, + "epoch": 0.6918983917029912, + "flos": 21649466459520.0, + "grad_norm": 1.912275326820313, + "language_loss": 0.77865887, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79959059, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38476562, + "step": 11508, + "time_per_iteration": 2.375572681427002 + }, + { + "auxiliary_loss_clip": 0.01050488, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.01963544, + "balance_loss_mlp": 1.01649427, + "epoch": 0.6919585149556591, + "flos": 25549090502400.0, + "grad_norm": 1.6245268004366946, + "language_loss": 0.75629115, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77719599, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33984375, + "step": 11509, + "time_per_iteration": 2.4217801094055176 + }, + { + "auxiliary_loss_clip": 0.01054651, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.01745462, + "balance_loss_mlp": 1.01814604, + "epoch": 0.6920186382083271, + "flos": 23658648048000.0, + "grad_norm": 2.163997069355503, + "language_loss": 0.65747499, + "learning_rate": 9.150424933219425e-07, + "loss": 0.67843759, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11510, + "time_per_iteration": 2.387324810028076 + }, + { + "auxiliary_loss_clip": 0.01056841, + "auxiliary_loss_mlp": 0.01043709, + "balance_loss_clip": 1.01651788, + "balance_loss_mlp": 1.01812196, + "epoch": 0.692078761460995, + "flos": 19060970814720.0, + "grad_norm": 1.760363731643227, + "language_loss": 0.7659409, + "learning_rate": 9.147153390061788e-07, + "loss": 0.78694636, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38671875, + "step": 11511, + "time_per_iteration": 2.346987247467041 + }, + { + "auxiliary_loss_clip": 0.01051843, + "auxiliary_loss_mlp": 0.01041169, + "balance_loss_clip": 1.01898444, + "balance_loss_mlp": 1.0159061, + "epoch": 0.692138884713663, + "flos": 29022005508480.0, + "grad_norm": 1.523499138118957, + "language_loss": 0.63514161, + "learning_rate": 9.143882258445184e-07, + "loss": 0.65607172, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 11512, + "time_per_iteration": 2.4706127643585205 + }, + { + "auxiliary_loss_clip": 0.01054457, + "auxiliary_loss_mlp": 0.01045983, + "balance_loss_clip": 1.01976883, + "balance_loss_mlp": 1.01604617, + "epoch": 0.6921990079663309, + "flos": 14756947960320.0, + "grad_norm": 1.9557598606053521, + "language_loss": 0.84036356, + "learning_rate": 9.140611538493666e-07, + "loss": 0.86136794, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 11513, + "time_per_iteration": 2.344864845275879 + }, + { + "auxiliary_loss_clip": 0.01053053, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.01216507, + "balance_loss_mlp": 1.01698565, + "epoch": 0.692259131218999, + "flos": 23840720121600.0, + "grad_norm": 1.452809205272252, + "language_loss": 0.78952283, + "learning_rate": 9.137341230331233e-07, + "loss": 0.81040263, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36132812, + "step": 11514, + "time_per_iteration": 2.398527145385742 + }, + { + "auxiliary_loss_clip": 0.01055194, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.01545501, + "balance_loss_mlp": 1.01720011, + "epoch": 0.6923192544716669, + "flos": 19134078935040.0, + "grad_norm": 1.7809348444448574, + "language_loss": 0.75921881, + "learning_rate": 9.134071334081907e-07, + "loss": 0.78017253, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 11515, + "time_per_iteration": 2.3349075317382812 + }, + { + "auxiliary_loss_clip": 0.01051319, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.01941752, + "balance_loss_mlp": 1.0165689, + "epoch": 0.6923793777243349, + "flos": 28073380423680.0, + "grad_norm": 1.8845986350118744, + "language_loss": 0.54892719, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56986022, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 11516, + "time_per_iteration": 2.4056625366210938 + }, + { + "auxiliary_loss_clip": 0.01051409, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.01309347, + "balance_loss_mlp": 1.0159229, + "epoch": 0.6924395009770029, + "flos": 16580321959680.0, + "grad_norm": 1.9989449485315323, + "language_loss": 0.74210232, + "learning_rate": 9.127532777818557e-07, + "loss": 0.7629903, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35546875, + "step": 11517, + "time_per_iteration": 2.3381876945495605 + }, + { + "auxiliary_loss_clip": 0.01054126, + "auxiliary_loss_mlp": 0.01039653, + "balance_loss_clip": 1.01411867, + "balance_loss_mlp": 1.01634848, + "epoch": 0.6924996242296708, + "flos": 16654337775360.0, + "grad_norm": 1.8207956481489977, + "language_loss": 0.76904309, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78998089, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37695312, + "step": 11518, + "time_per_iteration": 2.346539258956909 + }, + { + "auxiliary_loss_clip": 0.0105545, + "auxiliary_loss_mlp": 0.01039507, + "balance_loss_clip": 1.01126647, + "balance_loss_mlp": 1.01686549, + "epoch": 0.6925597474823388, + "flos": 34752649138560.0, + "grad_norm": 1.3639793011519266, + "language_loss": 0.65606904, + "learning_rate": 9.120995870695376e-07, + "loss": 0.67701864, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.38671875, + "step": 11519, + "time_per_iteration": 2.4963490962982178 + }, + { + "auxiliary_loss_clip": 0.01052634, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.01323676, + "balance_loss_mlp": 1.01595378, + "epoch": 0.6926198707350067, + "flos": 21870641122560.0, + "grad_norm": 2.4054172443288366, + "language_loss": 0.6390357, + "learning_rate": 9.117728035871212e-07, + "loss": 0.65992945, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3671875, + "step": 11520, + "time_per_iteration": 2.3828723430633545 + }, + { + "auxiliary_loss_clip": 0.01057008, + "auxiliary_loss_mlp": 0.01042111, + "balance_loss_clip": 1.01429987, + "balance_loss_mlp": 1.01704192, + "epoch": 0.6926799939876748, + "flos": 13005425272320.0, + "grad_norm": 2.0515745868003243, + "language_loss": 0.79133517, + "learning_rate": 9.114460613703887e-07, + "loss": 0.81232637, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 11521, + "time_per_iteration": 2.3506743907928467 + }, + { + "auxiliary_loss_clip": 0.01055366, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.01386881, + "balance_loss_mlp": 1.01643813, + "epoch": 0.6927401172403427, + "flos": 16760369174400.0, + "grad_norm": 1.7679972627436769, + "language_loss": 0.83077717, + "learning_rate": 9.111193604317304e-07, + "loss": 0.85173678, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 11522, + "time_per_iteration": 2.3741588592529297 + }, + { + "auxiliary_loss_clip": 0.01053924, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.01708961, + "epoch": 0.6928002404930107, + "flos": 25704383696640.0, + "grad_norm": 1.3406284512889908, + "language_loss": 0.77326965, + "learning_rate": 9.107927007835361e-07, + "loss": 0.79420364, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 11523, + "time_per_iteration": 2.483132839202881 + }, + { + "auxiliary_loss_clip": 0.01052467, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.01829147, + "balance_loss_mlp": 1.01713181, + "epoch": 0.6928603637456786, + "flos": 18587269221120.0, + "grad_norm": 2.1678184695236538, + "language_loss": 0.69566643, + "learning_rate": 9.104660824381915e-07, + "loss": 0.71660143, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 11524, + "time_per_iteration": 2.474879503250122 + }, + { + "auxiliary_loss_clip": 0.01055548, + "auxiliary_loss_mlp": 0.01038192, + "balance_loss_clip": 1.01289606, + "balance_loss_mlp": 1.01779008, + "epoch": 0.6929204869983466, + "flos": 22199767309440.0, + "grad_norm": 1.7672332924600918, + "language_loss": 0.65489113, + "learning_rate": 9.101395054080815e-07, + "loss": 0.67582858, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 11525, + "time_per_iteration": 3.6500043869018555 + }, + { + "auxiliary_loss_clip": 0.0105408, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.01334, + "balance_loss_mlp": 1.01769793, + "epoch": 0.6929806102510145, + "flos": 17893789418880.0, + "grad_norm": 2.267936681730246, + "language_loss": 0.7126348, + "learning_rate": 9.098129697055907e-07, + "loss": 0.73355317, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 11526, + "time_per_iteration": 2.3763978481292725 + }, + { + "auxiliary_loss_clip": 0.01052861, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.01336336, + "balance_loss_mlp": 1.0169102, + "epoch": 0.6930407335036826, + "flos": 19754171326080.0, + "grad_norm": 1.6154588566355925, + "language_loss": 0.77333808, + "learning_rate": 9.094864753431022e-07, + "loss": 0.79423529, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 11527, + "time_per_iteration": 2.381542444229126 + }, + { + "auxiliary_loss_clip": 0.01050725, + "auxiliary_loss_mlp": 0.01034845, + "balance_loss_clip": 1.01161134, + "balance_loss_mlp": 1.01527357, + "epoch": 0.6931008567563505, + "flos": 21543155769600.0, + "grad_norm": 3.4644728567690333, + "language_loss": 0.80603522, + "learning_rate": 9.091600223329952e-07, + "loss": 0.82689095, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 11528, + "time_per_iteration": 2.3859262466430664 + }, + { + "auxiliary_loss_clip": 0.01049192, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.01237535, + "balance_loss_mlp": 1.01513124, + "epoch": 0.6931609800090185, + "flos": 26248819438080.0, + "grad_norm": 1.3403837894507182, + "language_loss": 0.76606548, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78689772, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 11529, + "time_per_iteration": 5.276731252670288 + }, + { + "auxiliary_loss_clip": 0.0105262, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.01794946, + "balance_loss_mlp": 1.01648974, + "epoch": 0.6932211032616865, + "flos": 32342001292800.0, + "grad_norm": 1.5733469137996845, + "language_loss": 0.73508871, + "learning_rate": 9.085072404194436e-07, + "loss": 0.75604093, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 11530, + "time_per_iteration": 2.446352243423462 + }, + { + "auxiliary_loss_clip": 0.01057251, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_clip": 1.01758933, + "balance_loss_mlp": 1.0170449, + "epoch": 0.6932812265143544, + "flos": 22048139808000.0, + "grad_norm": 1.863708378601082, + "language_loss": 0.79226214, + "learning_rate": 9.081809115407513e-07, + "loss": 0.81329823, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.40234375, + "step": 11531, + "time_per_iteration": 2.374729871749878 + }, + { + "auxiliary_loss_clip": 0.0105049, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.0128237, + "balance_loss_mlp": 1.01504636, + "epoch": 0.6933413497670224, + "flos": 26255243128320.0, + "grad_norm": 1.4978426653986063, + "language_loss": 0.70222282, + "learning_rate": 9.078546240639484e-07, + "loss": 0.7230891, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 11532, + "time_per_iteration": 2.444790840148926 + }, + { + "auxiliary_loss_clip": 0.01053336, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.01686406, + "balance_loss_mlp": 1.01600349, + "epoch": 0.6934014730196904, + "flos": 19571994518400.0, + "grad_norm": 1.447808405394924, + "language_loss": 0.67992598, + "learning_rate": 9.075283780014082e-07, + "loss": 0.70087522, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37304688, + "step": 11533, + "time_per_iteration": 2.3924949169158936 + }, + { + "auxiliary_loss_clip": 0.01054217, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.01622772, + "balance_loss_mlp": 1.01632917, + "epoch": 0.6934615962723584, + "flos": 22118385196800.0, + "grad_norm": 2.6109034263162325, + "language_loss": 0.59700918, + "learning_rate": 9.072021733655007e-07, + "loss": 0.61797804, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 11534, + "time_per_iteration": 2.3471391201019287 + }, + { + "auxiliary_loss_clip": 0.01052942, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.01397324, + "balance_loss_mlp": 1.01566362, + "epoch": 0.6935217195250263, + "flos": 21359757064320.0, + "grad_norm": 2.075226532966578, + "language_loss": 0.73003173, + "learning_rate": 9.068760101685971e-07, + "loss": 0.75093937, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37304688, + "step": 11535, + "time_per_iteration": 2.379699945449829 + }, + { + "auxiliary_loss_clip": 0.01008821, + "auxiliary_loss_mlp": 0.0100352, + "balance_loss_clip": 1.00102818, + "balance_loss_mlp": 1.00156891, + "epoch": 0.6935818427776943, + "flos": 64060137901440.0, + "grad_norm": 0.7154913137288637, + "language_loss": 0.59215784, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61228132, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.07226562, + "step": 11536, + "time_per_iteration": 3.109936237335205 + }, + { + "auxiliary_loss_clip": 0.01055531, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_clip": 1.01720643, + "balance_loss_mlp": 1.01692808, + "epoch": 0.6936419660303622, + "flos": 20301539621760.0, + "grad_norm": 2.1399357195898716, + "language_loss": 0.74490905, + "learning_rate": 9.062238081412692e-07, + "loss": 0.76588404, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 11537, + "time_per_iteration": 2.3765084743499756 + }, + { + "auxiliary_loss_clip": 0.01008346, + "auxiliary_loss_mlp": 0.01006957, + "balance_loss_clip": 1.00451326, + "balance_loss_mlp": 1.00139725, + "epoch": 0.6937020892830302, + "flos": 67179349123200.0, + "grad_norm": 0.7493355353136424, + "language_loss": 0.55623138, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57638437, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.06933594, + "step": 11538, + "time_per_iteration": 3.033482313156128 + }, + { + "auxiliary_loss_clip": 0.01050248, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.01324463, + "balance_loss_mlp": 1.0157305, + "epoch": 0.6937622125356981, + "flos": 23877064713600.0, + "grad_norm": 1.5646476447307522, + "language_loss": 0.78478098, + "learning_rate": 9.055717720183505e-07, + "loss": 0.80562365, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34570312, + "step": 11539, + "time_per_iteration": 2.3856000900268555 + }, + { + "auxiliary_loss_clip": 0.01053166, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.01309144, + "balance_loss_mlp": 1.01686406, + "epoch": 0.6938223357883662, + "flos": 28729363559040.0, + "grad_norm": 1.7575681836264927, + "language_loss": 0.65471381, + "learning_rate": 9.05245816201953e-07, + "loss": 0.67561066, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 11540, + "time_per_iteration": 3.8264381885528564 + }, + { + "auxiliary_loss_clip": 0.01051326, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.01989365, + "balance_loss_mlp": 1.01571155, + "epoch": 0.6938824590410341, + "flos": 28653846554880.0, + "grad_norm": 1.3814787782319071, + "language_loss": 0.87313873, + "learning_rate": 9.049199018987437e-07, + "loss": 0.89406538, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35546875, + "step": 11541, + "time_per_iteration": 2.417262077331543 + }, + { + "auxiliary_loss_clip": 0.01052681, + "auxiliary_loss_mlp": 0.01036666, + "balance_loss_clip": 1.01262128, + "balance_loss_mlp": 1.01689243, + "epoch": 0.6939425822937021, + "flos": 18982241965440.0, + "grad_norm": 4.103317544760109, + "language_loss": 0.84892511, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86981857, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.359375, + "step": 11542, + "time_per_iteration": 2.362509250640869 + }, + { + "auxiliary_loss_clip": 0.01053471, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.01004541, + "balance_loss_mlp": 1.01608729, + "epoch": 0.6940027055463701, + "flos": 23074725692160.0, + "grad_norm": 2.1140565481878033, + "language_loss": 0.76116383, + "learning_rate": 9.04268197881323e-07, + "loss": 0.78205138, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 11543, + "time_per_iteration": 2.365304470062256 + }, + { + "auxiliary_loss_clip": 0.01051256, + "auxiliary_loss_mlp": 0.01039654, + "balance_loss_clip": 1.01755285, + "balance_loss_mlp": 1.01562309, + "epoch": 0.694062828799038, + "flos": 18185593495680.0, + "grad_norm": 1.6184320028998964, + "language_loss": 0.7710138, + "learning_rate": 9.039424081918241e-07, + "loss": 0.79192287, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 11544, + "time_per_iteration": 2.384392499923706 + }, + { + "auxiliary_loss_clip": 0.01055462, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.01515675, + "balance_loss_mlp": 1.01751649, + "epoch": 0.694122952051706, + "flos": 17820576564480.0, + "grad_norm": 2.2194374601727658, + "language_loss": 0.73188519, + "learning_rate": 9.036166600649388e-07, + "loss": 0.75283492, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.38085938, + "step": 11545, + "time_per_iteration": 2.3280136585235596 + }, + { + "auxiliary_loss_clip": 0.01050396, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.00915742, + "balance_loss_mlp": 1.01589465, + "epoch": 0.694183075304374, + "flos": 21214239050880.0, + "grad_norm": 1.519953841412578, + "language_loss": 0.80632961, + "learning_rate": 9.0329095351302e-07, + "loss": 0.82712924, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.34570312, + "step": 11546, + "time_per_iteration": 2.3915302753448486 + }, + { + "auxiliary_loss_clip": 0.01052365, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.01622605, + "balance_loss_mlp": 1.01610088, + "epoch": 0.694243198557042, + "flos": 24059381166720.0, + "grad_norm": 1.4332717864269475, + "language_loss": 0.79557991, + "learning_rate": 9.029652885484194e-07, + "loss": 0.81649256, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 11547, + "time_per_iteration": 2.487957000732422 + }, + { + "auxiliary_loss_clip": 0.0105308, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.02033448, + "balance_loss_mlp": 1.01681316, + "epoch": 0.6943033218097099, + "flos": 21140816728320.0, + "grad_norm": 2.4564028622434995, + "language_loss": 0.82227969, + "learning_rate": 9.026396651834834e-07, + "loss": 0.84325898, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 11548, + "time_per_iteration": 2.39040470123291 + }, + { + "auxiliary_loss_clip": 0.01007973, + "auxiliary_loss_mlp": 0.01007907, + "balance_loss_clip": 1.00542748, + "balance_loss_mlp": 1.00102448, + "epoch": 0.6943634450623779, + "flos": 57808869943680.0, + "grad_norm": 0.6983198094806087, + "language_loss": 0.53850627, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55866516, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.06933594, + "step": 11549, + "time_per_iteration": 3.0069692134857178 + }, + { + "auxiliary_loss_clip": 0.01052708, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.01393104, + "balance_loss_mlp": 1.01535416, + "epoch": 0.6944235683150458, + "flos": 30589396352640.0, + "grad_norm": 1.412146031347397, + "language_loss": 0.74335611, + "learning_rate": 9.01988543302e-07, + "loss": 0.76427054, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 11550, + "time_per_iteration": 2.459702730178833 + }, + { + "auxiliary_loss_clip": 0.01054823, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.0152458, + "balance_loss_mlp": 1.01747942, + "epoch": 0.6944836915677138, + "flos": 19718420227200.0, + "grad_norm": 1.89412770928752, + "language_loss": 0.74995404, + "learning_rate": 9.016630448101425e-07, + "loss": 0.77090168, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 11551, + "time_per_iteration": 2.349748134613037 + }, + { + "auxiliary_loss_clip": 0.01052402, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.01909924, + "balance_loss_mlp": 1.01697409, + "epoch": 0.6945438148203817, + "flos": 24862418415360.0, + "grad_norm": 1.5626797211838361, + "language_loss": 0.85417622, + "learning_rate": 9.01337587967333e-07, + "loss": 0.875121, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 11552, + "time_per_iteration": 2.426356077194214 + }, + { + "auxiliary_loss_clip": 0.01052849, + "auxiliary_loss_mlp": 0.01043526, + "balance_loss_clip": 1.02093542, + "balance_loss_mlp": 1.01709199, + "epoch": 0.6946039380730498, + "flos": 33325295224320.0, + "grad_norm": 1.5285701443228237, + "language_loss": 0.68160129, + "learning_rate": 9.010121727859117e-07, + "loss": 0.70256501, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 11553, + "time_per_iteration": 2.459977388381958 + }, + { + "auxiliary_loss_clip": 0.0105463, + "auxiliary_loss_mlp": 0.01042969, + "balance_loss_clip": 1.01605153, + "balance_loss_mlp": 1.01690841, + "epoch": 0.6946640613257177, + "flos": 20849885435520.0, + "grad_norm": 1.857417230654762, + "language_loss": 0.80495417, + "learning_rate": 9.006867992782195e-07, + "loss": 0.82593012, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37695312, + "step": 11554, + "time_per_iteration": 2.4154860973358154 + }, + { + "auxiliary_loss_clip": 0.01054613, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.01535511, + "balance_loss_mlp": 1.01688063, + "epoch": 0.6947241845783857, + "flos": 19353822232320.0, + "grad_norm": 2.0491083285994507, + "language_loss": 0.73560435, + "learning_rate": 9.003614674565934e-07, + "loss": 0.75654876, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37695312, + "step": 11555, + "time_per_iteration": 2.3440942764282227 + }, + { + "auxiliary_loss_clip": 0.0105274, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.0122273, + "balance_loss_mlp": 1.01703429, + "epoch": 0.6947843078310536, + "flos": 27119169521280.0, + "grad_norm": 1.707371308402884, + "language_loss": 0.78913736, + "learning_rate": 9.000361773333705e-07, + "loss": 0.81002712, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 11556, + "time_per_iteration": 2.5135788917541504 + }, + { + "auxiliary_loss_clip": 0.01053462, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.01313818, + "balance_loss_mlp": 1.01634407, + "epoch": 0.6948444310837216, + "flos": 28583845545600.0, + "grad_norm": 3.6184923015996877, + "language_loss": 0.62668383, + "learning_rate": 8.997109289208869e-07, + "loss": 0.64758015, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37109375, + "step": 11557, + "time_per_iteration": 2.4196648597717285 + }, + { + "auxiliary_loss_clip": 0.0105175, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.01731515, + "balance_loss_mlp": 1.01612246, + "epoch": 0.6949045543363896, + "flos": 15668355669120.0, + "grad_norm": 2.8992106941674667, + "language_loss": 0.86499155, + "learning_rate": 8.993857222314752e-07, + "loss": 0.88591832, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 11558, + "time_per_iteration": 2.3283145427703857 + }, + { + "auxiliary_loss_clip": 0.01054136, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01217914, + "balance_loss_mlp": 1.01690412, + "epoch": 0.6949646775890576, + "flos": 23258264042880.0, + "grad_norm": 1.467116628542879, + "language_loss": 0.7149756, + "learning_rate": 8.990605572774664e-07, + "loss": 0.73587275, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 11559, + "time_per_iteration": 2.3957700729370117 + }, + { + "auxiliary_loss_clip": 0.0105213, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.01778865, + "balance_loss_mlp": 1.01646686, + "epoch": 0.6950248008417256, + "flos": 22381455358080.0, + "grad_norm": 2.015518713171025, + "language_loss": 0.80035937, + "learning_rate": 8.987354340711921e-07, + "loss": 0.82127678, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35546875, + "step": 11560, + "time_per_iteration": 2.3796913623809814 + }, + { + "auxiliary_loss_clip": 0.01051527, + "auxiliary_loss_mlp": 0.01040502, + "balance_loss_clip": 1.01849639, + "balance_loss_mlp": 1.01674032, + "epoch": 0.6950849240943935, + "flos": 23476226860800.0, + "grad_norm": 1.6898196034068835, + "language_loss": 0.77923167, + "learning_rate": 8.9841035262498e-07, + "loss": 0.80015194, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 11561, + "time_per_iteration": 2.397343158721924 + }, + { + "auxiliary_loss_clip": 0.01051078, + "auxiliary_loss_mlp": 0.01043588, + "balance_loss_clip": 1.01620603, + "balance_loss_mlp": 1.01472545, + "epoch": 0.6951450473470615, + "flos": 17419599066240.0, + "grad_norm": 2.3780493968595393, + "language_loss": 0.79800677, + "learning_rate": 8.980853129511577e-07, + "loss": 0.81895345, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.36328125, + "step": 11562, + "time_per_iteration": 2.3227412700653076 + }, + { + "auxiliary_loss_clip": 0.01053656, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.01506829, + "balance_loss_mlp": 1.01640046, + "epoch": 0.6952051705997294, + "flos": 20484693947520.0, + "grad_norm": 1.9896241968635169, + "language_loss": 0.70326614, + "learning_rate": 8.977603150620515e-07, + "loss": 0.72419339, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37304688, + "step": 11563, + "time_per_iteration": 2.3537721633911133 + }, + { + "auxiliary_loss_clip": 0.0104889, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.01318765, + "balance_loss_mlp": 1.01470327, + "epoch": 0.6952652938523974, + "flos": 13988719203840.0, + "grad_norm": 2.483716085772271, + "language_loss": 0.75223881, + "learning_rate": 8.974353589699846e-07, + "loss": 0.77307826, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 11564, + "time_per_iteration": 2.3247358798980713 + }, + { + "auxiliary_loss_clip": 0.01060664, + "auxiliary_loss_mlp": 0.01044232, + "balance_loss_clip": 1.01452541, + "balance_loss_mlp": 1.01928401, + "epoch": 0.6953254171050653, + "flos": 30952702627200.0, + "grad_norm": 2.1352724731006476, + "language_loss": 0.74116719, + "learning_rate": 8.971104446872785e-07, + "loss": 0.76221615, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.4140625, + "step": 11565, + "time_per_iteration": 3.739166736602783 + }, + { + "auxiliary_loss_clip": 0.01008751, + "auxiliary_loss_mlp": 0.01005758, + "balance_loss_clip": 1.00339794, + "balance_loss_mlp": 1.00162244, + "epoch": 0.6953855403577334, + "flos": 61667261804160.0, + "grad_norm": 0.8967487992596903, + "language_loss": 0.58546269, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60560775, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07128906, + "step": 11566, + "time_per_iteration": 2.8646254539489746 + }, + { + "auxiliary_loss_clip": 0.01054958, + "auxiliary_loss_mlp": 0.01043455, + "balance_loss_clip": 1.01700282, + "balance_loss_mlp": 1.01521158, + "epoch": 0.6954456636104013, + "flos": 23037927252480.0, + "grad_norm": 1.873272202675458, + "language_loss": 0.75109261, + "learning_rate": 8.964607415992338e-07, + "loss": 0.77207673, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3984375, + "step": 11567, + "time_per_iteration": 2.3713159561157227 + }, + { + "auxiliary_loss_clip": 0.01051096, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.01485598, + "balance_loss_mlp": 1.0155617, + "epoch": 0.6955057868630693, + "flos": 23917284466560.0, + "grad_norm": 1.2990935562318244, + "language_loss": 0.77224946, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79314697, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 11568, + "time_per_iteration": 3.7449910640716553 + }, + { + "auxiliary_loss_clip": 0.01054957, + "auxiliary_loss_mlp": 0.01037528, + "balance_loss_clip": 1.01490188, + "balance_loss_mlp": 1.01867414, + "epoch": 0.6955659101157372, + "flos": 22593727624320.0, + "grad_norm": 1.7162374412704953, + "language_loss": 0.73217404, + "learning_rate": 8.958112058964649e-07, + "loss": 0.75309891, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 11569, + "time_per_iteration": 3.808018445968628 + }, + { + "auxiliary_loss_clip": 0.01055036, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.01502991, + "balance_loss_mlp": 1.01773596, + "epoch": 0.6956260333684052, + "flos": 24571347477120.0, + "grad_norm": 1.5833263591246132, + "language_loss": 0.77963126, + "learning_rate": 8.954865008453471e-07, + "loss": 0.80057454, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 11570, + "time_per_iteration": 2.387924909591675 + }, + { + "auxiliary_loss_clip": 0.01052277, + "auxiliary_loss_mlp": 0.01038352, + "balance_loss_clip": 1.01366329, + "balance_loss_mlp": 1.01566505, + "epoch": 0.6956861566210732, + "flos": 25844944296960.0, + "grad_norm": 1.8163462212446806, + "language_loss": 0.75876367, + "learning_rate": 8.95161837677493e-07, + "loss": 0.77966994, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 11571, + "time_per_iteration": 2.4393911361694336 + }, + { + "auxiliary_loss_clip": 0.0104984, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.01356995, + "balance_loss_mlp": 1.01535988, + "epoch": 0.6957462798737412, + "flos": 15300580740480.0, + "grad_norm": 2.241990276879471, + "language_loss": 0.75529879, + "learning_rate": 8.948372164052118e-07, + "loss": 0.77614874, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34375, + "step": 11572, + "time_per_iteration": 2.33640718460083 + }, + { + "auxiliary_loss_clip": 0.01052861, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.01002812, + "balance_loss_mlp": 1.0160948, + "epoch": 0.6958064031264092, + "flos": 36245360851200.0, + "grad_norm": 1.855673932407113, + "language_loss": 0.71470255, + "learning_rate": 8.94512637040814e-07, + "loss": 0.73555982, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 11573, + "time_per_iteration": 2.511404275894165 + }, + { + "auxiliary_loss_clip": 0.01055492, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.01580119, + "balance_loss_mlp": 1.0175581, + "epoch": 0.6958665263790771, + "flos": 19207710725760.0, + "grad_norm": 1.7300041225976708, + "language_loss": 0.7612859, + "learning_rate": 8.941880995966095e-07, + "loss": 0.78225505, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 11574, + "time_per_iteration": 2.366525888442993 + }, + { + "auxiliary_loss_clip": 0.01053295, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.01281953, + "balance_loss_mlp": 1.01668727, + "epoch": 0.6959266496317451, + "flos": 21794844827520.0, + "grad_norm": 1.6027373067451658, + "language_loss": 0.75645202, + "learning_rate": 8.938636040849014e-07, + "loss": 0.77733672, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3671875, + "step": 11575, + "time_per_iteration": 2.3972008228302 + }, + { + "auxiliary_loss_clip": 0.01052382, + "auxiliary_loss_mlp": 0.01037984, + "balance_loss_clip": 1.0131166, + "balance_loss_mlp": 1.01559424, + "epoch": 0.695986772884413, + "flos": 20557208574720.0, + "grad_norm": 1.91399863718846, + "language_loss": 0.80278832, + "learning_rate": 8.935391505179966e-07, + "loss": 0.82369196, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 11576, + "time_per_iteration": 2.3558080196380615 + }, + { + "auxiliary_loss_clip": 0.0105455, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.01297808, + "balance_loss_mlp": 1.01688349, + "epoch": 0.696046896137081, + "flos": 14935424163840.0, + "grad_norm": 2.4174822470824457, + "language_loss": 0.5787378, + "learning_rate": 8.932147389081985e-07, + "loss": 0.59966701, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 11577, + "time_per_iteration": 2.323350667953491 + }, + { + "auxiliary_loss_clip": 0.01050072, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.01324594, + "balance_loss_mlp": 1.01555908, + "epoch": 0.696107019389749, + "flos": 30738824438400.0, + "grad_norm": 1.5175528897674602, + "language_loss": 0.77425075, + "learning_rate": 8.928903692678081e-07, + "loss": 0.79508454, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.34570312, + "step": 11578, + "time_per_iteration": 2.464705467224121 + }, + { + "auxiliary_loss_clip": 0.01053925, + "auxiliary_loss_mlp": 0.01036804, + "balance_loss_clip": 1.01411891, + "balance_loss_mlp": 1.01746643, + "epoch": 0.696167142642417, + "flos": 20775695063040.0, + "grad_norm": 1.8427457013325552, + "language_loss": 0.80399108, + "learning_rate": 8.925660416091254e-07, + "loss": 0.82489836, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36523438, + "step": 11579, + "time_per_iteration": 2.3513641357421875 + }, + { + "auxiliary_loss_clip": 0.01051056, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.012568, + "balance_loss_mlp": 1.01513803, + "epoch": 0.6962272658950849, + "flos": 22564051102080.0, + "grad_norm": 1.8541760369824505, + "language_loss": 0.73697233, + "learning_rate": 8.922417559444502e-07, + "loss": 0.75784695, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 11580, + "time_per_iteration": 3.825021743774414 + }, + { + "auxiliary_loss_clip": 0.01052627, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.01090956, + "balance_loss_mlp": 1.01609635, + "epoch": 0.6962873891477529, + "flos": 22199069082240.0, + "grad_norm": 1.8628750519224015, + "language_loss": 0.6714623, + "learning_rate": 8.919175122860787e-07, + "loss": 0.69233412, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 11581, + "time_per_iteration": 2.3771555423736572 + }, + { + "auxiliary_loss_clip": 0.01053543, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.01174498, + "balance_loss_mlp": 1.01704204, + "epoch": 0.6963475124004208, + "flos": 12489025219200.0, + "grad_norm": 2.4860546287487995, + "language_loss": 0.77812243, + "learning_rate": 8.915933106463056e-07, + "loss": 0.79899639, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36523438, + "step": 11582, + "time_per_iteration": 2.3225293159484863 + }, + { + "auxiliary_loss_clip": 0.01051644, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.01276338, + "balance_loss_mlp": 1.01536644, + "epoch": 0.6964076356530888, + "flos": 17164139581440.0, + "grad_norm": 2.7401986135400924, + "language_loss": 0.70797729, + "learning_rate": 8.91269151037425e-07, + "loss": 0.7288475, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 11583, + "time_per_iteration": 2.3888485431671143 + }, + { + "auxiliary_loss_clip": 0.01054069, + "auxiliary_loss_mlp": 0.01038452, + "balance_loss_clip": 1.01488471, + "balance_loss_mlp": 1.01799941, + "epoch": 0.6964677589057569, + "flos": 19936313222400.0, + "grad_norm": 2.108932441149312, + "language_loss": 0.83433253, + "learning_rate": 8.909450334717301e-07, + "loss": 0.85525775, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 11584, + "time_per_iteration": 2.3491950035095215 + }, + { + "auxiliary_loss_clip": 0.0105487, + "auxiliary_loss_mlp": 0.01037191, + "balance_loss_clip": 1.01147723, + "balance_loss_mlp": 1.01739788, + "epoch": 0.6965278821584248, + "flos": 22782956526720.0, + "grad_norm": 2.323242599946581, + "language_loss": 0.81395125, + "learning_rate": 8.906209579615107e-07, + "loss": 0.83487189, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 11585, + "time_per_iteration": 2.38248872756958 + }, + { + "auxiliary_loss_clip": 0.01050525, + "auxiliary_loss_mlp": 0.01035191, + "balance_loss_clip": 1.01324487, + "balance_loss_mlp": 1.01602852, + "epoch": 0.6965880054110928, + "flos": 20046533984640.0, + "grad_norm": 1.6123779950085897, + "language_loss": 0.79137683, + "learning_rate": 8.90296924519055e-07, + "loss": 0.81223398, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34570312, + "step": 11586, + "time_per_iteration": 2.376453161239624 + }, + { + "auxiliary_loss_clip": 0.0104794, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.01440382, + "balance_loss_mlp": 1.01522136, + "epoch": 0.6966481286637607, + "flos": 21907160271360.0, + "grad_norm": 1.5638269576469919, + "language_loss": 0.79637337, + "learning_rate": 8.899729331566519e-07, + "loss": 0.8171922, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.328125, + "step": 11587, + "time_per_iteration": 2.3909571170806885 + }, + { + "auxiliary_loss_clip": 0.01050835, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.01405525, + "balance_loss_mlp": 1.01640391, + "epoch": 0.6967082519164287, + "flos": 15632255456640.0, + "grad_norm": 1.9309067872880417, + "language_loss": 0.74090803, + "learning_rate": 8.896489838865857e-07, + "loss": 0.76178229, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34375, + "step": 11588, + "time_per_iteration": 2.350559949874878 + }, + { + "auxiliary_loss_clip": 0.01050887, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.01173091, + "balance_loss_mlp": 1.01564848, + "epoch": 0.6967683751690966, + "flos": 24023455511040.0, + "grad_norm": 1.7213783218890435, + "language_loss": 0.76645297, + "learning_rate": 8.893250767211413e-07, + "loss": 0.78728515, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.3515625, + "step": 11589, + "time_per_iteration": 2.3964054584503174 + }, + { + "auxiliary_loss_clip": 0.01052819, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.01202524, + "balance_loss_mlp": 1.01623726, + "epoch": 0.6968284984217646, + "flos": 31023506597760.0, + "grad_norm": 2.0513123346592765, + "language_loss": 0.6500628, + "learning_rate": 8.890012116726012e-07, + "loss": 0.67094034, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 11590, + "time_per_iteration": 2.4435911178588867 + }, + { + "auxiliary_loss_clip": 0.01008463, + "auxiliary_loss_mlp": 0.01005014, + "balance_loss_clip": 1.00243902, + "balance_loss_mlp": 1.00149894, + "epoch": 0.6968886216744326, + "flos": 67619673590400.0, + "grad_norm": 0.7559601346058112, + "language_loss": 0.61363077, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63376546, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.06933594, + "step": 11591, + "time_per_iteration": 3.0924127101898193 + }, + { + "auxiliary_loss_clip": 0.01053749, + "auxiliary_loss_mlp": 0.0103689, + "balance_loss_clip": 1.01321507, + "balance_loss_mlp": 1.0168519, + "epoch": 0.6969487449271006, + "flos": 24862523149440.0, + "grad_norm": 1.6961223723905692, + "language_loss": 0.70548522, + "learning_rate": 8.883536079753582e-07, + "loss": 0.72639167, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 11592, + "time_per_iteration": 2.385193347930908 + }, + { + "auxiliary_loss_clip": 0.01050862, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.01166344, + "balance_loss_mlp": 1.01627302, + "epoch": 0.6970088681797685, + "flos": 28766580935040.0, + "grad_norm": 1.6868821010494794, + "language_loss": 0.63442284, + "learning_rate": 8.880298693512109e-07, + "loss": 0.65527093, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34570312, + "step": 11593, + "time_per_iteration": 2.459378719329834 + }, + { + "auxiliary_loss_clip": 0.01049411, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.00873137, + "balance_loss_mlp": 1.0161531, + "epoch": 0.6970689914324365, + "flos": 27307316171520.0, + "grad_norm": 1.4571851987143956, + "language_loss": 0.55676532, + "learning_rate": 8.877061728930832e-07, + "loss": 0.57755184, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33203125, + "step": 11594, + "time_per_iteration": 2.40832257270813 + }, + { + "auxiliary_loss_clip": 0.01051507, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.01248837, + "balance_loss_mlp": 1.01622808, + "epoch": 0.6971291146851044, + "flos": 19135231009920.0, + "grad_norm": 2.4614175461899492, + "language_loss": 0.78304631, + "learning_rate": 8.87382518613248e-07, + "loss": 0.80390704, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 11595, + "time_per_iteration": 2.358025312423706 + }, + { + "auxiliary_loss_clip": 0.0105458, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.01545811, + "balance_loss_mlp": 1.01763475, + "epoch": 0.6971892379377724, + "flos": 14609649467520.0, + "grad_norm": 2.2759913012179487, + "language_loss": 0.72565949, + "learning_rate": 8.870589065239793e-07, + "loss": 0.74659228, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 11596, + "time_per_iteration": 2.3428988456726074 + }, + { + "auxiliary_loss_clip": 0.01053255, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.02011335, + "balance_loss_mlp": 1.01709199, + "epoch": 0.6972493611904405, + "flos": 22306427112960.0, + "grad_norm": 1.969685580749523, + "language_loss": 0.76872182, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78970349, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 11597, + "time_per_iteration": 2.37727427482605 + }, + { + "auxiliary_loss_clip": 0.01050605, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.01481342, + "balance_loss_mlp": 1.0155859, + "epoch": 0.6973094844431084, + "flos": 17419424509440.0, + "grad_norm": 1.8621659776966977, + "language_loss": 0.76309967, + "learning_rate": 8.864118089662267e-07, + "loss": 0.78396147, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34960938, + "step": 11598, + "time_per_iteration": 2.3557000160217285 + }, + { + "auxiliary_loss_clip": 0.01053884, + "auxiliary_loss_mlp": 0.01040364, + "balance_loss_clip": 1.01581919, + "balance_loss_mlp": 1.01642501, + "epoch": 0.6973696076957764, + "flos": 27234138228480.0, + "grad_norm": 1.778545507796275, + "language_loss": 0.90925097, + "learning_rate": 8.860883235222791e-07, + "loss": 0.93019348, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 11599, + "time_per_iteration": 2.396793842315674 + }, + { + "auxiliary_loss_clip": 0.0105599, + "auxiliary_loss_mlp": 0.01046434, + "balance_loss_clip": 1.01979101, + "balance_loss_mlp": 1.01768637, + "epoch": 0.6974297309484443, + "flos": 22016997008640.0, + "grad_norm": 1.830212777454065, + "language_loss": 0.7096566, + "learning_rate": 8.85764880317974e-07, + "loss": 0.73068082, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 11600, + "time_per_iteration": 2.3984360694885254 + }, + { + "auxiliary_loss_clip": 0.01052319, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.01518297, + "balance_loss_mlp": 1.01559258, + "epoch": 0.6974898542011123, + "flos": 28365184500480.0, + "grad_norm": 1.68095654370787, + "language_loss": 0.77759576, + "learning_rate": 8.854414793655771e-07, + "loss": 0.79849923, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 11601, + "time_per_iteration": 2.4187092781066895 + }, + { + "auxiliary_loss_clip": 0.01049317, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.01714492, + "balance_loss_mlp": 1.01526141, + "epoch": 0.6975499774537802, + "flos": 15231138312960.0, + "grad_norm": 1.695083918114348, + "language_loss": 0.73400986, + "learning_rate": 8.851181206773508e-07, + "loss": 0.75487775, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33984375, + "step": 11602, + "time_per_iteration": 2.3627638816833496 + }, + { + "auxiliary_loss_clip": 0.0105175, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.01271462, + "balance_loss_mlp": 1.0159936, + "epoch": 0.6976101007064482, + "flos": 22156510268160.0, + "grad_norm": 2.153134496457893, + "language_loss": 0.77250421, + "learning_rate": 8.847948042655567e-07, + "loss": 0.79337192, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 11603, + "time_per_iteration": 2.3522982597351074 + }, + { + "auxiliary_loss_clip": 0.01051509, + "auxiliary_loss_mlp": 0.01039436, + "balance_loss_clip": 1.01571321, + "balance_loss_mlp": 1.01517427, + "epoch": 0.6976702239591162, + "flos": 22272421582080.0, + "grad_norm": 1.7172762318889199, + "language_loss": 0.62932587, + "learning_rate": 8.844715301424557e-07, + "loss": 0.65023535, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36328125, + "step": 11604, + "time_per_iteration": 3.639749765396118 + }, + { + "auxiliary_loss_clip": 0.01052163, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.01583099, + "balance_loss_mlp": 1.01596582, + "epoch": 0.6977303472117842, + "flos": 25847423003520.0, + "grad_norm": 3.4041046785085847, + "language_loss": 0.81933475, + "learning_rate": 8.841482983203057e-07, + "loss": 0.8402608, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 11605, + "time_per_iteration": 2.420377016067505 + }, + { + "auxiliary_loss_clip": 0.01051642, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.01352024, + "balance_loss_mlp": 1.01583934, + "epoch": 0.6977904704644521, + "flos": 20958535186560.0, + "grad_norm": 1.6572227326874687, + "language_loss": 0.71576542, + "learning_rate": 8.838251088113638e-07, + "loss": 0.73664069, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 11606, + "time_per_iteration": 2.3563010692596436 + }, + { + "auxiliary_loss_clip": 0.01053801, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.01348245, + "balance_loss_mlp": 1.01682091, + "epoch": 0.6978505937171201, + "flos": 22053935093760.0, + "grad_norm": 1.9572113513859628, + "language_loss": 0.83534712, + "learning_rate": 8.835019616278856e-07, + "loss": 0.85625595, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 11607, + "time_per_iteration": 2.38275146484375 + }, + { + "auxiliary_loss_clip": 0.01054739, + "auxiliary_loss_mlp": 0.01042091, + "balance_loss_clip": 1.01821351, + "balance_loss_mlp": 1.01742339, + "epoch": 0.697910716969788, + "flos": 20042798469120.0, + "grad_norm": 1.8769507605945905, + "language_loss": 0.80212808, + "learning_rate": 8.831788567821265e-07, + "loss": 0.82309639, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 11608, + "time_per_iteration": 3.74326753616333 + }, + { + "auxiliary_loss_clip": 0.01051946, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.01359367, + "balance_loss_mlp": 1.01650035, + "epoch": 0.697970840222456, + "flos": 15887330916480.0, + "grad_norm": 1.966320485106676, + "language_loss": 0.91428769, + "learning_rate": 8.828557942863357e-07, + "loss": 0.93517113, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 11609, + "time_per_iteration": 3.709801435470581 + }, + { + "auxiliary_loss_clip": 0.01052633, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.01368809, + "balance_loss_mlp": 1.01542008, + "epoch": 0.698030963475124, + "flos": 21214553253120.0, + "grad_norm": 1.5866161236653757, + "language_loss": 0.65249598, + "learning_rate": 8.82532774152765e-07, + "loss": 0.67339194, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 11610, + "time_per_iteration": 2.360164165496826 + }, + { + "auxiliary_loss_clip": 0.01051783, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.01555538, + "balance_loss_mlp": 1.01644874, + "epoch": 0.698091086727792, + "flos": 33758497774080.0, + "grad_norm": 2.0159556774404424, + "language_loss": 0.85570145, + "learning_rate": 8.822097963936643e-07, + "loss": 0.87659323, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 11611, + "time_per_iteration": 2.4812636375427246 + }, + { + "auxiliary_loss_clip": 0.01052301, + "auxiliary_loss_mlp": 0.01038114, + "balance_loss_clip": 1.01514244, + "balance_loss_mlp": 1.01566696, + "epoch": 0.69815120998046, + "flos": 15886946891520.0, + "grad_norm": 1.940930807163004, + "language_loss": 0.72228658, + "learning_rate": 8.818868610212793e-07, + "loss": 0.74319077, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 11612, + "time_per_iteration": 2.3347463607788086 + }, + { + "auxiliary_loss_clip": 0.01050338, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.01783717, + "balance_loss_mlp": 1.01575708, + "epoch": 0.6982113332331279, + "flos": 18946211575680.0, + "grad_norm": 1.4985155717040275, + "language_loss": 0.82066548, + "learning_rate": 8.815639680478573e-07, + "loss": 0.84157133, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 11613, + "time_per_iteration": 2.4345881938934326 + }, + { + "auxiliary_loss_clip": 0.01050306, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.01006341, + "balance_loss_mlp": 1.01645041, + "epoch": 0.6982714564857959, + "flos": 24388437530880.0, + "grad_norm": 2.026493467018975, + "language_loss": 0.76821673, + "learning_rate": 8.812411174856411e-07, + "loss": 0.78903109, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33789062, + "step": 11614, + "time_per_iteration": 2.4006752967834473 + }, + { + "auxiliary_loss_clip": 0.01054306, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.01292253, + "balance_loss_mlp": 1.01798868, + "epoch": 0.6983315797384638, + "flos": 20082704019840.0, + "grad_norm": 2.7625709647120655, + "language_loss": 0.79290974, + "learning_rate": 8.809183093468746e-07, + "loss": 0.81379247, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.36328125, + "step": 11615, + "time_per_iteration": 2.419020891189575 + }, + { + "auxiliary_loss_clip": 0.01049267, + "auxiliary_loss_mlp": 0.01038956, + "balance_loss_clip": 1.01773679, + "balance_loss_mlp": 1.01533461, + "epoch": 0.6983917029911318, + "flos": 13511701031040.0, + "grad_norm": 2.4175594533015867, + "language_loss": 0.73687935, + "learning_rate": 8.80595543643797e-07, + "loss": 0.7577616, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33984375, + "step": 11616, + "time_per_iteration": 2.3636999130249023 + }, + { + "auxiliary_loss_clip": 0.01051111, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.01773047, + "balance_loss_mlp": 1.01655447, + "epoch": 0.6984518262437998, + "flos": 22017311210880.0, + "grad_norm": 1.7883712298753913, + "language_loss": 0.85105097, + "learning_rate": 8.802728203886487e-07, + "loss": 0.87195718, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34570312, + "step": 11617, + "time_per_iteration": 2.4609696865081787 + }, + { + "auxiliary_loss_clip": 0.0105371, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.01934075, + "balance_loss_mlp": 1.01708412, + "epoch": 0.6985119494964678, + "flos": 18769620585600.0, + "grad_norm": 2.213873187580799, + "language_loss": 0.60273886, + "learning_rate": 8.799501395936682e-07, + "loss": 0.62369597, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 11618, + "time_per_iteration": 2.4006450176239014 + }, + { + "auxiliary_loss_clip": 0.0105148, + "auxiliary_loss_mlp": 0.01039177, + "balance_loss_clip": 1.01761174, + "balance_loss_mlp": 1.01616359, + "epoch": 0.6985720727491357, + "flos": 22381734648960.0, + "grad_norm": 1.7803714561394302, + "language_loss": 0.83944619, + "learning_rate": 8.796275012710903e-07, + "loss": 0.86035275, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35351562, + "step": 11619, + "time_per_iteration": 3.872917890548706 + }, + { + "auxiliary_loss_clip": 0.01047816, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.01534605, + "balance_loss_mlp": 1.01447678, + "epoch": 0.6986321960018037, + "flos": 39566299242240.0, + "grad_norm": 1.937177192152036, + "language_loss": 0.68072248, + "learning_rate": 8.793049054331494e-07, + "loss": 0.7015484, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.33398438, + "step": 11620, + "time_per_iteration": 2.56880784034729 + }, + { + "auxiliary_loss_clip": 0.01053126, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.01343417, + "balance_loss_mlp": 1.01608884, + "epoch": 0.6986923192544716, + "flos": 17966757893760.0, + "grad_norm": 2.080679860087928, + "language_loss": 0.73882598, + "learning_rate": 8.789823520920794e-07, + "loss": 0.75973392, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 11621, + "time_per_iteration": 2.364098310470581 + }, + { + "auxiliary_loss_clip": 0.01053412, + "auxiliary_loss_mlp": 0.01041538, + "balance_loss_clip": 1.01905489, + "balance_loss_mlp": 1.01641786, + "epoch": 0.6987524425071396, + "flos": 25593115593600.0, + "grad_norm": 1.6028099462176315, + "language_loss": 0.69929129, + "learning_rate": 8.7865984126011e-07, + "loss": 0.72024071, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.37109375, + "step": 11622, + "time_per_iteration": 2.44592547416687 + }, + { + "auxiliary_loss_clip": 0.01048771, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.01373076, + "balance_loss_mlp": 1.01501608, + "epoch": 0.6988125657598077, + "flos": 17529121601280.0, + "grad_norm": 1.6909221715128162, + "language_loss": 0.63526857, + "learning_rate": 8.783373729494721e-07, + "loss": 0.65611136, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.33789062, + "step": 11623, + "time_per_iteration": 2.3564870357513428 + }, + { + "auxiliary_loss_clip": 0.01052982, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.0132854, + "balance_loss_mlp": 1.01539755, + "epoch": 0.6988726890124756, + "flos": 39164169669120.0, + "grad_norm": 1.7662552665025708, + "language_loss": 0.61916733, + "learning_rate": 8.780149471723932e-07, + "loss": 0.64005733, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.37695312, + "step": 11624, + "time_per_iteration": 2.5177547931671143 + }, + { + "auxiliary_loss_clip": 0.01053799, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_clip": 1.02116299, + "balance_loss_mlp": 1.0165751, + "epoch": 0.6989328122651436, + "flos": 20192436023040.0, + "grad_norm": 1.9220385006714453, + "language_loss": 0.7941035, + "learning_rate": 8.776925639411017e-07, + "loss": 0.81511748, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37109375, + "step": 11625, + "time_per_iteration": 2.354222059249878 + }, + { + "auxiliary_loss_clip": 0.01049788, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.01136589, + "balance_loss_mlp": 1.01543176, + "epoch": 0.6989929355178115, + "flos": 21833807771520.0, + "grad_norm": 1.8709478912433393, + "language_loss": 0.67617595, + "learning_rate": 8.773702232678188e-07, + "loss": 0.69698983, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34375, + "step": 11626, + "time_per_iteration": 2.378319501876831 + }, + { + "auxiliary_loss_clip": 0.01053093, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.01776767, + "balance_loss_mlp": 1.0168283, + "epoch": 0.6990530587704795, + "flos": 26321683178880.0, + "grad_norm": 2.168284421833644, + "language_loss": 0.71998459, + "learning_rate": 8.770479251647697e-07, + "loss": 0.74093521, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 11627, + "time_per_iteration": 2.4146807193756104 + }, + { + "auxiliary_loss_clip": 0.01051709, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.01330924, + "balance_loss_mlp": 1.01745558, + "epoch": 0.6991131820231474, + "flos": 19827942762240.0, + "grad_norm": 2.1061698962593645, + "language_loss": 0.6348002, + "learning_rate": 8.767256696441768e-07, + "loss": 0.6556499, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.34179688, + "step": 11628, + "time_per_iteration": 2.377227544784546 + }, + { + "auxiliary_loss_clip": 0.0105289, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.01672769, + "balance_loss_mlp": 1.01664519, + "epoch": 0.6991733052758154, + "flos": 33983407952640.0, + "grad_norm": 1.9618913309861241, + "language_loss": 0.69874573, + "learning_rate": 8.764034567182581e-07, + "loss": 0.71966815, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 11629, + "time_per_iteration": 2.5003819465637207 + }, + { + "auxiliary_loss_clip": 0.0105226, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.01522732, + "balance_loss_mlp": 1.01654291, + "epoch": 0.6992334285284834, + "flos": 15632220545280.0, + "grad_norm": 1.6736693820148398, + "language_loss": 0.73622537, + "learning_rate": 8.760812863992337e-07, + "loss": 0.75712347, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 11630, + "time_per_iteration": 2.351524591445923 + }, + { + "auxiliary_loss_clip": 0.01050674, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.01564717, + "balance_loss_mlp": 1.01609206, + "epoch": 0.6992935517811514, + "flos": 21725192931840.0, + "grad_norm": 1.9047625800670134, + "language_loss": 0.75181299, + "learning_rate": 8.757591586993196e-07, + "loss": 0.77269661, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34570312, + "step": 11631, + "time_per_iteration": 2.3895297050476074 + }, + { + "auxiliary_loss_clip": 0.01055257, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.01209331, + "balance_loss_mlp": 1.0181793, + "epoch": 0.6993536750338193, + "flos": 20114370489600.0, + "grad_norm": 2.0447692505600825, + "language_loss": 0.90306181, + "learning_rate": 8.7543707363073e-07, + "loss": 0.92398417, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37109375, + "step": 11632, + "time_per_iteration": 2.3458268642425537 + }, + { + "auxiliary_loss_clip": 0.01053149, + "auxiliary_loss_mlp": 0.01039603, + "balance_loss_clip": 1.01760924, + "balance_loss_mlp": 1.01671338, + "epoch": 0.6994137982864873, + "flos": 22009665623040.0, + "grad_norm": 1.605118778338367, + "language_loss": 0.80616403, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82709157, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36328125, + "step": 11633, + "time_per_iteration": 2.412066698074341 + }, + { + "auxiliary_loss_clip": 0.01054897, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.01486635, + "balance_loss_mlp": 1.01705873, + "epoch": 0.6994739215391552, + "flos": 25517877880320.0, + "grad_norm": 1.8589679345341321, + "language_loss": 0.68695641, + "learning_rate": 8.747930314363794e-07, + "loss": 0.70790726, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 11634, + "time_per_iteration": 2.4107253551483154 + }, + { + "auxiliary_loss_clip": 0.01009069, + "auxiliary_loss_mlp": 0.01002702, + "balance_loss_clip": 1.00018644, + "balance_loss_mlp": 1.00208354, + "epoch": 0.6995340447918232, + "flos": 59125095400320.0, + "grad_norm": 0.6864107652113981, + "language_loss": 0.531546, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55166364, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.02514648, + "router_z_loss_mlp": 0.06982422, + "step": 11635, + "time_per_iteration": 3.1277272701263428 + }, + { + "auxiliary_loss_clip": 0.01052414, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.01294255, + "balance_loss_mlp": 1.01615739, + "epoch": 0.6995941680444913, + "flos": 17966862627840.0, + "grad_norm": 1.8063572211320686, + "language_loss": 0.8227706, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84365839, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 11636, + "time_per_iteration": 2.3450889587402344 + }, + { + "auxiliary_loss_clip": 0.01053395, + "auxiliary_loss_mlp": 0.01041973, + "balance_loss_clip": 1.01772594, + "balance_loss_mlp": 1.01597524, + "epoch": 0.6996542912971592, + "flos": 21979046494080.0, + "grad_norm": 3.6704368814587904, + "language_loss": 0.84197581, + "learning_rate": 8.738272881850801e-07, + "loss": 0.86292946, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 11637, + "time_per_iteration": 2.3990094661712646 + }, + { + "auxiliary_loss_clip": 0.01052056, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.00945103, + "balance_loss_mlp": 1.01601028, + "epoch": 0.6997144145498272, + "flos": 11685534122880.0, + "grad_norm": 2.3280504823733676, + "language_loss": 0.68976474, + "learning_rate": 8.735054591608704e-07, + "loss": 0.71061003, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 11638, + "time_per_iteration": 2.365551471710205 + }, + { + "auxiliary_loss_clip": 0.01055083, + "auxiliary_loss_mlp": 0.01045323, + "balance_loss_clip": 1.0182029, + "balance_loss_mlp": 1.01699924, + "epoch": 0.6997745378024951, + "flos": 29605858041600.0, + "grad_norm": 11.855144041182964, + "language_loss": 0.79052615, + "learning_rate": 8.731836728534459e-07, + "loss": 0.81153023, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38085938, + "step": 11639, + "time_per_iteration": 2.4729456901550293 + }, + { + "auxiliary_loss_clip": 0.01053782, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_clip": 1.01886892, + "balance_loss_mlp": 1.01728201, + "epoch": 0.6998346610551631, + "flos": 20885566711680.0, + "grad_norm": 2.214195161931057, + "language_loss": 0.84653401, + "learning_rate": 8.728619292750093e-07, + "loss": 0.86749828, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 11640, + "time_per_iteration": 2.381373882293701 + }, + { + "auxiliary_loss_clip": 0.01052229, + "auxiliary_loss_mlp": 0.01035805, + "balance_loss_clip": 1.01391816, + "balance_loss_mlp": 1.01579547, + "epoch": 0.699894784307831, + "flos": 27161798158080.0, + "grad_norm": 2.303276877340575, + "language_loss": 0.76479125, + "learning_rate": 8.725402284377619e-07, + "loss": 0.78567159, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36328125, + "step": 11641, + "time_per_iteration": 2.4226763248443604 + }, + { + "auxiliary_loss_clip": 0.01052765, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.00957274, + "balance_loss_mlp": 1.01650465, + "epoch": 0.699954907560499, + "flos": 20922574619520.0, + "grad_norm": 1.899966531906291, + "language_loss": 0.78706491, + "learning_rate": 8.722185703539022e-07, + "loss": 0.80793476, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 11642, + "time_per_iteration": 2.3795344829559326 + }, + { + "auxiliary_loss_clip": 0.01056758, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.01508868, + "balance_loss_mlp": 1.01702726, + "epoch": 0.700015030813167, + "flos": 28656534729600.0, + "grad_norm": 4.47725169375736, + "language_loss": 0.7628879, + "learning_rate": 8.718969550356266e-07, + "loss": 0.78388709, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.3984375, + "step": 11643, + "time_per_iteration": 2.4311883449554443 + }, + { + "auxiliary_loss_clip": 0.01053808, + "auxiliary_loss_mlp": 0.01036132, + "balance_loss_clip": 1.01172972, + "balance_loss_mlp": 1.01658177, + "epoch": 0.700075154065835, + "flos": 29204007759360.0, + "grad_norm": 2.9454503166943606, + "language_loss": 0.61389166, + "learning_rate": 8.715753824951315e-07, + "loss": 0.63479114, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 11644, + "time_per_iteration": 3.7346742153167725 + }, + { + "auxiliary_loss_clip": 0.01052072, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.01154137, + "balance_loss_mlp": 1.01673627, + "epoch": 0.7001352773185029, + "flos": 23111314663680.0, + "grad_norm": 1.670992255123713, + "language_loss": 0.82716888, + "learning_rate": 8.712538527446119e-07, + "loss": 0.84804082, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3515625, + "step": 11645, + "time_per_iteration": 2.3787055015563965 + }, + { + "auxiliary_loss_clip": 0.01052196, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.01755857, + "balance_loss_mlp": 1.01556015, + "epoch": 0.7001954005711709, + "flos": 21321841461120.0, + "grad_norm": 1.8596787889723019, + "language_loss": 0.68985921, + "learning_rate": 8.709323657962584e-07, + "loss": 0.71080291, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 11646, + "time_per_iteration": 2.3904736042022705 + }, + { + "auxiliary_loss_clip": 0.01052906, + "auxiliary_loss_mlp": 0.01036257, + "balance_loss_clip": 1.01404881, + "balance_loss_mlp": 1.01672792, + "epoch": 0.7002555238238388, + "flos": 24534653771520.0, + "grad_norm": 1.529638644803851, + "language_loss": 0.72159827, + "learning_rate": 8.706109216622635e-07, + "loss": 0.74248993, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.36328125, + "step": 11647, + "time_per_iteration": 3.8580915927886963 + }, + { + "auxiliary_loss_clip": 0.01055729, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.01420641, + "balance_loss_mlp": 1.01738393, + "epoch": 0.7003156470765068, + "flos": 39054996247680.0, + "grad_norm": 1.7755579417219047, + "language_loss": 0.72782332, + "learning_rate": 8.702895203548155e-07, + "loss": 0.74877453, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 11648, + "time_per_iteration": 2.540794849395752 + }, + { + "auxiliary_loss_clip": 0.01051891, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.01331961, + "balance_loss_mlp": 1.01567996, + "epoch": 0.7003757703291749, + "flos": 28802820792960.0, + "grad_norm": 1.515977300836846, + "language_loss": 0.7855401, + "learning_rate": 8.699681618861014e-07, + "loss": 0.80643922, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 11649, + "time_per_iteration": 3.8794047832489014 + }, + { + "auxiliary_loss_clip": 0.01053623, + "auxiliary_loss_mlp": 0.01041781, + "balance_loss_clip": 1.01799834, + "balance_loss_mlp": 1.01681304, + "epoch": 0.7004358935818428, + "flos": 15953142562560.0, + "grad_norm": 1.6478815874091366, + "language_loss": 0.79224283, + "learning_rate": 8.69646846268308e-07, + "loss": 0.81319684, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 11650, + "time_per_iteration": 2.3217427730560303 + }, + { + "auxiliary_loss_clip": 0.01052474, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.01315749, + "balance_loss_mlp": 1.01592875, + "epoch": 0.7004960168345108, + "flos": 20410957422720.0, + "grad_norm": 2.129889873976562, + "language_loss": 0.79411435, + "learning_rate": 8.693255735136194e-07, + "loss": 0.8150177, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36523438, + "step": 11651, + "time_per_iteration": 2.38931941986084 + }, + { + "auxiliary_loss_clip": 0.0105552, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.01307178, + "balance_loss_mlp": 1.01723444, + "epoch": 0.7005561400871787, + "flos": 17346595680000.0, + "grad_norm": 1.7432764385854966, + "language_loss": 0.7153815, + "learning_rate": 8.690043436342198e-07, + "loss": 0.73629969, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3828125, + "step": 11652, + "time_per_iteration": 2.379960775375366 + }, + { + "auxiliary_loss_clip": 0.01053514, + "auxiliary_loss_mlp": 0.0104252, + "balance_loss_clip": 1.01810563, + "balance_loss_mlp": 1.01696134, + "epoch": 0.7006162633398467, + "flos": 25300927491840.0, + "grad_norm": 1.6462672250919261, + "language_loss": 0.75166285, + "learning_rate": 8.686831566422874e-07, + "loss": 0.77262318, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 11653, + "time_per_iteration": 2.433234930038452 + }, + { + "auxiliary_loss_clip": 0.01053826, + "auxiliary_loss_mlp": 0.01039732, + "balance_loss_clip": 1.01325548, + "balance_loss_mlp": 1.01607072, + "epoch": 0.7006763865925146, + "flos": 20667918096000.0, + "grad_norm": 1.9462396809942573, + "language_loss": 0.72059357, + "learning_rate": 8.68362012550003e-07, + "loss": 0.74152923, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 11654, + "time_per_iteration": 2.3964052200317383 + }, + { + "auxiliary_loss_clip": 0.01054431, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.01204085, + "balance_loss_mlp": 1.01589751, + "epoch": 0.7007365098451827, + "flos": 20045451732480.0, + "grad_norm": 2.785867635805633, + "language_loss": 0.74970877, + "learning_rate": 8.680409113695453e-07, + "loss": 0.77064687, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 11655, + "time_per_iteration": 2.3460280895233154 + }, + { + "auxiliary_loss_clip": 0.01058287, + "auxiliary_loss_mlp": 0.01043544, + "balance_loss_clip": 1.01572084, + "balance_loss_mlp": 1.01820397, + "epoch": 0.7007966330978506, + "flos": 20776323467520.0, + "grad_norm": 2.0464353274645455, + "language_loss": 0.71654683, + "learning_rate": 8.677198531130889e-07, + "loss": 0.73756516, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.40039062, + "step": 11656, + "time_per_iteration": 2.392951488494873 + }, + { + "auxiliary_loss_clip": 0.01051346, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.01512074, + "balance_loss_mlp": 1.01537013, + "epoch": 0.7008567563505186, + "flos": 29637035752320.0, + "grad_norm": 1.5162099575520795, + "language_loss": 0.78773332, + "learning_rate": 8.673988377928092e-07, + "loss": 0.8086282, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 11657, + "time_per_iteration": 2.4203271865844727 + }, + { + "auxiliary_loss_clip": 0.01056203, + "auxiliary_loss_mlp": 0.01044444, + "balance_loss_clip": 1.01794362, + "balance_loss_mlp": 1.01703501, + "epoch": 0.7009168796031865, + "flos": 17091066372480.0, + "grad_norm": 2.1362766900840127, + "language_loss": 0.79599524, + "learning_rate": 8.670778654208797e-07, + "loss": 0.8170017, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 11658, + "time_per_iteration": 2.403447389602661 + }, + { + "auxiliary_loss_clip": 0.01051411, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.0128032, + "balance_loss_mlp": 1.01608396, + "epoch": 0.7009770028558545, + "flos": 20447930419200.0, + "grad_norm": 1.7403631691447954, + "language_loss": 0.83572906, + "learning_rate": 8.667569360094713e-07, + "loss": 0.85661256, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35351562, + "step": 11659, + "time_per_iteration": 3.785346031188965 + }, + { + "auxiliary_loss_clip": 0.0105034, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.01059663, + "balance_loss_mlp": 1.01486742, + "epoch": 0.7010371261085224, + "flos": 19244125140480.0, + "grad_norm": 2.1222440042605752, + "language_loss": 0.71123558, + "learning_rate": 8.664360495707526e-07, + "loss": 0.73206902, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 11660, + "time_per_iteration": 2.3738138675689697 + }, + { + "auxiliary_loss_clip": 0.01054342, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.0176729, + "balance_loss_mlp": 1.01666152, + "epoch": 0.7010972493611904, + "flos": 22126484632320.0, + "grad_norm": 2.1979808825325815, + "language_loss": 0.82587749, + "learning_rate": 8.661152061168924e-07, + "loss": 0.84685272, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37695312, + "step": 11661, + "time_per_iteration": 2.39043927192688 + }, + { + "auxiliary_loss_clip": 0.01052014, + "auxiliary_loss_mlp": 0.01039524, + "balance_loss_clip": 1.01677847, + "balance_loss_mlp": 1.01533985, + "epoch": 0.7011573726138585, + "flos": 31389885072000.0, + "grad_norm": 2.38469615613404, + "language_loss": 0.80677056, + "learning_rate": 8.657944056600579e-07, + "loss": 0.82768595, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 11662, + "time_per_iteration": 2.429936408996582 + }, + { + "auxiliary_loss_clip": 0.01054702, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.01272738, + "balance_loss_mlp": 1.01644778, + "epoch": 0.7012174958665264, + "flos": 18149598017280.0, + "grad_norm": 1.7279675924616176, + "language_loss": 0.83996689, + "learning_rate": 8.654736482124134e-07, + "loss": 0.86087859, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 11663, + "time_per_iteration": 2.3359127044677734 + }, + { + "auxiliary_loss_clip": 0.01008196, + "auxiliary_loss_mlp": 0.0100814, + "balance_loss_clip": 1.00575578, + "balance_loss_mlp": 1.00145268, + "epoch": 0.7012776191191944, + "flos": 60648216773760.0, + "grad_norm": 0.8219505640328004, + "language_loss": 0.53739923, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55756259, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.06738281, + "step": 11664, + "time_per_iteration": 2.995934009552002 + }, + { + "auxiliary_loss_clip": 0.01054092, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.01193166, + "balance_loss_mlp": 1.01674676, + "epoch": 0.7013377423718623, + "flos": 27197374700160.0, + "grad_norm": 2.049169491218709, + "language_loss": 0.81315339, + "learning_rate": 8.64832262393344e-07, + "loss": 0.83408409, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.375, + "step": 11665, + "time_per_iteration": 2.454685688018799 + }, + { + "auxiliary_loss_clip": 0.01052978, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.01156211, + "balance_loss_mlp": 1.01652241, + "epoch": 0.7013978656245303, + "flos": 16542650736000.0, + "grad_norm": 2.1394588576838083, + "language_loss": 0.78863728, + "learning_rate": 8.645116340462404e-07, + "loss": 0.80951476, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 11666, + "time_per_iteration": 2.306657552719116 + }, + { + "auxiliary_loss_clip": 0.01052887, + "auxiliary_loss_mlp": 0.01040882, + "balance_loss_clip": 1.01674211, + "balance_loss_mlp": 1.01689947, + "epoch": 0.7014579888771982, + "flos": 23142806576640.0, + "grad_norm": 1.736986141174474, + "language_loss": 0.82378578, + "learning_rate": 8.641910487569695e-07, + "loss": 0.84472346, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.359375, + "step": 11667, + "time_per_iteration": 2.377025842666626 + }, + { + "auxiliary_loss_clip": 0.01052251, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.01481926, + "balance_loss_mlp": 1.01597154, + "epoch": 0.7015181121298663, + "flos": 25080939815040.0, + "grad_norm": 2.7597718497773536, + "language_loss": 0.65940952, + "learning_rate": 8.638705065376879e-07, + "loss": 0.68032372, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 11668, + "time_per_iteration": 2.3921072483062744 + }, + { + "auxiliary_loss_clip": 0.01054823, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.01071048, + "balance_loss_mlp": 1.01657605, + "epoch": 0.7015782353825342, + "flos": 23326868597760.0, + "grad_norm": 1.735423967205533, + "language_loss": 0.7767638, + "learning_rate": 8.635500074005519e-07, + "loss": 0.79766941, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 11669, + "time_per_iteration": 2.4696245193481445 + }, + { + "auxiliary_loss_clip": 0.01008064, + "auxiliary_loss_mlp": 0.01005728, + "balance_loss_clip": 1.00302231, + "balance_loss_mlp": 1.0010916, + "epoch": 0.7016383586352022, + "flos": 70393732444800.0, + "grad_norm": 0.7004618485629884, + "language_loss": 0.54537761, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56551552, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.02709961, + "router_z_loss_mlp": 0.06982422, + "step": 11670, + "time_per_iteration": 3.129777431488037 + }, + { + "auxiliary_loss_clip": 0.01053548, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.01339841, + "balance_loss_mlp": 1.0168643, + "epoch": 0.7016984818878701, + "flos": 19791249056640.0, + "grad_norm": 1.807700361279175, + "language_loss": 0.82868499, + "learning_rate": 8.629091384213218e-07, + "loss": 0.84959489, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 11671, + "time_per_iteration": 2.370938301086426 + }, + { + "auxiliary_loss_clip": 0.01054296, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.01210117, + "balance_loss_mlp": 1.01749933, + "epoch": 0.7017586051405381, + "flos": 12896077294080.0, + "grad_norm": 2.0225843641789503, + "language_loss": 0.76885897, + "learning_rate": 8.625887686035313e-07, + "loss": 0.78975987, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 11672, + "time_per_iteration": 2.3380470275878906 + }, + { + "auxiliary_loss_clip": 0.0105154, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.01387024, + "balance_loss_mlp": 1.01566935, + "epoch": 0.701818728393206, + "flos": 18331844647680.0, + "grad_norm": 1.7926303568800686, + "language_loss": 0.87995046, + "learning_rate": 8.622684419164883e-07, + "loss": 0.90083909, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35742188, + "step": 11673, + "time_per_iteration": 2.3659539222717285 + }, + { + "auxiliary_loss_clip": 0.01051718, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.01521432, + "balance_loss_mlp": 1.01654625, + "epoch": 0.701878851645874, + "flos": 17383254474240.0, + "grad_norm": 1.7687659679367151, + "language_loss": 0.74608481, + "learning_rate": 8.619481583723399e-07, + "loss": 0.7669853, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 11674, + "time_per_iteration": 2.3332138061523438 + }, + { + "auxiliary_loss_clip": 0.01051125, + "auxiliary_loss_mlp": 0.01036653, + "balance_loss_clip": 1.0145998, + "balance_loss_mlp": 1.01689661, + "epoch": 0.701938974898542, + "flos": 23914351912320.0, + "grad_norm": 1.9146178962144484, + "language_loss": 0.7323283, + "learning_rate": 8.616279179832329e-07, + "loss": 0.75320613, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34179688, + "step": 11675, + "time_per_iteration": 2.397435426712036 + }, + { + "auxiliary_loss_clip": 0.01052378, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.01572466, + "balance_loss_mlp": 1.01555574, + "epoch": 0.70199909815121, + "flos": 21794600448000.0, + "grad_norm": 2.0233199127695785, + "language_loss": 0.51929373, + "learning_rate": 8.613077207613078e-07, + "loss": 0.54021221, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 11676, + "time_per_iteration": 2.364105701446533 + }, + { + "auxiliary_loss_clip": 0.01008362, + "auxiliary_loss_mlp": 0.01004279, + "balance_loss_clip": 1.00158513, + "balance_loss_mlp": 1.00137579, + "epoch": 0.702059221403878, + "flos": 71711459089920.0, + "grad_norm": 0.7287796981515064, + "language_loss": 0.59266067, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61278701, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.02697754, + "router_z_loss_mlp": 0.06982422, + "step": 11677, + "time_per_iteration": 3.07122540473938 + }, + { + "auxiliary_loss_clip": 0.01053441, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.0091449, + "balance_loss_mlp": 1.0159297, + "epoch": 0.7021193446565459, + "flos": 28109794838400.0, + "grad_norm": 2.224457071584434, + "language_loss": 0.63638902, + "learning_rate": 8.606674558675737e-07, + "loss": 0.65725887, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 11678, + "time_per_iteration": 2.4134421348571777 + }, + { + "auxiliary_loss_clip": 0.01051261, + "auxiliary_loss_mlp": 0.01040422, + "balance_loss_clip": 1.01648486, + "balance_loss_mlp": 1.01589, + "epoch": 0.7021794679092139, + "flos": 22923936063360.0, + "grad_norm": 1.6151568489866754, + "language_loss": 0.80101788, + "learning_rate": 8.603473882200444e-07, + "loss": 0.82193476, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35351562, + "step": 11679, + "time_per_iteration": 2.447798252105713 + }, + { + "auxiliary_loss_clip": 0.01052409, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.01613879, + "balance_loss_mlp": 1.01690245, + "epoch": 0.7022395911618818, + "flos": 18076839010560.0, + "grad_norm": 2.2825911743132714, + "language_loss": 0.71877038, + "learning_rate": 8.600273637882567e-07, + "loss": 0.73967826, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 11680, + "time_per_iteration": 2.404818058013916 + }, + { + "auxiliary_loss_clip": 0.01054808, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.01668549, + "balance_loss_mlp": 1.0167073, + "epoch": 0.7022997144145499, + "flos": 16033372600320.0, + "grad_norm": 1.754227999388068, + "language_loss": 0.75999808, + "learning_rate": 8.597073825843446e-07, + "loss": 0.7809571, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38085938, + "step": 11681, + "time_per_iteration": 2.35341215133667 + }, + { + "auxiliary_loss_clip": 0.01054322, + "auxiliary_loss_mlp": 0.01036418, + "balance_loss_clip": 1.01347041, + "balance_loss_mlp": 1.01676917, + "epoch": 0.7023598376672178, + "flos": 26467480483200.0, + "grad_norm": 1.944087849389389, + "language_loss": 0.77723658, + "learning_rate": 8.593874446204434e-07, + "loss": 0.79814392, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.375, + "step": 11682, + "time_per_iteration": 2.4471960067749023 + }, + { + "auxiliary_loss_clip": 0.01053471, + "auxiliary_loss_mlp": 0.01041374, + "balance_loss_clip": 1.01619744, + "balance_loss_mlp": 1.01583982, + "epoch": 0.7024199609198858, + "flos": 17054966160000.0, + "grad_norm": 2.0439080109216925, + "language_loss": 0.75194108, + "learning_rate": 8.590675499086841e-07, + "loss": 0.77288949, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 11683, + "time_per_iteration": 3.5945653915405273 + }, + { + "auxiliary_loss_clip": 0.01052328, + "auxiliary_loss_mlp": 0.01040177, + "balance_loss_clip": 1.01366484, + "balance_loss_mlp": 1.01618123, + "epoch": 0.7024800841725537, + "flos": 25847841939840.0, + "grad_norm": 1.8006752924284735, + "language_loss": 0.72663105, + "learning_rate": 8.587476984611976e-07, + "loss": 0.74755609, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36328125, + "step": 11684, + "time_per_iteration": 2.394998073577881 + }, + { + "auxiliary_loss_clip": 0.01052189, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.0139823, + "balance_loss_mlp": 1.01596713, + "epoch": 0.7025402074252217, + "flos": 23511908136960.0, + "grad_norm": 1.8863236225247524, + "language_loss": 0.72987747, + "learning_rate": 8.584278902901128e-07, + "loss": 0.75076497, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 11685, + "time_per_iteration": 2.4023122787475586 + }, + { + "auxiliary_loss_clip": 0.01052517, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.01258719, + "balance_loss_mlp": 1.01580393, + "epoch": 0.7026003306778896, + "flos": 20150121588480.0, + "grad_norm": 1.5880218901273346, + "language_loss": 0.85316288, + "learning_rate": 8.581081254075582e-07, + "loss": 0.87403464, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3671875, + "step": 11686, + "time_per_iteration": 2.3437767028808594 + }, + { + "auxiliary_loss_clip": 0.01008047, + "auxiliary_loss_mlp": 0.0100451, + "balance_loss_clip": 1.00197053, + "balance_loss_mlp": 1.00098813, + "epoch": 0.7026604539305576, + "flos": 64769294770560.0, + "grad_norm": 0.987636016672983, + "language_loss": 0.70034885, + "learning_rate": 8.577884038256566e-07, + "loss": 0.72047448, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.0703125, + "step": 11687, + "time_per_iteration": 4.430670261383057 + }, + { + "auxiliary_loss_clip": 0.01052834, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.01781988, + "balance_loss_mlp": 1.01641047, + "epoch": 0.7027205771832256, + "flos": 21870396743040.0, + "grad_norm": 3.7394662544084385, + "language_loss": 0.78750795, + "learning_rate": 8.574687255565329e-07, + "loss": 0.80843455, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36328125, + "step": 11688, + "time_per_iteration": 2.3935906887054443 + }, + { + "auxiliary_loss_clip": 0.01053648, + "auxiliary_loss_mlp": 0.01041446, + "balance_loss_clip": 1.01694894, + "balance_loss_mlp": 1.01700306, + "epoch": 0.7027807004358936, + "flos": 23366669414400.0, + "grad_norm": 2.0287205181243086, + "language_loss": 0.69971073, + "learning_rate": 8.571490906123107e-07, + "loss": 0.7206617, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 11689, + "time_per_iteration": 3.7493700981140137 + }, + { + "auxiliary_loss_clip": 0.01054959, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.01443005, + "balance_loss_mlp": 1.01686561, + "epoch": 0.7028408236885616, + "flos": 15303373649280.0, + "grad_norm": 1.994691472760705, + "language_loss": 0.80797231, + "learning_rate": 8.568294990051086e-07, + "loss": 0.82893801, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38085938, + "step": 11690, + "time_per_iteration": 2.401571750640869 + }, + { + "auxiliary_loss_clip": 0.01055415, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.01717281, + "balance_loss_mlp": 1.01825333, + "epoch": 0.7029009469412295, + "flos": 22017101742720.0, + "grad_norm": 1.6180703425643823, + "language_loss": 0.7659539, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78691566, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 11691, + "time_per_iteration": 2.3610615730285645 + }, + { + "auxiliary_loss_clip": 0.01053923, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.01545, + "balance_loss_mlp": 1.01790309, + "epoch": 0.7029610701938975, + "flos": 21834436176000.0, + "grad_norm": 1.949335260845897, + "language_loss": 0.82377321, + "learning_rate": 8.561904458502429e-07, + "loss": 0.84468877, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 11692, + "time_per_iteration": 2.384661912918091 + }, + { + "auxiliary_loss_clip": 0.01052965, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.01123548, + "balance_loss_mlp": 1.01648045, + "epoch": 0.7030211934465654, + "flos": 19134637516800.0, + "grad_norm": 1.586962564984575, + "language_loss": 0.77468365, + "learning_rate": 8.558709843268111e-07, + "loss": 0.79556674, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 11693, + "time_per_iteration": 2.3609442710876465 + }, + { + "auxiliary_loss_clip": 0.01052874, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.01498389, + "balance_loss_mlp": 1.01769912, + "epoch": 0.7030813166992335, + "flos": 38544461303040.0, + "grad_norm": 1.636306874966264, + "language_loss": 0.69897205, + "learning_rate": 8.55551566188866e-07, + "loss": 0.719877, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 11694, + "time_per_iteration": 2.5595955848693848 + }, + { + "auxiliary_loss_clip": 0.01052828, + "auxiliary_loss_mlp": 0.01039446, + "balance_loss_clip": 1.01515126, + "balance_loss_mlp": 1.0158031, + "epoch": 0.7031414399519014, + "flos": 14720009875200.0, + "grad_norm": 2.2611275335301477, + "language_loss": 0.77200365, + "learning_rate": 8.552321914485203e-07, + "loss": 0.79292643, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 11695, + "time_per_iteration": 2.3596301078796387 + }, + { + "auxiliary_loss_clip": 0.01054529, + "auxiliary_loss_mlp": 0.01039853, + "balance_loss_clip": 1.01613009, + "balance_loss_mlp": 1.01727057, + "epoch": 0.7032015632045694, + "flos": 14026390427520.0, + "grad_norm": 1.9152164946404682, + "language_loss": 0.75332648, + "learning_rate": 8.549128601178852e-07, + "loss": 0.7742703, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37304688, + "step": 11696, + "time_per_iteration": 2.424807071685791 + }, + { + "auxiliary_loss_clip": 0.01054149, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.01328444, + "balance_loss_mlp": 1.01713729, + "epoch": 0.7032616864572373, + "flos": 27635918688000.0, + "grad_norm": 1.4727917707338387, + "language_loss": 0.76258242, + "learning_rate": 8.545935722090693e-07, + "loss": 0.78351432, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 11697, + "time_per_iteration": 2.4145584106445312 + }, + { + "auxiliary_loss_clip": 0.01054696, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_clip": 1.01678073, + "balance_loss_mlp": 1.01738429, + "epoch": 0.7033218097099053, + "flos": 17966338957440.0, + "grad_norm": 3.0251722270196444, + "language_loss": 0.81481159, + "learning_rate": 8.542743277341793e-07, + "loss": 0.83578587, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37304688, + "step": 11698, + "time_per_iteration": 2.3612053394317627 + }, + { + "auxiliary_loss_clip": 0.01053592, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_clip": 1.01749492, + "balance_loss_mlp": 1.01649189, + "epoch": 0.7033819329625732, + "flos": 19500666877440.0, + "grad_norm": 1.386048467884942, + "language_loss": 0.85611272, + "learning_rate": 8.539551267053222e-07, + "loss": 0.87706566, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 11699, + "time_per_iteration": 3.759413242340088 + }, + { + "auxiliary_loss_clip": 0.01052437, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.01167893, + "balance_loss_mlp": 1.01626837, + "epoch": 0.7034420562152413, + "flos": 23986517425920.0, + "grad_norm": 1.9493389858323087, + "language_loss": 0.80508983, + "learning_rate": 8.53635969134601e-07, + "loss": 0.82598472, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36132812, + "step": 11700, + "time_per_iteration": 2.4684903621673584 + }, + { + "auxiliary_loss_clip": 0.0105222, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.00953603, + "balance_loss_mlp": 1.01539016, + "epoch": 0.7035021794679092, + "flos": 35041974508800.0, + "grad_norm": 1.7442158572046753, + "language_loss": 0.7584734, + "learning_rate": 8.533168550341186e-07, + "loss": 0.77933651, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 11701, + "time_per_iteration": 2.45867919921875 + }, + { + "auxiliary_loss_clip": 0.01054391, + "auxiliary_loss_mlp": 0.01040659, + "balance_loss_clip": 1.01585197, + "balance_loss_mlp": 1.01705348, + "epoch": 0.7035623027205772, + "flos": 10996697531520.0, + "grad_norm": 2.501398189330933, + "language_loss": 0.85194129, + "learning_rate": 8.529977844159769e-07, + "loss": 0.87289178, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 11702, + "time_per_iteration": 2.343057870864868 + }, + { + "auxiliary_loss_clip": 0.01053578, + "auxiliary_loss_mlp": 0.01034694, + "balance_loss_clip": 1.01177001, + "balance_loss_mlp": 1.01724744, + "epoch": 0.7036224259732452, + "flos": 23622582746880.0, + "grad_norm": 1.9476649228554805, + "language_loss": 0.61636961, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63725233, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 11703, + "time_per_iteration": 2.431807518005371 + }, + { + "auxiliary_loss_clip": 0.01053349, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.01198816, + "balance_loss_mlp": 1.01628935, + "epoch": 0.7036825492259131, + "flos": 31684831171200.0, + "grad_norm": 2.7293669882190517, + "language_loss": 0.63105142, + "learning_rate": 8.523597736751067e-07, + "loss": 0.65193957, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 11704, + "time_per_iteration": 2.4580395221710205 + }, + { + "auxiliary_loss_clip": 0.01050063, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.01616096, + "balance_loss_mlp": 1.0152998, + "epoch": 0.7037426724785811, + "flos": 30191491054080.0, + "grad_norm": 1.5777958823256464, + "language_loss": 0.72068524, + "learning_rate": 8.520408335765719e-07, + "loss": 0.74155945, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34765625, + "step": 11705, + "time_per_iteration": 2.441856622695923 + }, + { + "auxiliary_loss_clip": 0.01051592, + "auxiliary_loss_mlp": 0.01038633, + "balance_loss_clip": 1.01786685, + "balance_loss_mlp": 1.0161581, + "epoch": 0.703802795731249, + "flos": 24310511642880.0, + "grad_norm": 1.945010893284573, + "language_loss": 0.63391471, + "learning_rate": 8.517219370087645e-07, + "loss": 0.65481699, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.35351562, + "step": 11706, + "time_per_iteration": 2.412151575088501 + }, + { + "auxiliary_loss_clip": 0.01053769, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.01677084, + "balance_loss_mlp": 1.01631236, + "epoch": 0.7038629189839171, + "flos": 22527846155520.0, + "grad_norm": 1.8185552558901321, + "language_loss": 0.69524527, + "learning_rate": 8.514030839837756e-07, + "loss": 0.71620262, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 11707, + "time_per_iteration": 2.3653564453125 + }, + { + "auxiliary_loss_clip": 0.01049823, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.01039028, + "balance_loss_mlp": 1.01565313, + "epoch": 0.703923042236585, + "flos": 26249273285760.0, + "grad_norm": 1.8867188749519992, + "language_loss": 0.77393579, + "learning_rate": 8.510842745136974e-07, + "loss": 0.79474962, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34179688, + "step": 11708, + "time_per_iteration": 2.418410062789917 + }, + { + "auxiliary_loss_clip": 0.01050953, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.01252079, + "balance_loss_mlp": 1.01660752, + "epoch": 0.703983165489253, + "flos": 19389259128960.0, + "grad_norm": 1.8118395101186702, + "language_loss": 0.73314255, + "learning_rate": 8.50765508610619e-07, + "loss": 0.75398248, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34375, + "step": 11709, + "time_per_iteration": 2.3824455738067627 + }, + { + "auxiliary_loss_clip": 0.01050228, + "auxiliary_loss_mlp": 0.01037012, + "balance_loss_clip": 1.01463652, + "balance_loss_mlp": 1.01484287, + "epoch": 0.7040432887419209, + "flos": 16682897134080.0, + "grad_norm": 2.2652831965756985, + "language_loss": 0.80498421, + "learning_rate": 8.504467862866267e-07, + "loss": 0.82585669, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 11710, + "time_per_iteration": 2.3531723022460938 + }, + { + "auxiliary_loss_clip": 0.01053613, + "auxiliary_loss_mlp": 0.01045001, + "balance_loss_clip": 1.02157629, + "balance_loss_mlp": 1.01698327, + "epoch": 0.7041034119945889, + "flos": 21140362880640.0, + "grad_norm": 1.6177818638504742, + "language_loss": 0.78719282, + "learning_rate": 8.501281075538076e-07, + "loss": 0.80817896, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 11711, + "time_per_iteration": 2.3527982234954834 + }, + { + "auxiliary_loss_clip": 0.01051071, + "auxiliary_loss_mlp": 0.01038144, + "balance_loss_clip": 1.01523209, + "balance_loss_mlp": 1.01531303, + "epoch": 0.7041635352472568, + "flos": 16909343412480.0, + "grad_norm": 2.3815037394255025, + "language_loss": 0.75866103, + "learning_rate": 8.498094724242457e-07, + "loss": 0.77955317, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 11712, + "time_per_iteration": 2.35577392578125 + }, + { + "auxiliary_loss_clip": 0.01007762, + "auxiliary_loss_mlp": 0.0100522, + "balance_loss_clip": 1.00282359, + "balance_loss_mlp": 1.00075316, + "epoch": 0.7042236584999249, + "flos": 71677558293120.0, + "grad_norm": 0.900332642961795, + "language_loss": 0.64789879, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66802859, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.0703125, + "step": 11713, + "time_per_iteration": 3.052063465118408 + }, + { + "auxiliary_loss_clip": 0.01050945, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.01537251, + "balance_loss_mlp": 1.01594591, + "epoch": 0.7042837817525928, + "flos": 28656918754560.0, + "grad_norm": 1.8685914882095165, + "language_loss": 0.73534989, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75622362, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34960938, + "step": 11714, + "time_per_iteration": 2.474581003189087 + }, + { + "auxiliary_loss_clip": 0.01052544, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.02273273, + "balance_loss_mlp": 1.01628745, + "epoch": 0.7043439050052608, + "flos": 19752600314880.0, + "grad_norm": 1.9388971082739306, + "language_loss": 0.81345153, + "learning_rate": 8.488538287759248e-07, + "loss": 0.83445096, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 11715, + "time_per_iteration": 2.3424272537231445 + }, + { + "auxiliary_loss_clip": 0.01053779, + "auxiliary_loss_mlp": 0.01042158, + "balance_loss_clip": 1.01856649, + "balance_loss_mlp": 1.01634264, + "epoch": 0.7044040282579288, + "flos": 11537956339200.0, + "grad_norm": 2.2194174379105327, + "language_loss": 0.72750384, + "learning_rate": 8.485353681802037e-07, + "loss": 0.74846315, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.375, + "step": 11716, + "time_per_iteration": 2.3767900466918945 + }, + { + "auxiliary_loss_clip": 0.01054908, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.01611435, + "balance_loss_mlp": 1.01731181, + "epoch": 0.7044641515105967, + "flos": 33654735613440.0, + "grad_norm": 2.189494639803364, + "language_loss": 0.68131471, + "learning_rate": 8.482169512481358e-07, + "loss": 0.70225751, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 11717, + "time_per_iteration": 2.4795823097229004 + }, + { + "auxiliary_loss_clip": 0.01051115, + "auxiliary_loss_mlp": 0.01041673, + "balance_loss_clip": 1.01938033, + "balance_loss_mlp": 1.014925, + "epoch": 0.7045242747632647, + "flos": 26722660677120.0, + "grad_norm": 1.5905799496760784, + "language_loss": 0.74836373, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76929164, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36132812, + "step": 11718, + "time_per_iteration": 2.4388298988342285 + }, + { + "auxiliary_loss_clip": 0.01052838, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.01393294, + "balance_loss_mlp": 1.01652408, + "epoch": 0.7045843980159326, + "flos": 26796432113280.0, + "grad_norm": 1.6767661602901758, + "language_loss": 0.81281412, + "learning_rate": 8.475802484232606e-07, + "loss": 0.83368945, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.36328125, + "step": 11719, + "time_per_iteration": 2.439176321029663 + }, + { + "auxiliary_loss_clip": 0.01051972, + "auxiliary_loss_mlp": 0.01041331, + "balance_loss_clip": 1.01736999, + "balance_loss_mlp": 1.01675725, + "epoch": 0.7046445212686007, + "flos": 41573176680960.0, + "grad_norm": 1.820200825915218, + "language_loss": 0.66773838, + "learning_rate": 8.472619625545951e-07, + "loss": 0.68867135, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 11720, + "time_per_iteration": 2.5654046535491943 + }, + { + "auxiliary_loss_clip": 0.01055906, + "auxiliary_loss_mlp": 0.01036369, + "balance_loss_clip": 1.01207399, + "balance_loss_mlp": 1.01862812, + "epoch": 0.7047046445212686, + "flos": 15559252070400.0, + "grad_norm": 2.3312572898047557, + "language_loss": 0.81849861, + "learning_rate": 8.46943720397872e-07, + "loss": 0.83942133, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 11721, + "time_per_iteration": 2.338749647140503 + }, + { + "auxiliary_loss_clip": 0.01008393, + "auxiliary_loss_mlp": 0.0100394, + "balance_loss_clip": 1.00140119, + "balance_loss_mlp": 1.00143445, + "epoch": 0.7047647677739366, + "flos": 70406475091200.0, + "grad_norm": 0.7623445852027596, + "language_loss": 0.64851409, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66863739, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.06933594, + "step": 11722, + "time_per_iteration": 3.140843152999878 + }, + { + "auxiliary_loss_clip": 0.01052142, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.01834774, + "balance_loss_mlp": 1.01657653, + "epoch": 0.7048248910266045, + "flos": 23658892427520.0, + "grad_norm": 1.5342205969690794, + "language_loss": 0.6644429, + "learning_rate": 8.463073672685211e-07, + "loss": 0.68537772, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 11723, + "time_per_iteration": 3.6164069175720215 + }, + { + "auxiliary_loss_clip": 0.01053383, + "auxiliary_loss_mlp": 0.01039503, + "balance_loss_clip": 1.01541078, + "balance_loss_mlp": 1.01631069, + "epoch": 0.7048850142792725, + "flos": 21396101656320.0, + "grad_norm": 1.7276378329871664, + "language_loss": 0.8153336, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83626252, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 11724, + "time_per_iteration": 2.402494192123413 + }, + { + "auxiliary_loss_clip": 0.01053534, + "auxiliary_loss_mlp": 0.01043565, + "balance_loss_clip": 1.01885319, + "balance_loss_mlp": 1.01630878, + "epoch": 0.7049451375319404, + "flos": 21647162309760.0, + "grad_norm": 1.7689261293706857, + "language_loss": 0.73741221, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75838321, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37109375, + "step": 11725, + "time_per_iteration": 2.3724658489227295 + }, + { + "auxiliary_loss_clip": 0.01055136, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_clip": 1.01837814, + "balance_loss_mlp": 1.01763439, + "epoch": 0.7050052607846085, + "flos": 14865911913600.0, + "grad_norm": 1.8982171600694007, + "language_loss": 0.79078114, + "learning_rate": 8.453531657156998e-07, + "loss": 0.811773, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 11726, + "time_per_iteration": 3.711406707763672 + }, + { + "auxiliary_loss_clip": 0.01052597, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.014256, + "balance_loss_mlp": 1.01560664, + "epoch": 0.7050653840372764, + "flos": 19240843472640.0, + "grad_norm": 2.024646677267195, + "language_loss": 0.71123385, + "learning_rate": 8.450351860839931e-07, + "loss": 0.7321341, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 11727, + "time_per_iteration": 2.3739194869995117 + }, + { + "auxiliary_loss_clip": 0.01047846, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.01255071, + "balance_loss_mlp": 1.01521909, + "epoch": 0.7051255072899444, + "flos": 27779237285760.0, + "grad_norm": 1.5562806674674543, + "language_loss": 0.69711185, + "learning_rate": 8.44717250248668e-07, + "loss": 0.71792531, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.32617188, + "step": 11728, + "time_per_iteration": 3.79042387008667 + }, + { + "auxiliary_loss_clip": 0.01052147, + "auxiliary_loss_mlp": 0.01039216, + "balance_loss_clip": 1.01417017, + "balance_loss_mlp": 1.01653671, + "epoch": 0.7051856305426124, + "flos": 27890784679680.0, + "grad_norm": 1.8526011904576865, + "language_loss": 0.74001926, + "learning_rate": 8.443993582217803e-07, + "loss": 0.76093292, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35546875, + "step": 11729, + "time_per_iteration": 2.4454312324523926 + }, + { + "auxiliary_loss_clip": 0.0105642, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.01866901, + "balance_loss_mlp": 1.01716316, + "epoch": 0.7052457537952803, + "flos": 25042465630080.0, + "grad_norm": 1.5844329301097693, + "language_loss": 0.79485786, + "learning_rate": 8.440815100153862e-07, + "loss": 0.81586671, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39257812, + "step": 11730, + "time_per_iteration": 2.386445999145508 + }, + { + "auxiliary_loss_clip": 0.01054023, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.01682353, + "balance_loss_mlp": 1.01672113, + "epoch": 0.7053058770479483, + "flos": 21870641122560.0, + "grad_norm": 2.2551294367164108, + "language_loss": 0.64129955, + "learning_rate": 8.437637056415359e-07, + "loss": 0.66224778, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37304688, + "step": 11731, + "time_per_iteration": 2.378030300140381 + }, + { + "auxiliary_loss_clip": 0.01054382, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.01517689, + "balance_loss_mlp": 1.01650429, + "epoch": 0.7053660003006162, + "flos": 16397796038400.0, + "grad_norm": 2.0361613327326036, + "language_loss": 0.75795561, + "learning_rate": 8.434459451122815e-07, + "loss": 0.77889037, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37890625, + "step": 11732, + "time_per_iteration": 2.354156970977783 + }, + { + "auxiliary_loss_clip": 0.01051857, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.01166248, + "balance_loss_mlp": 1.01606059, + "epoch": 0.7054261235532843, + "flos": 22710441899520.0, + "grad_norm": 1.6107121533854414, + "language_loss": 0.71990073, + "learning_rate": 8.431282284396735e-07, + "loss": 0.74075627, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 11733, + "time_per_iteration": 2.384000062942505 + }, + { + "auxiliary_loss_clip": 0.01050801, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.01288748, + "balance_loss_mlp": 1.01529193, + "epoch": 0.7054862468059522, + "flos": 13588858869120.0, + "grad_norm": 2.038418524907823, + "language_loss": 0.7430315, + "learning_rate": 8.428105556357583e-07, + "loss": 0.76389605, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 11734, + "time_per_iteration": 2.348458766937256 + }, + { + "auxiliary_loss_clip": 0.01054689, + "auxiliary_loss_mlp": 0.01043123, + "balance_loss_clip": 1.01669455, + "balance_loss_mlp": 1.01620948, + "epoch": 0.7055463700586202, + "flos": 15879999530880.0, + "grad_norm": 2.709815294222579, + "language_loss": 0.7173273, + "learning_rate": 8.424929267125829e-07, + "loss": 0.73830545, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38476562, + "step": 11735, + "time_per_iteration": 2.3873109817504883 + }, + { + "auxiliary_loss_clip": 0.01053144, + "auxiliary_loss_mlp": 0.01044363, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01614249, + "epoch": 0.7056064933112881, + "flos": 23075039894400.0, + "grad_norm": 1.9699608725937527, + "language_loss": 0.72873485, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74970996, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37109375, + "step": 11736, + "time_per_iteration": 2.3784830570220947 + }, + { + "auxiliary_loss_clip": 0.01052579, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.01373649, + "balance_loss_mlp": 1.01689613, + "epoch": 0.7056666165639561, + "flos": 24056134410240.0, + "grad_norm": 1.8111662053652924, + "language_loss": 0.69764507, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71853518, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 11737, + "time_per_iteration": 2.4847075939178467 + }, + { + "auxiliary_loss_clip": 0.01054412, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.01750433, + "balance_loss_mlp": 1.01649904, + "epoch": 0.705726739816624, + "flos": 17492288250240.0, + "grad_norm": 2.205681679520115, + "language_loss": 0.69035101, + "learning_rate": 8.415403033479332e-07, + "loss": 0.71133411, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 11738, + "time_per_iteration": 2.3752493858337402 + }, + { + "auxiliary_loss_clip": 0.01052854, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.01159298, + "balance_loss_mlp": 1.01627469, + "epoch": 0.7057868630692921, + "flos": 51348578342400.0, + "grad_norm": 3.6350415176944737, + "language_loss": 0.76195836, + "learning_rate": 8.41222850068145e-07, + "loss": 0.7828564, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 11739, + "time_per_iteration": 3.9973809719085693 + }, + { + "auxiliary_loss_clip": 0.01050631, + "auxiliary_loss_mlp": 0.01036213, + "balance_loss_clip": 1.01289582, + "balance_loss_mlp": 1.01590383, + "epoch": 0.70584698632196, + "flos": 26101800236160.0, + "grad_norm": 1.9220311291067262, + "language_loss": 0.72662526, + "learning_rate": 8.409054407293032e-07, + "loss": 0.74749368, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 11740, + "time_per_iteration": 2.417091131210327 + }, + { + "auxiliary_loss_clip": 0.01051442, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.01483369, + "balance_loss_mlp": 1.01599026, + "epoch": 0.705907109574628, + "flos": 21542073517440.0, + "grad_norm": 1.668167657436761, + "language_loss": 0.83325875, + "learning_rate": 8.405880753434434e-07, + "loss": 0.85413849, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 11741, + "time_per_iteration": 2.375587224960327 + }, + { + "auxiliary_loss_clip": 0.01053107, + "auxiliary_loss_mlp": 0.01038886, + "balance_loss_clip": 1.01509202, + "balance_loss_mlp": 1.01590705, + "epoch": 0.705967232827296, + "flos": 22709743672320.0, + "grad_norm": 3.262959062923107, + "language_loss": 0.79346871, + "learning_rate": 8.402707539225993e-07, + "loss": 0.81438863, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37304688, + "step": 11742, + "time_per_iteration": 2.3679652214050293 + }, + { + "auxiliary_loss_clip": 0.01053485, + "auxiliary_loss_mlp": 0.01035601, + "balance_loss_clip": 1.01223624, + "balance_loss_mlp": 1.01629066, + "epoch": 0.7060273560799639, + "flos": 28690051501440.0, + "grad_norm": 2.1797390349779344, + "language_loss": 0.65530407, + "learning_rate": 8.39953476478805e-07, + "loss": 0.67619491, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37304688, + "step": 11743, + "time_per_iteration": 2.4386730194091797 + }, + { + "auxiliary_loss_clip": 0.01054095, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.01425016, + "balance_loss_mlp": 1.01635742, + "epoch": 0.7060874793326319, + "flos": 15705258842880.0, + "grad_norm": 3.853423696676147, + "language_loss": 0.66619825, + "learning_rate": 8.396362430240902e-07, + "loss": 0.68712664, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37695312, + "step": 11744, + "time_per_iteration": 2.335339307785034 + }, + { + "auxiliary_loss_clip": 0.01051742, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.01073623, + "balance_loss_mlp": 1.01684022, + "epoch": 0.7061476025852998, + "flos": 21505694014080.0, + "grad_norm": 1.738053208148986, + "language_loss": 0.64446819, + "learning_rate": 8.393190535704857e-07, + "loss": 0.66532397, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 11745, + "time_per_iteration": 2.377833366394043 + }, + { + "auxiliary_loss_clip": 0.01052531, + "auxiliary_loss_mlp": 0.01039941, + "balance_loss_clip": 1.01661205, + "balance_loss_mlp": 1.01657355, + "epoch": 0.7062077258379679, + "flos": 28180633720320.0, + "grad_norm": 1.747770203780208, + "language_loss": 0.72033614, + "learning_rate": 8.390019081300188e-07, + "loss": 0.74126089, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 11746, + "time_per_iteration": 2.4467108249664307 + }, + { + "auxiliary_loss_clip": 0.01052989, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.01034045, + "balance_loss_mlp": 1.01695347, + "epoch": 0.7062678490906358, + "flos": 27852485051520.0, + "grad_norm": 1.4539536746232318, + "language_loss": 0.79746258, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81834209, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 11747, + "time_per_iteration": 2.4083681106567383 + }, + { + "auxiliary_loss_clip": 0.01052747, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.0139147, + "balance_loss_mlp": 1.01746702, + "epoch": 0.7063279723433038, + "flos": 23183759468160.0, + "grad_norm": 1.6176620314447931, + "language_loss": 0.65990281, + "learning_rate": 8.383677493366031e-07, + "loss": 0.68078679, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35351562, + "step": 11748, + "time_per_iteration": 2.362006902694702 + }, + { + "auxiliary_loss_clip": 0.01053215, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.01761305, + "balance_loss_mlp": 1.01581407, + "epoch": 0.7063880955959717, + "flos": 20187757900800.0, + "grad_norm": 2.4043278808613553, + "language_loss": 0.80669975, + "learning_rate": 8.380507360077003e-07, + "loss": 0.82765186, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 11749, + "time_per_iteration": 2.3967604637145996 + }, + { + "auxiliary_loss_clip": 0.01008845, + "auxiliary_loss_mlp": 0.01004941, + "balance_loss_clip": 1.00204432, + "balance_loss_mlp": 1.00194597, + "epoch": 0.7064482188486397, + "flos": 63665376491520.0, + "grad_norm": 0.7886036334288581, + "language_loss": 0.54043287, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56057072, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.06884766, + "step": 11750, + "time_per_iteration": 2.965235948562622 + }, + { + "auxiliary_loss_clip": 0.01052379, + "auxiliary_loss_mlp": 0.01040272, + "balance_loss_clip": 1.01588225, + "balance_loss_mlp": 1.0163213, + "epoch": 0.7065083421013076, + "flos": 25190078325120.0, + "grad_norm": 1.8539040465360817, + "language_loss": 0.80089486, + "learning_rate": 8.37416841545612e-07, + "loss": 0.82182133, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 11751, + "time_per_iteration": 2.423936367034912 + }, + { + "auxiliary_loss_clip": 0.01049263, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.00951493, + "balance_loss_mlp": 1.01501894, + "epoch": 0.7065684653539757, + "flos": 22892583795840.0, + "grad_norm": 1.582121074810804, + "language_loss": 0.69077885, + "learning_rate": 8.370999604364634e-07, + "loss": 0.71157813, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 11752, + "time_per_iteration": 2.3553566932678223 + }, + { + "auxiliary_loss_clip": 0.0105162, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.01565886, + "balance_loss_mlp": 1.01676202, + "epoch": 0.7066285886066436, + "flos": 23549125512960.0, + "grad_norm": 1.9697778450507746, + "language_loss": 0.7821213, + "learning_rate": 8.367831234246025e-07, + "loss": 0.80301142, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 11753, + "time_per_iteration": 2.392807722091675 + }, + { + "auxiliary_loss_clip": 0.01050907, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.01110315, + "balance_loss_mlp": 1.01634669, + "epoch": 0.7066887118593116, + "flos": 21068232278400.0, + "grad_norm": 1.4706552864781466, + "language_loss": 0.71584558, + "learning_rate": 8.364663305220405e-07, + "loss": 0.73668742, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34570312, + "step": 11754, + "time_per_iteration": 2.3911681175231934 + }, + { + "auxiliary_loss_clip": 0.01050634, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.01595783, + "balance_loss_mlp": 1.01517665, + "epoch": 0.7067488351119796, + "flos": 21175311018240.0, + "grad_norm": 2.098645692349254, + "language_loss": 0.90119553, + "learning_rate": 8.361495817407919e-07, + "loss": 0.92207229, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35546875, + "step": 11755, + "time_per_iteration": 2.4026427268981934 + }, + { + "auxiliary_loss_clip": 0.0105131, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.01457214, + "balance_loss_mlp": 1.01614821, + "epoch": 0.7068089583646475, + "flos": 20448174798720.0, + "grad_norm": 2.0813737317661802, + "language_loss": 0.80679893, + "learning_rate": 8.358328770928678e-07, + "loss": 0.82767838, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 11756, + "time_per_iteration": 2.360037088394165 + }, + { + "auxiliary_loss_clip": 0.01008247, + "auxiliary_loss_mlp": 0.01002655, + "balance_loss_clip": 1.00030637, + "balance_loss_mlp": 1.00123167, + "epoch": 0.7068690816173155, + "flos": 59106452734080.0, + "grad_norm": 0.8259914294892376, + "language_loss": 0.60522997, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62533903, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.0703125, + "step": 11757, + "time_per_iteration": 2.7885687351226807 + }, + { + "auxiliary_loss_clip": 0.01052011, + "auxiliary_loss_mlp": 0.01039451, + "balance_loss_clip": 1.01549029, + "balance_loss_mlp": 1.01629376, + "epoch": 0.7069292048699835, + "flos": 16250672102400.0, + "grad_norm": 1.976031762051657, + "language_loss": 0.80938447, + "learning_rate": 8.351996002450307e-07, + "loss": 0.83029914, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35742188, + "step": 11758, + "time_per_iteration": 2.3350675106048584 + }, + { + "auxiliary_loss_clip": 0.01050061, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.01618457, + "balance_loss_mlp": 1.01496708, + "epoch": 0.7069893281226515, + "flos": 41171151841920.0, + "grad_norm": 1.6859831124471654, + "language_loss": 0.78883684, + "learning_rate": 8.348830280691304e-07, + "loss": 0.80973858, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 11759, + "time_per_iteration": 2.5645201206207275 + }, + { + "auxiliary_loss_clip": 0.01051618, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.01351607, + "balance_loss_mlp": 1.01585054, + "epoch": 0.7070494513753194, + "flos": 24206121077760.0, + "grad_norm": 1.8064623637159547, + "language_loss": 0.6893096, + "learning_rate": 8.34566500074583e-07, + "loss": 0.71019948, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 11760, + "time_per_iteration": 2.4318594932556152 + }, + { + "auxiliary_loss_clip": 0.01053711, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.01637745, + "balance_loss_mlp": 1.01692188, + "epoch": 0.7071095746279874, + "flos": 20184860257920.0, + "grad_norm": 1.9839216648768392, + "language_loss": 0.81314677, + "learning_rate": 8.342500162733899e-07, + "loss": 0.83407545, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 11761, + "time_per_iteration": 2.4425573348999023 + }, + { + "auxiliary_loss_clip": 0.01052066, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.01321936, + "balance_loss_mlp": 1.01666284, + "epoch": 0.7071696978806553, + "flos": 18182172182400.0, + "grad_norm": 2.45721572435099, + "language_loss": 0.76029181, + "learning_rate": 8.33933576677553e-07, + "loss": 0.7811662, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35351562, + "step": 11762, + "time_per_iteration": 3.6084506511688232 + }, + { + "auxiliary_loss_clip": 0.01051105, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.01710486, + "balance_loss_mlp": 1.0160023, + "epoch": 0.7072298211333233, + "flos": 24130638984960.0, + "grad_norm": 1.746028787070688, + "language_loss": 0.77796984, + "learning_rate": 8.336171812990724e-07, + "loss": 0.79886281, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.3515625, + "step": 11763, + "time_per_iteration": 2.4882588386535645 + }, + { + "auxiliary_loss_clip": 0.01051798, + "auxiliary_loss_mlp": 0.01042247, + "balance_loss_clip": 1.02015722, + "balance_loss_mlp": 1.01663888, + "epoch": 0.7072899443859912, + "flos": 27197200143360.0, + "grad_norm": 2.1905391442756117, + "language_loss": 0.7966907, + "learning_rate": 8.333008301499453e-07, + "loss": 0.81763119, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3515625, + "step": 11764, + "time_per_iteration": 2.4255123138427734 + }, + { + "auxiliary_loss_clip": 0.01053808, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_clip": 1.0202775, + "balance_loss_mlp": 1.01691926, + "epoch": 0.7073500676386593, + "flos": 16434664300800.0, + "grad_norm": 1.6063257086964478, + "language_loss": 0.80631995, + "learning_rate": 8.32984523242167e-07, + "loss": 0.82730663, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 11765, + "time_per_iteration": 2.3693227767944336 + }, + { + "auxiliary_loss_clip": 0.01049861, + "auxiliary_loss_mlp": 0.01036342, + "balance_loss_clip": 1.01481247, + "balance_loss_mlp": 1.01533604, + "epoch": 0.7074101908913272, + "flos": 27672472748160.0, + "grad_norm": 1.7228984531456335, + "language_loss": 0.69259846, + "learning_rate": 8.326682605877324e-07, + "loss": 0.71346045, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 11766, + "time_per_iteration": 3.9094440937042236 + }, + { + "auxiliary_loss_clip": 0.01053034, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.01714826, + "balance_loss_mlp": 1.01634836, + "epoch": 0.7074703141439952, + "flos": 22236949774080.0, + "grad_norm": 2.631140019346152, + "language_loss": 0.65305328, + "learning_rate": 8.323520421986352e-07, + "loss": 0.67400372, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3671875, + "step": 11767, + "time_per_iteration": 3.8022382259368896 + }, + { + "auxiliary_loss_clip": 0.01051145, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.01016986, + "balance_loss_mlp": 1.01504266, + "epoch": 0.7075304373966632, + "flos": 29641923342720.0, + "grad_norm": 1.5061615662929713, + "language_loss": 0.53865635, + "learning_rate": 8.320358680868646e-07, + "loss": 0.55951232, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 11768, + "time_per_iteration": 2.4309890270233154 + }, + { + "auxiliary_loss_clip": 0.01049394, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.01510882, + "balance_loss_mlp": 1.01523137, + "epoch": 0.7075905606493311, + "flos": 19754206237440.0, + "grad_norm": 1.691726033111157, + "language_loss": 0.76908928, + "learning_rate": 8.317197382644119e-07, + "loss": 0.78994322, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34179688, + "step": 11769, + "time_per_iteration": 2.3710732460021973 + }, + { + "auxiliary_loss_clip": 0.01008358, + "auxiliary_loss_mlp": 0.01004843, + "balance_loss_clip": 1.00236332, + "balance_loss_mlp": 1.0011642, + "epoch": 0.7076506839019991, + "flos": 65713136999040.0, + "grad_norm": 0.8690675832735557, + "language_loss": 0.62068832, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64082032, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.07226562, + "step": 11770, + "time_per_iteration": 2.970346212387085 + }, + { + "auxiliary_loss_clip": 0.01053385, + "auxiliary_loss_mlp": 0.01041015, + "balance_loss_clip": 1.01730466, + "balance_loss_mlp": 1.01536906, + "epoch": 0.707710807154667, + "flos": 23764260510720.0, + "grad_norm": 1.8978259127695623, + "language_loss": 0.76787513, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78881913, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.38085938, + "step": 11771, + "time_per_iteration": 2.394069194793701 + }, + { + "auxiliary_loss_clip": 0.01050127, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.01482129, + "balance_loss_mlp": 1.01568627, + "epoch": 0.7077709304073351, + "flos": 21250304352000.0, + "grad_norm": 1.5085800385246073, + "language_loss": 0.72749782, + "learning_rate": 8.307716146528221e-07, + "loss": 0.74834913, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.34375, + "step": 11772, + "time_per_iteration": 2.3766093254089355 + }, + { + "auxiliary_loss_clip": 0.01054562, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.01383352, + "balance_loss_mlp": 1.01648521, + "epoch": 0.707831053660003, + "flos": 20739699584640.0, + "grad_norm": 1.940522791989844, + "language_loss": 0.71248955, + "learning_rate": 8.30455662107496e-07, + "loss": 0.73341966, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38085938, + "step": 11773, + "time_per_iteration": 2.3562734127044678 + }, + { + "auxiliary_loss_clip": 0.01051896, + "auxiliary_loss_mlp": 0.01041658, + "balance_loss_clip": 1.0180192, + "balance_loss_mlp": 1.01579118, + "epoch": 0.707891176912671, + "flos": 21979919278080.0, + "grad_norm": 1.6051061858747486, + "language_loss": 0.71711624, + "learning_rate": 8.301397539114095e-07, + "loss": 0.73805177, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 11774, + "time_per_iteration": 2.3715498447418213 + }, + { + "auxiliary_loss_clip": 0.01051162, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.01634955, + "balance_loss_mlp": 1.01692224, + "epoch": 0.7079513001653389, + "flos": 21067918076160.0, + "grad_norm": 1.5465960276307475, + "language_loss": 0.75334442, + "learning_rate": 8.298238900765407e-07, + "loss": 0.77422953, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34179688, + "step": 11775, + "time_per_iteration": 2.3623576164245605 + }, + { + "auxiliary_loss_clip": 0.01053321, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.0109539, + "balance_loss_mlp": 1.01782632, + "epoch": 0.7080114234180069, + "flos": 18039691457280.0, + "grad_norm": 1.6996297622343441, + "language_loss": 0.87977207, + "learning_rate": 8.295080706148665e-07, + "loss": 0.90062833, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35546875, + "step": 11776, + "time_per_iteration": 2.3654417991638184 + }, + { + "auxiliary_loss_clip": 0.01051945, + "auxiliary_loss_mlp": 0.01033947, + "balance_loss_clip": 1.01176238, + "balance_loss_mlp": 1.01681757, + "epoch": 0.7080715466706748, + "flos": 15121371398400.0, + "grad_norm": 1.4799550903346335, + "language_loss": 0.75699854, + "learning_rate": 8.291922955383641e-07, + "loss": 0.77785742, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3515625, + "step": 11777, + "time_per_iteration": 2.3275177478790283 + }, + { + "auxiliary_loss_clip": 0.01056125, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.01169276, + "balance_loss_mlp": 1.01724148, + "epoch": 0.7081316699233429, + "flos": 14422096310400.0, + "grad_norm": 2.5104688766882073, + "language_loss": 0.8321209, + "learning_rate": 8.288765648590066e-07, + "loss": 0.85305643, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 11778, + "time_per_iteration": 3.7400238513946533 + }, + { + "auxiliary_loss_clip": 0.01048316, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.01629162, + "balance_loss_mlp": 1.01531208, + "epoch": 0.7081917931760108, + "flos": 23221256135040.0, + "grad_norm": 1.494611289325078, + "language_loss": 0.8579818, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87882274, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.33007812, + "step": 11779, + "time_per_iteration": 2.3920862674713135 + }, + { + "auxiliary_loss_clip": 0.01053283, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.01661456, + "balance_loss_mlp": 1.01723146, + "epoch": 0.7082519164286788, + "flos": 39306964596480.0, + "grad_norm": 8.041743902085788, + "language_loss": 0.72827667, + "learning_rate": 8.28245236739618e-07, + "loss": 0.74920738, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 11780, + "time_per_iteration": 2.507024049758911 + }, + { + "auxiliary_loss_clip": 0.01051099, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.01294827, + "balance_loss_mlp": 1.01650262, + "epoch": 0.7083120396813467, + "flos": 21650129775360.0, + "grad_norm": 1.38527338383707, + "language_loss": 0.74354845, + "learning_rate": 8.279296393235256e-07, + "loss": 0.76440394, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 11781, + "time_per_iteration": 2.395230293273926 + }, + { + "auxiliary_loss_clip": 0.01050886, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.01509142, + "balance_loss_mlp": 1.01641214, + "epoch": 0.7083721629340147, + "flos": 17566059686400.0, + "grad_norm": 1.6623694958798765, + "language_loss": 0.79056793, + "learning_rate": 8.276140863524585e-07, + "loss": 0.81144631, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 11782, + "time_per_iteration": 2.3439900875091553 + }, + { + "auxiliary_loss_clip": 0.01051043, + "auxiliary_loss_mlp": 0.01035088, + "balance_loss_clip": 1.0135355, + "balance_loss_mlp": 1.01593542, + "epoch": 0.7084322861866827, + "flos": 29349246481920.0, + "grad_norm": 1.5111525782890733, + "language_loss": 0.70654982, + "learning_rate": 8.272985778383828e-07, + "loss": 0.72741115, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 11783, + "time_per_iteration": 2.443906307220459 + }, + { + "auxiliary_loss_clip": 0.01053323, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.01492167, + "balance_loss_mlp": 1.01726246, + "epoch": 0.7084924094393507, + "flos": 20193238984320.0, + "grad_norm": 1.7318878831963107, + "language_loss": 0.79798138, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81888819, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 11784, + "time_per_iteration": 2.365013360977173 + }, + { + "auxiliary_loss_clip": 0.01052042, + "auxiliary_loss_mlp": 0.01037563, + "balance_loss_clip": 1.01538968, + "balance_loss_mlp": 1.01665521, + "epoch": 0.7085525326920187, + "flos": 23476087215360.0, + "grad_norm": 2.0061065629957584, + "language_loss": 0.78901613, + "learning_rate": 8.266676942290609e-07, + "loss": 0.8099122, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35351562, + "step": 11785, + "time_per_iteration": 2.3966333866119385 + }, + { + "auxiliary_loss_clip": 0.01052309, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_clip": 1.01732361, + "balance_loss_mlp": 1.01656318, + "epoch": 0.7086126559446866, + "flos": 25957608854400.0, + "grad_norm": 1.6205595262910608, + "language_loss": 0.78471369, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80565047, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35742188, + "step": 11786, + "time_per_iteration": 2.423788547515869 + }, + { + "auxiliary_loss_clip": 0.01052325, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.01278472, + "balance_loss_mlp": 1.01566899, + "epoch": 0.7086727791973546, + "flos": 26723568372480.0, + "grad_norm": 3.1214970019678816, + "language_loss": 0.80192709, + "learning_rate": 8.260369885912526e-07, + "loss": 0.82281554, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 11787, + "time_per_iteration": 2.3953213691711426 + }, + { + "auxiliary_loss_clip": 0.01052912, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.01539314, + "balance_loss_mlp": 1.01632452, + "epoch": 0.7087329024500225, + "flos": 21682459560960.0, + "grad_norm": 1.786788798996011, + "language_loss": 0.78178191, + "learning_rate": 8.257217025415615e-07, + "loss": 0.80269033, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36523438, + "step": 11788, + "time_per_iteration": 2.423516273498535 + }, + { + "auxiliary_loss_clip": 0.01053832, + "auxiliary_loss_mlp": 0.01047353, + "balance_loss_clip": 1.01993477, + "balance_loss_mlp": 1.01601648, + "epoch": 0.7087930257026905, + "flos": 17930099099520.0, + "grad_norm": 1.82743873836133, + "language_loss": 0.69151783, + "learning_rate": 8.254064610206212e-07, + "loss": 0.71252972, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.37890625, + "step": 11789, + "time_per_iteration": 2.339289903640747 + }, + { + "auxiliary_loss_clip": 0.01053532, + "auxiliary_loss_mlp": 0.01036878, + "balance_loss_clip": 1.01326275, + "balance_loss_mlp": 1.01513386, + "epoch": 0.7088531489553584, + "flos": 18910669944960.0, + "grad_norm": 1.7166653975179405, + "language_loss": 0.78701413, + "learning_rate": 8.250912640403858e-07, + "loss": 0.80791819, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38476562, + "step": 11790, + "time_per_iteration": 2.394822597503662 + }, + { + "auxiliary_loss_clip": 0.01054387, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.01023102, + "balance_loss_mlp": 1.01584721, + "epoch": 0.7089132722080265, + "flos": 27379656241920.0, + "grad_norm": 1.7299221792417945, + "language_loss": 0.72542918, + "learning_rate": 8.247761116128085e-07, + "loss": 0.74632019, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38476562, + "step": 11791, + "time_per_iteration": 2.3981773853302 + }, + { + "auxiliary_loss_clip": 0.01053717, + "auxiliary_loss_mlp": 0.01040536, + "balance_loss_clip": 1.01628923, + "balance_loss_mlp": 1.01715708, + "epoch": 0.7089733954606944, + "flos": 22161851706240.0, + "grad_norm": 1.8265431646639285, + "language_loss": 0.82909667, + "learning_rate": 8.244610037498376e-07, + "loss": 0.85003918, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11792, + "time_per_iteration": 2.3898210525512695 + }, + { + "auxiliary_loss_clip": 0.01053667, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.01040578, + "balance_loss_mlp": 1.0156852, + "epoch": 0.7090335187133624, + "flos": 24424677388800.0, + "grad_norm": 2.21715149643487, + "language_loss": 0.6540556, + "learning_rate": 8.241459404634232e-07, + "loss": 0.67494166, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37890625, + "step": 11793, + "time_per_iteration": 2.3949155807495117 + }, + { + "auxiliary_loss_clip": 0.01052179, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.01571417, + "balance_loss_mlp": 1.01592767, + "epoch": 0.7090936419660303, + "flos": 21834156885120.0, + "grad_norm": 2.054363048594997, + "language_loss": 0.71501166, + "learning_rate": 8.238309217655133e-07, + "loss": 0.7359097, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 11794, + "time_per_iteration": 2.404747486114502 + }, + { + "auxiliary_loss_clip": 0.01053113, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.01796556, + "balance_loss_mlp": 1.01699734, + "epoch": 0.7091537652186983, + "flos": 20081377388160.0, + "grad_norm": 1.8665998911082642, + "language_loss": 0.77487469, + "learning_rate": 8.23515947668052e-07, + "loss": 0.79581618, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 11795, + "time_per_iteration": 2.3456039428710938 + }, + { + "auxiliary_loss_clip": 0.01053207, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.01522338, + "balance_loss_mlp": 1.0167439, + "epoch": 0.7092138884713663, + "flos": 13150733817600.0, + "grad_norm": 2.5206315900615097, + "language_loss": 0.7558378, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77675921, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36523438, + "step": 11796, + "time_per_iteration": 2.3349499702453613 + }, + { + "auxiliary_loss_clip": 0.01057412, + "auxiliary_loss_mlp": 0.01044344, + "balance_loss_clip": 1.01686668, + "balance_loss_mlp": 1.01859426, + "epoch": 0.7092740117240343, + "flos": 21645102539520.0, + "grad_norm": 1.9897182748920894, + "language_loss": 0.74625701, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76727456, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.38867188, + "step": 11797, + "time_per_iteration": 2.3709568977355957 + }, + { + "auxiliary_loss_clip": 0.01053191, + "auxiliary_loss_mlp": 0.01039499, + "balance_loss_clip": 1.01591921, + "balance_loss_mlp": 1.0169518, + "epoch": 0.7093341349767023, + "flos": 21031468750080.0, + "grad_norm": 1.5117565906790191, + "language_loss": 0.80353796, + "learning_rate": 8.225712930977953e-07, + "loss": 0.82446486, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 11798, + "time_per_iteration": 2.409475564956665 + }, + { + "auxiliary_loss_clip": 0.01052081, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.0130235, + "balance_loss_mlp": 1.01579773, + "epoch": 0.7093942582293702, + "flos": 22016578072320.0, + "grad_norm": 2.6272341974738906, + "language_loss": 0.67372739, + "learning_rate": 8.222564975215529e-07, + "loss": 0.69462454, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 11799, + "time_per_iteration": 2.3579423427581787 + }, + { + "auxiliary_loss_clip": 0.01052532, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.0120374, + "balance_loss_mlp": 1.01610088, + "epoch": 0.7094543814820382, + "flos": 27234347696640.0, + "grad_norm": 1.6558766189975322, + "language_loss": 0.82733184, + "learning_rate": 8.219417466054622e-07, + "loss": 0.84822816, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 11800, + "time_per_iteration": 2.439335823059082 + }, + { + "auxiliary_loss_clip": 0.0105094, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.01349831, + "balance_loss_mlp": 1.01536012, + "epoch": 0.7095145047347061, + "flos": 12088466657280.0, + "grad_norm": 1.7515215071970471, + "language_loss": 0.87231195, + "learning_rate": 8.21627040361459e-07, + "loss": 0.89317429, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35546875, + "step": 11801, + "time_per_iteration": 2.3191843032836914 + }, + { + "auxiliary_loss_clip": 0.01053753, + "auxiliary_loss_mlp": 0.0104051, + "balance_loss_clip": 1.01701379, + "balance_loss_mlp": 1.0163312, + "epoch": 0.7095746279873741, + "flos": 19382975084160.0, + "grad_norm": 1.6886098558063236, + "language_loss": 0.77545333, + "learning_rate": 8.213123788014758e-07, + "loss": 0.7963959, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 11802, + "time_per_iteration": 3.6549456119537354 + }, + { + "auxiliary_loss_clip": 0.01055262, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.0185101, + "balance_loss_mlp": 1.01696086, + "epoch": 0.709634751240042, + "flos": 21359547596160.0, + "grad_norm": 1.9402741640486607, + "language_loss": 0.8291418, + "learning_rate": 8.209977619374462e-07, + "loss": 0.85013461, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 11803, + "time_per_iteration": 2.362514019012451 + }, + { + "auxiliary_loss_clip": 0.01054289, + "auxiliary_loss_mlp": 0.01038677, + "balance_loss_clip": 1.01227212, + "balance_loss_mlp": 1.01658463, + "epoch": 0.7096948744927101, + "flos": 13916204576640.0, + "grad_norm": 2.2450633639231086, + "language_loss": 0.69171333, + "learning_rate": 8.206831897812995e-07, + "loss": 0.71264303, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37695312, + "step": 11804, + "time_per_iteration": 2.367338180541992 + }, + { + "auxiliary_loss_clip": 0.01049132, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.01441813, + "balance_loss_mlp": 1.01534033, + "epoch": 0.709754997745378, + "flos": 30297068605440.0, + "grad_norm": 1.7216809123045562, + "language_loss": 0.79318404, + "learning_rate": 8.203686623449637e-07, + "loss": 0.81403106, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33789062, + "step": 11805, + "time_per_iteration": 2.4112584590911865 + }, + { + "auxiliary_loss_clip": 0.01053836, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.01264858, + "balance_loss_mlp": 1.01699126, + "epoch": 0.709815120998046, + "flos": 18514161100800.0, + "grad_norm": 2.6009074625166853, + "language_loss": 0.80357552, + "learning_rate": 8.200541796403667e-07, + "loss": 0.82447845, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 11806, + "time_per_iteration": 3.7489686012268066 + }, + { + "auxiliary_loss_clip": 0.01053371, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.01306653, + "balance_loss_mlp": 1.0171715, + "epoch": 0.7098752442507139, + "flos": 22271513886720.0, + "grad_norm": 2.2622721222705207, + "language_loss": 0.57757169, + "learning_rate": 8.197397416794332e-07, + "loss": 0.59847915, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 11807, + "time_per_iteration": 3.7828528881073 + }, + { + "auxiliary_loss_clip": 0.01055884, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_clip": 1.02021313, + "balance_loss_mlp": 1.0167191, + "epoch": 0.7099353675033819, + "flos": 19274604624000.0, + "grad_norm": 1.9683750271505465, + "language_loss": 0.69746155, + "learning_rate": 8.194253484740882e-07, + "loss": 0.71848226, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 11808, + "time_per_iteration": 2.3475189208984375 + }, + { + "auxiliary_loss_clip": 0.01055168, + "auxiliary_loss_mlp": 0.01040575, + "balance_loss_clip": 1.01685286, + "balance_loss_mlp": 1.01685953, + "epoch": 0.70999549075605, + "flos": 21907439562240.0, + "grad_norm": 1.8606070957251337, + "language_loss": 0.72059739, + "learning_rate": 8.191110000362513e-07, + "loss": 0.74155474, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3828125, + "step": 11809, + "time_per_iteration": 2.4748032093048096 + }, + { + "auxiliary_loss_clip": 0.01009313, + "auxiliary_loss_mlp": 0.01004393, + "balance_loss_clip": 1.00190198, + "balance_loss_mlp": 1.00222421, + "epoch": 0.7100556140087179, + "flos": 70453015666560.0, + "grad_norm": 0.7529155436134074, + "language_loss": 0.59499109, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61512816, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.07080078, + "step": 11810, + "time_per_iteration": 3.116567850112915 + }, + { + "auxiliary_loss_clip": 0.01054068, + "auxiliary_loss_mlp": 0.01040299, + "balance_loss_clip": 1.016958, + "balance_loss_mlp": 1.01729238, + "epoch": 0.7101157372613859, + "flos": 23038450922880.0, + "grad_norm": 1.95434771323913, + "language_loss": 0.75013268, + "learning_rate": 8.18482437510784e-07, + "loss": 0.77107632, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 11811, + "time_per_iteration": 2.415149450302124 + }, + { + "auxiliary_loss_clip": 0.01050933, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.01416802, + "balance_loss_mlp": 1.01570809, + "epoch": 0.7101758605140538, + "flos": 23184213315840.0, + "grad_norm": 2.137783006700291, + "language_loss": 0.84548557, + "learning_rate": 8.181682234469882e-07, + "loss": 0.8663522, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 11812, + "time_per_iteration": 2.3637962341308594 + }, + { + "auxiliary_loss_clip": 0.01054831, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.01495242, + "balance_loss_mlp": 1.01720953, + "epoch": 0.7102359837667218, + "flos": 23694992640000.0, + "grad_norm": 1.5145816481083905, + "language_loss": 0.71389449, + "learning_rate": 8.178540541983716e-07, + "loss": 0.73483455, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 11813, + "time_per_iteration": 2.392571210861206 + }, + { + "auxiliary_loss_clip": 0.01051996, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.01008272, + "balance_loss_mlp": 1.01617861, + "epoch": 0.7102961070193897, + "flos": 19390097001600.0, + "grad_norm": 1.9385817572530637, + "language_loss": 0.82557893, + "learning_rate": 8.175399297768495e-07, + "loss": 0.84643191, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 11814, + "time_per_iteration": 2.345689058303833 + }, + { + "auxiliary_loss_clip": 0.01052996, + "auxiliary_loss_mlp": 0.01039924, + "balance_loss_clip": 1.01499736, + "balance_loss_mlp": 1.01671255, + "epoch": 0.7103562302720577, + "flos": 21506427152640.0, + "grad_norm": 1.6960117772966579, + "language_loss": 0.77349728, + "learning_rate": 8.172258501943301e-07, + "loss": 0.79442644, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36328125, + "step": 11815, + "time_per_iteration": 2.360841989517212 + }, + { + "auxiliary_loss_clip": 0.01051579, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.01419282, + "balance_loss_mlp": 1.01562798, + "epoch": 0.7104163535247257, + "flos": 14534272108800.0, + "grad_norm": 1.977096361405698, + "language_loss": 0.79466844, + "learning_rate": 8.16911815462725e-07, + "loss": 0.81556523, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 11816, + "time_per_iteration": 2.332798719406128 + }, + { + "auxiliary_loss_clip": 0.01055517, + "auxiliary_loss_mlp": 0.01038888, + "balance_loss_clip": 1.01361585, + "balance_loss_mlp": 1.01791859, + "epoch": 0.7104764767773937, + "flos": 11399525331840.0, + "grad_norm": 1.7757270705156833, + "language_loss": 0.87784827, + "learning_rate": 8.165978255939426e-07, + "loss": 0.89879227, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 11817, + "time_per_iteration": 3.758486270904541 + }, + { + "auxiliary_loss_clip": 0.01050688, + "auxiliary_loss_mlp": 0.01039904, + "balance_loss_clip": 1.01619279, + "balance_loss_mlp": 1.01511598, + "epoch": 0.7105366000300616, + "flos": 11689688574720.0, + "grad_norm": 2.29149237709545, + "language_loss": 0.86196685, + "learning_rate": 8.162838805998897e-07, + "loss": 0.88287276, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 11818, + "time_per_iteration": 2.3107738494873047 + }, + { + "auxiliary_loss_clip": 0.01053735, + "auxiliary_loss_mlp": 0.01039345, + "balance_loss_clip": 1.01367879, + "balance_loss_mlp": 1.01559103, + "epoch": 0.7105967232827296, + "flos": 19353019271040.0, + "grad_norm": 3.291294109363843, + "language_loss": 0.76012814, + "learning_rate": 8.159699804924709e-07, + "loss": 0.78105891, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 11819, + "time_per_iteration": 2.3576385974884033 + }, + { + "auxiliary_loss_clip": 0.01053785, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.01635206, + "balance_loss_mlp": 1.01534438, + "epoch": 0.7106568465353975, + "flos": 22929277501440.0, + "grad_norm": 1.6598459892288746, + "language_loss": 0.71730822, + "learning_rate": 8.156561252835883e-07, + "loss": 0.73828828, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38476562, + "step": 11820, + "time_per_iteration": 2.370387554168701 + }, + { + "auxiliary_loss_clip": 0.01053516, + "auxiliary_loss_mlp": 0.01042043, + "balance_loss_clip": 1.0165205, + "balance_loss_mlp": 1.01567912, + "epoch": 0.7107169697880655, + "flos": 19098642038400.0, + "grad_norm": 1.8006683421361198, + "language_loss": 0.77270901, + "learning_rate": 8.153423149851449e-07, + "loss": 0.79366463, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 11821, + "time_per_iteration": 2.4090652465820312 + }, + { + "auxiliary_loss_clip": 0.01008544, + "auxiliary_loss_mlp": 0.01003215, + "balance_loss_clip": 1.00090265, + "balance_loss_mlp": 1.00133824, + "epoch": 0.7107770930407336, + "flos": 63635071564800.0, + "grad_norm": 0.7815052358258732, + "language_loss": 0.55160952, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57172716, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.07226562, + "step": 11822, + "time_per_iteration": 3.0461642742156982 + }, + { + "auxiliary_loss_clip": 0.01050005, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.01292074, + "balance_loss_mlp": 1.01526189, + "epoch": 0.7108372162934015, + "flos": 22053376512000.0, + "grad_norm": 2.257354919920837, + "language_loss": 0.61468285, + "learning_rate": 8.147148291671688e-07, + "loss": 0.63554442, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 11823, + "time_per_iteration": 2.4178292751312256 + }, + { + "auxiliary_loss_clip": 0.01052709, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.01444101, + "balance_loss_mlp": 1.01586986, + "epoch": 0.7108973395460695, + "flos": 19134148757760.0, + "grad_norm": 2.1816076088165626, + "language_loss": 0.72369707, + "learning_rate": 8.144011536714322e-07, + "loss": 0.74460644, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36914062, + "step": 11824, + "time_per_iteration": 2.3468844890594482 + }, + { + "auxiliary_loss_clip": 0.01050445, + "auxiliary_loss_mlp": 0.01038106, + "balance_loss_clip": 1.01550424, + "balance_loss_mlp": 1.01565206, + "epoch": 0.7109574627987374, + "flos": 17893475216640.0, + "grad_norm": 1.5961935145497086, + "language_loss": 0.73953509, + "learning_rate": 8.140875231337223e-07, + "loss": 0.76042056, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 11825, + "time_per_iteration": 2.3686375617980957 + }, + { + "auxiliary_loss_clip": 0.01054911, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.01718497, + "balance_loss_mlp": 1.01659989, + "epoch": 0.7110175860514054, + "flos": 28978538999040.0, + "grad_norm": 4.235957620496351, + "language_loss": 0.80123591, + "learning_rate": 8.137739375659321e-07, + "loss": 0.82220745, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 11826, + "time_per_iteration": 2.427446126937866 + }, + { + "auxiliary_loss_clip": 0.01050444, + "auxiliary_loss_mlp": 0.01037511, + "balance_loss_clip": 1.01446748, + "balance_loss_mlp": 1.01499915, + "epoch": 0.7110777093040733, + "flos": 26172220181760.0, + "grad_norm": 1.443999521377218, + "language_loss": 0.83608449, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85696405, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 11827, + "time_per_iteration": 2.433148145675659 + }, + { + "auxiliary_loss_clip": 0.01054491, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.01137424, + "balance_loss_mlp": 1.01682425, + "epoch": 0.7111378325567413, + "flos": 26868737272320.0, + "grad_norm": 1.322285469478923, + "language_loss": 0.63244617, + "learning_rate": 8.131469013876748e-07, + "loss": 0.6533553, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 11828, + "time_per_iteration": 2.4376096725463867 + }, + { + "auxiliary_loss_clip": 0.01053182, + "auxiliary_loss_mlp": 0.01038291, + "balance_loss_clip": 1.01459253, + "balance_loss_mlp": 1.01665401, + "epoch": 0.7111979558094093, + "flos": 27270587554560.0, + "grad_norm": 1.5288110270526933, + "language_loss": 0.72952187, + "learning_rate": 8.128334508009846e-07, + "loss": 0.75043654, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 11829, + "time_per_iteration": 2.4639363288879395 + }, + { + "auxiliary_loss_clip": 0.01053313, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.01562154, + "balance_loss_mlp": 1.01687789, + "epoch": 0.7112580790620773, + "flos": 25045747297920.0, + "grad_norm": 1.8648012586162592, + "language_loss": 0.81462806, + "learning_rate": 8.125200452317697e-07, + "loss": 0.83554107, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36523438, + "step": 11830, + "time_per_iteration": 2.4137136936187744 + }, + { + "auxiliary_loss_clip": 0.01053187, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.01528299, + "balance_loss_mlp": 1.01671004, + "epoch": 0.7113182023147452, + "flos": 21645730944000.0, + "grad_norm": 1.6939791598698022, + "language_loss": 0.84663773, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86757171, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36328125, + "step": 11831, + "time_per_iteration": 2.4023618698120117 + }, + { + "auxiliary_loss_clip": 0.01054045, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.01324487, + "balance_loss_mlp": 1.01682043, + "epoch": 0.7113783255674132, + "flos": 20995228892160.0, + "grad_norm": 1.8245952064802147, + "language_loss": 0.7858299, + "learning_rate": 8.118933691932985e-07, + "loss": 0.80673981, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 11832, + "time_per_iteration": 2.3468964099884033 + }, + { + "auxiliary_loss_clip": 0.01008344, + "auxiliary_loss_mlp": 0.01010358, + "balance_loss_clip": 1.00799811, + "balance_loss_mlp": 1.00119781, + "epoch": 0.7114384488200811, + "flos": 66768142596480.0, + "grad_norm": 0.7533420308465638, + "language_loss": 0.56598341, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58617043, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07128906, + "step": 11833, + "time_per_iteration": 2.929084062576294 + }, + { + "auxiliary_loss_clip": 0.01052332, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.0159533, + "balance_loss_mlp": 1.01622248, + "epoch": 0.7114985720727491, + "flos": 25008879035520.0, + "grad_norm": 1.7082744670809078, + "language_loss": 0.72317809, + "learning_rate": 8.11266873367315e-07, + "loss": 0.74408221, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 11834, + "time_per_iteration": 2.393937826156616 + }, + { + "auxiliary_loss_clip": 0.01055083, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.01638961, + "balance_loss_mlp": 1.01713729, + "epoch": 0.7115586953254172, + "flos": 21469070131200.0, + "grad_norm": 1.8943181923480448, + "language_loss": 0.80764985, + "learning_rate": 8.10953693063704e-07, + "loss": 0.82862353, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 11835, + "time_per_iteration": 2.370798110961914 + }, + { + "auxiliary_loss_clip": 0.01052218, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.01238084, + "balance_loss_mlp": 1.01577806, + "epoch": 0.7116188185780851, + "flos": 28621307301120.0, + "grad_norm": 1.8168836050959192, + "language_loss": 0.76630044, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78718412, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36523438, + "step": 11836, + "time_per_iteration": 2.4083449840545654 + }, + { + "auxiliary_loss_clip": 0.01052992, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.01319122, + "balance_loss_mlp": 1.01620841, + "epoch": 0.7116789418307531, + "flos": 25292653499520.0, + "grad_norm": 1.6659384490398124, + "language_loss": 0.70413941, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72503883, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 11837, + "time_per_iteration": 2.4145584106445312 + }, + { + "auxiliary_loss_clip": 0.01056852, + "auxiliary_loss_mlp": 0.01046284, + "balance_loss_clip": 1.01564777, + "balance_loss_mlp": 1.01788867, + "epoch": 0.711739065083421, + "flos": 25556107685760.0, + "grad_norm": 1.8966925709202866, + "language_loss": 0.62819695, + "learning_rate": 8.100144227328958e-07, + "loss": 0.64922833, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.390625, + "step": 11838, + "time_per_iteration": 2.3927271366119385 + }, + { + "auxiliary_loss_clip": 0.01053791, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.01419806, + "balance_loss_mlp": 1.01751518, + "epoch": 0.711799188336089, + "flos": 26139785662080.0, + "grad_norm": 2.564655143347502, + "language_loss": 0.68120611, + "learning_rate": 8.097014228555426e-07, + "loss": 0.70211309, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 11839, + "time_per_iteration": 2.4034202098846436 + }, + { + "auxiliary_loss_clip": 0.01054622, + "auxiliary_loss_mlp": 0.01048527, + "balance_loss_clip": 1.02271783, + "balance_loss_mlp": 1.01737976, + "epoch": 0.7118593115887569, + "flos": 21139629742080.0, + "grad_norm": 1.9855986135745387, + "language_loss": 0.85100806, + "learning_rate": 8.093884681144305e-07, + "loss": 0.87203956, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 11840, + "time_per_iteration": 2.346921920776367 + }, + { + "auxiliary_loss_clip": 0.01054952, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.01530027, + "balance_loss_mlp": 1.01698589, + "epoch": 0.711919434841425, + "flos": 14974806044160.0, + "grad_norm": 2.6567593877409386, + "language_loss": 0.78235215, + "learning_rate": 8.090755585214277e-07, + "loss": 0.80329913, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 11841, + "time_per_iteration": 2.3458197116851807 + }, + { + "auxiliary_loss_clip": 0.01054365, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_clip": 1.0229497, + "balance_loss_mlp": 1.01627362, + "epoch": 0.7119795580940929, + "flos": 16508051712000.0, + "grad_norm": 1.8669589235139754, + "language_loss": 0.76292896, + "learning_rate": 8.087626940883994e-07, + "loss": 0.78396523, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 11842, + "time_per_iteration": 3.5786001682281494 + }, + { + "auxiliary_loss_clip": 0.01008148, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 0.99986464, + "balance_loss_mlp": 1.00091767, + "epoch": 0.7120396813467609, + "flos": 66567286344960.0, + "grad_norm": 0.8035538260306488, + "language_loss": 0.6179502, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63805795, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.02770996, + "router_z_loss_mlp": 0.07226562, + "step": 11843, + "time_per_iteration": 3.000540256500244 + }, + { + "auxiliary_loss_clip": 0.01052503, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.0104655, + "balance_loss_mlp": 1.016011, + "epoch": 0.7120998045994288, + "flos": 26431519916160.0, + "grad_norm": 1.6271905717956923, + "language_loss": 0.81356245, + "learning_rate": 8.081371007497171e-07, + "loss": 0.83442479, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 11844, + "time_per_iteration": 2.4270031452178955 + }, + { + "auxiliary_loss_clip": 0.01052381, + "auxiliary_loss_mlp": 0.01039593, + "balance_loss_clip": 1.01498866, + "balance_loss_mlp": 1.01593709, + "epoch": 0.7121599278520968, + "flos": 16427263092480.0, + "grad_norm": 2.0881061576832587, + "language_loss": 0.80066031, + "learning_rate": 8.078243718677873e-07, + "loss": 0.82158005, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 11845, + "time_per_iteration": 3.6863064765930176 + }, + { + "auxiliary_loss_clip": 0.01053117, + "auxiliary_loss_mlp": 0.01041387, + "balance_loss_clip": 1.01523256, + "balance_loss_mlp": 1.01765549, + "epoch": 0.7122200511047647, + "flos": 28948618097280.0, + "grad_norm": 3.567338195116469, + "language_loss": 0.78347707, + "learning_rate": 8.075116881932762e-07, + "loss": 0.80442214, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.35351562, + "step": 11846, + "time_per_iteration": 2.481381893157959 + }, + { + "auxiliary_loss_clip": 0.01053778, + "auxiliary_loss_mlp": 0.01041668, + "balance_loss_clip": 1.01761174, + "balance_loss_mlp": 1.01634169, + "epoch": 0.7122801743574327, + "flos": 16470939070080.0, + "grad_norm": 2.1749225225629645, + "language_loss": 0.60300916, + "learning_rate": 8.071990497380421e-07, + "loss": 0.62396359, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 11847, + "time_per_iteration": 3.7581920623779297 + }, + { + "auxiliary_loss_clip": 0.01050593, + "auxiliary_loss_mlp": 0.01041558, + "balance_loss_clip": 1.01696563, + "balance_loss_mlp": 1.01574326, + "epoch": 0.7123402976101008, + "flos": 20630002492800.0, + "grad_norm": 1.3812932199983736, + "language_loss": 0.72235864, + "learning_rate": 8.068864565139395e-07, + "loss": 0.74328017, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.34765625, + "step": 11848, + "time_per_iteration": 2.3804235458374023 + }, + { + "auxiliary_loss_clip": 0.01008184, + "auxiliary_loss_mlp": 0.01002534, + "balance_loss_clip": 1.00013804, + "balance_loss_mlp": 1.00100279, + "epoch": 0.7124004208627687, + "flos": 62322756180480.0, + "grad_norm": 0.85055997830156, + "language_loss": 0.63179433, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65190148, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.07177734, + "step": 11849, + "time_per_iteration": 2.9836130142211914 + }, + { + "auxiliary_loss_clip": 0.01052939, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.02019119, + "balance_loss_mlp": 1.01571965, + "epoch": 0.7124605441154367, + "flos": 39674425322880.0, + "grad_norm": 1.4432348969734283, + "language_loss": 0.6483168, + "learning_rate": 8.0626140580654e-07, + "loss": 0.66928804, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 11850, + "time_per_iteration": 2.5119211673736572 + }, + { + "auxiliary_loss_clip": 0.01053655, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.01409602, + "balance_loss_mlp": 1.01687169, + "epoch": 0.7125206673681046, + "flos": 28180668631680.0, + "grad_norm": 1.5439779473449253, + "language_loss": 0.70404005, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72495151, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 11851, + "time_per_iteration": 2.4293599128723145 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.01779985, + "balance_loss_mlp": 1.01735699, + "epoch": 0.7125807906207726, + "flos": 26175746229120.0, + "grad_norm": 1.6463536105689336, + "language_loss": 0.84043032, + "learning_rate": 8.056365361658882e-07, + "loss": 0.86137873, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36523438, + "step": 11852, + "time_per_iteration": 2.4106132984161377 + }, + { + "auxiliary_loss_clip": 0.01055127, + "auxiliary_loss_mlp": 0.01041665, + "balance_loss_clip": 1.0154624, + "balance_loss_mlp": 1.01741195, + "epoch": 0.7126409138734405, + "flos": 17156598727680.0, + "grad_norm": 2.3572061130468103, + "language_loss": 0.73679173, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75775969, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37695312, + "step": 11853, + "time_per_iteration": 2.3418514728546143 + }, + { + "auxiliary_loss_clip": 0.01050994, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.01497746, + "balance_loss_mlp": 1.01573384, + "epoch": 0.7127010371261085, + "flos": 18768957269760.0, + "grad_norm": 1.9306988248057908, + "language_loss": 0.94091797, + "learning_rate": 8.050118476867635e-07, + "loss": 0.96179813, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 11854, + "time_per_iteration": 2.3264312744140625 + }, + { + "auxiliary_loss_clip": 0.01052008, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.01651049, + "balance_loss_mlp": 1.01595116, + "epoch": 0.7127611603787765, + "flos": 20375380880640.0, + "grad_norm": 1.7407743762622219, + "language_loss": 0.8052572, + "learning_rate": 8.046995714123856e-07, + "loss": 0.82617092, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 11855, + "time_per_iteration": 2.3533198833465576 + }, + { + "auxiliary_loss_clip": 0.01053834, + "auxiliary_loss_mlp": 0.01040523, + "balance_loss_clip": 1.01485682, + "balance_loss_mlp": 1.01653075, + "epoch": 0.7128212836314445, + "flos": 20447965330560.0, + "grad_norm": 1.6212280999031499, + "language_loss": 0.7453531, + "learning_rate": 8.043873404639192e-07, + "loss": 0.76629674, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 11856, + "time_per_iteration": 2.3821818828582764 + }, + { + "auxiliary_loss_clip": 0.01055799, + "auxiliary_loss_mlp": 0.01038812, + "balance_loss_clip": 1.01377821, + "balance_loss_mlp": 1.01766098, + "epoch": 0.7128814068841124, + "flos": 23439707712000.0, + "grad_norm": 1.5928830073211622, + "language_loss": 0.70896649, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72991264, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 11857, + "time_per_iteration": 3.850458860397339 + }, + { + "auxiliary_loss_clip": 0.01051436, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.01167703, + "balance_loss_mlp": 1.01575899, + "epoch": 0.7129415301367804, + "flos": 18221972999040.0, + "grad_norm": 2.004843749115044, + "language_loss": 0.86186087, + "learning_rate": 8.03763014592081e-07, + "loss": 0.8827374, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 11858, + "time_per_iteration": 2.3199496269226074 + }, + { + "auxiliary_loss_clip": 0.01057214, + "auxiliary_loss_mlp": 0.01044428, + "balance_loss_clip": 1.0173552, + "balance_loss_mlp": 1.01818168, + "epoch": 0.7130016533894483, + "flos": 15522977301120.0, + "grad_norm": 1.6195539983187577, + "language_loss": 0.81727087, + "learning_rate": 8.034509196923829e-07, + "loss": 0.83828723, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.390625, + "step": 11859, + "time_per_iteration": 2.359506845474243 + }, + { + "auxiliary_loss_clip": 0.01051925, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.01182175, + "balance_loss_mlp": 1.01651144, + "epoch": 0.7130617766421163, + "flos": 57113646439680.0, + "grad_norm": 1.2341453534003826, + "language_loss": 0.69521618, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71608323, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 11860, + "time_per_iteration": 2.6778433322906494 + }, + { + "auxiliary_loss_clip": 0.01054958, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.01481557, + "balance_loss_mlp": 1.01746118, + "epoch": 0.7131218998947844, + "flos": 19787338984320.0, + "grad_norm": 2.220392626822036, + "language_loss": 0.65475869, + "learning_rate": 8.028268660246023e-07, + "loss": 0.67572051, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.375, + "step": 11861, + "time_per_iteration": 2.4142262935638428 + }, + { + "auxiliary_loss_clip": 0.01057121, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.01596808, + "balance_loss_mlp": 1.01792288, + "epoch": 0.7131820231474523, + "flos": 26650669720320.0, + "grad_norm": 2.8425097367141667, + "language_loss": 0.68557107, + "learning_rate": 8.025149072801849e-07, + "loss": 0.7065655, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 11862, + "time_per_iteration": 2.4201173782348633 + }, + { + "auxiliary_loss_clip": 0.01053455, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_clip": 1.02181983, + "balance_loss_mlp": 1.01761949, + "epoch": 0.7132421464001203, + "flos": 29204321961600.0, + "grad_norm": 2.535254190885049, + "language_loss": 0.68714559, + "learning_rate": 8.022029939445214e-07, + "loss": 0.70812619, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 11863, + "time_per_iteration": 2.4689650535583496 + }, + { + "auxiliary_loss_clip": 0.01057686, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.0182941, + "balance_loss_mlp": 1.01851773, + "epoch": 0.7133022696527882, + "flos": 23072561187840.0, + "grad_norm": 3.3224899482546686, + "language_loss": 0.66867781, + "learning_rate": 8.018911260294414e-07, + "loss": 0.68969685, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 11864, + "time_per_iteration": 2.373098850250244 + }, + { + "auxiliary_loss_clip": 0.01055095, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.01553869, + "balance_loss_mlp": 1.01721251, + "epoch": 0.7133623929054562, + "flos": 17456153126400.0, + "grad_norm": 2.7339158769046707, + "language_loss": 0.87589896, + "learning_rate": 8.015793035467697e-07, + "loss": 0.8968699, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 11865, + "time_per_iteration": 2.3892478942871094 + }, + { + "auxiliary_loss_clip": 0.01053232, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.01442361, + "balance_loss_mlp": 1.01695395, + "epoch": 0.7134225161581241, + "flos": 19535545192320.0, + "grad_norm": 2.0149381006534854, + "language_loss": 0.76357067, + "learning_rate": 8.012675265083304e-07, + "loss": 0.78450882, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36328125, + "step": 11866, + "time_per_iteration": 2.3340768814086914 + }, + { + "auxiliary_loss_clip": 0.01055922, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.01802063, + "balance_loss_mlp": 1.0186547, + "epoch": 0.7134826394107922, + "flos": 26248889260800.0, + "grad_norm": 2.0135751985455643, + "language_loss": 0.7205888, + "learning_rate": 8.009557949259464e-07, + "loss": 0.74159086, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37304688, + "step": 11867, + "time_per_iteration": 2.426377296447754 + }, + { + "auxiliary_loss_clip": 0.01051156, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.01279867, + "balance_loss_mlp": 1.01564527, + "epoch": 0.7135427626634601, + "flos": 15814397352960.0, + "grad_norm": 1.89859548975805, + "language_loss": 0.7274636, + "learning_rate": 8.006441088114397e-07, + "loss": 0.74833286, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 11868, + "time_per_iteration": 2.350062608718872 + }, + { + "auxiliary_loss_clip": 0.0105688, + "auxiliary_loss_mlp": 0.01037577, + "balance_loss_clip": 1.01037383, + "balance_loss_mlp": 1.01773524, + "epoch": 0.7136028859161281, + "flos": 18222426846720.0, + "grad_norm": 1.9996559523091737, + "language_loss": 0.67905056, + "learning_rate": 8.003324681766286e-07, + "loss": 0.69999516, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 11869, + "time_per_iteration": 2.35369873046875 + }, + { + "auxiliary_loss_clip": 0.0105252, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.01204324, + "balance_loss_mlp": 1.01548159, + "epoch": 0.713663009168796, + "flos": 24313723488000.0, + "grad_norm": 1.7560283737634654, + "language_loss": 0.78858411, + "learning_rate": 8.000208730333298e-07, + "loss": 0.80946803, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 11870, + "time_per_iteration": 2.4271585941314697 + }, + { + "auxiliary_loss_clip": 0.01054042, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.01510179, + "balance_loss_mlp": 1.0171591, + "epoch": 0.713723132421464, + "flos": 26537376758400.0, + "grad_norm": 2.3680105818073844, + "language_loss": 0.82067823, + "learning_rate": 7.997093233933597e-07, + "loss": 0.84162796, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36914062, + "step": 11871, + "time_per_iteration": 2.3964641094207764 + }, + { + "auxiliary_loss_clip": 0.01054788, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.01599908, + "balance_loss_mlp": 1.01713395, + "epoch": 0.7137832556741319, + "flos": 19864636467840.0, + "grad_norm": 1.6274968594032253, + "language_loss": 0.79802841, + "learning_rate": 7.993978192685331e-07, + "loss": 0.81898773, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 11872, + "time_per_iteration": 2.4025540351867676 + }, + { + "auxiliary_loss_clip": 0.01054303, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.01492476, + "balance_loss_mlp": 1.01622987, + "epoch": 0.7138433789267999, + "flos": 21687870821760.0, + "grad_norm": 2.397034497709944, + "language_loss": 0.85303712, + "learning_rate": 7.990863606706606e-07, + "loss": 0.87398273, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 11873, + "time_per_iteration": 2.420586585998535 + }, + { + "auxiliary_loss_clip": 0.01050872, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.0137943, + "balance_loss_mlp": 1.01516581, + "epoch": 0.713903502179468, + "flos": 17601775873920.0, + "grad_norm": 1.9009093574987883, + "language_loss": 0.87453163, + "learning_rate": 7.987749476115539e-07, + "loss": 0.89541149, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35742188, + "step": 11874, + "time_per_iteration": 2.3548779487609863 + }, + { + "auxiliary_loss_clip": 0.01054062, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.00806046, + "balance_loss_mlp": 1.01657057, + "epoch": 0.7139636254321359, + "flos": 18039377255040.0, + "grad_norm": 1.8422142836138475, + "language_loss": 0.84181136, + "learning_rate": 7.984635801030228e-07, + "loss": 0.86268079, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 11875, + "time_per_iteration": 2.3589565753936768 + }, + { + "auxiliary_loss_clip": 0.01057862, + "auxiliary_loss_mlp": 0.01043337, + "balance_loss_clip": 1.01533461, + "balance_loss_mlp": 1.01742339, + "epoch": 0.7140237486848039, + "flos": 23330010620160.0, + "grad_norm": 1.749646364249995, + "language_loss": 0.71005362, + "learning_rate": 7.981522581568721e-07, + "loss": 0.73106569, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.40429688, + "step": 11876, + "time_per_iteration": 2.3878326416015625 + }, + { + "auxiliary_loss_clip": 0.01055176, + "auxiliary_loss_mlp": 0.01039271, + "balance_loss_clip": 1.0147258, + "balance_loss_mlp": 1.01733553, + "epoch": 0.7140838719374718, + "flos": 16836130558080.0, + "grad_norm": 2.0812151517657425, + "language_loss": 0.79385018, + "learning_rate": 7.978409817849079e-07, + "loss": 0.81479466, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37890625, + "step": 11877, + "time_per_iteration": 2.331735610961914 + }, + { + "auxiliary_loss_clip": 0.01054196, + "auxiliary_loss_mlp": 0.01037573, + "balance_loss_clip": 1.01473236, + "balance_loss_mlp": 1.01743329, + "epoch": 0.7141439951901398, + "flos": 21140956373760.0, + "grad_norm": 1.9055146279761588, + "language_loss": 0.70231712, + "learning_rate": 7.97529750998934e-07, + "loss": 0.72323483, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 11878, + "time_per_iteration": 2.382715940475464 + }, + { + "auxiliary_loss_clip": 0.01050689, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.01636291, + "balance_loss_mlp": 1.01555109, + "epoch": 0.7142041184428077, + "flos": 24716551288320.0, + "grad_norm": 1.991704391828737, + "language_loss": 0.69166028, + "learning_rate": 7.972185658107535e-07, + "loss": 0.71256089, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 11879, + "time_per_iteration": 2.376295804977417 + }, + { + "auxiliary_loss_clip": 0.01053071, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.01432991, + "balance_loss_mlp": 1.01701498, + "epoch": 0.7142642416954758, + "flos": 21907125360000.0, + "grad_norm": 1.580571788914813, + "language_loss": 0.70452547, + "learning_rate": 7.969074262321646e-07, + "loss": 0.7254436, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 11880, + "time_per_iteration": 2.39935040473938 + }, + { + "auxiliary_loss_clip": 0.01054987, + "auxiliary_loss_mlp": 0.01037615, + "balance_loss_clip": 1.01330853, + "balance_loss_mlp": 1.01684308, + "epoch": 0.7143243649481437, + "flos": 20804813003520.0, + "grad_norm": 2.2635017809985123, + "language_loss": 0.82569766, + "learning_rate": 7.965963322749674e-07, + "loss": 0.84662372, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3828125, + "step": 11881, + "time_per_iteration": 3.631772518157959 + }, + { + "auxiliary_loss_clip": 0.01052701, + "auxiliary_loss_mlp": 0.01035211, + "balance_loss_clip": 1.01235855, + "balance_loss_mlp": 1.01578641, + "epoch": 0.7143844882008117, + "flos": 27233789114880.0, + "grad_norm": 1.4770511163556561, + "language_loss": 0.64622909, + "learning_rate": 7.962852839509579e-07, + "loss": 0.66710818, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36914062, + "step": 11882, + "time_per_iteration": 2.449892282485962 + }, + { + "auxiliary_loss_clip": 0.01054231, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.0145818, + "balance_loss_mlp": 1.01635957, + "epoch": 0.7144446114534796, + "flos": 17928702645120.0, + "grad_norm": 2.405004496271898, + "language_loss": 0.70740336, + "learning_rate": 7.959742812719304e-07, + "loss": 0.72833633, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37890625, + "step": 11883, + "time_per_iteration": 2.3299291133880615 + }, + { + "auxiliary_loss_clip": 0.01051752, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_clip": 1.01735449, + "balance_loss_mlp": 1.01566041, + "epoch": 0.7145047347061476, + "flos": 20739909052800.0, + "grad_norm": 1.90368465480192, + "language_loss": 0.79038656, + "learning_rate": 7.956633242496788e-07, + "loss": 0.81133044, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36132812, + "step": 11884, + "time_per_iteration": 2.3630733489990234 + }, + { + "auxiliary_loss_clip": 0.0105651, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.01294541, + "balance_loss_mlp": 1.01642895, + "epoch": 0.7145648579588155, + "flos": 21177545345280.0, + "grad_norm": 1.8955374970064391, + "language_loss": 0.75451511, + "learning_rate": 7.953524128959954e-07, + "loss": 0.77548528, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.40039062, + "step": 11885, + "time_per_iteration": 3.8235628604888916 + }, + { + "auxiliary_loss_clip": 0.01008648, + "auxiliary_loss_mlp": 0.01007268, + "balance_loss_clip": 1.0047766, + "balance_loss_mlp": 1.00135326, + "epoch": 0.7146249812114835, + "flos": 64781094810240.0, + "grad_norm": 0.9208247648721084, + "language_loss": 0.6637603, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68391949, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.07324219, + "step": 11886, + "time_per_iteration": 2.976095199584961 + }, + { + "auxiliary_loss_clip": 0.01052977, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.01423192, + "balance_loss_mlp": 1.01679134, + "epoch": 0.7146851044641516, + "flos": 18112904311680.0, + "grad_norm": 2.205232448517343, + "language_loss": 0.76203895, + "learning_rate": 7.947307272414874e-07, + "loss": 0.78295982, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36132812, + "step": 11887, + "time_per_iteration": 3.762922525405884 + }, + { + "auxiliary_loss_clip": 0.01052624, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.01180196, + "balance_loss_mlp": 1.01639402, + "epoch": 0.7147452277168195, + "flos": 19242868331520.0, + "grad_norm": 1.486527951297893, + "language_loss": 0.72448158, + "learning_rate": 7.944199529642372e-07, + "loss": 0.74535728, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 11888, + "time_per_iteration": 2.3544697761535645 + }, + { + "auxiliary_loss_clip": 0.01055366, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_clip": 1.01836157, + "balance_loss_mlp": 1.01727557, + "epoch": 0.7148053509694875, + "flos": 23763701928960.0, + "grad_norm": 1.971117309098969, + "language_loss": 0.84921646, + "learning_rate": 7.941092244027041e-07, + "loss": 0.87019765, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38085938, + "step": 11889, + "time_per_iteration": 2.4031383991241455 + }, + { + "auxiliary_loss_clip": 0.01054035, + "auxiliary_loss_mlp": 0.01036318, + "balance_loss_clip": 1.01233268, + "balance_loss_mlp": 1.01707983, + "epoch": 0.7148654742221554, + "flos": 22484414557440.0, + "grad_norm": 1.7508442990826478, + "language_loss": 0.7658236, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78672707, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 11890, + "time_per_iteration": 2.3596489429473877 + }, + { + "auxiliary_loss_clip": 0.01052305, + "auxiliary_loss_mlp": 0.01039036, + "balance_loss_clip": 1.01658869, + "balance_loss_mlp": 1.0163393, + "epoch": 0.7149255974748234, + "flos": 24678112014720.0, + "grad_norm": 1.554043841399718, + "language_loss": 0.74796075, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76887405, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 11891, + "time_per_iteration": 2.410285472869873 + }, + { + "auxiliary_loss_clip": 0.0105547, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.01651049, + "balance_loss_mlp": 1.01740289, + "epoch": 0.7149857207274913, + "flos": 18404603654400.0, + "grad_norm": 1.8249831424908263, + "language_loss": 0.69784009, + "learning_rate": 7.931773131302211e-07, + "loss": 0.71879923, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38085938, + "step": 11892, + "time_per_iteration": 2.3318088054656982 + }, + { + "auxiliary_loss_clip": 0.01054749, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.01286292, + "balance_loss_mlp": 1.01584935, + "epoch": 0.7150458439801594, + "flos": 24968449814400.0, + "grad_norm": 1.8279378043048726, + "language_loss": 0.74157131, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76252103, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 11893, + "time_per_iteration": 2.3881959915161133 + }, + { + "auxiliary_loss_clip": 0.01055453, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.00933266, + "balance_loss_mlp": 1.01708114, + "epoch": 0.7151059672328273, + "flos": 16689844494720.0, + "grad_norm": 2.157579857911161, + "language_loss": 0.67599124, + "learning_rate": 7.925562677431185e-07, + "loss": 0.69691277, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38476562, + "step": 11894, + "time_per_iteration": 2.3656461238861084 + }, + { + "auxiliary_loss_clip": 0.01056196, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.01094055, + "balance_loss_mlp": 1.01767898, + "epoch": 0.7151660904854953, + "flos": 27270587554560.0, + "grad_norm": 1.6384705456494812, + "language_loss": 0.78773969, + "learning_rate": 7.922458137232613e-07, + "loss": 0.80865765, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38476562, + "step": 11895, + "time_per_iteration": 2.4224467277526855 + }, + { + "auxiliary_loss_clip": 0.01053809, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.01353264, + "balance_loss_mlp": 1.01630926, + "epoch": 0.7152262137381632, + "flos": 18331286065920.0, + "grad_norm": 2.881337486785914, + "language_loss": 0.71118844, + "learning_rate": 7.919354055015643e-07, + "loss": 0.73212385, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 11896, + "time_per_iteration": 2.319547653198242 + }, + { + "auxiliary_loss_clip": 0.01054722, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_clip": 1.01819134, + "balance_loss_mlp": 1.01625323, + "epoch": 0.7152863369908312, + "flos": 21798196318080.0, + "grad_norm": 1.753127327340692, + "language_loss": 0.8713094, + "learning_rate": 7.91625043089798e-07, + "loss": 0.89230746, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38476562, + "step": 11897, + "time_per_iteration": 3.81473970413208 + }, + { + "auxiliary_loss_clip": 0.01052064, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.01125455, + "balance_loss_mlp": 1.01584387, + "epoch": 0.7153464602434991, + "flos": 22157487786240.0, + "grad_norm": 1.8635272476912215, + "language_loss": 0.79111207, + "learning_rate": 7.913147264997304e-07, + "loss": 0.811988, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 11898, + "time_per_iteration": 2.455075979232788 + }, + { + "auxiliary_loss_clip": 0.0105586, + "auxiliary_loss_mlp": 0.01042595, + "balance_loss_clip": 1.01472378, + "balance_loss_mlp": 1.01676691, + "epoch": 0.7154065834961671, + "flos": 24714945365760.0, + "grad_norm": 1.8021853187560672, + "language_loss": 0.74120688, + "learning_rate": 7.910044557431302e-07, + "loss": 0.76219141, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.390625, + "step": 11899, + "time_per_iteration": 2.4003450870513916 + }, + { + "auxiliary_loss_clip": 0.01052575, + "auxiliary_loss_mlp": 0.01037031, + "balance_loss_clip": 1.01254511, + "balance_loss_mlp": 1.0158813, + "epoch": 0.7154667067488351, + "flos": 22600395694080.0, + "grad_norm": 1.9244268577211279, + "language_loss": 0.77209234, + "learning_rate": 7.906942308317614e-07, + "loss": 0.79298842, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3671875, + "step": 11900, + "time_per_iteration": 2.35492205619812 + }, + { + "auxiliary_loss_clip": 0.01054219, + "auxiliary_loss_mlp": 0.0103866, + "balance_loss_clip": 1.01261258, + "balance_loss_mlp": 1.01667213, + "epoch": 0.7155268300015031, + "flos": 18770144256000.0, + "grad_norm": 2.260139258466625, + "language_loss": 0.82078892, + "learning_rate": 7.903840517773886e-07, + "loss": 0.84171766, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 11901, + "time_per_iteration": 2.3544371128082275 + }, + { + "auxiliary_loss_clip": 0.01056849, + "auxiliary_loss_mlp": 0.01042106, + "balance_loss_clip": 1.01621389, + "balance_loss_mlp": 1.0173732, + "epoch": 0.7155869532541711, + "flos": 18295360410240.0, + "grad_norm": 1.9134314305306346, + "language_loss": 0.82263792, + "learning_rate": 7.900739185917744e-07, + "loss": 0.84362751, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39453125, + "step": 11902, + "time_per_iteration": 2.3189008235931396 + }, + { + "auxiliary_loss_clip": 0.01053962, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.01427925, + "balance_loss_mlp": 1.01647186, + "epoch": 0.715647076506839, + "flos": 11980096197120.0, + "grad_norm": 1.7928297718535435, + "language_loss": 0.68597579, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70690691, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 11903, + "time_per_iteration": 2.3301796913146973 + }, + { + "auxiliary_loss_clip": 0.01050917, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.01515603, + "balance_loss_mlp": 1.01526523, + "epoch": 0.715707199759507, + "flos": 18950680229760.0, + "grad_norm": 1.611236137726872, + "language_loss": 0.76968664, + "learning_rate": 7.894537898738589e-07, + "loss": 0.79058909, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 11904, + "time_per_iteration": 2.3475146293640137 + }, + { + "auxiliary_loss_clip": 0.01053757, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.0167048, + "balance_loss_mlp": 1.01623142, + "epoch": 0.7157673230121749, + "flos": 15303513294720.0, + "grad_norm": 1.877231757526791, + "language_loss": 0.73510969, + "learning_rate": 7.891437943650727e-07, + "loss": 0.75606936, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 11905, + "time_per_iteration": 2.354663848876953 + }, + { + "auxiliary_loss_clip": 0.01052344, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.01424408, + "balance_loss_mlp": 1.0157274, + "epoch": 0.715827446264843, + "flos": 23220732464640.0, + "grad_norm": 1.475831858447686, + "language_loss": 0.78819311, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80910087, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 11906, + "time_per_iteration": 2.373241901397705 + }, + { + "auxiliary_loss_clip": 0.01008611, + "auxiliary_loss_mlp": 0.0100873, + "balance_loss_clip": 1.00635755, + "balance_loss_mlp": 1.0014751, + "epoch": 0.7158875695175109, + "flos": 60972490281600.0, + "grad_norm": 0.7367735577011942, + "language_loss": 0.55333072, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57350415, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07128906, + "step": 11907, + "time_per_iteration": 2.931745767593384 + }, + { + "auxiliary_loss_clip": 0.0105468, + "auxiliary_loss_mlp": 0.01039662, + "balance_loss_clip": 1.01410413, + "balance_loss_mlp": 1.01667857, + "epoch": 0.7159476927701789, + "flos": 17127829900800.0, + "grad_norm": 1.7958833879673786, + "language_loss": 0.7044282, + "learning_rate": 7.882140833804593e-07, + "loss": 0.7253716, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 11908, + "time_per_iteration": 2.3553526401519775 + }, + { + "auxiliary_loss_clip": 0.0105469, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.01465034, + "balance_loss_mlp": 1.0172075, + "epoch": 0.7160078160228468, + "flos": 22489546527360.0, + "grad_norm": 1.630228873617626, + "language_loss": 0.72466838, + "learning_rate": 7.879042716053415e-07, + "loss": 0.74562192, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 11909, + "time_per_iteration": 2.396656036376953 + }, + { + "auxiliary_loss_clip": 0.01054176, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.01077688, + "balance_loss_mlp": 1.01685905, + "epoch": 0.7160679392755148, + "flos": 30589640732160.0, + "grad_norm": 1.6979647105857998, + "language_loss": 0.75840217, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77930629, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 11910, + "time_per_iteration": 2.4298155307769775 + }, + { + "auxiliary_loss_clip": 0.01053231, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.0170157, + "balance_loss_mlp": 1.01644349, + "epoch": 0.7161280625281827, + "flos": 21322609511040.0, + "grad_norm": 1.4809652261212423, + "language_loss": 0.76911259, + "learning_rate": 7.872847859552251e-07, + "loss": 0.7900387, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36914062, + "step": 11911, + "time_per_iteration": 2.402038335800171 + }, + { + "auxiliary_loss_clip": 0.01054518, + "auxiliary_loss_mlp": 0.01040844, + "balance_loss_clip": 1.01595354, + "balance_loss_mlp": 1.01679683, + "epoch": 0.7161881857808508, + "flos": 61857889027200.0, + "grad_norm": 1.6877721068727722, + "language_loss": 0.59714204, + "learning_rate": 7.869751121037192e-07, + "loss": 0.61809576, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37695312, + "step": 11912, + "time_per_iteration": 2.740537643432617 + }, + { + "auxiliary_loss_clip": 0.01053135, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.01270914, + "balance_loss_mlp": 1.01583362, + "epoch": 0.7162483090335187, + "flos": 20811097048320.0, + "grad_norm": 1.8251757135616946, + "language_loss": 0.79312789, + "learning_rate": 7.866654842502376e-07, + "loss": 0.81404674, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37304688, + "step": 11913, + "time_per_iteration": 2.3910491466522217 + }, + { + "auxiliary_loss_clip": 0.01050904, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.0122354, + "balance_loss_mlp": 1.01564074, + "epoch": 0.7163084322861867, + "flos": 24096389074560.0, + "grad_norm": 1.6175526203266204, + "language_loss": 0.75415677, + "learning_rate": 7.863559024065234e-07, + "loss": 0.77500778, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 11914, + "time_per_iteration": 2.4009103775024414 + }, + { + "auxiliary_loss_clip": 0.01051631, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.0154953, + "balance_loss_mlp": 1.01587009, + "epoch": 0.7163685555388547, + "flos": 20079946022400.0, + "grad_norm": 1.667927293032064, + "language_loss": 0.74499023, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76589191, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 11915, + "time_per_iteration": 2.35589599609375 + }, + { + "auxiliary_loss_clip": 0.01055517, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.01628423, + "balance_loss_mlp": 1.01706171, + "epoch": 0.7164286787915226, + "flos": 17456013480960.0, + "grad_norm": 1.8038938455563338, + "language_loss": 0.81393492, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83489454, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3828125, + "step": 11916, + "time_per_iteration": 2.3357057571411133 + }, + { + "auxiliary_loss_clip": 0.01053833, + "auxiliary_loss_mlp": 0.01038651, + "balance_loss_clip": 1.01577437, + "balance_loss_mlp": 1.01678741, + "epoch": 0.7164888020441906, + "flos": 19717896556800.0, + "grad_norm": 2.7767659762949024, + "language_loss": 0.69258165, + "learning_rate": 7.854274330513626e-07, + "loss": 0.71350646, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.37109375, + "step": 11917, + "time_per_iteration": 2.420375347137451 + }, + { + "auxiliary_loss_clip": 0.01053695, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.01158595, + "balance_loss_mlp": 1.01726949, + "epoch": 0.7165489252968585, + "flos": 21469454156160.0, + "grad_norm": 1.6268505297474343, + "language_loss": 0.7693702, + "learning_rate": 7.851180353640896e-07, + "loss": 0.79027957, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36328125, + "step": 11918, + "time_per_iteration": 2.354374408721924 + }, + { + "auxiliary_loss_clip": 0.01008884, + "auxiliary_loss_mlp": 0.01003052, + "balance_loss_clip": 1.00081134, + "balance_loss_mlp": 1.00178492, + "epoch": 0.7166090485495266, + "flos": 69924499505280.0, + "grad_norm": 0.6356037168803202, + "language_loss": 0.53992254, + "learning_rate": 7.848086837452639e-07, + "loss": 0.5600419, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.07128906, + "step": 11919, + "time_per_iteration": 3.053708553314209 + }, + { + "auxiliary_loss_clip": 0.01054008, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.00974894, + "balance_loss_mlp": 1.01619995, + "epoch": 0.7166691718021945, + "flos": 27342683245440.0, + "grad_norm": 1.892105717126141, + "language_loss": 0.69411701, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71500194, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37890625, + "step": 11920, + "time_per_iteration": 3.6935031414031982 + }, + { + "auxiliary_loss_clip": 0.01053514, + "auxiliary_loss_mlp": 0.01041424, + "balance_loss_clip": 1.0171169, + "balance_loss_mlp": 1.01617503, + "epoch": 0.7167292950548625, + "flos": 30407568658560.0, + "grad_norm": 2.06067406660684, + "language_loss": 0.75605363, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77700299, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37304688, + "step": 11921, + "time_per_iteration": 2.4161055088043213 + }, + { + "auxiliary_loss_clip": 0.0105709, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.01543891, + "balance_loss_mlp": 1.01681757, + "epoch": 0.7167894183075304, + "flos": 14570477055360.0, + "grad_norm": 3.738916611887295, + "language_loss": 0.77171904, + "learning_rate": 7.83880905416755e-07, + "loss": 0.79274213, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.40234375, + "step": 11922, + "time_per_iteration": 2.346613883972168 + }, + { + "auxiliary_loss_clip": 0.01008172, + "auxiliary_loss_mlp": 0.01002411, + "balance_loss_clip": 0.99997926, + "balance_loss_mlp": 1.00103688, + "epoch": 0.7168495415601984, + "flos": 64107725817600.0, + "grad_norm": 0.7549146162207981, + "language_loss": 0.55155635, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57166219, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.07128906, + "step": 11923, + "time_per_iteration": 2.8438587188720703 + }, + { + "auxiliary_loss_clip": 0.01054039, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.01415288, + "balance_loss_mlp": 1.01625443, + "epoch": 0.7169096648128663, + "flos": 24680276519040.0, + "grad_norm": 1.493813367560006, + "language_loss": 0.77968585, + "learning_rate": 7.832626170883279e-07, + "loss": 0.80061865, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 11924, + "time_per_iteration": 3.7888686656951904 + }, + { + "auxiliary_loss_clip": 0.010532, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.01193285, + "balance_loss_mlp": 1.01594067, + "epoch": 0.7169697880655344, + "flos": 20666486730240.0, + "grad_norm": 1.5962721560709763, + "language_loss": 0.69385773, + "learning_rate": 7.829535421264588e-07, + "loss": 0.71475226, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37304688, + "step": 11925, + "time_per_iteration": 2.354365825653076 + }, + { + "auxiliary_loss_clip": 0.01050718, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.01261759, + "balance_loss_mlp": 1.01578498, + "epoch": 0.7170299113182023, + "flos": 21031643306880.0, + "grad_norm": 1.6429501986462847, + "language_loss": 0.77969003, + "learning_rate": 7.826445133151133e-07, + "loss": 0.80054867, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 11926, + "time_per_iteration": 3.855084180831909 + }, + { + "auxiliary_loss_clip": 0.01056242, + "auxiliary_loss_mlp": 0.01040525, + "balance_loss_clip": 1.01347613, + "balance_loss_mlp": 1.01652765, + "epoch": 0.7170900345708703, + "flos": 22892199770880.0, + "grad_norm": 2.3581731993644097, + "language_loss": 0.78086019, + "learning_rate": 7.823355306660093e-07, + "loss": 0.80182791, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39648438, + "step": 11927, + "time_per_iteration": 2.399858236312866 + }, + { + "auxiliary_loss_clip": 0.01053903, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.01627326, + "balance_loss_mlp": 1.01735067, + "epoch": 0.7171501578235383, + "flos": 15517915153920.0, + "grad_norm": 2.5595873646871707, + "language_loss": 0.70616472, + "learning_rate": 7.820265941908642e-07, + "loss": 0.7271015, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36523438, + "step": 11928, + "time_per_iteration": 2.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.01051313, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.01090729, + "balance_loss_mlp": 1.01662171, + "epoch": 0.7172102810762062, + "flos": 26103091956480.0, + "grad_norm": 1.930114897444092, + "language_loss": 0.65919942, + "learning_rate": 7.817177039013931e-07, + "loss": 0.68003786, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 11929, + "time_per_iteration": 2.4334864616394043 + }, + { + "auxiliary_loss_clip": 0.01053627, + "auxiliary_loss_mlp": 0.01039366, + "balance_loss_clip": 1.01397407, + "balance_loss_mlp": 1.01594615, + "epoch": 0.7172704043288742, + "flos": 21505589280000.0, + "grad_norm": 2.094070257135292, + "language_loss": 0.72018504, + "learning_rate": 7.81408859809308e-07, + "loss": 0.74111497, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 11930, + "time_per_iteration": 2.395768165588379 + }, + { + "auxiliary_loss_clip": 0.01054263, + "auxiliary_loss_mlp": 0.01040738, + "balance_loss_clip": 1.01509631, + "balance_loss_mlp": 1.01642513, + "epoch": 0.7173305275815421, + "flos": 18769934787840.0, + "grad_norm": 2.2075402918474754, + "language_loss": 0.8300283, + "learning_rate": 7.811000619263219e-07, + "loss": 0.85097826, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 11931, + "time_per_iteration": 2.3533082008361816 + }, + { + "auxiliary_loss_clip": 0.01052964, + "auxiliary_loss_mlp": 0.01037583, + "balance_loss_clip": 1.01424193, + "balance_loss_mlp": 1.01698101, + "epoch": 0.7173906508342102, + "flos": 16178960436480.0, + "grad_norm": 2.0338899567733093, + "language_loss": 0.7989018, + "learning_rate": 7.80791310264143e-07, + "loss": 0.81980729, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 11932, + "time_per_iteration": 2.3625948429107666 + }, + { + "auxiliary_loss_clip": 0.0105231, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_clip": 1.01866126, + "balance_loss_mlp": 1.01545274, + "epoch": 0.7174507740868781, + "flos": 26612684294400.0, + "grad_norm": 1.4527995951453703, + "language_loss": 0.75857818, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77953112, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36914062, + "step": 11933, + "time_per_iteration": 2.4043662548065186 + }, + { + "auxiliary_loss_clip": 0.01059722, + "auxiliary_loss_mlp": 0.01051767, + "balance_loss_clip": 1.02231097, + "balance_loss_mlp": 1.01928353, + "epoch": 0.7175108973395461, + "flos": 18432185495040.0, + "grad_norm": 3.1991848194251133, + "language_loss": 0.71437359, + "learning_rate": 7.801739456490388e-07, + "loss": 0.73548847, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.40429688, + "step": 11934, + "time_per_iteration": 2.3425910472869873 + }, + { + "auxiliary_loss_clip": 0.01053109, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.0130105, + "balance_loss_mlp": 1.01659107, + "epoch": 0.717571020592214, + "flos": 23913828241920.0, + "grad_norm": 1.8933338131269595, + "language_loss": 0.86776423, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88867867, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 11935, + "time_per_iteration": 2.3742575645446777 + }, + { + "auxiliary_loss_clip": 0.0105368, + "auxiliary_loss_mlp": 0.01038904, + "balance_loss_clip": 1.01468027, + "balance_loss_mlp": 1.01570523, + "epoch": 0.717631143844882, + "flos": 38255310489600.0, + "grad_norm": 1.522305562615613, + "language_loss": 0.74724817, + "learning_rate": 7.795567660576388e-07, + "loss": 0.76817399, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 11936, + "time_per_iteration": 3.9423036575317383 + }, + { + "auxiliary_loss_clip": 0.01008651, + "auxiliary_loss_mlp": 0.01005169, + "balance_loss_clip": 1.00286806, + "balance_loss_mlp": 1.00181448, + "epoch": 0.7176912670975499, + "flos": 65512385481600.0, + "grad_norm": 0.7631400997413118, + "language_loss": 0.55964816, + "learning_rate": 7.79248245675082e-07, + "loss": 0.5797863, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.06835938, + "step": 11937, + "time_per_iteration": 3.0053932666778564 + }, + { + "auxiliary_loss_clip": 0.01055644, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.01528335, + "balance_loss_mlp": 1.01726091, + "epoch": 0.717751390350218, + "flos": 31279838866560.0, + "grad_norm": 1.9372424019651522, + "language_loss": 0.56104833, + "learning_rate": 7.789397715835542e-07, + "loss": 0.58201617, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 11938, + "time_per_iteration": 2.430328607559204 + }, + { + "auxiliary_loss_clip": 0.01050119, + "auxiliary_loss_mlp": 0.01035489, + "balance_loss_clip": 1.01125336, + "balance_loss_mlp": 1.01508152, + "epoch": 0.7178115136028859, + "flos": 19858177866240.0, + "grad_norm": 1.7071248202052391, + "language_loss": 0.77493095, + "learning_rate": 7.786313437947527e-07, + "loss": 0.79578704, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3515625, + "step": 11939, + "time_per_iteration": 2.4006354808807373 + }, + { + "auxiliary_loss_clip": 0.01008042, + "auxiliary_loss_mlp": 0.01002698, + "balance_loss_clip": 0.99995577, + "balance_loss_mlp": 1.00101674, + "epoch": 0.7178716368555539, + "flos": 64345169174400.0, + "grad_norm": 0.7609847851267416, + "language_loss": 0.61511916, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63522655, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.0703125, + "step": 11940, + "time_per_iteration": 2.9766199588775635 + }, + { + "auxiliary_loss_clip": 0.01052848, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.01154697, + "balance_loss_mlp": 1.01676607, + "epoch": 0.7179317601082219, + "flos": 26761344330240.0, + "grad_norm": 1.6843430054340716, + "language_loss": 0.59733325, + "learning_rate": 7.780146271721097e-07, + "loss": 0.6182071, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 11941, + "time_per_iteration": 2.4296348094940186 + }, + { + "auxiliary_loss_clip": 0.0105297, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.01510167, + "balance_loss_mlp": 1.01689816, + "epoch": 0.7179918833608898, + "flos": 23512676186880.0, + "grad_norm": 1.960921552566401, + "language_loss": 0.80910772, + "learning_rate": 7.777063383616543e-07, + "loss": 0.83000767, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36132812, + "step": 11942, + "time_per_iteration": 2.367962598800659 + }, + { + "auxiliary_loss_clip": 0.0105315, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_clip": 1.01653123, + "balance_loss_mlp": 1.01578879, + "epoch": 0.7180520066135578, + "flos": 17164628340480.0, + "grad_norm": 2.0463187170356676, + "language_loss": 0.67257977, + "learning_rate": 7.773980959006968e-07, + "loss": 0.69351941, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 11943, + "time_per_iteration": 2.3607850074768066 + }, + { + "auxiliary_loss_clip": 0.01051207, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.01262546, + "balance_loss_mlp": 1.01574993, + "epoch": 0.7181121298662257, + "flos": 17565675661440.0, + "grad_norm": 1.8801878690085914, + "language_loss": 0.79583013, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81669718, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 11944, + "time_per_iteration": 2.3384201526641846 + }, + { + "auxiliary_loss_clip": 0.01054514, + "auxiliary_loss_mlp": 0.01042517, + "balance_loss_clip": 1.01303625, + "balance_loss_mlp": 1.01615167, + "epoch": 0.7181722531188938, + "flos": 11946858716160.0, + "grad_norm": 2.279536159072007, + "language_loss": 0.64352202, + "learning_rate": 7.767817500740277e-07, + "loss": 0.66449237, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.38476562, + "step": 11945, + "time_per_iteration": 2.3585660457611084 + }, + { + "auxiliary_loss_clip": 0.01008851, + "auxiliary_loss_mlp": 0.01002866, + "balance_loss_clip": 1.00037479, + "balance_loss_mlp": 1.00162268, + "epoch": 0.7182323763715617, + "flos": 65500480707840.0, + "grad_norm": 0.7008793365898902, + "language_loss": 0.51095617, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53107333, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.07226562, + "step": 11946, + "time_per_iteration": 2.9282047748565674 + }, + { + "auxiliary_loss_clip": 0.01055756, + "auxiliary_loss_mlp": 0.01043653, + "balance_loss_clip": 1.01549625, + "balance_loss_mlp": 1.01719546, + "epoch": 0.7182924996242297, + "flos": 20629897758720.0, + "grad_norm": 1.8962867946838917, + "language_loss": 0.76085496, + "learning_rate": 7.761655897855925e-07, + "loss": 0.78184909, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.38476562, + "step": 11947, + "time_per_iteration": 2.3767342567443848 + }, + { + "auxiliary_loss_clip": 0.01051932, + "auxiliary_loss_mlp": 0.01037549, + "balance_loss_clip": 1.01323032, + "balance_loss_mlp": 1.0155338, + "epoch": 0.7183526228768976, + "flos": 16215514496640.0, + "grad_norm": 1.5446156454343203, + "language_loss": 0.73799312, + "learning_rate": 7.758575792474187e-07, + "loss": 0.75888795, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 11948, + "time_per_iteration": 2.3336637020111084 + }, + { + "auxiliary_loss_clip": 0.01055742, + "auxiliary_loss_mlp": 0.01039921, + "balance_loss_clip": 1.01320577, + "balance_loss_mlp": 1.01735556, + "epoch": 0.7184127461295656, + "flos": 22231678158720.0, + "grad_norm": 1.678960312271913, + "language_loss": 0.72420967, + "learning_rate": 7.755496151288483e-07, + "loss": 0.7451663, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3828125, + "step": 11949, + "time_per_iteration": 2.3958375453948975 + }, + { + "auxiliary_loss_clip": 0.01052726, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.01843429, + "balance_loss_mlp": 1.01621652, + "epoch": 0.7184728693822335, + "flos": 27343276738560.0, + "grad_norm": 2.1414179747306488, + "language_loss": 0.77705991, + "learning_rate": 7.752416974415598e-07, + "loss": 0.79801267, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 11950, + "time_per_iteration": 2.4081759452819824 + }, + { + "auxiliary_loss_clip": 0.01057036, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.01516485, + "balance_loss_mlp": 1.01839602, + "epoch": 0.7185329926349016, + "flos": 16507597864320.0, + "grad_norm": 2.343878592511926, + "language_loss": 0.68770343, + "learning_rate": 7.749338261972282e-07, + "loss": 0.70868909, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38671875, + "step": 11951, + "time_per_iteration": 2.492427349090576 + }, + { + "auxiliary_loss_clip": 0.01056194, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.01003528, + "balance_loss_mlp": 1.01699662, + "epoch": 0.7185931158875695, + "flos": 23949928454400.0, + "grad_norm": 1.9195771849374768, + "language_loss": 0.79453957, + "learning_rate": 7.746260014075286e-07, + "loss": 0.81547183, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39257812, + "step": 11952, + "time_per_iteration": 2.375363826751709 + }, + { + "auxiliary_loss_clip": 0.01053966, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.01340997, + "balance_loss_mlp": 1.0162493, + "epoch": 0.7186532391402375, + "flos": 26540798071680.0, + "grad_norm": 3.626053853604953, + "language_loss": 0.752666, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77358234, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37890625, + "step": 11953, + "time_per_iteration": 2.46132755279541 + }, + { + "auxiliary_loss_clip": 0.01054177, + "auxiliary_loss_mlp": 0.01041729, + "balance_loss_clip": 1.01605129, + "balance_loss_mlp": 1.01633668, + "epoch": 0.7187133623929055, + "flos": 22381944117120.0, + "grad_norm": 1.8775416390822386, + "language_loss": 0.74702042, + "learning_rate": 7.740104912387164e-07, + "loss": 0.7679795, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 11954, + "time_per_iteration": 2.3528268337249756 + }, + { + "auxiliary_loss_clip": 0.01055006, + "auxiliary_loss_mlp": 0.01040261, + "balance_loss_clip": 1.01331949, + "balance_loss_mlp": 1.0178113, + "epoch": 0.7187734856455734, + "flos": 15778646254080.0, + "grad_norm": 1.7618427741766383, + "language_loss": 0.75681221, + "learning_rate": 7.737028058829425e-07, + "loss": 0.77776492, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37109375, + "step": 11955, + "time_per_iteration": 2.3788928985595703 + }, + { + "auxiliary_loss_clip": 0.01053755, + "auxiliary_loss_mlp": 0.01041215, + "balance_loss_clip": 1.01796973, + "balance_loss_mlp": 1.01755023, + "epoch": 0.7188336088982414, + "flos": 31758253493760.0, + "grad_norm": 1.726781872740489, + "language_loss": 0.74300635, + "learning_rate": 7.733951670284817e-07, + "loss": 0.76395607, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 11956, + "time_per_iteration": 2.4412827491760254 + }, + { + "auxiliary_loss_clip": 0.01054026, + "auxiliary_loss_mlp": 0.01038707, + "balance_loss_clip": 1.01349461, + "balance_loss_mlp": 1.01632929, + "epoch": 0.7188937321509093, + "flos": 21464287274880.0, + "grad_norm": 2.2849262260537833, + "language_loss": 0.72866446, + "learning_rate": 7.730875746869987e-07, + "loss": 0.74959177, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 11957, + "time_per_iteration": 2.4339489936828613 + }, + { + "auxiliary_loss_clip": 0.01055075, + "auxiliary_loss_mlp": 0.01047287, + "balance_loss_clip": 1.0209533, + "balance_loss_mlp": 1.01715493, + "epoch": 0.7189538554035774, + "flos": 27270273352320.0, + "grad_norm": 1.7924875861589642, + "language_loss": 0.74425012, + "learning_rate": 7.727800288701582e-07, + "loss": 0.76527369, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 11958, + "time_per_iteration": 2.409494161605835 + }, + { + "auxiliary_loss_clip": 0.01052235, + "auxiliary_loss_mlp": 0.01037316, + "balance_loss_clip": 1.01391542, + "balance_loss_mlp": 1.01659739, + "epoch": 0.7190139786562453, + "flos": 21579535272960.0, + "grad_norm": 1.5841216251836978, + "language_loss": 0.8494823, + "learning_rate": 7.724725295896215e-07, + "loss": 0.87037778, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 11959, + "time_per_iteration": 2.4106082916259766 + }, + { + "auxiliary_loss_clip": 0.01056798, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.01838756, + "balance_loss_mlp": 1.01894557, + "epoch": 0.7190741019089133, + "flos": 26720112147840.0, + "grad_norm": 1.69250507523006, + "language_loss": 0.82856858, + "learning_rate": 7.7216507685705e-07, + "loss": 0.8495813, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 11960, + "time_per_iteration": 3.747187852859497 + }, + { + "auxiliary_loss_clip": 0.01051647, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.02047944, + "balance_loss_mlp": 1.01622295, + "epoch": 0.7191342251615812, + "flos": 26103545804160.0, + "grad_norm": 1.5593598496783287, + "language_loss": 0.78778815, + "learning_rate": 7.718576706841013e-07, + "loss": 0.80874729, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 11961, + "time_per_iteration": 2.440626859664917 + }, + { + "auxiliary_loss_clip": 0.0105026, + "auxiliary_loss_mlp": 0.01036405, + "balance_loss_clip": 1.01467264, + "balance_loss_mlp": 1.01596355, + "epoch": 0.7191943484142492, + "flos": 22965901384320.0, + "grad_norm": 1.382497935960697, + "language_loss": 0.75895411, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77982074, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 11962, + "time_per_iteration": 2.371434211730957 + }, + { + "auxiliary_loss_clip": 0.01054436, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.01343608, + "balance_loss_mlp": 1.01693869, + "epoch": 0.7192544716669171, + "flos": 22564225658880.0, + "grad_norm": 1.764319293597679, + "language_loss": 0.76312578, + "learning_rate": 7.712429980637001e-07, + "loss": 0.78405821, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 11963, + "time_per_iteration": 3.90022611618042 + }, + { + "auxiliary_loss_clip": 0.01056197, + "auxiliary_loss_mlp": 0.01040355, + "balance_loss_clip": 1.0138427, + "balance_loss_mlp": 1.01673746, + "epoch": 0.7193145949195852, + "flos": 18981404092800.0, + "grad_norm": 2.3611239439004774, + "language_loss": 0.81766492, + "learning_rate": 7.709357316395564e-07, + "loss": 0.83863044, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39453125, + "step": 11964, + "time_per_iteration": 2.4728188514709473 + }, + { + "auxiliary_loss_clip": 0.01052446, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.01552105, + "balance_loss_mlp": 1.01565218, + "epoch": 0.7193747181722531, + "flos": 18003277042560.0, + "grad_norm": 1.6677009931322888, + "language_loss": 0.75482452, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77574694, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3671875, + "step": 11965, + "time_per_iteration": 3.7361624240875244 + }, + { + "auxiliary_loss_clip": 0.01055186, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.01443172, + "balance_loss_mlp": 1.01737046, + "epoch": 0.7194348414249211, + "flos": 24388262974080.0, + "grad_norm": 1.5252076593587447, + "language_loss": 0.78589404, + "learning_rate": 7.703213386216377e-07, + "loss": 0.80685568, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 11966, + "time_per_iteration": 2.397975444793701 + }, + { + "auxiliary_loss_clip": 0.01053776, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.01360869, + "balance_loss_mlp": 1.016222, + "epoch": 0.7194949646775891, + "flos": 22162375376640.0, + "grad_norm": 1.8119571516834474, + "language_loss": 0.74561691, + "learning_rate": 7.700142120511619e-07, + "loss": 0.76652265, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.375, + "step": 11967, + "time_per_iteration": 2.3485090732574463 + }, + { + "auxiliary_loss_clip": 0.01052082, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.01447225, + "balance_loss_mlp": 1.01785517, + "epoch": 0.719555087930257, + "flos": 20265334675200.0, + "grad_norm": 1.6531668939611108, + "language_loss": 0.82585615, + "learning_rate": 7.6970713212187e-07, + "loss": 0.8467291, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34375, + "step": 11968, + "time_per_iteration": 2.387608766555786 + }, + { + "auxiliary_loss_clip": 0.01053262, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.01697338, + "balance_loss_mlp": 1.01646733, + "epoch": 0.719615211182925, + "flos": 24715189745280.0, + "grad_norm": 2.155043888478388, + "language_loss": 0.77264768, + "learning_rate": 7.69400098845407e-07, + "loss": 0.79358643, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 11969, + "time_per_iteration": 2.3975765705108643 + }, + { + "auxiliary_loss_clip": 0.01052175, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.01430798, + "balance_loss_mlp": 1.01447511, + "epoch": 0.719675334435593, + "flos": 20008653292800.0, + "grad_norm": 1.6230802807348554, + "language_loss": 0.71818924, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73910666, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 11970, + "time_per_iteration": 2.4073703289031982 + }, + { + "auxiliary_loss_clip": 0.01007844, + "auxiliary_loss_mlp": 0.01003451, + "balance_loss_clip": 1.00092411, + "balance_loss_mlp": 1.00079215, + "epoch": 0.719735457688261, + "flos": 44197177178880.0, + "grad_norm": 0.9100888829564044, + "language_loss": 0.60965145, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62976444, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.0703125, + "step": 11971, + "time_per_iteration": 2.8934988975524902 + }, + { + "auxiliary_loss_clip": 0.01057041, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.01297295, + "balance_loss_mlp": 1.01699507, + "epoch": 0.7197955809409289, + "flos": 16801880647680.0, + "grad_norm": 2.0756426613618113, + "language_loss": 0.81305552, + "learning_rate": 7.684792790494105e-07, + "loss": 0.83400792, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.40039062, + "step": 11972, + "time_per_iteration": 2.3897409439086914 + }, + { + "auxiliary_loss_clip": 0.01055336, + "auxiliary_loss_mlp": 0.01046162, + "balance_loss_clip": 1.01912487, + "balance_loss_mlp": 1.01685607, + "epoch": 0.7198557041935969, + "flos": 24534234835200.0, + "grad_norm": 1.5150031038827514, + "language_loss": 0.76514536, + "learning_rate": 7.681724325006733e-07, + "loss": 0.78616035, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38476562, + "step": 11973, + "time_per_iteration": 2.3833138942718506 + }, + { + "auxiliary_loss_clip": 0.01008025, + "auxiliary_loss_mlp": 0.01003921, + "balance_loss_clip": 1.00150084, + "balance_loss_mlp": 1.001302, + "epoch": 0.7199158274462648, + "flos": 70707811835520.0, + "grad_norm": 0.8540206319147171, + "language_loss": 0.57356817, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59368765, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.06738281, + "step": 11974, + "time_per_iteration": 2.9001505374908447 + }, + { + "auxiliary_loss_clip": 0.01054308, + "auxiliary_loss_mlp": 0.01042922, + "balance_loss_clip": 1.01654077, + "balance_loss_mlp": 1.01555538, + "epoch": 0.7199759506989328, + "flos": 29346802686720.0, + "grad_norm": 2.125579031994053, + "language_loss": 0.62005568, + "learning_rate": 7.675588795479062e-07, + "loss": 0.64102793, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38671875, + "step": 11975, + "time_per_iteration": 2.421797513961792 + }, + { + "auxiliary_loss_clip": 0.01052022, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.01262236, + "balance_loss_mlp": 1.01585555, + "epoch": 0.7200360739516007, + "flos": 24639428361600.0, + "grad_norm": 2.4174172413754693, + "language_loss": 0.6866554, + "learning_rate": 7.672521731671425e-07, + "loss": 0.70753539, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36132812, + "step": 11976, + "time_per_iteration": 3.8162434101104736 + }, + { + "auxiliary_loss_clip": 0.01053742, + "auxiliary_loss_mlp": 0.01040541, + "balance_loss_clip": 1.01758146, + "balance_loss_mlp": 1.0160892, + "epoch": 0.7200961972042688, + "flos": 20811830186880.0, + "grad_norm": 2.210778157598444, + "language_loss": 0.68678719, + "learning_rate": 7.669455135323004e-07, + "loss": 0.70773005, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.375, + "step": 11977, + "time_per_iteration": 2.3939740657806396 + }, + { + "auxiliary_loss_clip": 0.01053193, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.01636636, + "balance_loss_mlp": 1.01617217, + "epoch": 0.7201563204569367, + "flos": 31244646349440.0, + "grad_norm": 1.8588845561870584, + "language_loss": 0.75969887, + "learning_rate": 7.666389006550074e-07, + "loss": 0.7806645, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37109375, + "step": 11978, + "time_per_iteration": 2.477902412414551 + }, + { + "auxiliary_loss_clip": 0.01051056, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.01299846, + "balance_loss_mlp": 1.01435697, + "epoch": 0.7202164437096047, + "flos": 26650180961280.0, + "grad_norm": 1.9113385895577413, + "language_loss": 0.79744542, + "learning_rate": 7.663323345468908e-07, + "loss": 0.8183217, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 11979, + "time_per_iteration": 2.4896886348724365 + }, + { + "auxiliary_loss_clip": 0.01054275, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_clip": 1.01587892, + "balance_loss_mlp": 1.01671791, + "epoch": 0.7202765669622727, + "flos": 25958376904320.0, + "grad_norm": 1.6634359015684512, + "language_loss": 0.65837288, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67934155, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.375, + "step": 11980, + "time_per_iteration": 2.4429006576538086 + }, + { + "auxiliary_loss_clip": 0.01054771, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.01595712, + "balance_loss_mlp": 1.01721275, + "epoch": 0.7203366902149406, + "flos": 28511086538880.0, + "grad_norm": 1.9854119615832166, + "language_loss": 0.68820059, + "learning_rate": 7.657193426846871e-07, + "loss": 0.7091651, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 11981, + "time_per_iteration": 2.41070556640625 + }, + { + "auxiliary_loss_clip": 0.01053209, + "auxiliary_loss_mlp": 0.01047123, + "balance_loss_clip": 1.02114677, + "balance_loss_mlp": 1.01596904, + "epoch": 0.7203968134676086, + "flos": 21104960895360.0, + "grad_norm": 2.4722264333092867, + "language_loss": 0.74566853, + "learning_rate": 7.65412916953843e-07, + "loss": 0.76667184, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37304688, + "step": 11982, + "time_per_iteration": 2.381371021270752 + }, + { + "auxiliary_loss_clip": 0.01052812, + "auxiliary_loss_mlp": 0.01046305, + "balance_loss_clip": 1.02165282, + "balance_loss_mlp": 1.01553047, + "epoch": 0.7204569367202766, + "flos": 18331181331840.0, + "grad_norm": 1.9963513575188736, + "language_loss": 0.67086816, + "learning_rate": 7.65106538038665e-07, + "loss": 0.69185936, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 11983, + "time_per_iteration": 2.33769154548645 + }, + { + "auxiliary_loss_clip": 0.01055446, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.01467073, + "balance_loss_mlp": 1.01750576, + "epoch": 0.7205170599729446, + "flos": 23254074679680.0, + "grad_norm": 1.4850932009149553, + "language_loss": 0.67922169, + "learning_rate": 7.648002059507715e-07, + "loss": 0.70019346, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 11984, + "time_per_iteration": 2.391268730163574 + }, + { + "auxiliary_loss_clip": 0.0105651, + "auxiliary_loss_mlp": 0.01040217, + "balance_loss_clip": 1.01370478, + "balance_loss_mlp": 1.01747608, + "epoch": 0.7205771832256125, + "flos": 20119851573120.0, + "grad_norm": 1.7783176491211077, + "language_loss": 0.75603855, + "learning_rate": 7.644939207017771e-07, + "loss": 0.77700585, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 11985, + "time_per_iteration": 2.350484848022461 + }, + { + "auxiliary_loss_clip": 0.01053412, + "auxiliary_loss_mlp": 0.01039525, + "balance_loss_clip": 1.01619554, + "balance_loss_mlp": 1.01700389, + "epoch": 0.7206373064782805, + "flos": 27702184181760.0, + "grad_norm": 1.9314544199988615, + "language_loss": 0.64037144, + "learning_rate": 7.641876823032977e-07, + "loss": 0.66130078, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 11986, + "time_per_iteration": 2.415400505065918 + }, + { + "auxiliary_loss_clip": 0.01053944, + "auxiliary_loss_mlp": 0.01043518, + "balance_loss_clip": 1.01583731, + "balance_loss_mlp": 1.01654482, + "epoch": 0.7206974297309484, + "flos": 17967176830080.0, + "grad_norm": 2.7867265608282703, + "language_loss": 0.73893905, + "learning_rate": 7.638814907669455e-07, + "loss": 0.75991368, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.37304688, + "step": 11987, + "time_per_iteration": 2.346979856491089 + }, + { + "auxiliary_loss_clip": 0.01055945, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_clip": 1.01463795, + "balance_loss_mlp": 1.01687241, + "epoch": 0.7207575529836164, + "flos": 16982207153280.0, + "grad_norm": 1.7892883883651076, + "language_loss": 0.78826511, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80923855, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.390625, + "step": 11988, + "time_per_iteration": 2.345564126968384 + }, + { + "auxiliary_loss_clip": 0.01053237, + "auxiliary_loss_mlp": 0.01042282, + "balance_loss_clip": 1.01807094, + "balance_loss_mlp": 1.01670885, + "epoch": 0.7208176762362843, + "flos": 18726782480640.0, + "grad_norm": 1.7601927296909525, + "language_loss": 0.79185599, + "learning_rate": 7.632692483270618e-07, + "loss": 0.8128112, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11989, + "time_per_iteration": 2.3650481700897217 + }, + { + "auxiliary_loss_clip": 0.01053034, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.01414084, + "balance_loss_mlp": 1.01606929, + "epoch": 0.7208777994889524, + "flos": 18733485461760.0, + "grad_norm": 2.3412785312890843, + "language_loss": 0.84015334, + "learning_rate": 7.629631974467481e-07, + "loss": 0.86105251, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36914062, + "step": 11990, + "time_per_iteration": 2.375626802444458 + }, + { + "auxiliary_loss_clip": 0.01053748, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.01437366, + "balance_loss_mlp": 1.0160197, + "epoch": 0.7209379227416203, + "flos": 14792559413760.0, + "grad_norm": 2.0479073334589772, + "language_loss": 0.77249652, + "learning_rate": 7.626571934749931e-07, + "loss": 0.79342937, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 11991, + "time_per_iteration": 2.3367466926574707 + }, + { + "auxiliary_loss_clip": 0.01052155, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.01067567, + "balance_loss_mlp": 1.01648283, + "epoch": 0.7209980459942883, + "flos": 29635744032000.0, + "grad_norm": 1.5168038881090715, + "language_loss": 0.73637944, + "learning_rate": 7.623512364234022e-07, + "loss": 0.75724429, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 11992, + "time_per_iteration": 2.4698843955993652 + }, + { + "auxiliary_loss_clip": 0.01054092, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.00907516, + "balance_loss_mlp": 1.01606166, + "epoch": 0.7210581692469563, + "flos": 23476052304000.0, + "grad_norm": 1.7453684045481757, + "language_loss": 0.67305815, + "learning_rate": 7.620453263035755e-07, + "loss": 0.69394898, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 11993, + "time_per_iteration": 2.3863577842712402 + }, + { + "auxiliary_loss_clip": 0.01052155, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.01355302, + "balance_loss_mlp": 1.01565933, + "epoch": 0.7211182924996242, + "flos": 26098762947840.0, + "grad_norm": 2.1289505227203724, + "language_loss": 0.66740471, + "learning_rate": 7.61739463127115e-07, + "loss": 0.68830401, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 11994, + "time_per_iteration": 2.4278411865234375 + }, + { + "auxiliary_loss_clip": 0.01053927, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.01388288, + "balance_loss_mlp": 1.01636243, + "epoch": 0.7211784157522922, + "flos": 17711123852160.0, + "grad_norm": 1.6785060013619115, + "language_loss": 0.6801253, + "learning_rate": 7.614336469056172e-07, + "loss": 0.70106852, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.375, + "step": 11995, + "time_per_iteration": 2.3310282230377197 + }, + { + "auxiliary_loss_clip": 0.0105214, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.00977635, + "balance_loss_mlp": 1.0164367, + "epoch": 0.7212385390049602, + "flos": 24422547795840.0, + "grad_norm": 1.849895504389214, + "language_loss": 0.80675745, + "learning_rate": 7.6112787765068e-07, + "loss": 0.82762218, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 11996, + "time_per_iteration": 2.4340925216674805 + }, + { + "auxiliary_loss_clip": 0.01053251, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.01421046, + "balance_loss_mlp": 1.01620626, + "epoch": 0.7212986622576282, + "flos": 28145999784960.0, + "grad_norm": 2.0188149231410497, + "language_loss": 0.82528478, + "learning_rate": 7.60822155373899e-07, + "loss": 0.84619218, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 11997, + "time_per_iteration": 2.435441493988037 + }, + { + "auxiliary_loss_clip": 0.01054402, + "auxiliary_loss_mlp": 0.01038476, + "balance_loss_clip": 1.01390719, + "balance_loss_mlp": 1.01642203, + "epoch": 0.7213587855102961, + "flos": 21834680555520.0, + "grad_norm": 1.8995033626514313, + "language_loss": 0.67921126, + "learning_rate": 7.605164800868646e-07, + "loss": 0.70014, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38085938, + "step": 11998, + "time_per_iteration": 2.4133143424987793 + }, + { + "auxiliary_loss_clip": 0.0105368, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.0162369, + "balance_loss_mlp": 1.01738584, + "epoch": 0.7214189087629641, + "flos": 14610661896960.0, + "grad_norm": 1.842671374753031, + "language_loss": 0.73497361, + "learning_rate": 7.602108518011696e-07, + "loss": 0.75590634, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 11999, + "time_per_iteration": 3.6233370304107666 + }, + { + "auxiliary_loss_clip": 0.0105311, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.0126884, + "balance_loss_mlp": 1.01602614, + "epoch": 0.721479032015632, + "flos": 19389852622080.0, + "grad_norm": 1.974739078758876, + "language_loss": 0.83903158, + "learning_rate": 7.599052705284039e-07, + "loss": 0.8599298, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 12000, + "time_per_iteration": 2.3890182971954346 + }, + { + "auxiliary_loss_clip": 0.01055537, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.01370955, + "balance_loss_mlp": 1.01693618, + "epoch": 0.7215391552683, + "flos": 18511961685120.0, + "grad_norm": 1.8303983279488911, + "language_loss": 0.78496301, + "learning_rate": 7.59599736280154e-07, + "loss": 0.80590403, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38671875, + "step": 12001, + "time_per_iteration": 2.357947826385498 + }, + { + "auxiliary_loss_clip": 0.01055135, + "auxiliary_loss_mlp": 0.01044189, + "balance_loss_clip": 1.02021599, + "balance_loss_mlp": 1.01872945, + "epoch": 0.721599278520968, + "flos": 23257600727040.0, + "grad_norm": 1.636984280462141, + "language_loss": 0.82514274, + "learning_rate": 7.592942490680066e-07, + "loss": 0.84613597, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 12002, + "time_per_iteration": 2.410571575164795 + }, + { + "auxiliary_loss_clip": 0.01054656, + "auxiliary_loss_mlp": 0.01038923, + "balance_loss_clip": 1.01484299, + "balance_loss_mlp": 1.0169512, + "epoch": 0.721659401773636, + "flos": 39197581706880.0, + "grad_norm": 1.9145231271958676, + "language_loss": 0.63921863, + "learning_rate": 7.589888089035462e-07, + "loss": 0.6601544, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37695312, + "step": 12003, + "time_per_iteration": 4.020500659942627 + }, + { + "auxiliary_loss_clip": 0.0105317, + "auxiliary_loss_mlp": 0.01036115, + "balance_loss_clip": 1.0104847, + "balance_loss_mlp": 1.01572394, + "epoch": 0.7217195250263039, + "flos": 14939020033920.0, + "grad_norm": 2.666621813777241, + "language_loss": 0.70296913, + "learning_rate": 7.586834157983544e-07, + "loss": 0.72386205, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12004, + "time_per_iteration": 3.7463862895965576 + }, + { + "auxiliary_loss_clip": 0.01008355, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 1.00000763, + "balance_loss_mlp": 1.00151205, + "epoch": 0.7217796482789719, + "flos": 70865828115840.0, + "grad_norm": 0.8626593854858787, + "language_loss": 0.54214859, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56225848, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.06835938, + "step": 12005, + "time_per_iteration": 2.945441961288452 + }, + { + "auxiliary_loss_clip": 0.01055724, + "auxiliary_loss_mlp": 0.01035501, + "balance_loss_clip": 1.01109862, + "balance_loss_mlp": 1.01791537, + "epoch": 0.7218397715316398, + "flos": 37450004002560.0, + "grad_norm": 1.7419866791108258, + "language_loss": 0.64012468, + "learning_rate": 7.580727708120962e-07, + "loss": 0.66103697, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 12006, + "time_per_iteration": 2.4981517791748047 + }, + { + "auxiliary_loss_clip": 0.01053209, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.01623249, + "balance_loss_mlp": 1.01688647, + "epoch": 0.7218998947843078, + "flos": 22709569115520.0, + "grad_norm": 1.8355458316295166, + "language_loss": 0.92420161, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94512343, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 12007, + "time_per_iteration": 2.377774238586426 + }, + { + "auxiliary_loss_clip": 0.01055166, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.00931168, + "balance_loss_mlp": 1.0169332, + "epoch": 0.7219600180369758, + "flos": 12166357633920.0, + "grad_norm": 2.2889206398610122, + "language_loss": 0.65166932, + "learning_rate": 7.574623142018568e-07, + "loss": 0.67256749, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 12008, + "time_per_iteration": 2.3323090076446533 + }, + { + "auxiliary_loss_clip": 0.01055178, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.01423049, + "balance_loss_mlp": 1.01698947, + "epoch": 0.7220201412896438, + "flos": 22595612837760.0, + "grad_norm": 1.9651084271322778, + "language_loss": 0.79567116, + "learning_rate": 7.57157156566681e-07, + "loss": 0.81661701, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 12009, + "time_per_iteration": 2.388071060180664 + }, + { + "auxiliary_loss_clip": 0.0105455, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.0162673, + "balance_loss_mlp": 1.01692963, + "epoch": 0.7220802645423118, + "flos": 26717598529920.0, + "grad_norm": 1.7895329032554002, + "language_loss": 0.64878559, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66976702, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.37695312, + "step": 12010, + "time_per_iteration": 2.4065637588500977 + }, + { + "auxiliary_loss_clip": 0.01052972, + "auxiliary_loss_mlp": 0.01044234, + "balance_loss_clip": 1.02036858, + "balance_loss_mlp": 1.01630187, + "epoch": 0.7221403877949797, + "flos": 24419545418880.0, + "grad_norm": 1.7121941797403988, + "language_loss": 0.78301489, + "learning_rate": 7.565469826940742e-07, + "loss": 0.80398697, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 12011, + "time_per_iteration": 2.411363363265991 + }, + { + "auxiliary_loss_clip": 0.01052393, + "auxiliary_loss_mlp": 0.01040889, + "balance_loss_clip": 1.01758349, + "balance_loss_mlp": 1.01586914, + "epoch": 0.7222005110476477, + "flos": 23513234768640.0, + "grad_norm": 1.6280293488370976, + "language_loss": 0.80165696, + "learning_rate": 7.56241966479781e-07, + "loss": 0.82258976, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 12012, + "time_per_iteration": 2.3806228637695312 + }, + { + "auxiliary_loss_clip": 0.01054144, + "auxiliary_loss_mlp": 0.01033137, + "balance_loss_clip": 1.01008153, + "balance_loss_mlp": 1.01778412, + "epoch": 0.7222606343003156, + "flos": 23111419397760.0, + "grad_norm": 2.322727075288452, + "language_loss": 0.77259326, + "learning_rate": 7.559369974289171e-07, + "loss": 0.79346609, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 12013, + "time_per_iteration": 2.431779623031616 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01040953, + "balance_loss_clip": 1.01677752, + "balance_loss_mlp": 1.01620531, + "epoch": 0.7223207575529836, + "flos": 24350068080000.0, + "grad_norm": 1.6243945235067492, + "language_loss": 0.76412785, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78506446, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 12014, + "time_per_iteration": 2.4370534420013428 + }, + { + "auxiliary_loss_clip": 0.01053639, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.01395631, + "balance_loss_mlp": 1.01603842, + "epoch": 0.7223808808056515, + "flos": 28328909731200.0, + "grad_norm": 2.932188511865818, + "language_loss": 0.87701035, + "learning_rate": 7.553272008637346e-07, + "loss": 0.89793146, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 12015, + "time_per_iteration": 4.001673221588135 + }, + { + "auxiliary_loss_clip": 0.0105229, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.01151919, + "balance_loss_mlp": 1.01650369, + "epoch": 0.7224410040583196, + "flos": 21068371923840.0, + "grad_norm": 2.0983426712511055, + "language_loss": 0.79057992, + "learning_rate": 7.55022373372538e-07, + "loss": 0.81144547, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35742188, + "step": 12016, + "time_per_iteration": 2.39625883102417 + }, + { + "auxiliary_loss_clip": 0.01052752, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.01800108, + "balance_loss_mlp": 1.01690972, + "epoch": 0.7225011273109875, + "flos": 26794267608960.0, + "grad_norm": 3.848054357596503, + "language_loss": 0.78271389, + "learning_rate": 7.547175930910186e-07, + "loss": 0.80365467, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 12017, + "time_per_iteration": 2.4486989974975586 + }, + { + "auxiliary_loss_clip": 0.0105155, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.01429105, + "balance_loss_mlp": 1.01651216, + "epoch": 0.7225612505636555, + "flos": 23582537550720.0, + "grad_norm": 1.9479048549375, + "language_loss": 0.75946629, + "learning_rate": 7.54412860030732e-07, + "loss": 0.78033769, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 12018, + "time_per_iteration": 2.410247325897217 + }, + { + "auxiliary_loss_clip": 0.01053261, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.01514673, + "balance_loss_mlp": 1.0186224, + "epoch": 0.7226213738163234, + "flos": 20776567847040.0, + "grad_norm": 1.5867880252190933, + "language_loss": 0.78386313, + "learning_rate": 7.541081742032347e-07, + "loss": 0.80475771, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34570312, + "step": 12019, + "time_per_iteration": 2.3982675075531006 + }, + { + "auxiliary_loss_clip": 0.010524, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.01035392, + "balance_loss_mlp": 1.01588202, + "epoch": 0.7226814970689914, + "flos": 32634433774080.0, + "grad_norm": 1.7050633194903118, + "language_loss": 0.74920118, + "learning_rate": 7.53803535620081e-07, + "loss": 0.77007341, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 12020, + "time_per_iteration": 2.4524645805358887 + }, + { + "auxiliary_loss_clip": 0.01051661, + "auxiliary_loss_mlp": 0.01041722, + "balance_loss_clip": 1.01901269, + "balance_loss_mlp": 1.01506066, + "epoch": 0.7227416203216595, + "flos": 22453306669440.0, + "grad_norm": 1.5401606246048691, + "language_loss": 0.78320009, + "learning_rate": 7.534989442928219e-07, + "loss": 0.80413401, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36523438, + "step": 12021, + "time_per_iteration": 2.4439949989318848 + }, + { + "auxiliary_loss_clip": 0.01052973, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.01061916, + "balance_loss_mlp": 1.01673567, + "epoch": 0.7228017435743274, + "flos": 21651246938880.0, + "grad_norm": 1.5673201571299449, + "language_loss": 0.69007051, + "learning_rate": 7.531944002330073e-07, + "loss": 0.71093392, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 12022, + "time_per_iteration": 2.492393970489502 + }, + { + "auxiliary_loss_clip": 0.01052849, + "auxiliary_loss_mlp": 0.01038912, + "balance_loss_clip": 1.01448631, + "balance_loss_mlp": 1.01615286, + "epoch": 0.7228618668269954, + "flos": 29532191339520.0, + "grad_norm": 2.3299153418465335, + "language_loss": 0.70493263, + "learning_rate": 7.528899034521858e-07, + "loss": 0.72585022, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 12023, + "time_per_iteration": 2.446632146835327 + }, + { + "auxiliary_loss_clip": 0.01051175, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.01351511, + "balance_loss_mlp": 1.01496732, + "epoch": 0.7229219900796633, + "flos": 27452589805440.0, + "grad_norm": 1.690268852215826, + "language_loss": 0.71730083, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73816669, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 12024, + "time_per_iteration": 2.4376330375671387 + }, + { + "auxiliary_loss_clip": 0.01052177, + "auxiliary_loss_mlp": 0.01039692, + "balance_loss_clip": 1.01624322, + "balance_loss_mlp": 1.01618385, + "epoch": 0.7229821133323313, + "flos": 16288448060160.0, + "grad_norm": 2.3587374880666383, + "language_loss": 0.76484615, + "learning_rate": 7.522810517737089e-07, + "loss": 0.78576481, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 12025, + "time_per_iteration": 2.4284820556640625 + }, + { + "auxiliary_loss_clip": 0.01052077, + "auxiliary_loss_mlp": 0.01039752, + "balance_loss_clip": 1.01802063, + "balance_loss_mlp": 1.0168221, + "epoch": 0.7230422365849992, + "flos": 20411306536320.0, + "grad_norm": 2.01142172097412, + "language_loss": 0.77849817, + "learning_rate": 7.519766968991395e-07, + "loss": 0.79941648, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 12026, + "time_per_iteration": 2.400914430618286 + }, + { + "auxiliary_loss_clip": 0.01054422, + "auxiliary_loss_mlp": 0.01042357, + "balance_loss_clip": 1.01970685, + "balance_loss_mlp": 1.01720726, + "epoch": 0.7231023598376672, + "flos": 25592312632320.0, + "grad_norm": 1.8861964181346198, + "language_loss": 0.68669629, + "learning_rate": 7.516723893497388e-07, + "loss": 0.70766413, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37109375, + "step": 12027, + "time_per_iteration": 2.407539129257202 + }, + { + "auxiliary_loss_clip": 0.01055101, + "auxiliary_loss_mlp": 0.0104059, + "balance_loss_clip": 1.01615202, + "balance_loss_mlp": 1.0178839, + "epoch": 0.7231624830903352, + "flos": 25148601763200.0, + "grad_norm": 1.9354894703746335, + "language_loss": 0.7991432, + "learning_rate": 7.513681291370469e-07, + "loss": 0.82010007, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 12028, + "time_per_iteration": 2.389491558074951 + }, + { + "auxiliary_loss_clip": 0.01051141, + "auxiliary_loss_mlp": 0.01041536, + "balance_loss_clip": 1.01739585, + "balance_loss_mlp": 1.01520848, + "epoch": 0.7232226063430032, + "flos": 21724669261440.0, + "grad_norm": 1.6429415965799696, + "language_loss": 0.83192849, + "learning_rate": 7.510639162726e-07, + "loss": 0.85285527, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 12029, + "time_per_iteration": 2.381150245666504 + }, + { + "auxiliary_loss_clip": 0.01008374, + "auxiliary_loss_mlp": 0.01003163, + "balance_loss_clip": 1.00082672, + "balance_loss_mlp": 1.00148034, + "epoch": 0.7232827295956711, + "flos": 68433149335680.0, + "grad_norm": 0.8144714647787783, + "language_loss": 0.61797929, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63809466, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.06884766, + "step": 12030, + "time_per_iteration": 3.064298629760742 + }, + { + "auxiliary_loss_clip": 0.01051171, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.01093221, + "balance_loss_mlp": 1.016168, + "epoch": 0.7233428528483391, + "flos": 20191633061760.0, + "grad_norm": 1.6544791415979463, + "language_loss": 0.78783238, + "learning_rate": 7.504556326345859e-07, + "loss": 0.80868942, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34960938, + "step": 12031, + "time_per_iteration": 2.3678088188171387 + }, + { + "auxiliary_loss_clip": 0.01054638, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.0129596, + "balance_loss_mlp": 1.01653934, + "epoch": 0.723402976101007, + "flos": 23948392354560.0, + "grad_norm": 4.407850369051631, + "language_loss": 0.82023871, + "learning_rate": 7.501515618840834e-07, + "loss": 0.84116113, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38085938, + "step": 12032, + "time_per_iteration": 2.374979257583618 + }, + { + "auxiliary_loss_clip": 0.01055255, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.01528454, + "balance_loss_mlp": 1.01707792, + "epoch": 0.723463099353675, + "flos": 20812353857280.0, + "grad_norm": 1.7516410225949417, + "language_loss": 0.76793724, + "learning_rate": 7.498475385279592e-07, + "loss": 0.78889215, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 12033, + "time_per_iteration": 2.4330575466156006 + }, + { + "auxiliary_loss_clip": 0.01049842, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.0140183, + "balance_loss_mlp": 1.01566434, + "epoch": 0.723523222606343, + "flos": 19097036115840.0, + "grad_norm": 1.6619912827899956, + "language_loss": 0.75903165, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77987909, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34179688, + "step": 12034, + "time_per_iteration": 2.353130578994751 + }, + { + "auxiliary_loss_clip": 0.01052011, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.01239514, + "balance_loss_mlp": 1.01620114, + "epoch": 0.723583345859011, + "flos": 26505745200000.0, + "grad_norm": 1.699265621845893, + "language_loss": 0.81980938, + "learning_rate": 7.492396340449578e-07, + "loss": 0.84067863, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 12035, + "time_per_iteration": 2.408832550048828 + }, + { + "auxiliary_loss_clip": 0.01054008, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.01291966, + "balance_loss_mlp": 1.01725364, + "epoch": 0.723643469111679, + "flos": 16032953664000.0, + "grad_norm": 1.9073542576646292, + "language_loss": 0.61547095, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63639033, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 12036, + "time_per_iteration": 2.36421799659729 + }, + { + "auxiliary_loss_clip": 0.01051088, + "auxiliary_loss_mlp": 0.01037081, + "balance_loss_clip": 1.01544464, + "balance_loss_mlp": 1.01565909, + "epoch": 0.7237035923643469, + "flos": 21944447470080.0, + "grad_norm": 1.8098894200382531, + "language_loss": 0.68601871, + "learning_rate": 7.486319192777883e-07, + "loss": 0.70690042, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35351562, + "step": 12037, + "time_per_iteration": 2.404141902923584 + }, + { + "auxiliary_loss_clip": 0.01051461, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.01511037, + "balance_loss_mlp": 1.01609731, + "epoch": 0.7237637156170149, + "flos": 23582083703040.0, + "grad_norm": 1.890443736986783, + "language_loss": 0.73114383, + "learning_rate": 7.483281330664479e-07, + "loss": 0.75204527, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35351562, + "step": 12038, + "time_per_iteration": 3.8177239894866943 + }, + { + "auxiliary_loss_clip": 0.01050505, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.01039207, + "balance_loss_mlp": 1.01510191, + "epoch": 0.7238238388696828, + "flos": 20593657900800.0, + "grad_norm": 2.4477891122618156, + "language_loss": 0.73285878, + "learning_rate": 7.480243943186293e-07, + "loss": 0.75372505, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.35351562, + "step": 12039, + "time_per_iteration": 2.351316213607788 + }, + { + "auxiliary_loss_clip": 0.01052435, + "auxiliary_loss_mlp": 0.0103663, + "balance_loss_clip": 1.01400399, + "balance_loss_mlp": 1.01587248, + "epoch": 0.7238839621223508, + "flos": 24205876698240.0, + "grad_norm": 1.8859031143739322, + "language_loss": 0.77911317, + "learning_rate": 7.477207030458513e-07, + "loss": 0.80000377, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 12040, + "time_per_iteration": 2.3983163833618164 + }, + { + "auxiliary_loss_clip": 0.01052242, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.01123357, + "balance_loss_mlp": 1.01634622, + "epoch": 0.7239440853750188, + "flos": 14208881437440.0, + "grad_norm": 1.558034112884099, + "language_loss": 0.77983773, + "learning_rate": 7.474170592596301e-07, + "loss": 0.80070698, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 12041, + "time_per_iteration": 2.3562402725219727 + }, + { + "auxiliary_loss_clip": 0.01053976, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.00881577, + "balance_loss_mlp": 1.0166285, + "epoch": 0.7240042086276868, + "flos": 21613785183360.0, + "grad_norm": 2.0168015194994986, + "language_loss": 0.65290904, + "learning_rate": 7.471134629714797e-07, + "loss": 0.67377234, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37304688, + "step": 12042, + "time_per_iteration": 2.3466567993164062 + }, + { + "auxiliary_loss_clip": 0.01053721, + "auxiliary_loss_mlp": 0.01035872, + "balance_loss_clip": 1.01131463, + "balance_loss_mlp": 1.01716328, + "epoch": 0.7240643318803547, + "flos": 23330324822400.0, + "grad_norm": 1.8497188374293847, + "language_loss": 0.85401845, + "learning_rate": 7.468099141929116e-07, + "loss": 0.87491441, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 12043, + "time_per_iteration": 3.7557036876678467 + }, + { + "auxiliary_loss_clip": 0.01053009, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.0160625, + "balance_loss_mlp": 1.01603484, + "epoch": 0.7241244551330227, + "flos": 24023699890560.0, + "grad_norm": 2.177223986283052, + "language_loss": 0.65252042, + "learning_rate": 7.465064129354379e-07, + "loss": 0.67346692, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36914062, + "step": 12044, + "time_per_iteration": 3.7909982204437256 + }, + { + "auxiliary_loss_clip": 0.0105349, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.01060438, + "balance_loss_mlp": 1.01671529, + "epoch": 0.7241845783856906, + "flos": 18729435744000.0, + "grad_norm": 1.462338527519926, + "language_loss": 0.82610637, + "learning_rate": 7.462029592105658e-07, + "loss": 0.84699732, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 12045, + "time_per_iteration": 2.353032350540161 + }, + { + "auxiliary_loss_clip": 0.01051003, + "auxiliary_loss_mlp": 0.01035922, + "balance_loss_clip": 1.01323605, + "balance_loss_mlp": 1.01565921, + "epoch": 0.7242447016383586, + "flos": 19497699411840.0, + "grad_norm": 1.6742718187972638, + "language_loss": 0.73049754, + "learning_rate": 7.458995530298034e-07, + "loss": 0.75136679, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 12046, + "time_per_iteration": 2.35311222076416 + }, + { + "auxiliary_loss_clip": 0.01051884, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.01372766, + "balance_loss_mlp": 1.01571834, + "epoch": 0.7243048248910267, + "flos": 22162410288000.0, + "grad_norm": 1.974345585291145, + "language_loss": 0.7199558, + "learning_rate": 7.455961944046553e-07, + "loss": 0.74084866, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 12047, + "time_per_iteration": 2.3511464595794678 + }, + { + "auxiliary_loss_clip": 0.0105474, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.01408625, + "balance_loss_mlp": 1.01648855, + "epoch": 0.7243649481436946, + "flos": 27671530141440.0, + "grad_norm": 1.846978142498591, + "language_loss": 0.71133107, + "learning_rate": 7.45292883346627e-07, + "loss": 0.73226953, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 12048, + "time_per_iteration": 2.400956153869629 + }, + { + "auxiliary_loss_clip": 0.01007113, + "auxiliary_loss_mlp": 0.01002323, + "balance_loss_clip": 0.99990326, + "balance_loss_mlp": 1.00076365, + "epoch": 0.7244250713963626, + "flos": 63241355629440.0, + "grad_norm": 0.8212147456630677, + "language_loss": 0.53803277, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55812716, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.06347656, + "step": 12049, + "time_per_iteration": 3.0113911628723145 + }, + { + "auxiliary_loss_clip": 0.01057285, + "auxiliary_loss_mlp": 0.01040833, + "balance_loss_clip": 1.01353455, + "balance_loss_mlp": 1.0181309, + "epoch": 0.7244851946490305, + "flos": 17966164400640.0, + "grad_norm": 2.911385660831529, + "language_loss": 0.61522996, + "learning_rate": 7.446864039779258e-07, + "loss": 0.6362111, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 12050, + "time_per_iteration": 2.3526384830474854 + }, + { + "auxiliary_loss_clip": 0.01007534, + "auxiliary_loss_mlp": 0.01004833, + "balance_loss_clip": 1.00246084, + "balance_loss_mlp": 1.00104117, + "epoch": 0.7245453179016985, + "flos": 70940227956480.0, + "grad_norm": 0.7191020026741499, + "language_loss": 0.53373086, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55385458, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.06494141, + "step": 12051, + "time_per_iteration": 3.0231308937072754 + }, + { + "auxiliary_loss_clip": 0.01051253, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.01866078, + "balance_loss_mlp": 1.01620173, + "epoch": 0.7246054411543664, + "flos": 24567402493440.0, + "grad_norm": 1.5003777698814518, + "language_loss": 0.73075581, + "learning_rate": 7.440801150156927e-07, + "loss": 0.75167143, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34960938, + "step": 12052, + "time_per_iteration": 2.3925609588623047 + }, + { + "auxiliary_loss_clip": 0.01054335, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_clip": 1.01529896, + "balance_loss_mlp": 1.01698434, + "epoch": 0.7246655644070344, + "flos": 32337078791040.0, + "grad_norm": 1.6780682577225594, + "language_loss": 0.75192642, + "learning_rate": 7.437770419657415e-07, + "loss": 0.77289736, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.37304688, + "step": 12053, + "time_per_iteration": 2.436213493347168 + }, + { + "auxiliary_loss_clip": 0.01052927, + "auxiliary_loss_mlp": 0.01039567, + "balance_loss_clip": 1.01440167, + "balance_loss_mlp": 1.01590657, + "epoch": 0.7247256876597024, + "flos": 21871374261120.0, + "grad_norm": 1.755486221385532, + "language_loss": 0.79320729, + "learning_rate": 7.434740165518898e-07, + "loss": 0.81413215, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 12054, + "time_per_iteration": 2.3751819133758545 + }, + { + "auxiliary_loss_clip": 0.0105379, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.0150032, + "balance_loss_mlp": 1.01722789, + "epoch": 0.7247858109123704, + "flos": 16212267740160.0, + "grad_norm": 2.4282352239537675, + "language_loss": 0.69258434, + "learning_rate": 7.431710387856301e-07, + "loss": 0.71350807, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 12055, + "time_per_iteration": 3.735994577407837 + }, + { + "auxiliary_loss_clip": 0.01050421, + "auxiliary_loss_mlp": 0.01040032, + "balance_loss_clip": 1.0189321, + "balance_loss_mlp": 1.01511645, + "epoch": 0.7248459341650383, + "flos": 20849641056000.0, + "grad_norm": 1.6951718835499987, + "language_loss": 0.74938786, + "learning_rate": 7.428681086784496e-07, + "loss": 0.7702924, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35351562, + "step": 12056, + "time_per_iteration": 2.35951828956604 + }, + { + "auxiliary_loss_clip": 0.01050195, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.00975847, + "balance_loss_mlp": 1.01590943, + "epoch": 0.7249060574177063, + "flos": 25920600946560.0, + "grad_norm": 1.525419390507607, + "language_loss": 0.71692538, + "learning_rate": 7.425652262418368e-07, + "loss": 0.7377485, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 12057, + "time_per_iteration": 2.4205920696258545 + }, + { + "auxiliary_loss_clip": 0.01055102, + "auxiliary_loss_mlp": 0.01042058, + "balance_loss_clip": 1.01794195, + "balance_loss_mlp": 1.01772594, + "epoch": 0.7249661806703742, + "flos": 17344640643840.0, + "grad_norm": 3.4693858979660726, + "language_loss": 0.63440645, + "learning_rate": 7.42262391487277e-07, + "loss": 0.65537804, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37304688, + "step": 12058, + "time_per_iteration": 2.339470624923706 + }, + { + "auxiliary_loss_clip": 0.01053164, + "auxiliary_loss_mlp": 0.01038953, + "balance_loss_clip": 1.01441979, + "balance_loss_mlp": 1.01689327, + "epoch": 0.7250263039230422, + "flos": 19573111681920.0, + "grad_norm": 2.0216155927510164, + "language_loss": 0.75426292, + "learning_rate": 7.419596044262535e-07, + "loss": 0.77518409, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36328125, + "step": 12059, + "time_per_iteration": 2.3768556118011475 + }, + { + "auxiliary_loss_clip": 0.0105234, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.01770377, + "balance_loss_mlp": 1.01603913, + "epoch": 0.7250864271757103, + "flos": 21975695003520.0, + "grad_norm": 1.7090598811318478, + "language_loss": 0.80122244, + "learning_rate": 7.416568650702472e-07, + "loss": 0.82214737, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36328125, + "step": 12060, + "time_per_iteration": 2.3914859294891357 + }, + { + "auxiliary_loss_clip": 0.01052405, + "auxiliary_loss_mlp": 0.01034478, + "balance_loss_clip": 1.00937223, + "balance_loss_mlp": 1.01659799, + "epoch": 0.7251465504283782, + "flos": 25011357742080.0, + "grad_norm": 4.592777554610926, + "language_loss": 0.77673817, + "learning_rate": 7.413541734307393e-07, + "loss": 0.797607, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.35742188, + "step": 12061, + "time_per_iteration": 2.460577964782715 + }, + { + "auxiliary_loss_clip": 0.01050061, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.01181948, + "balance_loss_mlp": 1.01613593, + "epoch": 0.7252066736810462, + "flos": 16689216090240.0, + "grad_norm": 1.6195488097620234, + "language_loss": 0.81911254, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83993995, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33984375, + "step": 12062, + "time_per_iteration": 2.330665349960327 + }, + { + "auxiliary_loss_clip": 0.01056575, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.01455832, + "balance_loss_mlp": 1.01818371, + "epoch": 0.7252667969337141, + "flos": 25701835167360.0, + "grad_norm": 2.481886655950343, + "language_loss": 0.71220726, + "learning_rate": 7.407489333471262e-07, + "loss": 0.73318404, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38476562, + "step": 12063, + "time_per_iteration": 2.473924398422241 + }, + { + "auxiliary_loss_clip": 0.0105087, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.01116586, + "balance_loss_mlp": 1.01570535, + "epoch": 0.7253269201863821, + "flos": 18258945995520.0, + "grad_norm": 1.514710653909478, + "language_loss": 0.71150589, + "learning_rate": 7.40446384925973e-07, + "loss": 0.73235166, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 12064, + "time_per_iteration": 2.366633653640747 + }, + { + "auxiliary_loss_clip": 0.0105461, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.01112604, + "balance_loss_mlp": 1.01791215, + "epoch": 0.72538704343905, + "flos": 20410782865920.0, + "grad_norm": 1.904574771201914, + "language_loss": 0.91760802, + "learning_rate": 7.401438842672192e-07, + "loss": 0.93849772, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 12065, + "time_per_iteration": 2.3930938243865967 + }, + { + "auxiliary_loss_clip": 0.01007736, + "auxiliary_loss_mlp": 0.01003843, + "balance_loss_clip": 1.00126839, + "balance_loss_mlp": 1.00110173, + "epoch": 0.725447166691718, + "flos": 70147524470400.0, + "grad_norm": 0.7266824611170254, + "language_loss": 0.56078953, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58090538, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.06640625, + "step": 12066, + "time_per_iteration": 3.1739885807037354 + }, + { + "auxiliary_loss_clip": 0.01051299, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.01619959, + "balance_loss_mlp": 1.01545858, + "epoch": 0.725507289944386, + "flos": 27051123548160.0, + "grad_norm": 1.9210499970111274, + "language_loss": 0.77645183, + "learning_rate": 7.395390262827897e-07, + "loss": 0.79735267, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 12067, + "time_per_iteration": 2.402179002761841 + }, + { + "auxiliary_loss_clip": 0.01007953, + "auxiliary_loss_mlp": 0.01004614, + "balance_loss_clip": 1.00199163, + "balance_loss_mlp": 1.00123119, + "epoch": 0.725567413197054, + "flos": 62918583310080.0, + "grad_norm": 0.7293650450051737, + "language_loss": 0.5714488, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59157449, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.06738281, + "step": 12068, + "time_per_iteration": 2.931572437286377 + }, + { + "auxiliary_loss_clip": 0.01007817, + "auxiliary_loss_mlp": 0.01003178, + "balance_loss_clip": 1.0008055, + "balance_loss_mlp": 1.0011251, + "epoch": 0.7256275364497219, + "flos": 60292660821120.0, + "grad_norm": 0.6627269736067521, + "language_loss": 0.55505764, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57516754, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.06689453, + "step": 12069, + "time_per_iteration": 3.0457684993743896 + }, + { + "auxiliary_loss_clip": 0.010493, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.01711392, + "balance_loss_mlp": 1.01508129, + "epoch": 0.7256876597023899, + "flos": 24497366572800.0, + "grad_norm": 1.817564220487277, + "language_loss": 0.80875921, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82964021, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34179688, + "step": 12070, + "time_per_iteration": 2.3818652629852295 + }, + { + "auxiliary_loss_clip": 0.01049581, + "auxiliary_loss_mlp": 0.01039205, + "balance_loss_clip": 1.01667452, + "balance_loss_mlp": 1.01595068, + "epoch": 0.7257477829550578, + "flos": 24351604179840.0, + "grad_norm": 1.7187918768560777, + "language_loss": 0.73026597, + "learning_rate": 7.383298839673197e-07, + "loss": 0.75115383, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3359375, + "step": 12071, + "time_per_iteration": 2.3952651023864746 + }, + { + "auxiliary_loss_clip": 0.0105201, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.01849508, + "balance_loss_mlp": 1.01651657, + "epoch": 0.7258079062077258, + "flos": 17201252223360.0, + "grad_norm": 1.7701672165431859, + "language_loss": 0.70376706, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72470248, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 12072, + "time_per_iteration": 2.3528645038604736 + }, + { + "auxiliary_loss_clip": 0.01053678, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.01336682, + "balance_loss_mlp": 1.01612926, + "epoch": 0.7258680294603939, + "flos": 21579255982080.0, + "grad_norm": 1.7219521399119946, + "language_loss": 0.79114377, + "learning_rate": 7.377255998196821e-07, + "loss": 0.8120541, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 12073, + "time_per_iteration": 2.4200217723846436 + }, + { + "auxiliary_loss_clip": 0.01051202, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.01141298, + "balance_loss_mlp": 1.01641202, + "epoch": 0.7259281527130618, + "flos": 34854107149440.0, + "grad_norm": 1.3919328880711501, + "language_loss": 0.71109104, + "learning_rate": 7.374235295384923e-07, + "loss": 0.7319541, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34765625, + "step": 12074, + "time_per_iteration": 2.4574480056762695 + }, + { + "auxiliary_loss_clip": 0.01053047, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.01370382, + "balance_loss_mlp": 1.01622427, + "epoch": 0.7259882759657298, + "flos": 25403642311680.0, + "grad_norm": 1.6314678999765204, + "language_loss": 0.75358188, + "learning_rate": 7.371215071343302e-07, + "loss": 0.77448404, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 12075, + "time_per_iteration": 2.400881052017212 + }, + { + "auxiliary_loss_clip": 0.01052944, + "auxiliary_loss_mlp": 0.01045353, + "balance_loss_clip": 1.0205338, + "balance_loss_mlp": 1.0162394, + "epoch": 0.7260483992183977, + "flos": 62951438632320.0, + "grad_norm": 1.5546676816335279, + "language_loss": 0.64523792, + "learning_rate": 7.368195326186458e-07, + "loss": 0.6662209, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 12076, + "time_per_iteration": 2.7409183979034424 + }, + { + "auxiliary_loss_clip": 0.01053123, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.01402187, + "balance_loss_mlp": 1.0163281, + "epoch": 0.7261085224710657, + "flos": 26466363319680.0, + "grad_norm": 2.043483374514626, + "language_loss": 0.79148972, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81240201, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 12077, + "time_per_iteration": 2.42322039604187 + }, + { + "auxiliary_loss_clip": 0.01008115, + "auxiliary_loss_mlp": 0.01003701, + "balance_loss_clip": 1.00088811, + "balance_loss_mlp": 1.00126696, + "epoch": 0.7261686457237336, + "flos": 66769748519040.0, + "grad_norm": 0.8889327412650297, + "language_loss": 0.65127778, + "learning_rate": 7.362157272985163e-07, + "loss": 0.67139602, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.02807617, + "router_z_loss_mlp": 0.06835938, + "step": 12078, + "time_per_iteration": 4.3908514976501465 + }, + { + "auxiliary_loss_clip": 0.01007821, + "auxiliary_loss_mlp": 0.0100404, + "balance_loss_clip": 1.00129783, + "balance_loss_mlp": 1.00107729, + "epoch": 0.7262287689764017, + "flos": 69996071525760.0, + "grad_norm": 0.7182445247988756, + "language_loss": 0.59316051, + "learning_rate": 7.359138965169671e-07, + "loss": 0.6132791, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.06738281, + "step": 12079, + "time_per_iteration": 3.1319475173950195 + }, + { + "auxiliary_loss_clip": 0.01051966, + "auxiliary_loss_mlp": 0.01037702, + "balance_loss_clip": 1.01341915, + "balance_loss_mlp": 1.01601601, + "epoch": 0.7262888922290696, + "flos": 23804305706880.0, + "grad_norm": 2.340765176572212, + "language_loss": 0.65823865, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67913532, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 12080, + "time_per_iteration": 2.3932976722717285 + }, + { + "auxiliary_loss_clip": 0.01052141, + "auxiliary_loss_mlp": 0.01036134, + "balance_loss_clip": 1.01206565, + "balance_loss_mlp": 1.01536369, + "epoch": 0.7263490154817376, + "flos": 19499305334400.0, + "grad_norm": 2.285142198759051, + "language_loss": 0.71551055, + "learning_rate": 7.35310378768128e-07, + "loss": 0.73639327, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 12081, + "time_per_iteration": 2.345550537109375 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_clip": 1.01696777, + "balance_loss_mlp": 1.01741064, + "epoch": 0.7264091387344055, + "flos": 16285410771840.0, + "grad_norm": 1.7399318877580308, + "language_loss": 0.8245008, + "learning_rate": 7.350086918237237e-07, + "loss": 0.84548032, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38671875, + "step": 12082, + "time_per_iteration": 3.651632070541382 + }, + { + "auxiliary_loss_clip": 0.01055842, + "auxiliary_loss_mlp": 0.01039328, + "balance_loss_clip": 1.01318514, + "balance_loss_mlp": 1.01702797, + "epoch": 0.7264692619870735, + "flos": 24350905952640.0, + "grad_norm": 1.6563487724561143, + "language_loss": 0.78192961, + "learning_rate": 7.347070528479158e-07, + "loss": 0.80288136, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 12083, + "time_per_iteration": 2.3894126415252686 + }, + { + "auxiliary_loss_clip": 0.01054568, + "auxiliary_loss_mlp": 0.01039268, + "balance_loss_clip": 1.01440144, + "balance_loss_mlp": 1.01743078, + "epoch": 0.7265293852397414, + "flos": 25118296836480.0, + "grad_norm": 1.954459923552141, + "language_loss": 0.73861039, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75954878, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 12084, + "time_per_iteration": 3.880523681640625 + }, + { + "auxiliary_loss_clip": 0.01055904, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.0160464, + "balance_loss_mlp": 1.01734459, + "epoch": 0.7265895084924094, + "flos": 22637124311040.0, + "grad_norm": 1.7719063824584447, + "language_loss": 0.78790271, + "learning_rate": 7.34103918847843e-07, + "loss": 0.80886626, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38476562, + "step": 12085, + "time_per_iteration": 2.4177029132843018 + }, + { + "auxiliary_loss_clip": 0.010542, + "auxiliary_loss_mlp": 0.01044265, + "balance_loss_clip": 1.02042305, + "balance_loss_mlp": 1.01681602, + "epoch": 0.7266496317450775, + "flos": 23367088350720.0, + "grad_norm": 2.054156698492528, + "language_loss": 0.73498178, + "learning_rate": 7.338024238464493e-07, + "loss": 0.75596642, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 12086, + "time_per_iteration": 2.3588883876800537 + }, + { + "auxiliary_loss_clip": 0.01053392, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.01624393, + "balance_loss_mlp": 1.0169189, + "epoch": 0.7267097549977454, + "flos": 28073345512320.0, + "grad_norm": 1.6480431378526412, + "language_loss": 0.70691156, + "learning_rate": 7.335009768593938e-07, + "loss": 0.72784424, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 12087, + "time_per_iteration": 2.4229543209075928 + }, + { + "auxiliary_loss_clip": 0.01053736, + "auxiliary_loss_mlp": 0.0104421, + "balance_loss_clip": 1.01879501, + "balance_loss_mlp": 1.01680052, + "epoch": 0.7267698782504134, + "flos": 22194565516800.0, + "grad_norm": 1.7200362704572183, + "language_loss": 0.79806316, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81904256, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 12088, + "time_per_iteration": 2.3559067249298096 + }, + { + "auxiliary_loss_clip": 0.01052677, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.01391888, + "balance_loss_mlp": 1.01575303, + "epoch": 0.7268300015030813, + "flos": 18513881809920.0, + "grad_norm": 1.7455296845044221, + "language_loss": 0.74667805, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76757371, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36914062, + "step": 12089, + "time_per_iteration": 2.3432788848876953 + }, + { + "auxiliary_loss_clip": 0.01053388, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.01401341, + "balance_loss_mlp": 1.01655746, + "epoch": 0.7268901247557493, + "flos": 23984946414720.0, + "grad_norm": 2.199131952528735, + "language_loss": 0.71970463, + "learning_rate": 7.325969240985616e-07, + "loss": 0.74059868, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3671875, + "step": 12090, + "time_per_iteration": 2.369866371154785 + }, + { + "auxiliary_loss_clip": 0.01054037, + "auxiliary_loss_mlp": 0.01046391, + "balance_loss_clip": 1.02065349, + "balance_loss_mlp": 1.01670098, + "epoch": 0.7269502480084172, + "flos": 32086716364800.0, + "grad_norm": 1.5934558642885301, + "language_loss": 0.77605617, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79706043, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 12091, + "time_per_iteration": 2.456702709197998 + }, + { + "auxiliary_loss_clip": 0.01051087, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.01353765, + "balance_loss_mlp": 1.01516771, + "epoch": 0.7270103712610853, + "flos": 19061773776000.0, + "grad_norm": 1.916543567140568, + "language_loss": 0.72589922, + "learning_rate": 7.319944625392205e-07, + "loss": 0.74678379, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 12092, + "time_per_iteration": 2.358867883682251 + }, + { + "auxiliary_loss_clip": 0.01051876, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.01147568, + "balance_loss_mlp": 1.01614594, + "epoch": 0.7270704945137532, + "flos": 34531474475520.0, + "grad_norm": 2.114893176169776, + "language_loss": 0.62292612, + "learning_rate": 7.31693303878184e-07, + "loss": 0.64379472, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35742188, + "step": 12093, + "time_per_iteration": 2.4600799083709717 + }, + { + "auxiliary_loss_clip": 0.0105222, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.01037073, + "balance_loss_mlp": 1.01629817, + "epoch": 0.7271306177664212, + "flos": 21506496975360.0, + "grad_norm": 1.6679603936347902, + "language_loss": 0.76016557, + "learning_rate": 7.313921933114644e-07, + "loss": 0.78102672, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 12094, + "time_per_iteration": 2.3829593658447266 + }, + { + "auxiliary_loss_clip": 0.01052097, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.01560187, + "balance_loss_mlp": 1.01675797, + "epoch": 0.7271907410190891, + "flos": 22271374241280.0, + "grad_norm": 1.8263038390688884, + "language_loss": 0.86081052, + "learning_rate": 7.310911308504808e-07, + "loss": 0.88170719, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 12095, + "time_per_iteration": 3.8250982761383057 + }, + { + "auxiliary_loss_clip": 0.01051753, + "auxiliary_loss_mlp": 0.01038468, + "balance_loss_clip": 1.01443529, + "balance_loss_mlp": 1.01521516, + "epoch": 0.7272508642717571, + "flos": 22892025214080.0, + "grad_norm": 1.7387656873931483, + "language_loss": 0.79225683, + "learning_rate": 7.307901165066479e-07, + "loss": 0.81315911, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 12096, + "time_per_iteration": 2.3697991371154785 + }, + { + "auxiliary_loss_clip": 0.01052696, + "auxiliary_loss_mlp": 0.01044006, + "balance_loss_clip": 1.02046239, + "balance_loss_mlp": 1.01661634, + "epoch": 0.727310987524425, + "flos": 11655089550720.0, + "grad_norm": 2.1922625805428666, + "language_loss": 0.73473608, + "learning_rate": 7.30489150291381e-07, + "loss": 0.75570309, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 12097, + "time_per_iteration": 2.3568108081817627 + }, + { + "auxiliary_loss_clip": 0.01053691, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.01848865, + "balance_loss_mlp": 1.0163337, + "epoch": 0.727371110777093, + "flos": 24534165012480.0, + "grad_norm": 1.8088019000227107, + "language_loss": 0.78396338, + "learning_rate": 7.301882322160935e-07, + "loss": 0.80493081, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 12098, + "time_per_iteration": 2.373223066329956 + }, + { + "auxiliary_loss_clip": 0.01053542, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.01313174, + "balance_loss_mlp": 1.01583362, + "epoch": 0.7274312340297611, + "flos": 74737278691200.0, + "grad_norm": 1.6455616993379154, + "language_loss": 0.68152779, + "learning_rate": 7.298873622921952e-07, + "loss": 0.70243353, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37695312, + "step": 12099, + "time_per_iteration": 2.778770923614502 + }, + { + "auxiliary_loss_clip": 0.01054778, + "auxiliary_loss_mlp": 0.01039392, + "balance_loss_clip": 1.01360714, + "balance_loss_mlp": 1.01603878, + "epoch": 0.727491357282429, + "flos": 22341864009600.0, + "grad_norm": 1.9425455507501495, + "language_loss": 0.73617136, + "learning_rate": 7.29586540531095e-07, + "loss": 0.7571131, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38867188, + "step": 12100, + "time_per_iteration": 2.357714891433716 + }, + { + "auxiliary_loss_clip": 0.01051692, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.01657867, + "balance_loss_mlp": 1.01634181, + "epoch": 0.727551480535097, + "flos": 23296354202880.0, + "grad_norm": 1.4150905993705087, + "language_loss": 0.7553128, + "learning_rate": 7.292857669442005e-07, + "loss": 0.77622122, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 12101, + "time_per_iteration": 2.4215524196624756 + }, + { + "auxiliary_loss_clip": 0.01053869, + "auxiliary_loss_mlp": 0.01038284, + "balance_loss_clip": 1.01673138, + "balance_loss_mlp": 1.01759315, + "epoch": 0.7276116037877649, + "flos": 21469489067520.0, + "grad_norm": 1.7845676767026417, + "language_loss": 0.83884335, + "learning_rate": 7.289850415429177e-07, + "loss": 0.85976487, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36328125, + "step": 12102, + "time_per_iteration": 2.352703332901001 + }, + { + "auxiliary_loss_clip": 0.01050704, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.01143265, + "balance_loss_mlp": 1.01546502, + "epoch": 0.7276717270404329, + "flos": 21463170111360.0, + "grad_norm": 1.9998261212245683, + "language_loss": 0.83337152, + "learning_rate": 7.286843643386495e-07, + "loss": 0.8542133, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 12103, + "time_per_iteration": 2.3677995204925537 + }, + { + "auxiliary_loss_clip": 0.01053213, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.01118183, + "balance_loss_mlp": 1.01757193, + "epoch": 0.7277318502931008, + "flos": 16836270203520.0, + "grad_norm": 1.689926741909336, + "language_loss": 0.6802603, + "learning_rate": 7.283837353427968e-07, + "loss": 0.70113349, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 12104, + "time_per_iteration": 2.34486722946167 + }, + { + "auxiliary_loss_clip": 0.01049634, + "auxiliary_loss_mlp": 0.01038003, + "balance_loss_clip": 1.01528192, + "balance_loss_mlp": 1.0155282, + "epoch": 0.7277919735457689, + "flos": 33399171394560.0, + "grad_norm": 2.119222022032666, + "language_loss": 0.66784686, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68872321, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.33984375, + "step": 12105, + "time_per_iteration": 2.4764440059661865 + }, + { + "auxiliary_loss_clip": 0.01052466, + "auxiliary_loss_mlp": 0.01040376, + "balance_loss_clip": 1.0171901, + "balance_loss_mlp": 1.01674438, + "epoch": 0.7278520967984368, + "flos": 19205546221440.0, + "grad_norm": 2.9474830135266354, + "language_loss": 0.76902974, + "learning_rate": 7.27782622021939e-07, + "loss": 0.78995812, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35742188, + "step": 12106, + "time_per_iteration": 2.3332674503326416 + }, + { + "auxiliary_loss_clip": 0.0105446, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.01763034, + "balance_loss_mlp": 1.01655746, + "epoch": 0.7279122200511048, + "flos": 34093244689920.0, + "grad_norm": 2.062094764470932, + "language_loss": 0.71528685, + "learning_rate": 7.274821377197273e-07, + "loss": 0.73624647, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37890625, + "step": 12107, + "time_per_iteration": 2.4844741821289062 + }, + { + "auxiliary_loss_clip": 0.01051513, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.01487517, + "balance_loss_mlp": 1.01613927, + "epoch": 0.7279723433037727, + "flos": 54597071928960.0, + "grad_norm": 1.4390570258688409, + "language_loss": 0.76178432, + "learning_rate": 7.271817016715205e-07, + "loss": 0.78267956, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 12108, + "time_per_iteration": 2.69358229637146 + }, + { + "auxiliary_loss_clip": 0.01052055, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.01117516, + "balance_loss_mlp": 1.0157783, + "epoch": 0.7280324665564407, + "flos": 36136012872960.0, + "grad_norm": 1.4242662346110704, + "language_loss": 0.67831951, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69918203, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 12109, + "time_per_iteration": 2.4926018714904785 + }, + { + "auxiliary_loss_clip": 0.01053314, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.0167942, + "balance_loss_mlp": 1.01640105, + "epoch": 0.7280925898091086, + "flos": 11617767440640.0, + "grad_norm": 1.9398687614112124, + "language_loss": 0.64514256, + "learning_rate": 7.265809743826912e-07, + "loss": 0.66608787, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 12110, + "time_per_iteration": 2.319761276245117 + }, + { + "auxiliary_loss_clip": 0.01052573, + "auxiliary_loss_mlp": 0.01036884, + "balance_loss_clip": 1.01092005, + "balance_loss_mlp": 1.01533711, + "epoch": 0.7281527130617766, + "flos": 34275665877120.0, + "grad_norm": 2.4885693530315787, + "language_loss": 0.59938371, + "learning_rate": 7.26280683164847e-07, + "loss": 0.62027824, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37304688, + "step": 12111, + "time_per_iteration": 2.4759747982025146 + }, + { + "auxiliary_loss_clip": 0.01053507, + "auxiliary_loss_mlp": 0.01037767, + "balance_loss_clip": 1.01287627, + "balance_loss_mlp": 1.01648378, + "epoch": 0.7282128363144446, + "flos": 13917182094720.0, + "grad_norm": 2.238178897883566, + "language_loss": 0.75717986, + "learning_rate": 7.259804402465677e-07, + "loss": 0.77809262, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36914062, + "step": 12112, + "time_per_iteration": 2.3309481143951416 + }, + { + "auxiliary_loss_clip": 0.01050775, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.01197886, + "balance_loss_mlp": 1.01585388, + "epoch": 0.7282729595671126, + "flos": 20776567847040.0, + "grad_norm": 2.5295613431338535, + "language_loss": 0.67944014, + "learning_rate": 7.25680245639237e-07, + "loss": 0.70028841, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 12113, + "time_per_iteration": 2.349595546722412 + }, + { + "auxiliary_loss_clip": 0.01053263, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.0121727, + "balance_loss_mlp": 1.01676393, + "epoch": 0.7283330828197806, + "flos": 16324513361280.0, + "grad_norm": 1.6347277970916918, + "language_loss": 0.74227595, + "learning_rate": 7.253800993542399e-07, + "loss": 0.76316071, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 12114, + "time_per_iteration": 2.3481242656707764 + }, + { + "auxiliary_loss_clip": 0.01052308, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.01389742, + "balance_loss_mlp": 1.01607859, + "epoch": 0.7283932060724485, + "flos": 27488969308800.0, + "grad_norm": 1.9288072852027034, + "language_loss": 0.69325888, + "learning_rate": 7.250800014029564e-07, + "loss": 0.71415597, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 12115, + "time_per_iteration": 2.404755115509033 + }, + { + "auxiliary_loss_clip": 0.01055823, + "auxiliary_loss_mlp": 0.01038915, + "balance_loss_clip": 1.01431, + "balance_loss_mlp": 1.01658392, + "epoch": 0.7284533293251165, + "flos": 18366932430720.0, + "grad_norm": 1.8805989086384283, + "language_loss": 0.61362433, + "learning_rate": 7.247799517967674e-07, + "loss": 0.63457167, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.39257812, + "step": 12116, + "time_per_iteration": 2.379642963409424 + }, + { + "auxiliary_loss_clip": 0.01051501, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.01283252, + "balance_loss_mlp": 1.01586294, + "epoch": 0.7285134525777844, + "flos": 21724459793280.0, + "grad_norm": 1.8120744669971138, + "language_loss": 0.74126792, + "learning_rate": 7.2447995054705e-07, + "loss": 0.76215863, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35546875, + "step": 12117, + "time_per_iteration": 2.3625385761260986 + }, + { + "auxiliary_loss_clip": 0.01053218, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.01359642, + "balance_loss_mlp": 1.01666307, + "epoch": 0.7285735758304525, + "flos": 20740293077760.0, + "grad_norm": 1.8225282082865497, + "language_loss": 0.70657814, + "learning_rate": 7.241799976651807e-07, + "loss": 0.72746992, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36523438, + "step": 12118, + "time_per_iteration": 3.6344995498657227 + }, + { + "auxiliary_loss_clip": 0.01048907, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.01381195, + "balance_loss_mlp": 1.01460993, + "epoch": 0.7286336990831204, + "flos": 17310006708480.0, + "grad_norm": 1.641483226712756, + "language_loss": 0.85310364, + "learning_rate": 7.238800931625346e-07, + "loss": 0.87393719, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34375, + "step": 12119, + "time_per_iteration": 2.376063108444214 + }, + { + "auxiliary_loss_clip": 0.01053845, + "auxiliary_loss_mlp": 0.01036121, + "balance_loss_clip": 1.01233888, + "balance_loss_mlp": 1.01712704, + "epoch": 0.7286938223357884, + "flos": 19786501111680.0, + "grad_norm": 2.134266128104077, + "language_loss": 0.82924283, + "learning_rate": 7.235802370504831e-07, + "loss": 0.85014248, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 12120, + "time_per_iteration": 2.326993465423584 + }, + { + "auxiliary_loss_clip": 0.01052828, + "auxiliary_loss_mlp": 0.01041863, + "balance_loss_clip": 1.01872396, + "balance_loss_mlp": 1.01661038, + "epoch": 0.7287539455884563, + "flos": 15339962620800.0, + "grad_norm": 1.8083110069762862, + "language_loss": 0.80019456, + "learning_rate": 7.232804293403963e-07, + "loss": 0.82114142, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 12121, + "time_per_iteration": 2.3832247257232666 + }, + { + "auxiliary_loss_clip": 0.01054174, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.01167178, + "balance_loss_mlp": 1.01671278, + "epoch": 0.7288140688411243, + "flos": 25191300222720.0, + "grad_norm": 1.6364896179781765, + "language_loss": 0.70514709, + "learning_rate": 7.229806700436441e-07, + "loss": 0.72605503, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 12122, + "time_per_iteration": 3.801983594894409 + }, + { + "auxiliary_loss_clip": 0.01049839, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.01383722, + "balance_loss_mlp": 1.01483274, + "epoch": 0.7288741920937922, + "flos": 23983131024000.0, + "grad_norm": 1.67795094646173, + "language_loss": 0.87962389, + "learning_rate": 7.226809591715923e-07, + "loss": 0.90049297, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34960938, + "step": 12123, + "time_per_iteration": 2.38325572013855 + }, + { + "auxiliary_loss_clip": 0.01052874, + "auxiliary_loss_mlp": 0.01038665, + "balance_loss_clip": 1.01639676, + "balance_loss_mlp": 1.01651716, + "epoch": 0.7289343153464602, + "flos": 22743888848640.0, + "grad_norm": 1.8092577592495658, + "language_loss": 0.83109975, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85201514, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 12124, + "time_per_iteration": 3.8626708984375 + }, + { + "auxiliary_loss_clip": 0.01051706, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.01256824, + "balance_loss_mlp": 1.01554108, + "epoch": 0.7289944385991282, + "flos": 24898867741440.0, + "grad_norm": 1.9118809039784581, + "language_loss": 0.68659747, + "learning_rate": 7.220816827470499e-07, + "loss": 0.70747441, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36132812, + "step": 12125, + "time_per_iteration": 2.4090561866760254 + }, + { + "auxiliary_loss_clip": 0.01053152, + "auxiliary_loss_mlp": 0.01040972, + "balance_loss_clip": 1.01596165, + "balance_loss_mlp": 1.01505375, + "epoch": 0.7290545618517962, + "flos": 22965936295680.0, + "grad_norm": 1.7384787789636396, + "language_loss": 0.7634294, + "learning_rate": 7.217821172172855e-07, + "loss": 0.78437066, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 12126, + "time_per_iteration": 2.3931398391723633 + }, + { + "auxiliary_loss_clip": 0.01008365, + "auxiliary_loss_mlp": 0.01003958, + "balance_loss_clip": 1.00120389, + "balance_loss_mlp": 1.00120711, + "epoch": 0.7291146851044642, + "flos": 61898176736640.0, + "grad_norm": 0.8190739063758912, + "language_loss": 0.58714592, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60726917, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.02758789, + "router_z_loss_mlp": 0.07128906, + "step": 12127, + "time_per_iteration": 2.946786642074585 + }, + { + "auxiliary_loss_clip": 0.0105279, + "auxiliary_loss_mlp": 0.01039738, + "balance_loss_clip": 1.01720715, + "balance_loss_mlp": 1.01755691, + "epoch": 0.7291748083571321, + "flos": 23329836063360.0, + "grad_norm": 2.2902889624258544, + "language_loss": 0.70507169, + "learning_rate": 7.21183131579562e-07, + "loss": 0.72599697, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 12128, + "time_per_iteration": 2.417213201522827 + }, + { + "auxiliary_loss_clip": 0.01053958, + "auxiliary_loss_mlp": 0.01048737, + "balance_loss_clip": 1.02419186, + "balance_loss_mlp": 1.01682711, + "epoch": 0.7292349316098001, + "flos": 28328735174400.0, + "grad_norm": 2.2452095585546683, + "language_loss": 0.66475266, + "learning_rate": 7.20883711494319e-07, + "loss": 0.68577957, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 12129, + "time_per_iteration": 2.406158685684204 + }, + { + "auxiliary_loss_clip": 0.01049401, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.01110816, + "balance_loss_mlp": 1.01516449, + "epoch": 0.729295054862468, + "flos": 24131127744000.0, + "grad_norm": 2.009497000525627, + "language_loss": 0.74955618, + "learning_rate": 7.205843399132927e-07, + "loss": 0.77039063, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34179688, + "step": 12130, + "time_per_iteration": 2.4020779132843018 + }, + { + "auxiliary_loss_clip": 0.01052175, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.01538396, + "balance_loss_mlp": 1.01576877, + "epoch": 0.7293551781151361, + "flos": 22815251400960.0, + "grad_norm": 1.7034359172970466, + "language_loss": 0.70810544, + "learning_rate": 7.202850168478374e-07, + "loss": 0.72902942, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 12131, + "time_per_iteration": 2.365910291671753 + }, + { + "auxiliary_loss_clip": 0.01052142, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.01163483, + "balance_loss_mlp": 1.01686263, + "epoch": 0.729415301367804, + "flos": 22125611848320.0, + "grad_norm": 1.5720379445330195, + "language_loss": 0.7788192, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79968214, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 12132, + "time_per_iteration": 2.36641263961792 + }, + { + "auxiliary_loss_clip": 0.01054054, + "auxiliary_loss_mlp": 0.01045134, + "balance_loss_clip": 1.02119684, + "balance_loss_mlp": 1.01795161, + "epoch": 0.729475424620472, + "flos": 12348778821120.0, + "grad_norm": 1.9856790212136735, + "language_loss": 0.80133665, + "learning_rate": 7.196865163090358e-07, + "loss": 0.82232845, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36132812, + "step": 12133, + "time_per_iteration": 2.343222141265869 + }, + { + "auxiliary_loss_clip": 0.01053462, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.01573968, + "balance_loss_mlp": 1.01601577, + "epoch": 0.7295355478731399, + "flos": 22194356048640.0, + "grad_norm": 1.887189697844182, + "language_loss": 0.72904593, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74996877, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.375, + "step": 12134, + "time_per_iteration": 3.7896244525909424 + }, + { + "auxiliary_loss_clip": 0.01053597, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.01651144, + "balance_loss_mlp": 1.01684678, + "epoch": 0.7295956711258079, + "flos": 23220907021440.0, + "grad_norm": 1.610580743693987, + "language_loss": 0.72308946, + "learning_rate": 7.190882099686939e-07, + "loss": 0.74403191, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 12135, + "time_per_iteration": 2.366248846054077 + }, + { + "auxiliary_loss_clip": 0.0105384, + "auxiliary_loss_mlp": 0.01038385, + "balance_loss_clip": 1.01403058, + "balance_loss_mlp": 1.01635671, + "epoch": 0.7296557943784758, + "flos": 31867741117440.0, + "grad_norm": 2.1379055162319354, + "language_loss": 0.64860904, + "learning_rate": 7.187891296513075e-07, + "loss": 0.66953129, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.375, + "step": 12136, + "time_per_iteration": 2.4587676525115967 + }, + { + "auxiliary_loss_clip": 0.01051475, + "auxiliary_loss_mlp": 0.01037216, + "balance_loss_clip": 1.01468539, + "balance_loss_mlp": 1.01530385, + "epoch": 0.7297159176311439, + "flos": 26650495163520.0, + "grad_norm": 2.096864151026659, + "language_loss": 0.7623964, + "learning_rate": 7.184900979175654e-07, + "loss": 0.78328335, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36132812, + "step": 12137, + "time_per_iteration": 2.3877265453338623 + }, + { + "auxiliary_loss_clip": 0.01053438, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.02169323, + "balance_loss_mlp": 1.01667011, + "epoch": 0.7297760408838118, + "flos": 24748531960320.0, + "grad_norm": 1.5823016494549402, + "language_loss": 0.74860394, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76958323, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 12138, + "time_per_iteration": 2.427959680557251 + }, + { + "auxiliary_loss_clip": 0.01051552, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.0127331, + "balance_loss_mlp": 1.01578474, + "epoch": 0.7298361641364798, + "flos": 18072894026880.0, + "grad_norm": 2.0727470501264698, + "language_loss": 0.73585564, + "learning_rate": 7.178921802463702e-07, + "loss": 0.75672895, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 12139, + "time_per_iteration": 2.33278226852417 + }, + { + "auxiliary_loss_clip": 0.01049833, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.01146221, + "balance_loss_mlp": 1.01550817, + "epoch": 0.7298962873891478, + "flos": 29894380450560.0, + "grad_norm": 1.5262762734107822, + "language_loss": 0.74271703, + "learning_rate": 7.175932943315898e-07, + "loss": 0.76354223, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34375, + "step": 12140, + "time_per_iteration": 2.48923397064209 + }, + { + "auxiliary_loss_clip": 0.01053382, + "auxiliary_loss_mlp": 0.01040271, + "balance_loss_clip": 1.01466501, + "balance_loss_mlp": 1.01621878, + "epoch": 0.7299564106418157, + "flos": 32264843454720.0, + "grad_norm": 1.6579350473179726, + "language_loss": 0.56565136, + "learning_rate": 7.172944570458003e-07, + "loss": 0.58658791, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37109375, + "step": 12141, + "time_per_iteration": 2.4417004585266113 + }, + { + "auxiliary_loss_clip": 0.0105066, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.0114665, + "balance_loss_mlp": 1.01617229, + "epoch": 0.7300165338944837, + "flos": 22929172767360.0, + "grad_norm": 1.4471005937910635, + "language_loss": 0.73447037, + "learning_rate": 7.169956684003342e-07, + "loss": 0.75529861, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34375, + "step": 12142, + "time_per_iteration": 2.4076027870178223 + }, + { + "auxiliary_loss_clip": 0.01051556, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.01340103, + "balance_loss_mlp": 1.01654005, + "epoch": 0.7300766571471516, + "flos": 19827768205440.0, + "grad_norm": 1.830109783907101, + "language_loss": 0.74811351, + "learning_rate": 7.16696928406521e-07, + "loss": 0.76897478, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.3515625, + "step": 12143, + "time_per_iteration": 2.337013006210327 + }, + { + "auxiliary_loss_clip": 0.01052994, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.01144803, + "balance_loss_mlp": 1.01625526, + "epoch": 0.7301367803998197, + "flos": 24346821323520.0, + "grad_norm": 2.0409701863013963, + "language_loss": 0.67999107, + "learning_rate": 7.163982370756882e-07, + "loss": 0.70086634, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 12144, + "time_per_iteration": 2.431495428085327 + }, + { + "auxiliary_loss_clip": 0.01051722, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.01532245, + "balance_loss_mlp": 1.01588094, + "epoch": 0.7301969036524876, + "flos": 15303618028800.0, + "grad_norm": 1.7055539242422253, + "language_loss": 0.80285585, + "learning_rate": 7.160995944191627e-07, + "loss": 0.82376385, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.359375, + "step": 12145, + "time_per_iteration": 2.3372957706451416 + }, + { + "auxiliary_loss_clip": 0.01051711, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.01433945, + "balance_loss_mlp": 1.01706028, + "epoch": 0.7302570269051556, + "flos": 23506322319360.0, + "grad_norm": 2.37061920027568, + "language_loss": 0.92440832, + "learning_rate": 7.158010004482702e-07, + "loss": 0.94529277, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 12146, + "time_per_iteration": 2.374751567840576 + }, + { + "auxiliary_loss_clip": 0.01050162, + "auxiliary_loss_mlp": 0.01037707, + "balance_loss_clip": 1.01506889, + "balance_loss_mlp": 1.01630998, + "epoch": 0.7303171501578235, + "flos": 20521981146240.0, + "grad_norm": 1.7212784606519054, + "language_loss": 0.62862587, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64950448, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.33984375, + "step": 12147, + "time_per_iteration": 2.439755916595459 + }, + { + "auxiliary_loss_clip": 0.01054066, + "auxiliary_loss_mlp": 0.01042975, + "balance_loss_clip": 1.01841795, + "balance_loss_mlp": 1.01722121, + "epoch": 0.7303772734104915, + "flos": 18331635179520.0, + "grad_norm": 1.8801477896779148, + "language_loss": 0.76363701, + "learning_rate": 7.152039586086693e-07, + "loss": 0.78460741, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36914062, + "step": 12148, + "time_per_iteration": 2.328608512878418 + }, + { + "auxiliary_loss_clip": 0.01008656, + "auxiliary_loss_mlp": 0.01002708, + "balance_loss_clip": 1.00016856, + "balance_loss_mlp": 1.00163913, + "epoch": 0.7304373966631594, + "flos": 60651638087040.0, + "grad_norm": 0.6927880124219913, + "language_loss": 0.56760401, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58771759, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.0703125, + "step": 12149, + "time_per_iteration": 2.997512102127075 + }, + { + "auxiliary_loss_clip": 0.01054409, + "auxiliary_loss_mlp": 0.01038339, + "balance_loss_clip": 1.01346016, + "balance_loss_mlp": 1.01665008, + "epoch": 0.7304975199158275, + "flos": 19827069978240.0, + "grad_norm": 1.781961016535131, + "language_loss": 0.75387698, + "learning_rate": 7.146071116474451e-07, + "loss": 0.77480447, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37890625, + "step": 12150, + "time_per_iteration": 2.3715248107910156 + }, + { + "auxiliary_loss_clip": 0.010539, + "auxiliary_loss_mlp": 0.0104396, + "balance_loss_clip": 1.01879454, + "balance_loss_mlp": 1.01647902, + "epoch": 0.7305576431684954, + "flos": 13223178622080.0, + "grad_norm": 2.132401682765272, + "language_loss": 0.84743559, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86841422, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 12151, + "time_per_iteration": 2.3284645080566406 + }, + { + "auxiliary_loss_clip": 0.01053229, + "auxiliary_loss_mlp": 0.01042766, + "balance_loss_clip": 1.01805425, + "balance_loss_mlp": 1.01622748, + "epoch": 0.7306177664211634, + "flos": 24059346255360.0, + "grad_norm": 2.1976542905522916, + "language_loss": 0.789639, + "learning_rate": 7.14010459655127e-07, + "loss": 0.81059897, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36914062, + "step": 12152, + "time_per_iteration": 2.3799543380737305 + }, + { + "auxiliary_loss_clip": 0.01052877, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.01418614, + "balance_loss_mlp": 1.01633668, + "epoch": 0.7306778896738314, + "flos": 27087887076480.0, + "grad_norm": 1.5619615004039853, + "language_loss": 0.80621451, + "learning_rate": 7.137122068005919e-07, + "loss": 0.82712841, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 12153, + "time_per_iteration": 2.414236068725586 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.01527059, + "balance_loss_mlp": 1.01694918, + "epoch": 0.7307380129264993, + "flos": 16689739760640.0, + "grad_norm": 1.7592985797593879, + "language_loss": 0.68409568, + "learning_rate": 7.134140027222173e-07, + "loss": 0.705037, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37890625, + "step": 12154, + "time_per_iteration": 2.3269240856170654 + }, + { + "auxiliary_loss_clip": 0.01052343, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.01355827, + "balance_loss_mlp": 1.01588726, + "epoch": 0.7307981361791673, + "flos": 21724669261440.0, + "grad_norm": 1.8236270557253473, + "language_loss": 0.66827881, + "learning_rate": 7.131158474313128e-07, + "loss": 0.68918359, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 12155, + "time_per_iteration": 2.3728766441345215 + }, + { + "auxiliary_loss_clip": 0.01050443, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.01243377, + "balance_loss_mlp": 1.01541591, + "epoch": 0.7308582594318352, + "flos": 18039691457280.0, + "grad_norm": 2.025576139984311, + "language_loss": 0.82765412, + "learning_rate": 7.128177409391851e-07, + "loss": 0.84850186, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 12156, + "time_per_iteration": 2.3345067501068115 + }, + { + "auxiliary_loss_clip": 0.01050864, + "auxiliary_loss_mlp": 0.01039737, + "balance_loss_clip": 1.01789749, + "balance_loss_mlp": 1.01603198, + "epoch": 0.7309183826845033, + "flos": 13844108885760.0, + "grad_norm": 4.6114895222421755, + "language_loss": 0.76603365, + "learning_rate": 7.125196832571367e-07, + "loss": 0.78693962, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 12157, + "time_per_iteration": 2.3798465728759766 + }, + { + "auxiliary_loss_clip": 0.01049259, + "auxiliary_loss_mlp": 0.01032462, + "balance_loss_clip": 1.01164842, + "balance_loss_mlp": 1.01504803, + "epoch": 0.7309785059371712, + "flos": 17018272454400.0, + "grad_norm": 2.6726121283910746, + "language_loss": 0.75021327, + "learning_rate": 7.122216743964713e-07, + "loss": 0.77103049, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34179688, + "step": 12158, + "time_per_iteration": 3.7539024353027344 + }, + { + "auxiliary_loss_clip": 0.01054235, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.01535416, + "balance_loss_mlp": 1.01752424, + "epoch": 0.7310386291898392, + "flos": 26501276545920.0, + "grad_norm": 1.7725098474960317, + "language_loss": 0.86525285, + "learning_rate": 7.119237143684896e-07, + "loss": 0.8861891, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 12159, + "time_per_iteration": 2.398587703704834 + }, + { + "auxiliary_loss_clip": 0.01055276, + "auxiliary_loss_mlp": 0.01040947, + "balance_loss_clip": 1.01599669, + "balance_loss_mlp": 1.01707339, + "epoch": 0.7310987524425071, + "flos": 16944989777280.0, + "grad_norm": 2.2488148973857847, + "language_loss": 0.74616665, + "learning_rate": 7.116258031844895e-07, + "loss": 0.76712883, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3828125, + "step": 12160, + "time_per_iteration": 2.3653366565704346 + }, + { + "auxiliary_loss_clip": 0.0105551, + "auxiliary_loss_mlp": 0.01039913, + "balance_loss_clip": 1.01551104, + "balance_loss_mlp": 1.01731038, + "epoch": 0.7311588756951751, + "flos": 13844423088000.0, + "grad_norm": 2.159069857584061, + "language_loss": 0.74897248, + "learning_rate": 7.113279408557675e-07, + "loss": 0.76992667, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3828125, + "step": 12161, + "time_per_iteration": 3.7169902324676514 + }, + { + "auxiliary_loss_clip": 0.01056626, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.01223791, + "balance_loss_mlp": 1.01731789, + "epoch": 0.731218998947843, + "flos": 28766615846400.0, + "grad_norm": 1.6924777205830486, + "language_loss": 0.71533179, + "learning_rate": 7.110301273936192e-07, + "loss": 0.73629767, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.39257812, + "step": 12162, + "time_per_iteration": 2.471043825149536 + }, + { + "auxiliary_loss_clip": 0.01055843, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.01307273, + "balance_loss_mlp": 1.01798892, + "epoch": 0.7312791222005111, + "flos": 27087572874240.0, + "grad_norm": 1.8808691744394173, + "language_loss": 0.67896938, + "learning_rate": 7.107323628093382e-07, + "loss": 0.69990003, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 12163, + "time_per_iteration": 3.9231553077697754 + }, + { + "auxiliary_loss_clip": 0.01053946, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.01118767, + "balance_loss_mlp": 1.01678014, + "epoch": 0.731339245453179, + "flos": 20922958644480.0, + "grad_norm": 1.927385033528841, + "language_loss": 0.69746381, + "learning_rate": 7.104346471142153e-07, + "loss": 0.71833408, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.37109375, + "step": 12164, + "time_per_iteration": 2.3704469203948975 + }, + { + "auxiliary_loss_clip": 0.01051574, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.01435947, + "balance_loss_mlp": 1.01680171, + "epoch": 0.731399368705847, + "flos": 23074586046720.0, + "grad_norm": 3.9296420524584903, + "language_loss": 0.74474329, + "learning_rate": 7.101369803195391e-07, + "loss": 0.76562589, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 12165, + "time_per_iteration": 2.4035120010375977 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.01409233, + "balance_loss_mlp": 1.01561618, + "epoch": 0.731459491958515, + "flos": 23581664766720.0, + "grad_norm": 2.027818882402817, + "language_loss": 0.77679157, + "learning_rate": 7.098393624365988e-07, + "loss": 0.79769003, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 12166, + "time_per_iteration": 2.438708782196045 + }, + { + "auxiliary_loss_clip": 0.01052963, + "auxiliary_loss_mlp": 0.01038658, + "balance_loss_clip": 1.01541257, + "balance_loss_mlp": 1.01703167, + "epoch": 0.7315196152111829, + "flos": 22378278424320.0, + "grad_norm": 1.7194521406162961, + "language_loss": 0.80383903, + "learning_rate": 7.095417934766781e-07, + "loss": 0.82475531, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 12167, + "time_per_iteration": 2.5277140140533447 + }, + { + "auxiliary_loss_clip": 0.01051892, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.0183301, + "balance_loss_mlp": 1.01679027, + "epoch": 0.7315797384638509, + "flos": 26175850963200.0, + "grad_norm": 3.4224931836249017, + "language_loss": 0.77819633, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79912269, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 12168, + "time_per_iteration": 2.429852247238159 + }, + { + "auxiliary_loss_clip": 0.01054514, + "auxiliary_loss_mlp": 0.01040633, + "balance_loss_clip": 1.01581371, + "balance_loss_mlp": 1.01758146, + "epoch": 0.7316398617165188, + "flos": 21505275077760.0, + "grad_norm": 1.8768507313890366, + "language_loss": 0.82825613, + "learning_rate": 7.089468023710326e-07, + "loss": 0.84920758, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36914062, + "step": 12169, + "time_per_iteration": 2.379197120666504 + }, + { + "auxiliary_loss_clip": 0.01053744, + "auxiliary_loss_mlp": 0.01046064, + "balance_loss_clip": 1.02247262, + "balance_loss_mlp": 1.01684403, + "epoch": 0.7316999849691869, + "flos": 30481235360640.0, + "grad_norm": 1.8613885281469587, + "language_loss": 0.71064311, + "learning_rate": 7.08649380247871e-07, + "loss": 0.73164117, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 12170, + "time_per_iteration": 2.4633593559265137 + }, + { + "auxiliary_loss_clip": 0.01051856, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.01481211, + "balance_loss_mlp": 1.0165441, + "epoch": 0.7317601082218548, + "flos": 21542701921920.0, + "grad_norm": 2.6192195724201386, + "language_loss": 0.71360052, + "learning_rate": 7.083520070928533e-07, + "loss": 0.73452079, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.35351562, + "step": 12171, + "time_per_iteration": 2.3543953895568848 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01041125, + "balance_loss_clip": 1.01886833, + "balance_loss_mlp": 1.01693487, + "epoch": 0.7318202314745228, + "flos": 33250301890560.0, + "grad_norm": 1.7258000291085507, + "language_loss": 0.67323607, + "learning_rate": 7.080546829172564e-07, + "loss": 0.69416463, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 12172, + "time_per_iteration": 2.456505298614502 + }, + { + "auxiliary_loss_clip": 0.01052659, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.00944674, + "balance_loss_mlp": 1.01609218, + "epoch": 0.7318803547271907, + "flos": 20156021608320.0, + "grad_norm": 2.334987018147699, + "language_loss": 0.64571899, + "learning_rate": 7.077574077323564e-07, + "loss": 0.66656715, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36523438, + "step": 12173, + "time_per_iteration": 2.3396220207214355 + }, + { + "auxiliary_loss_clip": 0.01053238, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.01395774, + "balance_loss_mlp": 1.01696622, + "epoch": 0.7319404779798587, + "flos": 20557487865600.0, + "grad_norm": 2.267903788203601, + "language_loss": 0.75873303, + "learning_rate": 7.074601815494243e-07, + "loss": 0.77962756, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36132812, + "step": 12174, + "time_per_iteration": 3.8076095581054688 + }, + { + "auxiliary_loss_clip": 0.01051358, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.01522815, + "balance_loss_mlp": 1.01625562, + "epoch": 0.7320006012325266, + "flos": 28694101219200.0, + "grad_norm": 1.547452310288421, + "language_loss": 0.819929, + "learning_rate": 7.071630043797317e-07, + "loss": 0.84080887, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.3515625, + "step": 12175, + "time_per_iteration": 2.461665630340576 + }, + { + "auxiliary_loss_clip": 0.01052446, + "auxiliary_loss_mlp": 0.01034034, + "balance_loss_clip": 1.01143169, + "balance_loss_mlp": 1.01634276, + "epoch": 0.7320607244851947, + "flos": 16361765648640.0, + "grad_norm": 2.111999908178241, + "language_loss": 0.776474, + "learning_rate": 7.068658762345488e-07, + "loss": 0.79733872, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 12176, + "time_per_iteration": 2.327890157699585 + }, + { + "auxiliary_loss_clip": 0.01053281, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.0119133, + "balance_loss_mlp": 1.01679754, + "epoch": 0.7321208477378626, + "flos": 20954171266560.0, + "grad_norm": 1.8435931236284973, + "language_loss": 0.77361143, + "learning_rate": 7.065687971251399e-07, + "loss": 0.79448509, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36523438, + "step": 12177, + "time_per_iteration": 2.3974416255950928 + }, + { + "auxiliary_loss_clip": 0.01051764, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.00880766, + "balance_loss_mlp": 1.01565683, + "epoch": 0.7321809709905306, + "flos": 13844213619840.0, + "grad_norm": 1.9741330530629164, + "language_loss": 0.75333691, + "learning_rate": 7.06271767062772e-07, + "loss": 0.77416825, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.359375, + "step": 12178, + "time_per_iteration": 2.322317123413086 + }, + { + "auxiliary_loss_clip": 0.01054516, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.0117135, + "balance_loss_mlp": 1.01707816, + "epoch": 0.7322410942431986, + "flos": 26978713655040.0, + "grad_norm": 1.947493245643442, + "language_loss": 0.83851564, + "learning_rate": 7.059747860587084e-07, + "loss": 0.85940719, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.375, + "step": 12179, + "time_per_iteration": 2.425567626953125 + }, + { + "auxiliary_loss_clip": 0.01050267, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.01633978, + "balance_loss_mlp": 1.01640153, + "epoch": 0.7323012174958665, + "flos": 17638748870400.0, + "grad_norm": 1.9329409004327058, + "language_loss": 0.76255906, + "learning_rate": 7.056778541242115e-07, + "loss": 0.78342783, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33789062, + "step": 12180, + "time_per_iteration": 2.348221778869629 + }, + { + "auxiliary_loss_clip": 0.01053748, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.0101577, + "balance_loss_mlp": 1.01618111, + "epoch": 0.7323613407485345, + "flos": 32341407799680.0, + "grad_norm": 1.8456225227488154, + "language_loss": 0.80346775, + "learning_rate": 7.053809712705396e-07, + "loss": 0.82434475, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.375, + "step": 12181, + "time_per_iteration": 2.462688446044922 + }, + { + "auxiliary_loss_clip": 0.0105467, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.0141573, + "balance_loss_mlp": 1.01746368, + "epoch": 0.7324214640012024, + "flos": 18361975017600.0, + "grad_norm": 1.7364640988605713, + "language_loss": 0.73097569, + "learning_rate": 7.050841375089506e-07, + "loss": 0.75189847, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37304688, + "step": 12182, + "time_per_iteration": 2.3621416091918945 + }, + { + "auxiliary_loss_clip": 0.0105507, + "auxiliary_loss_mlp": 0.0104058, + "balance_loss_clip": 1.01682162, + "balance_loss_mlp": 1.01757395, + "epoch": 0.7324815872538705, + "flos": 30810920129280.0, + "grad_norm": 1.4970771806136376, + "language_loss": 0.72297704, + "learning_rate": 7.047873528507015e-07, + "loss": 0.74393356, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.375, + "step": 12183, + "time_per_iteration": 2.4357213973999023 + }, + { + "auxiliary_loss_clip": 0.01054908, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.02100956, + "balance_loss_mlp": 1.01730359, + "epoch": 0.7325417105065384, + "flos": 21504053180160.0, + "grad_norm": 1.7723969659481718, + "language_loss": 0.74042225, + "learning_rate": 7.04490617307045e-07, + "loss": 0.76142246, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 12184, + "time_per_iteration": 2.3853797912597656 + }, + { + "auxiliary_loss_clip": 0.01008296, + "auxiliary_loss_mlp": 0.01003445, + "balance_loss_clip": 1.00095379, + "balance_loss_mlp": 1.00143504, + "epoch": 0.7326018337592064, + "flos": 67254447191040.0, + "grad_norm": 0.7605067815141181, + "language_loss": 0.65228069, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67239809, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.06835938, + "step": 12185, + "time_per_iteration": 3.014910936355591 + }, + { + "auxiliary_loss_clip": 0.01052596, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.01127863, + "balance_loss_mlp": 1.01544738, + "epoch": 0.7326619570118743, + "flos": 22855959912960.0, + "grad_norm": 1.9530815698409731, + "language_loss": 0.82324749, + "learning_rate": 7.038972936085197e-07, + "loss": 0.84412229, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 12186, + "time_per_iteration": 2.363873243331909 + }, + { + "auxiliary_loss_clip": 0.01051773, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.01674998, + "balance_loss_mlp": 1.01568437, + "epoch": 0.7327220802645423, + "flos": 23326484572800.0, + "grad_norm": 1.735707726524004, + "language_loss": 0.74601215, + "learning_rate": 7.036007054761508e-07, + "loss": 0.76693749, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 12187, + "time_per_iteration": 2.3823039531707764 + }, + { + "auxiliary_loss_clip": 0.01055259, + "auxiliary_loss_mlp": 0.01046803, + "balance_loss_clip": 1.02077949, + "balance_loss_mlp": 1.01758313, + "epoch": 0.7327822035172102, + "flos": 23179674839040.0, + "grad_norm": 1.8825897214537683, + "language_loss": 0.90903044, + "learning_rate": 7.033041665033716e-07, + "loss": 0.93005109, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37695312, + "step": 12188, + "time_per_iteration": 2.393327474594116 + }, + { + "auxiliary_loss_clip": 0.01054172, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.01075792, + "balance_loss_mlp": 1.01679325, + "epoch": 0.7328423267698783, + "flos": 21065613926400.0, + "grad_norm": 1.9523384078448847, + "language_loss": 0.75984287, + "learning_rate": 7.030076767014284e-07, + "loss": 0.78074253, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37304688, + "step": 12189, + "time_per_iteration": 2.364394187927246 + }, + { + "auxiliary_loss_clip": 0.01052379, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.01014936, + "balance_loss_mlp": 1.01580441, + "epoch": 0.7329024500225462, + "flos": 21688499226240.0, + "grad_norm": 5.0639985530296885, + "language_loss": 0.83145559, + "learning_rate": 7.027112360815648e-07, + "loss": 0.8523047, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36523438, + "step": 12190, + "time_per_iteration": 2.418689012527466 + }, + { + "auxiliary_loss_clip": 0.01053529, + "auxiliary_loss_mlp": 0.01041506, + "balance_loss_clip": 1.01746178, + "balance_loss_mlp": 1.01660228, + "epoch": 0.7329625732752142, + "flos": 24163073504640.0, + "grad_norm": 1.719604316873844, + "language_loss": 0.73007613, + "learning_rate": 7.024148446550204e-07, + "loss": 0.75102645, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 12191, + "time_per_iteration": 2.455463409423828 + }, + { + "auxiliary_loss_clip": 0.01052999, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.01166511, + "balance_loss_mlp": 1.01664186, + "epoch": 0.7330226965278822, + "flos": 30076696903680.0, + "grad_norm": 2.183197278769759, + "language_loss": 0.6996612, + "learning_rate": 7.021185024330361e-07, + "loss": 0.72053707, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 12192, + "time_per_iteration": 2.443463087081909 + }, + { + "auxiliary_loss_clip": 0.01050877, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.01320708, + "balance_loss_mlp": 1.01564825, + "epoch": 0.7330828197805501, + "flos": 23367158173440.0, + "grad_norm": 1.737783957420689, + "language_loss": 0.74518692, + "learning_rate": 7.01822209426848e-07, + "loss": 0.7660467, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 12193, + "time_per_iteration": 2.379871368408203 + }, + { + "auxiliary_loss_clip": 0.01052805, + "auxiliary_loss_mlp": 0.01040914, + "balance_loss_clip": 1.01530766, + "balance_loss_mlp": 1.01601148, + "epoch": 0.7331429430332181, + "flos": 21031748040960.0, + "grad_norm": 1.7159705974222188, + "language_loss": 0.78532863, + "learning_rate": 7.015259656476911e-07, + "loss": 0.80626583, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 12194, + "time_per_iteration": 2.3885979652404785 + }, + { + "auxiliary_loss_clip": 0.01051723, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.012362, + "balance_loss_mlp": 1.01627731, + "epoch": 0.733203066285886, + "flos": 14647006488960.0, + "grad_norm": 1.6587890429035141, + "language_loss": 0.71204573, + "learning_rate": 7.012297711067998e-07, + "loss": 0.7329157, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 12195, + "time_per_iteration": 2.3389620780944824 + }, + { + "auxiliary_loss_clip": 0.01054518, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.01989269, + "balance_loss_mlp": 1.0173378, + "epoch": 0.7332631895385541, + "flos": 17164349049600.0, + "grad_norm": 1.8641467648867205, + "language_loss": 0.73723984, + "learning_rate": 7.009336258154057e-07, + "loss": 0.75821114, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.37109375, + "step": 12196, + "time_per_iteration": 2.391292095184326 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.01519287, + "balance_loss_mlp": 1.01700914, + "epoch": 0.733323312791222, + "flos": 28656883843200.0, + "grad_norm": 2.3533086309267235, + "language_loss": 0.7271263, + "learning_rate": 7.006375297847394e-07, + "loss": 0.7480334, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 12197, + "time_per_iteration": 2.405515193939209 + }, + { + "auxiliary_loss_clip": 0.01055349, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_clip": 1.01840997, + "balance_loss_mlp": 1.01646924, + "epoch": 0.73338343604389, + "flos": 16617469512960.0, + "grad_norm": 2.037183122961024, + "language_loss": 0.79669297, + "learning_rate": 7.003414830260282e-07, + "loss": 0.81769371, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 12198, + "time_per_iteration": 3.607362747192383 + }, + { + "auxiliary_loss_clip": 0.01052574, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.01674032, + "balance_loss_mlp": 1.01630616, + "epoch": 0.7334435592965579, + "flos": 21141026196480.0, + "grad_norm": 1.884267483486003, + "language_loss": 0.75528157, + "learning_rate": 7.000454855504974e-07, + "loss": 0.776196, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.36328125, + "step": 12199, + "time_per_iteration": 2.3580985069274902 + }, + { + "auxiliary_loss_clip": 0.01054562, + "auxiliary_loss_mlp": 0.01044532, + "balance_loss_clip": 1.01823473, + "balance_loss_mlp": 1.01743269, + "epoch": 0.7335036825492259, + "flos": 17124478410240.0, + "grad_norm": 2.5164756158998767, + "language_loss": 0.78080463, + "learning_rate": 6.997495373693729e-07, + "loss": 0.80179554, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37109375, + "step": 12200, + "time_per_iteration": 2.3273849487304688 + }, + { + "auxiliary_loss_clip": 0.01051373, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.01032412, + "balance_loss_mlp": 1.01621163, + "epoch": 0.7335638058018938, + "flos": 23730708827520.0, + "grad_norm": 1.556493829591751, + "language_loss": 0.62739575, + "learning_rate": 6.994536384938754e-07, + "loss": 0.64822847, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 12201, + "time_per_iteration": 3.8360352516174316 + }, + { + "auxiliary_loss_clip": 0.01052502, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.01073182, + "balance_loss_mlp": 1.01740646, + "epoch": 0.7336239290545619, + "flos": 34931858480640.0, + "grad_norm": 2.595453945809166, + "language_loss": 0.53314841, + "learning_rate": 6.991577889352264e-07, + "loss": 0.55400145, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 12202, + "time_per_iteration": 2.47965145111084 + }, + { + "auxiliary_loss_clip": 0.01051467, + "auxiliary_loss_mlp": 0.01034122, + "balance_loss_clip": 1.01174641, + "balance_loss_mlp": 1.01652765, + "epoch": 0.7336840523072298, + "flos": 21102063252480.0, + "grad_norm": 1.9934573211191406, + "language_loss": 0.6980474, + "learning_rate": 6.98861988704645e-07, + "loss": 0.7189033, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34960938, + "step": 12203, + "time_per_iteration": 3.8799729347229004 + }, + { + "auxiliary_loss_clip": 0.01054454, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.01986265, + "balance_loss_mlp": 1.01666427, + "epoch": 0.7337441755598978, + "flos": 24023280954240.0, + "grad_norm": 1.9727383820314361, + "language_loss": 0.67507905, + "learning_rate": 6.985662378133474e-07, + "loss": 0.69605422, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 12204, + "time_per_iteration": 2.376086711883545 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.01921582, + "balance_loss_mlp": 1.01751113, + "epoch": 0.7338042988125658, + "flos": 22710197520000.0, + "grad_norm": 1.955522457370784, + "language_loss": 0.78340423, + "learning_rate": 6.982705362725479e-07, + "loss": 0.80434042, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35546875, + "step": 12205, + "time_per_iteration": 2.3886218070983887 + }, + { + "auxiliary_loss_clip": 0.01051952, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.00984645, + "balance_loss_mlp": 1.01759326, + "epoch": 0.7338644220652337, + "flos": 21359931621120.0, + "grad_norm": 1.8829385843235025, + "language_loss": 0.80917513, + "learning_rate": 6.979748840934601e-07, + "loss": 0.82999361, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.34375, + "step": 12206, + "time_per_iteration": 2.3651058673858643 + }, + { + "auxiliary_loss_clip": 0.01051852, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.01250601, + "balance_loss_mlp": 1.01602471, + "epoch": 0.7339245453179017, + "flos": 30918906564480.0, + "grad_norm": 1.8834612871506102, + "language_loss": 0.71910298, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73996896, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 12207, + "time_per_iteration": 2.443502426147461 + }, + { + "auxiliary_loss_clip": 0.01008279, + "auxiliary_loss_mlp": 0.01004129, + "balance_loss_clip": 1.00162613, + "balance_loss_mlp": 1.00139499, + "epoch": 0.7339846685705697, + "flos": 67896535605120.0, + "grad_norm": 0.7836927948084487, + "language_loss": 0.54856026, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56868434, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.06933594, + "step": 12208, + "time_per_iteration": 3.1158335208892822 + }, + { + "auxiliary_loss_clip": 0.01050794, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.01441455, + "balance_loss_mlp": 1.01599085, + "epoch": 0.7340447918232377, + "flos": 22235658053760.0, + "grad_norm": 1.6814089464009954, + "language_loss": 0.81096482, + "learning_rate": 6.970882238385703e-07, + "loss": 0.8318274, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34765625, + "step": 12209, + "time_per_iteration": 2.3908092975616455 + }, + { + "auxiliary_loss_clip": 0.01048936, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.01307082, + "balance_loss_mlp": 1.01491308, + "epoch": 0.7341049150759056, + "flos": 23763771751680.0, + "grad_norm": 1.3918954254862488, + "language_loss": 0.79723406, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81805432, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.33984375, + "step": 12210, + "time_per_iteration": 2.3856420516967773 + }, + { + "auxiliary_loss_clip": 0.01049058, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.0136416, + "balance_loss_mlp": 1.01515973, + "epoch": 0.7341650383285736, + "flos": 17235641779200.0, + "grad_norm": 1.8960322664534088, + "language_loss": 0.77883339, + "learning_rate": 6.964973640160236e-07, + "loss": 0.79968619, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.33789062, + "step": 12211, + "time_per_iteration": 2.3717644214630127 + }, + { + "auxiliary_loss_clip": 0.01053206, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.01474667, + "balance_loss_mlp": 1.01695919, + "epoch": 0.7342251615812415, + "flos": 23402839449600.0, + "grad_norm": 2.3598813747762333, + "language_loss": 0.73243237, + "learning_rate": 6.962020082425748e-07, + "loss": 0.75334102, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 12212, + "time_per_iteration": 2.3778247833251953 + }, + { + "auxiliary_loss_clip": 0.0105293, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.01365888, + "balance_loss_mlp": 1.01724255, + "epoch": 0.7342852848339095, + "flos": 22746088264320.0, + "grad_norm": 1.6016104630095909, + "language_loss": 0.70209485, + "learning_rate": 6.959067019092766e-07, + "loss": 0.72298473, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 12213, + "time_per_iteration": 3.874692678451538 + }, + { + "auxiliary_loss_clip": 0.01008047, + "auxiliary_loss_mlp": 0.01005986, + "balance_loss_clip": 1.00361383, + "balance_loss_mlp": 1.00124168, + "epoch": 0.7343454080865774, + "flos": 53939376270720.0, + "grad_norm": 0.7343095905964789, + "language_loss": 0.54396403, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56410432, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.06835938, + "step": 12214, + "time_per_iteration": 2.9529306888580322 + }, + { + "auxiliary_loss_clip": 0.01053602, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.01282489, + "balance_loss_mlp": 1.01646411, + "epoch": 0.7344055313392455, + "flos": 12166043431680.0, + "grad_norm": 1.9971553792133507, + "language_loss": 0.71083963, + "learning_rate": 6.953162376079233e-07, + "loss": 0.73173338, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.37109375, + "step": 12215, + "time_per_iteration": 2.3388352394104004 + }, + { + "auxiliary_loss_clip": 0.0105, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.01701164, + "balance_loss_mlp": 1.01632762, + "epoch": 0.7344656545919134, + "flos": 18549109238400.0, + "grad_norm": 2.9585043399433752, + "language_loss": 0.73858917, + "learning_rate": 6.950210796622573e-07, + "loss": 0.75945497, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.3359375, + "step": 12216, + "time_per_iteration": 2.403745651245117 + }, + { + "auxiliary_loss_clip": 0.01056149, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.0153079, + "balance_loss_mlp": 1.01737404, + "epoch": 0.7345257778445814, + "flos": 23660463438720.0, + "grad_norm": 1.7618868782017676, + "language_loss": 0.78903764, + "learning_rate": 6.947259712015236e-07, + "loss": 0.81002986, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38671875, + "step": 12217, + "time_per_iteration": 2.4626104831695557 + }, + { + "auxiliary_loss_clip": 0.01049347, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.01360869, + "balance_loss_mlp": 1.01526749, + "epoch": 0.7345859010972494, + "flos": 13807799205120.0, + "grad_norm": 2.0962944343401486, + "language_loss": 0.79129952, + "learning_rate": 6.94430912236911e-07, + "loss": 0.81212538, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.34179688, + "step": 12218, + "time_per_iteration": 2.399146318435669 + }, + { + "auxiliary_loss_clip": 0.01050191, + "auxiliary_loss_mlp": 0.01039939, + "balance_loss_clip": 1.01690805, + "balance_loss_mlp": 1.01588631, + "epoch": 0.7346460243499173, + "flos": 22271653532160.0, + "grad_norm": 1.978918353028586, + "language_loss": 0.73620117, + "learning_rate": 6.941359027796092e-07, + "loss": 0.75710249, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34375, + "step": 12219, + "time_per_iteration": 2.3605458736419678 + }, + { + "auxiliary_loss_clip": 0.01049852, + "auxiliary_loss_mlp": 0.01037943, + "balance_loss_clip": 1.01743901, + "balance_loss_mlp": 1.01555943, + "epoch": 0.7347061476025853, + "flos": 23254214325120.0, + "grad_norm": 1.7317830754446983, + "language_loss": 0.76024616, + "learning_rate": 6.938409428408061e-07, + "loss": 0.78112411, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34375, + "step": 12220, + "time_per_iteration": 2.407951593399048 + }, + { + "auxiliary_loss_clip": 0.01053983, + "auxiliary_loss_mlp": 0.01040517, + "balance_loss_clip": 1.01653183, + "balance_loss_mlp": 1.01720893, + "epoch": 0.7347662708552533, + "flos": 15266679943680.0, + "grad_norm": 1.6072022240743673, + "language_loss": 0.67095149, + "learning_rate": 6.93546032431684e-07, + "loss": 0.6918965, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 12221, + "time_per_iteration": 2.374692440032959 + }, + { + "auxiliary_loss_clip": 0.0105149, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.01007903, + "balance_loss_mlp": 1.01618361, + "epoch": 0.7348263941079213, + "flos": 24858927279360.0, + "grad_norm": 1.7441848018998098, + "language_loss": 0.70759952, + "learning_rate": 6.932511715634273e-07, + "loss": 0.72842669, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.35351562, + "step": 12222, + "time_per_iteration": 2.426853895187378 + }, + { + "auxiliary_loss_clip": 0.01050748, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.0148294, + "balance_loss_mlp": 1.01525319, + "epoch": 0.7348865173605892, + "flos": 24350975775360.0, + "grad_norm": 1.6314777475129116, + "language_loss": 0.67379713, + "learning_rate": 6.92956360247217e-07, + "loss": 0.69465697, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.35546875, + "step": 12223, + "time_per_iteration": 2.379265308380127 + }, + { + "auxiliary_loss_clip": 0.01052099, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.01171756, + "balance_loss_mlp": 1.0161351, + "epoch": 0.7349466406132572, + "flos": 20003765702400.0, + "grad_norm": 1.837513783058394, + "language_loss": 0.73646593, + "learning_rate": 6.926615984942332e-07, + "loss": 0.75732368, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.359375, + "step": 12224, + "time_per_iteration": 2.392704725265503 + }, + { + "auxiliary_loss_clip": 0.01053925, + "auxiliary_loss_mlp": 0.01040791, + "balance_loss_clip": 1.01722288, + "balance_loss_mlp": 1.01699567, + "epoch": 0.7350067638659251, + "flos": 29823785948160.0, + "grad_norm": 1.6863028748354423, + "language_loss": 0.73683965, + "learning_rate": 6.92366886315652e-07, + "loss": 0.75778681, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36914062, + "step": 12225, + "time_per_iteration": 2.436452627182007 + }, + { + "auxiliary_loss_clip": 0.01054899, + "auxiliary_loss_mlp": 0.01044066, + "balance_loss_clip": 1.01902032, + "balance_loss_mlp": 1.01638544, + "epoch": 0.7350668871185931, + "flos": 21865229861760.0, + "grad_norm": 1.7156839668408816, + "language_loss": 0.77315736, + "learning_rate": 6.920722237226501e-07, + "loss": 0.79414696, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38671875, + "step": 12226, + "time_per_iteration": 2.395659923553467 + }, + { + "auxiliary_loss_clip": 0.01051302, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.01331842, + "balance_loss_mlp": 1.01654935, + "epoch": 0.735127010371261, + "flos": 22564993708800.0, + "grad_norm": 5.93572330296042, + "language_loss": 0.67975837, + "learning_rate": 6.917776107264008e-07, + "loss": 0.70062894, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 12227, + "time_per_iteration": 2.3797426223754883 + }, + { + "auxiliary_loss_clip": 0.01051227, + "auxiliary_loss_mlp": 0.0104036, + "balance_loss_clip": 1.01684022, + "balance_loss_mlp": 1.0151732, + "epoch": 0.7351871336239291, + "flos": 25883174102400.0, + "grad_norm": 1.4133334840830094, + "language_loss": 0.64713699, + "learning_rate": 6.914830473380749e-07, + "loss": 0.66805279, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36132812, + "step": 12228, + "time_per_iteration": 2.4294989109039307 + }, + { + "auxiliary_loss_clip": 0.0105198, + "auxiliary_loss_mlp": 0.0103703, + "balance_loss_clip": 1.01610911, + "balance_loss_mlp": 1.01611853, + "epoch": 0.735247256876597, + "flos": 17931181351680.0, + "grad_norm": 2.0640722392032664, + "language_loss": 0.64144838, + "learning_rate": 6.911885335688427e-07, + "loss": 0.66233844, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.359375, + "step": 12229, + "time_per_iteration": 2.387708902359009 + }, + { + "auxiliary_loss_clip": 0.01053382, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.0171535, + "balance_loss_mlp": 1.01610565, + "epoch": 0.735307380129265, + "flos": 28873938965760.0, + "grad_norm": 3.9099454884027876, + "language_loss": 0.74756289, + "learning_rate": 6.908940694298726e-07, + "loss": 0.76849449, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.37109375, + "step": 12230, + "time_per_iteration": 2.401040554046631 + }, + { + "auxiliary_loss_clip": 0.01054022, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.01556957, + "balance_loss_mlp": 1.01680923, + "epoch": 0.7353675033819329, + "flos": 13624819436160.0, + "grad_norm": 1.9864673203866536, + "language_loss": 0.74145848, + "learning_rate": 6.90599654932332e-07, + "loss": 0.76240057, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 12231, + "time_per_iteration": 2.387418031692505 + }, + { + "auxiliary_loss_clip": 0.01054153, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.01382339, + "balance_loss_mlp": 1.01661325, + "epoch": 0.7354276266346009, + "flos": 19462087958400.0, + "grad_norm": 2.8744683338816115, + "language_loss": 0.64734149, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66828048, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 12232, + "time_per_iteration": 2.3440961837768555 + }, + { + "auxiliary_loss_clip": 0.01053717, + "auxiliary_loss_mlp": 0.01040887, + "balance_loss_clip": 1.01764131, + "balance_loss_mlp": 1.01691973, + "epoch": 0.735487749887269, + "flos": 15771140311680.0, + "grad_norm": 1.8134867098816378, + "language_loss": 0.76310432, + "learning_rate": 6.900109749061874e-07, + "loss": 0.78405035, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 12233, + "time_per_iteration": 2.3689301013946533 + }, + { + "auxiliary_loss_clip": 0.01052566, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.01519465, + "balance_loss_mlp": 1.01618838, + "epoch": 0.7355478731399369, + "flos": 18259644222720.0, + "grad_norm": 1.5434505874737896, + "language_loss": 0.74848628, + "learning_rate": 6.897167093999079e-07, + "loss": 0.7694034, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 12234, + "time_per_iteration": 2.367719888687134 + }, + { + "auxiliary_loss_clip": 0.01053315, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.01450157, + "balance_loss_mlp": 1.01668847, + "epoch": 0.7356079963926049, + "flos": 26540832983040.0, + "grad_norm": 2.0209935084659216, + "language_loss": 0.60912573, + "learning_rate": 6.894224935797017e-07, + "loss": 0.63002944, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3671875, + "step": 12235, + "time_per_iteration": 2.4186418056488037 + }, + { + "auxiliary_loss_clip": 0.01052532, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.01424336, + "balance_loss_mlp": 1.01699376, + "epoch": 0.7356681196452728, + "flos": 10777896840960.0, + "grad_norm": 2.1215473747761355, + "language_loss": 0.87502873, + "learning_rate": 6.891283274567259e-07, + "loss": 0.89592397, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 12236, + "time_per_iteration": 2.330629348754883 + }, + { + "auxiliary_loss_clip": 0.0105279, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.01107192, + "balance_loss_mlp": 1.01651514, + "epoch": 0.7357282428979408, + "flos": 19717687088640.0, + "grad_norm": 1.8230683079055678, + "language_loss": 0.70427561, + "learning_rate": 6.888342110421364e-07, + "loss": 0.72514045, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 12237, + "time_per_iteration": 2.3682234287261963 + }, + { + "auxiliary_loss_clip": 0.01051647, + "auxiliary_loss_mlp": 0.01034965, + "balance_loss_clip": 1.01370955, + "balance_loss_mlp": 1.01554847, + "epoch": 0.7357883661506087, + "flos": 19462995653760.0, + "grad_norm": 1.868610878331933, + "language_loss": 0.72937906, + "learning_rate": 6.885401443470839e-07, + "loss": 0.75024509, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.36132812, + "step": 12238, + "time_per_iteration": 3.5560007095336914 + }, + { + "auxiliary_loss_clip": 0.01053855, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.0111165, + "balance_loss_mlp": 1.01561105, + "epoch": 0.7358484894032767, + "flos": 27121857696000.0, + "grad_norm": 1.7954210816532172, + "language_loss": 0.73773438, + "learning_rate": 6.882461273827205e-07, + "loss": 0.758623, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3828125, + "step": 12239, + "time_per_iteration": 2.403902769088745 + }, + { + "auxiliary_loss_clip": 0.01050168, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.01551008, + "balance_loss_mlp": 1.01522398, + "epoch": 0.7359086126559446, + "flos": 24501032265600.0, + "grad_norm": 1.4285044965877596, + "language_loss": 0.79952258, + "learning_rate": 6.879521601601954e-07, + "loss": 0.82040334, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 12240, + "time_per_iteration": 3.881805419921875 + }, + { + "auxiliary_loss_clip": 0.01052274, + "auxiliary_loss_mlp": 0.01038891, + "balance_loss_clip": 1.01614594, + "balance_loss_mlp": 1.01683331, + "epoch": 0.7359687359086127, + "flos": 23330150265600.0, + "grad_norm": 1.7159160274811243, + "language_loss": 0.84396267, + "learning_rate": 6.876582426906565e-07, + "loss": 0.8648743, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 12241, + "time_per_iteration": 2.384986639022827 + }, + { + "auxiliary_loss_clip": 0.01051256, + "auxiliary_loss_mlp": 0.01037604, + "balance_loss_clip": 1.01527631, + "balance_loss_mlp": 1.01607335, + "epoch": 0.7360288591612806, + "flos": 20192366200320.0, + "grad_norm": 1.988105886287044, + "language_loss": 0.79796016, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81884873, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3515625, + "step": 12242, + "time_per_iteration": 3.777843952178955 + }, + { + "auxiliary_loss_clip": 0.01051931, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.01481152, + "balance_loss_mlp": 1.01605844, + "epoch": 0.7360889824139486, + "flos": 24971626748160.0, + "grad_norm": 1.7524527978224644, + "language_loss": 0.80462056, + "learning_rate": 6.870705570551145e-07, + "loss": 0.82550639, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 12243, + "time_per_iteration": 2.391782760620117 + }, + { + "auxiliary_loss_clip": 0.01053025, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.01653552, + "balance_loss_mlp": 1.01597738, + "epoch": 0.7361491056666165, + "flos": 15011429927040.0, + "grad_norm": 2.7446895340229305, + "language_loss": 0.76550829, + "learning_rate": 6.867767889113969e-07, + "loss": 0.78644669, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36914062, + "step": 12244, + "time_per_iteration": 2.3524110317230225 + }, + { + "auxiliary_loss_clip": 0.01052638, + "auxiliary_loss_mlp": 0.01040016, + "balance_loss_clip": 1.01623392, + "balance_loss_mlp": 1.0157311, + "epoch": 0.7362092289192845, + "flos": 22929277501440.0, + "grad_norm": 1.7705886594120857, + "language_loss": 0.70750082, + "learning_rate": 6.864830705652347e-07, + "loss": 0.72842741, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 12245, + "time_per_iteration": 2.376838445663452 + }, + { + "auxiliary_loss_clip": 0.01049976, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.01931334, + "balance_loss_mlp": 1.01540983, + "epoch": 0.7362693521719526, + "flos": 20701679247360.0, + "grad_norm": 1.8619028947142098, + "language_loss": 0.75112683, + "learning_rate": 6.861894020277658e-07, + "loss": 0.77205539, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34570312, + "step": 12246, + "time_per_iteration": 2.406189441680908 + }, + { + "auxiliary_loss_clip": 0.01049777, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.01269364, + "balance_loss_mlp": 1.01559448, + "epoch": 0.7363294754246205, + "flos": 13110653710080.0, + "grad_norm": 2.3335912960277003, + "language_loss": 0.74436235, + "learning_rate": 6.858957833101266e-07, + "loss": 0.76519716, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.33984375, + "step": 12247, + "time_per_iteration": 2.4332563877105713 + }, + { + "auxiliary_loss_clip": 0.01050196, + "auxiliary_loss_mlp": 0.01033887, + "balance_loss_clip": 1.01190531, + "balance_loss_mlp": 1.01667261, + "epoch": 0.7363895986772885, + "flos": 14026564984320.0, + "grad_norm": 1.6815085120733877, + "language_loss": 0.75247186, + "learning_rate": 6.856022144234526e-07, + "loss": 0.77331269, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3359375, + "step": 12248, + "time_per_iteration": 2.3795406818389893 + }, + { + "auxiliary_loss_clip": 0.01051232, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.01335323, + "balance_loss_mlp": 1.01483142, + "epoch": 0.7364497219299564, + "flos": 19718943897600.0, + "grad_norm": 1.806204533516511, + "language_loss": 0.74700296, + "learning_rate": 6.853086953788727e-07, + "loss": 0.76789248, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 12249, + "time_per_iteration": 2.352381706237793 + }, + { + "auxiliary_loss_clip": 0.01053517, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.01016498, + "balance_loss_mlp": 1.01706958, + "epoch": 0.7365098451826244, + "flos": 21360315646080.0, + "grad_norm": 2.0574157710162515, + "language_loss": 0.78190631, + "learning_rate": 6.850152261875189e-07, + "loss": 0.80278784, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 12250, + "time_per_iteration": 2.3913090229034424 + }, + { + "auxiliary_loss_clip": 0.01055005, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.01478374, + "balance_loss_mlp": 1.01804972, + "epoch": 0.7365699684352923, + "flos": 23367088350720.0, + "grad_norm": 1.6139780580453422, + "language_loss": 0.72257513, + "learning_rate": 6.8472180686052e-07, + "loss": 0.74351215, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36914062, + "step": 12251, + "time_per_iteration": 2.381366729736328 + }, + { + "auxiliary_loss_clip": 0.0105063, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.01490259, + "balance_loss_mlp": 1.01600182, + "epoch": 0.7366300916879603, + "flos": 59522758185600.0, + "grad_norm": 1.4654645283759766, + "language_loss": 0.66728556, + "learning_rate": 6.844284374090015e-07, + "loss": 0.68815869, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34570312, + "step": 12252, + "time_per_iteration": 2.7115306854248047 + }, + { + "auxiliary_loss_clip": 0.01056293, + "auxiliary_loss_mlp": 0.01037484, + "balance_loss_clip": 1.01433349, + "balance_loss_mlp": 1.01843596, + "epoch": 0.7366902149406283, + "flos": 20922085860480.0, + "grad_norm": 1.5278282315981666, + "language_loss": 0.80125546, + "learning_rate": 6.841351178440884e-07, + "loss": 0.82219326, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37890625, + "step": 12253, + "time_per_iteration": 2.384110450744629 + }, + { + "auxiliary_loss_clip": 0.01049083, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.01014447, + "balance_loss_mlp": 1.01531208, + "epoch": 0.7367503381932963, + "flos": 17347189173120.0, + "grad_norm": 2.0454598331339553, + "language_loss": 0.77916348, + "learning_rate": 6.83841848176905e-07, + "loss": 0.79996514, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33789062, + "step": 12254, + "time_per_iteration": 3.831697702407837 + }, + { + "auxiliary_loss_clip": 0.01052966, + "auxiliary_loss_mlp": 0.01038148, + "balance_loss_clip": 1.01555777, + "balance_loss_mlp": 1.01651001, + "epoch": 0.7368104614459642, + "flos": 17820367096320.0, + "grad_norm": 2.4543238737404782, + "language_loss": 0.7025193, + "learning_rate": 6.835486284185692e-07, + "loss": 0.72343045, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 12255, + "time_per_iteration": 2.35434889793396 + }, + { + "auxiliary_loss_clip": 0.01051832, + "auxiliary_loss_mlp": 0.01036139, + "balance_loss_clip": 1.01298845, + "balance_loss_mlp": 1.01585078, + "epoch": 0.7368705846986322, + "flos": 24605003894400.0, + "grad_norm": 2.0955857525256967, + "language_loss": 0.76241112, + "learning_rate": 6.832554585802012e-07, + "loss": 0.78329074, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 12256, + "time_per_iteration": 2.3822531700134277 + }, + { + "auxiliary_loss_clip": 0.01052375, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.01258755, + "balance_loss_mlp": 1.01661706, + "epoch": 0.7369307079513001, + "flos": 34968726743040.0, + "grad_norm": 1.766669201740906, + "language_loss": 0.74661982, + "learning_rate": 6.829623386729182e-07, + "loss": 0.76748955, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 12257, + "time_per_iteration": 2.4921045303344727 + }, + { + "auxiliary_loss_clip": 0.01050517, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.01669943, + "balance_loss_mlp": 1.01544726, + "epoch": 0.7369908312039681, + "flos": 21213540823680.0, + "grad_norm": 1.4546555039469709, + "language_loss": 0.78687739, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80776489, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 12258, + "time_per_iteration": 2.3701868057250977 + }, + { + "auxiliary_loss_clip": 0.01053172, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.01633239, + "balance_loss_mlp": 1.0168792, + "epoch": 0.7370509544566362, + "flos": 23622512924160.0, + "grad_norm": 1.4677573794601317, + "language_loss": 0.66994655, + "learning_rate": 6.823762486960674e-07, + "loss": 0.6908704, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 12259, + "time_per_iteration": 2.3873655796051025 + }, + { + "auxiliary_loss_clip": 0.01053431, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.01388669, + "balance_loss_mlp": 1.01676416, + "epoch": 0.7371110777093041, + "flos": 24826527671040.0, + "grad_norm": 1.760069963994392, + "language_loss": 0.742423, + "learning_rate": 6.820832786487225e-07, + "loss": 0.76332098, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3671875, + "step": 12260, + "time_per_iteration": 2.4112956523895264 + }, + { + "auxiliary_loss_clip": 0.01053208, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.01399183, + "balance_loss_mlp": 1.01635218, + "epoch": 0.7371712009619721, + "flos": 23148357482880.0, + "grad_norm": 1.5989140688923762, + "language_loss": 0.74011093, + "learning_rate": 6.817903585769125e-07, + "loss": 0.76101422, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36914062, + "step": 12261, + "time_per_iteration": 2.3695507049560547 + }, + { + "auxiliary_loss_clip": 0.01054639, + "auxiliary_loss_mlp": 0.01041121, + "balance_loss_clip": 1.0143106, + "balance_loss_mlp": 1.01663256, + "epoch": 0.73723132421464, + "flos": 23111768511360.0, + "grad_norm": 2.2526523443432573, + "language_loss": 0.69152111, + "learning_rate": 6.814974884917438e-07, + "loss": 0.7124787, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38085938, + "step": 12262, + "time_per_iteration": 2.3942551612854004 + }, + { + "auxiliary_loss_clip": 0.01052737, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.00991678, + "balance_loss_mlp": 1.01557398, + "epoch": 0.737291447467308, + "flos": 19272544853760.0, + "grad_norm": 2.0624457893724824, + "language_loss": 0.9003275, + "learning_rate": 6.81204668404322e-07, + "loss": 0.92120147, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37109375, + "step": 12263, + "time_per_iteration": 2.3306210041046143 + }, + { + "auxiliary_loss_clip": 0.01047675, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.01203668, + "balance_loss_mlp": 1.01466227, + "epoch": 0.7373515707199759, + "flos": 25117109850240.0, + "grad_norm": 1.5315135299265008, + "language_loss": 0.68197447, + "learning_rate": 6.809118983257522e-07, + "loss": 0.70276982, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.33007812, + "step": 12264, + "time_per_iteration": 2.428191900253296 + }, + { + "auxiliary_loss_clip": 0.01050722, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.01296973, + "balance_loss_mlp": 1.01593041, + "epoch": 0.737411693972644, + "flos": 32407324179840.0, + "grad_norm": 1.729231402033943, + "language_loss": 0.81209403, + "learning_rate": 6.806191782671356e-07, + "loss": 0.83295065, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 12265, + "time_per_iteration": 2.460345983505249 + }, + { + "auxiliary_loss_clip": 0.01055367, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.0181179, + "balance_loss_mlp": 1.01718533, + "epoch": 0.7374718172253119, + "flos": 24314666094720.0, + "grad_norm": 1.5538462428975695, + "language_loss": 0.74857712, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76955634, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3828125, + "step": 12266, + "time_per_iteration": 2.3893086910247803 + }, + { + "auxiliary_loss_clip": 0.0105254, + "auxiliary_loss_mlp": 0.01044463, + "balance_loss_clip": 1.0200851, + "balance_loss_mlp": 1.01621497, + "epoch": 0.7375319404779799, + "flos": 27155060265600.0, + "grad_norm": 1.5790958869998546, + "language_loss": 0.74175572, + "learning_rate": 6.800338882541576e-07, + "loss": 0.76272571, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 12267, + "time_per_iteration": 2.3892602920532227 + }, + { + "auxiliary_loss_clip": 0.01050178, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.01549363, + "balance_loss_mlp": 1.01591337, + "epoch": 0.7375920637306478, + "flos": 18879003475200.0, + "grad_norm": 2.0013710798930773, + "language_loss": 0.84719586, + "learning_rate": 6.797413183219923e-07, + "loss": 0.86806607, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 12268, + "time_per_iteration": 2.363011598587036 + }, + { + "auxiliary_loss_clip": 0.01051609, + "auxiliary_loss_mlp": 0.01041273, + "balance_loss_clip": 1.01787257, + "balance_loss_mlp": 1.01604676, + "epoch": 0.7376521869833158, + "flos": 15668844428160.0, + "grad_norm": 1.7160116861224517, + "language_loss": 0.7399739, + "learning_rate": 6.794487984541677e-07, + "loss": 0.76090276, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 12269, + "time_per_iteration": 2.3553900718688965 + }, + { + "auxiliary_loss_clip": 0.01053231, + "auxiliary_loss_mlp": 0.01039189, + "balance_loss_clip": 1.01556206, + "balance_loss_mlp": 1.01619339, + "epoch": 0.7377123102359837, + "flos": 36970611857280.0, + "grad_norm": 2.042598676428294, + "language_loss": 0.71379024, + "learning_rate": 6.791563286617776e-07, + "loss": 0.73471445, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 12270, + "time_per_iteration": 2.556297540664673 + }, + { + "auxiliary_loss_clip": 0.01049927, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.01343799, + "balance_loss_mlp": 1.01527262, + "epoch": 0.7377724334886517, + "flos": 24495202068480.0, + "grad_norm": 1.5697328621948805, + "language_loss": 0.69989598, + "learning_rate": 6.788639089559119e-07, + "loss": 0.72073269, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34570312, + "step": 12271, + "time_per_iteration": 2.384168863296509 + }, + { + "auxiliary_loss_clip": 0.0105335, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.00871468, + "balance_loss_mlp": 1.01576352, + "epoch": 0.7378325567413198, + "flos": 24388856467200.0, + "grad_norm": 2.2496425068828554, + "language_loss": 0.69117451, + "learning_rate": 6.785715393476586e-07, + "loss": 0.71204025, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37695312, + "step": 12272, + "time_per_iteration": 2.4765048027038574 + }, + { + "auxiliary_loss_clip": 0.01051074, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.01309109, + "balance_loss_mlp": 1.01620948, + "epoch": 0.7378926799939877, + "flos": 17415549348480.0, + "grad_norm": 1.736047022973496, + "language_loss": 0.79134107, + "learning_rate": 6.782792198481049e-07, + "loss": 0.81219375, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34765625, + "step": 12273, + "time_per_iteration": 2.377392530441284 + }, + { + "auxiliary_loss_clip": 0.01050807, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.01111543, + "balance_loss_mlp": 1.01529789, + "epoch": 0.7379528032466557, + "flos": 18473347854720.0, + "grad_norm": 2.7021599139941097, + "language_loss": 0.83717191, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85801619, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 12274, + "time_per_iteration": 2.4021050930023193 + }, + { + "auxiliary_loss_clip": 0.01055698, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.01118815, + "balance_loss_mlp": 1.0176034, + "epoch": 0.7380129264993236, + "flos": 17821030412160.0, + "grad_norm": 1.841075170255253, + "language_loss": 0.7521897, + "learning_rate": 6.776947312194341e-07, + "loss": 0.77312446, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38085938, + "step": 12275, + "time_per_iteration": 2.3512771129608154 + }, + { + "auxiliary_loss_clip": 0.01054942, + "auxiliary_loss_mlp": 0.01039524, + "balance_loss_clip": 1.01511002, + "balance_loss_mlp": 1.01744485, + "epoch": 0.7380730497519916, + "flos": 22996415779200.0, + "grad_norm": 1.8224260122589107, + "language_loss": 0.74575394, + "learning_rate": 6.774025621124813e-07, + "loss": 0.7666986, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 12276, + "time_per_iteration": 2.411278486251831 + }, + { + "auxiliary_loss_clip": 0.01052797, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.01277852, + "balance_loss_mlp": 1.01534688, + "epoch": 0.7381331730046595, + "flos": 20265229941120.0, + "grad_norm": 1.9828965914458558, + "language_loss": 0.7902956, + "learning_rate": 6.771104431585551e-07, + "loss": 0.81120694, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12277, + "time_per_iteration": 3.594449043273926 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.01592648, + "balance_loss_mlp": 1.01627743, + "epoch": 0.7381932962573275, + "flos": 19753542921600.0, + "grad_norm": 1.8253454143029726, + "language_loss": 0.80283105, + "learning_rate": 6.768183743687338e-07, + "loss": 0.82374185, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.34179688, + "step": 12278, + "time_per_iteration": 2.368837833404541 + }, + { + "auxiliary_loss_clip": 0.01051177, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.01253223, + "balance_loss_mlp": 1.01540327, + "epoch": 0.7382534195099955, + "flos": 17304525624960.0, + "grad_norm": 2.3104646252812366, + "language_loss": 0.72945493, + "learning_rate": 6.765263557540921e-07, + "loss": 0.75032258, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 12279, + "time_per_iteration": 2.3451077938079834 + }, + { + "auxiliary_loss_clip": 0.01052557, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.00994825, + "balance_loss_mlp": 1.01505566, + "epoch": 0.7383135427626635, + "flos": 18696372819840.0, + "grad_norm": 2.0846026821050065, + "language_loss": 0.8711642, + "learning_rate": 6.762343873257034e-07, + "loss": 0.89204174, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 12280, + "time_per_iteration": 3.6808841228485107 + }, + { + "auxiliary_loss_clip": 0.01053169, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.00904989, + "balance_loss_mlp": 1.01703107, + "epoch": 0.7383736660153314, + "flos": 20880399830400.0, + "grad_norm": 1.841172642786682, + "language_loss": 0.73714989, + "learning_rate": 6.759424690946408e-07, + "loss": 0.75800288, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 12281, + "time_per_iteration": 3.769367218017578 + }, + { + "auxiliary_loss_clip": 0.01052602, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.0145911, + "balance_loss_mlp": 1.01557302, + "epoch": 0.7384337892679994, + "flos": 20662297367040.0, + "grad_norm": 2.307664295285107, + "language_loss": 0.61993289, + "learning_rate": 6.756506010719711e-07, + "loss": 0.64084238, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 12282, + "time_per_iteration": 2.3468971252441406 + }, + { + "auxiliary_loss_clip": 0.01052085, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.01375651, + "balance_loss_mlp": 1.01537371, + "epoch": 0.7384939125206673, + "flos": 29168326483200.0, + "grad_norm": 1.7642854426265835, + "language_loss": 0.68837154, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70927763, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 12283, + "time_per_iteration": 2.440141439437866 + }, + { + "auxiliary_loss_clip": 0.01052263, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.01198626, + "balance_loss_mlp": 1.0161078, + "epoch": 0.7385540357733353, + "flos": 36311556522240.0, + "grad_norm": 1.8412387816742841, + "language_loss": 0.77219319, + "learning_rate": 6.750670156960832e-07, + "loss": 0.79305673, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36132812, + "step": 12284, + "time_per_iteration": 2.476750612258911 + }, + { + "auxiliary_loss_clip": 0.01052135, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.01284885, + "balance_loss_mlp": 1.01494431, + "epoch": 0.7386141590260034, + "flos": 20301574533120.0, + "grad_norm": 1.789793724573719, + "language_loss": 0.70329249, + "learning_rate": 6.747752983649954e-07, + "loss": 0.72420299, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37109375, + "step": 12285, + "time_per_iteration": 2.3714308738708496 + }, + { + "auxiliary_loss_clip": 0.01055063, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.01436806, + "balance_loss_mlp": 1.01627183, + "epoch": 0.7386742822786713, + "flos": 25482615540480.0, + "grad_norm": 2.0140333893314932, + "language_loss": 0.81012303, + "learning_rate": 6.744836312865602e-07, + "loss": 0.83108509, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38867188, + "step": 12286, + "time_per_iteration": 2.388735771179199 + }, + { + "auxiliary_loss_clip": 0.01049212, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.01242828, + "balance_loss_mlp": 1.01443684, + "epoch": 0.7387344055313393, + "flos": 13771105499520.0, + "grad_norm": 15.410354649919997, + "language_loss": 0.66510284, + "learning_rate": 6.741920144718396e-07, + "loss": 0.68593979, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 12287, + "time_per_iteration": 2.3568530082702637 + }, + { + "auxiliary_loss_clip": 0.01050315, + "auxiliary_loss_mlp": 0.01037401, + "balance_loss_clip": 1.01549077, + "balance_loss_mlp": 1.0150317, + "epoch": 0.7387945287840072, + "flos": 27853951328640.0, + "grad_norm": 1.87496616039231, + "language_loss": 0.77523607, + "learning_rate": 6.739004479318903e-07, + "loss": 0.79611325, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 12288, + "time_per_iteration": 2.404808521270752 + }, + { + "auxiliary_loss_clip": 0.01054428, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.01373982, + "balance_loss_mlp": 1.01670194, + "epoch": 0.7388546520366752, + "flos": 44231463866880.0, + "grad_norm": 2.1319256775021898, + "language_loss": 0.59171951, + "learning_rate": 6.736089316777684e-07, + "loss": 0.61266512, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37695312, + "step": 12289, + "time_per_iteration": 2.6246416568756104 + }, + { + "auxiliary_loss_clip": 0.01008483, + "auxiliary_loss_mlp": 0.01006017, + "balance_loss_clip": 1.00348938, + "balance_loss_mlp": 1.00160456, + "epoch": 0.7389147752893431, + "flos": 70677681465600.0, + "grad_norm": 0.6540303527809055, + "language_loss": 0.49382591, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51397085, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.06884766, + "step": 12290, + "time_per_iteration": 3.1272263526916504 + }, + { + "auxiliary_loss_clip": 0.01055362, + "auxiliary_loss_mlp": 0.01043928, + "balance_loss_clip": 1.01800036, + "balance_loss_mlp": 1.01712, + "epoch": 0.7389748985420111, + "flos": 25993778889600.0, + "grad_norm": 1.99510775178788, + "language_loss": 0.69124138, + "learning_rate": 6.730260500712237e-07, + "loss": 0.71223438, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 12291, + "time_per_iteration": 2.4210011959075928 + }, + { + "auxiliary_loss_clip": 0.0100797, + "auxiliary_loss_mlp": 0.01002561, + "balance_loss_clip": 1.00012887, + "balance_loss_mlp": 1.00096941, + "epoch": 0.7390350217946791, + "flos": 54401033445120.0, + "grad_norm": 1.001495113853988, + "language_loss": 0.61013007, + "learning_rate": 6.727346847409052e-07, + "loss": 0.63023537, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.0703125, + "step": 12292, + "time_per_iteration": 2.676886558532715 + }, + { + "auxiliary_loss_clip": 0.01054409, + "auxiliary_loss_mlp": 0.01044939, + "balance_loss_clip": 1.02224183, + "balance_loss_mlp": 1.01779127, + "epoch": 0.7390951450473471, + "flos": 32195610495360.0, + "grad_norm": 1.7224096610705013, + "language_loss": 0.68018168, + "learning_rate": 6.724433697406191e-07, + "loss": 0.70117509, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 12293, + "time_per_iteration": 3.996119260787964 + }, + { + "auxiliary_loss_clip": 0.010519, + "auxiliary_loss_mlp": 0.01036074, + "balance_loss_clip": 1.01261401, + "balance_loss_mlp": 1.01551878, + "epoch": 0.739155268300015, + "flos": 16683490627200.0, + "grad_norm": 1.8591678506051779, + "language_loss": 0.83884001, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85971975, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 12294, + "time_per_iteration": 2.359832286834717 + }, + { + "auxiliary_loss_clip": 0.01050716, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.01296997, + "balance_loss_mlp": 1.01565206, + "epoch": 0.739215391552683, + "flos": 31648416756480.0, + "grad_norm": 1.5909578658695285, + "language_loss": 0.73809862, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75897318, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3515625, + "step": 12295, + "time_per_iteration": 2.4244799613952637 + }, + { + "auxiliary_loss_clip": 0.01049327, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.0200386, + "balance_loss_mlp": 1.01534653, + "epoch": 0.7392755148053509, + "flos": 29717161056000.0, + "grad_norm": 1.7256630041040149, + "language_loss": 0.79541993, + "learning_rate": 6.715697268304215e-07, + "loss": 0.81634223, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.33984375, + "step": 12296, + "time_per_iteration": 2.4644765853881836 + }, + { + "auxiliary_loss_clip": 0.01052146, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.0137732, + "balance_loss_mlp": 1.0161767, + "epoch": 0.7393356380580189, + "flos": 37048956681600.0, + "grad_norm": 1.912791715558873, + "language_loss": 0.68081999, + "learning_rate": 6.712786132607182e-07, + "loss": 0.70172858, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.359375, + "step": 12297, + "time_per_iteration": 2.49284291267395 + }, + { + "auxiliary_loss_clip": 0.01053755, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_clip": 1.02062821, + "balance_loss_mlp": 1.01684117, + "epoch": 0.739395761310687, + "flos": 19718594784000.0, + "grad_norm": 2.3157523829520894, + "language_loss": 0.70384437, + "learning_rate": 6.709875500762645e-07, + "loss": 0.72483033, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 12298, + "time_per_iteration": 2.373307704925537 + }, + { + "auxiliary_loss_clip": 0.01053234, + "auxiliary_loss_mlp": 0.01044694, + "balance_loss_clip": 1.01967192, + "balance_loss_mlp": 1.01562095, + "epoch": 0.7394558845633549, + "flos": 11800712298240.0, + "grad_norm": 2.2957259574104576, + "language_loss": 0.75430846, + "learning_rate": 6.706965372880946e-07, + "loss": 0.77528769, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 12299, + "time_per_iteration": 2.3240997791290283 + }, + { + "auxiliary_loss_clip": 0.01008379, + "auxiliary_loss_mlp": 0.0100315, + "balance_loss_clip": 1.00063455, + "balance_loss_mlp": 1.00135958, + "epoch": 0.7395160078160229, + "flos": 66192668789760.0, + "grad_norm": 0.7317688276372185, + "language_loss": 0.60955781, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62967312, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.02514648, + "router_z_loss_mlp": 0.0703125, + "step": 12300, + "time_per_iteration": 3.0516090393066406 + }, + { + "auxiliary_loss_clip": 0.01051528, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.0132668, + "balance_loss_mlp": 1.01625812, + "epoch": 0.7395761310686908, + "flos": 21248698429440.0, + "grad_norm": 1.8174176058332792, + "language_loss": 0.81591374, + "learning_rate": 6.7011466294475e-07, + "loss": 0.83678448, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 12301, + "time_per_iteration": 2.3594186305999756 + }, + { + "auxiliary_loss_clip": 0.0105123, + "auxiliary_loss_mlp": 0.01038521, + "balance_loss_clip": 1.01568079, + "balance_loss_mlp": 1.0153507, + "epoch": 0.7396362543213588, + "flos": 25954187541120.0, + "grad_norm": 1.5192143132850227, + "language_loss": 0.73587525, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75677276, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 12302, + "time_per_iteration": 2.4284322261810303 + }, + { + "auxiliary_loss_clip": 0.01052929, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_clip": 1.01909947, + "balance_loss_mlp": 1.01598692, + "epoch": 0.7396963775740267, + "flos": 27376793510400.0, + "grad_norm": 2.044110895928742, + "language_loss": 0.750166, + "learning_rate": 6.695329903189451e-07, + "loss": 0.7711336, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36914062, + "step": 12303, + "time_per_iteration": 2.429917097091675 + }, + { + "auxiliary_loss_clip": 0.01049633, + "auxiliary_loss_mlp": 0.01038106, + "balance_loss_clip": 1.01661229, + "balance_loss_mlp": 1.0155108, + "epoch": 0.7397565008266948, + "flos": 25518960132480.0, + "grad_norm": 1.8295115371562056, + "language_loss": 0.55461991, + "learning_rate": 6.692422296776927e-07, + "loss": 0.57549727, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 12304, + "time_per_iteration": 2.418330192565918 + }, + { + "auxiliary_loss_clip": 0.01052803, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.01468277, + "balance_loss_mlp": 1.01708984, + "epoch": 0.7398166240793627, + "flos": 23726763843840.0, + "grad_norm": 2.0069684368794642, + "language_loss": 0.8541643, + "learning_rate": 6.689515194989084e-07, + "loss": 0.87506986, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 12305, + "time_per_iteration": 2.403287887573242 + }, + { + "auxiliary_loss_clip": 0.01007899, + "auxiliary_loss_mlp": 0.01003392, + "balance_loss_clip": 1.00099635, + "balance_loss_mlp": 1.00080538, + "epoch": 0.7398767473320307, + "flos": 67264012903680.0, + "grad_norm": 0.874389388802186, + "language_loss": 0.57693034, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59704322, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.07128906, + "step": 12306, + "time_per_iteration": 3.011460304260254 + }, + { + "auxiliary_loss_clip": 0.01054364, + "auxiliary_loss_mlp": 0.01040944, + "balance_loss_clip": 1.01525426, + "balance_loss_mlp": 1.01724637, + "epoch": 0.7399368705846986, + "flos": 22017590501760.0, + "grad_norm": 2.9309154700949698, + "language_loss": 0.83004665, + "learning_rate": 6.683702505728355e-07, + "loss": 0.85099977, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 12307, + "time_per_iteration": 2.38409423828125 + }, + { + "auxiliary_loss_clip": 0.01051864, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.01397038, + "balance_loss_mlp": 1.01662016, + "epoch": 0.7399969938373666, + "flos": 14172990693120.0, + "grad_norm": 1.7732504270805958, + "language_loss": 0.70882916, + "learning_rate": 6.680796918475893e-07, + "loss": 0.72970891, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3515625, + "step": 12308, + "time_per_iteration": 2.3597514629364014 + }, + { + "auxiliary_loss_clip": 0.01050544, + "auxiliary_loss_mlp": 0.01034509, + "balance_loss_clip": 1.01283658, + "balance_loss_mlp": 1.01556814, + "epoch": 0.7400571170900345, + "flos": 25300299087360.0, + "grad_norm": 1.864283248202594, + "language_loss": 0.83615649, + "learning_rate": 6.67789183628896e-07, + "loss": 0.85700703, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34960938, + "step": 12309, + "time_per_iteration": 2.407780170440674 + }, + { + "auxiliary_loss_clip": 0.01054041, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.01547384, + "balance_loss_mlp": 1.01669812, + "epoch": 0.7401172403427025, + "flos": 22710232431360.0, + "grad_norm": 1.746097291855091, + "language_loss": 0.74041682, + "learning_rate": 6.674987259277692e-07, + "loss": 0.76136482, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 12310, + "time_per_iteration": 2.3972346782684326 + }, + { + "auxiliary_loss_clip": 0.01053, + "auxiliary_loss_mlp": 0.01043692, + "balance_loss_clip": 1.01926589, + "balance_loss_mlp": 1.01613128, + "epoch": 0.7401773635953706, + "flos": 18066749627520.0, + "grad_norm": 2.6048512246936264, + "language_loss": 0.89639866, + "learning_rate": 6.672083187552239e-07, + "loss": 0.91736567, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 12311, + "time_per_iteration": 2.3199305534362793 + }, + { + "auxiliary_loss_clip": 0.01051793, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.01293778, + "balance_loss_mlp": 1.0154115, + "epoch": 0.7402374868480385, + "flos": 22711000481280.0, + "grad_norm": 1.5695396751609656, + "language_loss": 0.80713809, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82801449, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 12312, + "time_per_iteration": 2.4128103256225586 + }, + { + "auxiliary_loss_clip": 0.01052261, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.01243556, + "balance_loss_mlp": 1.01665664, + "epoch": 0.7402976101007065, + "flos": 22855575888000.0, + "grad_norm": 1.7555384579414623, + "language_loss": 0.79099536, + "learning_rate": 6.666276560399273e-07, + "loss": 0.81186879, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 12313, + "time_per_iteration": 2.484036445617676 + }, + { + "auxiliary_loss_clip": 0.01053749, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.01365685, + "balance_loss_mlp": 1.01592231, + "epoch": 0.7403577333533744, + "flos": 12345078216960.0, + "grad_norm": 1.9867782372546265, + "language_loss": 0.79809225, + "learning_rate": 6.663374005191937e-07, + "loss": 0.81902117, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 12314, + "time_per_iteration": 2.394526720046997 + }, + { + "auxiliary_loss_clip": 0.01007726, + "auxiliary_loss_mlp": 0.01005248, + "balance_loss_clip": 1.00266135, + "balance_loss_mlp": 1.00075984, + "epoch": 0.7404178566060424, + "flos": 60324117822720.0, + "grad_norm": 0.8391677859923466, + "language_loss": 0.55187953, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57200927, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.06982422, + "step": 12315, + "time_per_iteration": 2.9966282844543457 + }, + { + "auxiliary_loss_clip": 0.01050212, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.01479506, + "balance_loss_mlp": 1.01582122, + "epoch": 0.7404779798587103, + "flos": 32013294042240.0, + "grad_norm": 2.0759418043614306, + "language_loss": 0.80621094, + "learning_rate": 6.65757041206591e-07, + "loss": 0.82707524, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 12316, + "time_per_iteration": 3.7717604637145996 + }, + { + "auxiliary_loss_clip": 0.0105265, + "auxiliary_loss_mlp": 0.01040142, + "balance_loss_clip": 1.01564503, + "balance_loss_mlp": 1.0160023, + "epoch": 0.7405381031113784, + "flos": 12889060110720.0, + "grad_norm": 2.244258140439781, + "language_loss": 0.76507872, + "learning_rate": 6.654669374367275e-07, + "loss": 0.78600669, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3671875, + "step": 12317, + "time_per_iteration": 2.409691095352173 + }, + { + "auxiliary_loss_clip": 0.01049623, + "auxiliary_loss_mlp": 0.0103611, + "balance_loss_clip": 1.01383007, + "balance_loss_mlp": 1.01574945, + "epoch": 0.7405982263640463, + "flos": 20228117299200.0, + "grad_norm": 1.6193641654479036, + "language_loss": 0.82642639, + "learning_rate": 6.651768842724917e-07, + "loss": 0.84728372, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33789062, + "step": 12318, + "time_per_iteration": 2.3854904174804688 + }, + { + "auxiliary_loss_clip": 0.01052947, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.0136863, + "balance_loss_mlp": 1.0161891, + "epoch": 0.7406583496167143, + "flos": 17566234243200.0, + "grad_norm": 1.8807667297716892, + "language_loss": 0.78144628, + "learning_rate": 6.648868817248827e-07, + "loss": 0.80235434, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 12319, + "time_per_iteration": 2.362156629562378 + }, + { + "auxiliary_loss_clip": 0.01051614, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.01404572, + "balance_loss_mlp": 1.01597714, + "epoch": 0.7407184728693822, + "flos": 18294766917120.0, + "grad_norm": 1.9659102506674795, + "language_loss": 0.64723104, + "learning_rate": 6.64596929804897e-07, + "loss": 0.66810346, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35546875, + "step": 12320, + "time_per_iteration": 3.839484691619873 + }, + { + "auxiliary_loss_clip": 0.01054513, + "auxiliary_loss_mlp": 0.01045054, + "balance_loss_clip": 1.0197463, + "balance_loss_mlp": 1.01675534, + "epoch": 0.7407785961220502, + "flos": 16689635026560.0, + "grad_norm": 2.7061522583031845, + "language_loss": 0.84959066, + "learning_rate": 6.643070285235288e-07, + "loss": 0.8705864, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 12321, + "time_per_iteration": 3.7310774326324463 + }, + { + "auxiliary_loss_clip": 0.01057025, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_clip": 1.02052963, + "balance_loss_mlp": 1.01699233, + "epoch": 0.7408387193747181, + "flos": 22087312220160.0, + "grad_norm": 2.23583245639914, + "language_loss": 0.73305702, + "learning_rate": 6.640171778917727e-07, + "loss": 0.75411808, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.40039062, + "step": 12322, + "time_per_iteration": 2.3885817527770996 + }, + { + "auxiliary_loss_clip": 0.01053062, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.01217914, + "balance_loss_mlp": 1.01663494, + "epoch": 0.7408988426273861, + "flos": 24235762688640.0, + "grad_norm": 1.6102347956366845, + "language_loss": 0.65126592, + "learning_rate": 6.637273779206183e-07, + "loss": 0.67216259, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 12323, + "time_per_iteration": 2.4394478797912598 + }, + { + "auxiliary_loss_clip": 0.01053943, + "auxiliary_loss_mlp": 0.01037292, + "balance_loss_clip": 1.01263928, + "balance_loss_mlp": 1.01651001, + "epoch": 0.7409589658800542, + "flos": 29021726217600.0, + "grad_norm": 1.435176551450173, + "language_loss": 0.76843822, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78935057, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.375, + "step": 12324, + "time_per_iteration": 2.4576776027679443 + }, + { + "auxiliary_loss_clip": 0.01053354, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.01157069, + "balance_loss_mlp": 1.0161556, + "epoch": 0.7410190891327221, + "flos": 19350435830400.0, + "grad_norm": 1.5909859358250389, + "language_loss": 0.75416207, + "learning_rate": 6.63147930004073e-07, + "loss": 0.77504814, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 12325, + "time_per_iteration": 2.3958988189697266 + }, + { + "auxiliary_loss_clip": 0.01055674, + "auxiliary_loss_mlp": 0.01039762, + "balance_loss_clip": 1.01528871, + "balance_loss_mlp": 1.01669312, + "epoch": 0.7410792123853901, + "flos": 22746542112000.0, + "grad_norm": 2.843428424858071, + "language_loss": 0.69652534, + "learning_rate": 6.628582820806545e-07, + "loss": 0.71747971, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.390625, + "step": 12326, + "time_per_iteration": 2.364818811416626 + }, + { + "auxiliary_loss_clip": 0.01054225, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.01429486, + "balance_loss_mlp": 1.01707518, + "epoch": 0.741139335638058, + "flos": 25371312526080.0, + "grad_norm": 1.8991485734240783, + "language_loss": 0.90676212, + "learning_rate": 6.625686848617835e-07, + "loss": 0.92768598, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 12327, + "time_per_iteration": 2.4255197048187256 + }, + { + "auxiliary_loss_clip": 0.01053734, + "auxiliary_loss_mlp": 0.01041922, + "balance_loss_clip": 1.01676869, + "balance_loss_mlp": 1.0164516, + "epoch": 0.741199458890726, + "flos": 18584720691840.0, + "grad_norm": 7.852205349883515, + "language_loss": 0.86401272, + "learning_rate": 6.62279138358442e-07, + "loss": 0.88496923, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 12328, + "time_per_iteration": 2.337928056716919 + }, + { + "auxiliary_loss_clip": 0.01050957, + "auxiliary_loss_mlp": 0.01039749, + "balance_loss_clip": 1.01386893, + "balance_loss_mlp": 1.01526654, + "epoch": 0.7412595821433939, + "flos": 22125995873280.0, + "grad_norm": 1.7806623842358433, + "language_loss": 0.68253714, + "learning_rate": 6.619896425816103e-07, + "loss": 0.70344424, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.35546875, + "step": 12329, + "time_per_iteration": 2.3928465843200684 + }, + { + "auxiliary_loss_clip": 0.01056745, + "auxiliary_loss_mlp": 0.01039983, + "balance_loss_clip": 1.01351857, + "balance_loss_mlp": 1.017874, + "epoch": 0.741319705396062, + "flos": 29168396305920.0, + "grad_norm": 1.6191955276238443, + "language_loss": 0.67933249, + "learning_rate": 6.617001975422647e-07, + "loss": 0.70029974, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38867188, + "step": 12330, + "time_per_iteration": 2.442047357559204 + }, + { + "auxiliary_loss_clip": 0.01057387, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.01301932, + "balance_loss_mlp": 1.01814318, + "epoch": 0.7413798286487299, + "flos": 20666451818880.0, + "grad_norm": 3.261086818843126, + "language_loss": 0.86500913, + "learning_rate": 6.614108032513823e-07, + "loss": 0.88597971, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 12331, + "time_per_iteration": 2.4118335247039795 + }, + { + "auxiliary_loss_clip": 0.01052483, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.01367521, + "balance_loss_mlp": 1.01587069, + "epoch": 0.7414399519013979, + "flos": 16397970595200.0, + "grad_norm": 2.088278761056931, + "language_loss": 0.71522546, + "learning_rate": 6.611214597199364e-07, + "loss": 0.73612392, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 12332, + "time_per_iteration": 2.3334925174713135 + }, + { + "auxiliary_loss_clip": 0.01053865, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.01240301, + "balance_loss_mlp": 1.01711249, + "epoch": 0.7415000751540658, + "flos": 25629041249280.0, + "grad_norm": 2.0358766828881176, + "language_loss": 0.64458454, + "learning_rate": 6.608321669588984e-07, + "loss": 0.66549176, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 12333, + "time_per_iteration": 3.945812463760376 + }, + { + "auxiliary_loss_clip": 0.0105335, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.01708293, + "balance_loss_mlp": 1.01814353, + "epoch": 0.7415601984067338, + "flos": 24498553559040.0, + "grad_norm": 1.5920177376131113, + "language_loss": 0.72199893, + "learning_rate": 6.605429249792387e-07, + "loss": 0.7429443, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 12334, + "time_per_iteration": 2.395211935043335 + }, + { + "auxiliary_loss_clip": 0.01051372, + "auxiliary_loss_mlp": 0.01036347, + "balance_loss_clip": 1.01298189, + "balance_loss_mlp": 1.01556945, + "epoch": 0.7416203216594017, + "flos": 20886090382080.0, + "grad_norm": 1.8313194478437247, + "language_loss": 0.83589768, + "learning_rate": 6.602537337919257e-07, + "loss": 0.85677493, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35742188, + "step": 12335, + "time_per_iteration": 2.3779118061065674 + }, + { + "auxiliary_loss_clip": 0.01053667, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.01147771, + "balance_loss_mlp": 1.01613462, + "epoch": 0.7416804449120697, + "flos": 15623597439360.0, + "grad_norm": 2.894733825010716, + "language_loss": 0.75575751, + "learning_rate": 6.599645934079259e-07, + "loss": 0.77667058, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 12336, + "time_per_iteration": 2.34045147895813 + }, + { + "auxiliary_loss_clip": 0.01053508, + "auxiliary_loss_mlp": 0.01041129, + "balance_loss_clip": 1.01623774, + "balance_loss_mlp": 1.01658726, + "epoch": 0.7417405681647377, + "flos": 17119765376640.0, + "grad_norm": 1.8622308305333641, + "language_loss": 0.74998736, + "learning_rate": 6.596755038382029e-07, + "loss": 0.77093369, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36914062, + "step": 12337, + "time_per_iteration": 2.4045145511627197 + }, + { + "auxiliary_loss_clip": 0.01051149, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.01515102, + "balance_loss_mlp": 1.01655173, + "epoch": 0.7418006914174057, + "flos": 18879317677440.0, + "grad_norm": 1.9426077341530699, + "language_loss": 0.77716017, + "learning_rate": 6.593864650937186e-07, + "loss": 0.79805517, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34570312, + "step": 12338, + "time_per_iteration": 2.344142198562622 + }, + { + "auxiliary_loss_clip": 0.01051226, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.0096643, + "balance_loss_mlp": 1.01589561, + "epoch": 0.7418608146700737, + "flos": 21579640007040.0, + "grad_norm": 1.6306689457415364, + "language_loss": 0.73552251, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75635254, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 12339, + "time_per_iteration": 2.414616823196411 + }, + { + "auxiliary_loss_clip": 0.01052693, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.0143044, + "balance_loss_mlp": 1.01599729, + "epoch": 0.7419209379227416, + "flos": 22339524948480.0, + "grad_norm": 1.6448661067173334, + "language_loss": 0.81125784, + "learning_rate": 6.588085401243077e-07, + "loss": 0.83216739, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 12340, + "time_per_iteration": 2.36594820022583 + }, + { + "auxiliary_loss_clip": 0.01052852, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.01682019, + "balance_loss_mlp": 1.01627958, + "epoch": 0.7419810611754096, + "flos": 16761381603840.0, + "grad_norm": 1.4595827395578265, + "language_loss": 0.7631948, + "learning_rate": 6.585196539212958e-07, + "loss": 0.78413618, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 12341, + "time_per_iteration": 2.376251459121704 + }, + { + "auxiliary_loss_clip": 0.01049735, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.01498854, + "balance_loss_mlp": 1.01614559, + "epoch": 0.7420411844280775, + "flos": 26211776618880.0, + "grad_norm": 1.4601450328008798, + "language_loss": 0.80502105, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82587922, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.3359375, + "step": 12342, + "time_per_iteration": 2.4010541439056396 + }, + { + "auxiliary_loss_clip": 0.01051378, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.01205206, + "balance_loss_mlp": 1.01577425, + "epoch": 0.7421013076807456, + "flos": 68527208004480.0, + "grad_norm": 1.6391546240381698, + "language_loss": 0.78109562, + "learning_rate": 6.57942034133433e-07, + "loss": 0.80196238, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 12343, + "time_per_iteration": 2.8015100955963135 + }, + { + "auxiliary_loss_clip": 0.01050555, + "auxiliary_loss_mlp": 0.01039737, + "balance_loss_clip": 1.01581192, + "balance_loss_mlp": 1.01475775, + "epoch": 0.7421614309334135, + "flos": 24424188629760.0, + "grad_norm": 1.7234761750976315, + "language_loss": 0.68616974, + "learning_rate": 6.576533005704843e-07, + "loss": 0.70707273, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35742188, + "step": 12344, + "time_per_iteration": 2.422603130340576 + }, + { + "auxiliary_loss_clip": 0.01052777, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.01390195, + "balance_loss_mlp": 1.0156734, + "epoch": 0.7422215541860815, + "flos": 12310304636160.0, + "grad_norm": 2.1599223949429165, + "language_loss": 0.82397503, + "learning_rate": 6.573646179094572e-07, + "loss": 0.84488791, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 12345, + "time_per_iteration": 2.35941743850708 + }, + { + "auxiliary_loss_clip": 0.01053611, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.01507032, + "balance_loss_mlp": 1.01726913, + "epoch": 0.7422816774387494, + "flos": 19644578968320.0, + "grad_norm": 2.5209104723034925, + "language_loss": 0.72414982, + "learning_rate": 6.570759861612988e-07, + "loss": 0.74507457, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 12346, + "time_per_iteration": 2.3424322605133057 + }, + { + "auxiliary_loss_clip": 0.01055033, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.01387596, + "balance_loss_mlp": 1.01902378, + "epoch": 0.7423418006914174, + "flos": 32014585762560.0, + "grad_norm": 1.547598575245381, + "language_loss": 0.74559426, + "learning_rate": 6.56787405336953e-07, + "loss": 0.7665053, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 12347, + "time_per_iteration": 2.497457981109619 + }, + { + "auxiliary_loss_clip": 0.01053532, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.01534033, + "balance_loss_mlp": 1.01587367, + "epoch": 0.7424019239440853, + "flos": 18915941560320.0, + "grad_norm": 1.8362913198629662, + "language_loss": 0.82561374, + "learning_rate": 6.564988754473642e-07, + "loss": 0.84654099, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 12348, + "time_per_iteration": 2.343688726425171 + }, + { + "auxiliary_loss_clip": 0.01051937, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.01344275, + "balance_loss_mlp": 1.01619828, + "epoch": 0.7424620471967533, + "flos": 35875211950080.0, + "grad_norm": 1.7085776743143948, + "language_loss": 0.7368924, + "learning_rate": 6.562103965034724e-07, + "loss": 0.75777173, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35742188, + "step": 12349, + "time_per_iteration": 2.5097122192382812 + }, + { + "auxiliary_loss_clip": 0.01055356, + "auxiliary_loss_mlp": 0.01047557, + "balance_loss_clip": 1.02108073, + "balance_loss_mlp": 1.0166595, + "epoch": 0.7425221704494213, + "flos": 27015372449280.0, + "grad_norm": 1.948379118141713, + "language_loss": 0.80583537, + "learning_rate": 6.559219685162165e-07, + "loss": 0.82686448, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 12350, + "time_per_iteration": 2.410724401473999 + }, + { + "auxiliary_loss_clip": 0.01051536, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.01341093, + "balance_loss_mlp": 1.01574636, + "epoch": 0.7425822937020893, + "flos": 34165724405760.0, + "grad_norm": 1.5760664330347598, + "language_loss": 0.76249307, + "learning_rate": 6.556335914965343e-07, + "loss": 0.78337604, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35742188, + "step": 12351, + "time_per_iteration": 2.4868342876434326 + }, + { + "auxiliary_loss_clip": 0.01051972, + "auxiliary_loss_mlp": 0.01032603, + "balance_loss_clip": 1.01048934, + "balance_loss_mlp": 1.01624537, + "epoch": 0.7426424169547573, + "flos": 21282634137600.0, + "grad_norm": 2.162353101103212, + "language_loss": 0.82299042, + "learning_rate": 6.553452654553611e-07, + "loss": 0.84383619, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 12352, + "time_per_iteration": 2.3509767055511475 + }, + { + "auxiliary_loss_clip": 0.0105224, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_clip": 1.01801181, + "balance_loss_mlp": 1.01667476, + "epoch": 0.7427025402074252, + "flos": 22447546295040.0, + "grad_norm": 2.021593502043992, + "language_loss": 0.7304647, + "learning_rate": 6.550569904036307e-07, + "loss": 0.75140369, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 12353, + "time_per_iteration": 2.423694610595703 + }, + { + "auxiliary_loss_clip": 0.01051617, + "auxiliary_loss_mlp": 0.01033741, + "balance_loss_clip": 1.01142526, + "balance_loss_mlp": 1.01662207, + "epoch": 0.7427626634600932, + "flos": 22523621880960.0, + "grad_norm": 1.784544271221922, + "language_loss": 0.73377889, + "learning_rate": 6.547687663522739e-07, + "loss": 0.75463247, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 12354, + "time_per_iteration": 2.3915696144104004 + }, + { + "auxiliary_loss_clip": 0.01008938, + "auxiliary_loss_mlp": 0.01003157, + "balance_loss_clip": 1.00078499, + "balance_loss_mlp": 1.00173426, + "epoch": 0.7428227867127611, + "flos": 67206512027520.0, + "grad_norm": 0.6977736480650875, + "language_loss": 0.59606385, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61618483, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07226562, + "step": 12355, + "time_per_iteration": 4.35331130027771 + }, + { + "auxiliary_loss_clip": 0.01052662, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.01350665, + "balance_loss_mlp": 1.01594234, + "epoch": 0.7428829099654292, + "flos": 14720324077440.0, + "grad_norm": 1.8407729507083679, + "language_loss": 0.68433392, + "learning_rate": 6.541924712943971e-07, + "loss": 0.70524889, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3671875, + "step": 12356, + "time_per_iteration": 2.3380463123321533 + }, + { + "auxiliary_loss_clip": 0.01053442, + "auxiliary_loss_mlp": 0.01037111, + "balance_loss_clip": 1.01313806, + "balance_loss_mlp": 1.01544595, + "epoch": 0.7429430332180971, + "flos": 48644834699520.0, + "grad_norm": 1.789260016500064, + "language_loss": 0.73660135, + "learning_rate": 6.539044003097301e-07, + "loss": 0.75750685, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 12357, + "time_per_iteration": 2.6221981048583984 + }, + { + "auxiliary_loss_clip": 0.01051732, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.00859928, + "balance_loss_mlp": 1.01718307, + "epoch": 0.7430031564707651, + "flos": 16763127171840.0, + "grad_norm": 1.7150353191870682, + "language_loss": 0.66033006, + "learning_rate": 6.53616380369143e-07, + "loss": 0.68115693, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34570312, + "step": 12358, + "time_per_iteration": 2.3928449153900146 + }, + { + "auxiliary_loss_clip": 0.01055453, + "auxiliary_loss_mlp": 0.01045143, + "balance_loss_clip": 1.01809406, + "balance_loss_mlp": 1.01748943, + "epoch": 0.743063279723433, + "flos": 23869663505280.0, + "grad_norm": 2.0155731063254576, + "language_loss": 0.81730795, + "learning_rate": 6.533284114835591e-07, + "loss": 0.83831394, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38085938, + "step": 12359, + "time_per_iteration": 3.7173728942871094 + }, + { + "auxiliary_loss_clip": 0.01051511, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.01255393, + "balance_loss_mlp": 1.0149411, + "epoch": 0.743123402976101, + "flos": 14390848776960.0, + "grad_norm": 2.1338565946521832, + "language_loss": 0.68960541, + "learning_rate": 6.530404936638956e-07, + "loss": 0.71048307, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 12360, + "time_per_iteration": 2.387791872024536 + }, + { + "auxiliary_loss_clip": 0.01052487, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.01466465, + "balance_loss_mlp": 1.01624107, + "epoch": 0.7431835262287689, + "flos": 27453078564480.0, + "grad_norm": 1.7053082523738714, + "language_loss": 0.73601925, + "learning_rate": 6.527526269210715e-07, + "loss": 0.75693572, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 12361, + "time_per_iteration": 3.785395860671997 + }, + { + "auxiliary_loss_clip": 0.01054436, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.01233935, + "balance_loss_mlp": 1.01686573, + "epoch": 0.743243649481437, + "flos": 20958465363840.0, + "grad_norm": 1.933892611428438, + "language_loss": 0.5717535, + "learning_rate": 6.524648112660027e-07, + "loss": 0.59267592, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12362, + "time_per_iteration": 2.374314785003662 + }, + { + "auxiliary_loss_clip": 0.01053333, + "auxiliary_loss_mlp": 0.01038923, + "balance_loss_clip": 1.01450896, + "balance_loss_mlp": 1.01728785, + "epoch": 0.7433037727341049, + "flos": 22782083742720.0, + "grad_norm": 1.7393540430537877, + "language_loss": 0.78477496, + "learning_rate": 6.521770467096039e-07, + "loss": 0.8056975, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 12363, + "time_per_iteration": 2.374274253845215 + }, + { + "auxiliary_loss_clip": 0.01052873, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.01088095, + "balance_loss_mlp": 1.01643384, + "epoch": 0.7433638959867729, + "flos": 22195717591680.0, + "grad_norm": 1.7666183646023021, + "language_loss": 0.78874165, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80960131, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 12364, + "time_per_iteration": 2.4018077850341797 + }, + { + "auxiliary_loss_clip": 0.01052486, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.01589251, + "balance_loss_mlp": 1.01605797, + "epoch": 0.7434240192394409, + "flos": 23295586152960.0, + "grad_norm": 2.094247699476492, + "language_loss": 0.80100262, + "learning_rate": 6.516016709364604e-07, + "loss": 0.82191235, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36523438, + "step": 12365, + "time_per_iteration": 2.390216112136841 + }, + { + "auxiliary_loss_clip": 0.01055327, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.00943589, + "balance_loss_mlp": 1.01730251, + "epoch": 0.7434841424921088, + "flos": 54007773223680.0, + "grad_norm": 1.8075821772955203, + "language_loss": 0.77613378, + "learning_rate": 6.513140597415346e-07, + "loss": 0.79703116, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37890625, + "step": 12366, + "time_per_iteration": 2.6761667728424072 + }, + { + "auxiliary_loss_clip": 0.01051862, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.01383638, + "balance_loss_mlp": 1.01688576, + "epoch": 0.7435442657447768, + "flos": 21432900096000.0, + "grad_norm": 1.3986391827696896, + "language_loss": 0.72512901, + "learning_rate": 6.510264996889141e-07, + "loss": 0.74600327, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34960938, + "step": 12367, + "time_per_iteration": 2.3684988021850586 + }, + { + "auxiliary_loss_clip": 0.01053974, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_clip": 1.01876152, + "balance_loss_mlp": 1.01660681, + "epoch": 0.7436043889974447, + "flos": 24498239356800.0, + "grad_norm": 1.6033397570480457, + "language_loss": 0.75456405, + "learning_rate": 6.507389907895038e-07, + "loss": 0.77554643, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12368, + "time_per_iteration": 2.4226863384246826 + }, + { + "auxiliary_loss_clip": 0.0105073, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.01583016, + "balance_loss_mlp": 1.01612699, + "epoch": 0.7436645122501128, + "flos": 40696786932480.0, + "grad_norm": 1.7172499318536323, + "language_loss": 0.70535076, + "learning_rate": 6.50451533054207e-07, + "loss": 0.72621882, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.34570312, + "step": 12369, + "time_per_iteration": 2.5139408111572266 + }, + { + "auxiliary_loss_clip": 0.01052781, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.01368511, + "balance_loss_mlp": 1.01670754, + "epoch": 0.7437246355027807, + "flos": 18908051592960.0, + "grad_norm": 1.8041741136718719, + "language_loss": 0.76403213, + "learning_rate": 6.501641264939233e-07, + "loss": 0.78493643, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36132812, + "step": 12370, + "time_per_iteration": 2.3806397914886475 + }, + { + "auxiliary_loss_clip": 0.01054424, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.01536322, + "balance_loss_mlp": 1.0176487, + "epoch": 0.7437847587554487, + "flos": 21542736833280.0, + "grad_norm": 1.5294628906323284, + "language_loss": 0.79596376, + "learning_rate": 6.498767711195503e-07, + "loss": 0.81690085, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 12371, + "time_per_iteration": 2.4101314544677734 + }, + { + "auxiliary_loss_clip": 0.01052313, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.0095439, + "balance_loss_mlp": 1.01601243, + "epoch": 0.7438448820081166, + "flos": 27781227233280.0, + "grad_norm": 1.7877383865794667, + "language_loss": 0.70888162, + "learning_rate": 6.495894669419857e-07, + "loss": 0.72973418, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 12372, + "time_per_iteration": 2.420164108276367 + }, + { + "auxiliary_loss_clip": 0.0105048, + "auxiliary_loss_mlp": 0.0104069, + "balance_loss_clip": 1.01720619, + "balance_loss_mlp": 1.01530182, + "epoch": 0.7439050052607846, + "flos": 17966862627840.0, + "grad_norm": 2.6344505033229986, + "language_loss": 0.7679888, + "learning_rate": 6.493022139721245e-07, + "loss": 0.78890049, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 12373, + "time_per_iteration": 3.781064510345459 + }, + { + "auxiliary_loss_clip": 0.01054128, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.01129866, + "balance_loss_mlp": 1.01594639, + "epoch": 0.7439651285134525, + "flos": 22957801948800.0, + "grad_norm": 1.6066675841772189, + "language_loss": 0.78264225, + "learning_rate": 6.49015012220858e-07, + "loss": 0.80355561, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38085938, + "step": 12374, + "time_per_iteration": 2.3914132118225098 + }, + { + "auxiliary_loss_clip": 0.01053884, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.01128483, + "balance_loss_mlp": 1.01678681, + "epoch": 0.7440252517661206, + "flos": 18805790620800.0, + "grad_norm": 1.938198050439268, + "language_loss": 0.77878362, + "learning_rate": 6.487278616990774e-07, + "loss": 0.79968524, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 12375, + "time_per_iteration": 2.3307554721832275 + }, + { + "auxiliary_loss_clip": 0.01050399, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.01272798, + "balance_loss_mlp": 1.01540756, + "epoch": 0.7440853750187885, + "flos": 20265264852480.0, + "grad_norm": 1.8641000532155652, + "language_loss": 0.78305972, + "learning_rate": 6.484407624176733e-07, + "loss": 0.80390745, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34960938, + "step": 12376, + "time_per_iteration": 2.3791542053222656 + }, + { + "auxiliary_loss_clip": 0.01054686, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.0104897, + "balance_loss_mlp": 1.01682377, + "epoch": 0.7441454982714565, + "flos": 25336120008960.0, + "grad_norm": 1.7132908296862734, + "language_loss": 0.80991191, + "learning_rate": 6.481537143875296e-07, + "loss": 0.83080506, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 12377, + "time_per_iteration": 2.4048173427581787 + }, + { + "auxiliary_loss_clip": 0.01054369, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.01250935, + "balance_loss_mlp": 1.0164361, + "epoch": 0.7442056215241245, + "flos": 64478819191680.0, + "grad_norm": 2.09682031153249, + "language_loss": 0.68137443, + "learning_rate": 6.478667176195322e-07, + "loss": 0.70229685, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 12378, + "time_per_iteration": 2.7550666332244873 + }, + { + "auxiliary_loss_clip": 0.01055739, + "auxiliary_loss_mlp": 0.01039727, + "balance_loss_clip": 1.01336944, + "balance_loss_mlp": 1.01768243, + "epoch": 0.7442657447767924, + "flos": 31284028229760.0, + "grad_norm": 1.7347902738127314, + "language_loss": 0.73457348, + "learning_rate": 6.475797721245648e-07, + "loss": 0.75552815, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38085938, + "step": 12379, + "time_per_iteration": 2.436588764190674 + }, + { + "auxiliary_loss_clip": 0.01051544, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.01595807, + "balance_loss_mlp": 1.01526129, + "epoch": 0.7443258680294604, + "flos": 20807012419200.0, + "grad_norm": 1.702551044152397, + "language_loss": 0.67004979, + "learning_rate": 6.472928779135085e-07, + "loss": 0.69096506, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 12380, + "time_per_iteration": 2.374053478240967 + }, + { + "auxiliary_loss_clip": 0.01055112, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.01351321, + "balance_loss_mlp": 1.01746452, + "epoch": 0.7443859912821283, + "flos": 22198754880000.0, + "grad_norm": 2.603210842350457, + "language_loss": 0.79654247, + "learning_rate": 6.470060349972411e-07, + "loss": 0.81748724, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 12381, + "time_per_iteration": 2.382394790649414 + }, + { + "auxiliary_loss_clip": 0.01055683, + "auxiliary_loss_mlp": 0.01039152, + "balance_loss_clip": 1.01358151, + "balance_loss_mlp": 1.01819301, + "epoch": 0.7444461145347964, + "flos": 22016752629120.0, + "grad_norm": 2.1985689671067576, + "language_loss": 0.74396622, + "learning_rate": 6.467192433866411e-07, + "loss": 0.76491451, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12382, + "time_per_iteration": 2.416293144226074 + }, + { + "auxiliary_loss_clip": 0.01008621, + "auxiliary_loss_mlp": 0.01005798, + "balance_loss_clip": 1.00349772, + "balance_loss_mlp": 1.00140762, + "epoch": 0.7445062377874643, + "flos": 70555170038400.0, + "grad_norm": 0.6521747393750501, + "language_loss": 0.54709947, + "learning_rate": 6.464325030925831e-07, + "loss": 0.5672437, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.07226562, + "step": 12383, + "time_per_iteration": 3.166435956954956 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.01252294, + "balance_loss_mlp": 1.01561689, + "epoch": 0.7445663610401323, + "flos": 22163317983360.0, + "grad_norm": 2.4999856358896966, + "language_loss": 0.77297169, + "learning_rate": 6.461458141259395e-07, + "loss": 0.79386353, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 12384, + "time_per_iteration": 2.4000794887542725 + }, + { + "auxiliary_loss_clip": 0.01051835, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.01276231, + "balance_loss_mlp": 1.0160656, + "epoch": 0.7446264842928002, + "flos": 24169113169920.0, + "grad_norm": 2.085130650991691, + "language_loss": 0.80961853, + "learning_rate": 6.458591764975823e-07, + "loss": 0.83049768, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 12385, + "time_per_iteration": 2.3751606941223145 + }, + { + "auxiliary_loss_clip": 0.01056219, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_clip": 1.01778948, + "balance_loss_mlp": 1.01753998, + "epoch": 0.7446866075454682, + "flos": 24133396982400.0, + "grad_norm": 1.5069223172032695, + "language_loss": 0.82778263, + "learning_rate": 6.455725902183813e-07, + "loss": 0.84877509, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 12386, + "time_per_iteration": 2.426088809967041 + }, + { + "auxiliary_loss_clip": 0.01051459, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.0131495, + "balance_loss_mlp": 1.01577902, + "epoch": 0.7447467307981361, + "flos": 23546995920000.0, + "grad_norm": 1.7060408085044414, + "language_loss": 0.72501689, + "learning_rate": 6.452860552992037e-07, + "loss": 0.74589235, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35742188, + "step": 12387, + "time_per_iteration": 2.3630526065826416 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.01711071, + "balance_loss_mlp": 1.01567149, + "epoch": 0.7448068540508042, + "flos": 19566967282560.0, + "grad_norm": 2.108164264658429, + "language_loss": 0.71428734, + "learning_rate": 6.449995717509138e-07, + "loss": 0.73522502, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 12388, + "time_per_iteration": 2.372028350830078 + }, + { + "auxiliary_loss_clip": 0.01053129, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.01402712, + "balance_loss_mlp": 1.01660669, + "epoch": 0.7448669773034721, + "flos": 21838520805120.0, + "grad_norm": 1.7039838150947855, + "language_loss": 0.859707, + "learning_rate": 6.447131395843761e-07, + "loss": 0.88061368, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36523438, + "step": 12389, + "time_per_iteration": 2.3968961238861084 + }, + { + "auxiliary_loss_clip": 0.01052154, + "auxiliary_loss_mlp": 0.01037311, + "balance_loss_clip": 1.01298022, + "balance_loss_mlp": 1.01587987, + "epoch": 0.7449271005561401, + "flos": 25154222492160.0, + "grad_norm": 1.663255110091735, + "language_loss": 0.80076051, + "learning_rate": 6.444267588104526e-07, + "loss": 0.82165515, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 12390, + "time_per_iteration": 2.4019064903259277 + }, + { + "auxiliary_loss_clip": 0.01054134, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.01093388, + "balance_loss_mlp": 1.01681614, + "epoch": 0.7449872238088081, + "flos": 22272247025280.0, + "grad_norm": 1.933938879252367, + "language_loss": 0.85600114, + "learning_rate": 6.441404294400014e-07, + "loss": 0.87691796, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37304688, + "step": 12391, + "time_per_iteration": 2.3619091510772705 + }, + { + "auxiliary_loss_clip": 0.01053019, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.01238012, + "balance_loss_mlp": 1.01711726, + "epoch": 0.745047347061476, + "flos": 20593762634880.0, + "grad_norm": 2.1015099766976935, + "language_loss": 0.74930096, + "learning_rate": 6.438541514838811e-07, + "loss": 0.77018249, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 12392, + "time_per_iteration": 2.374443769454956 + }, + { + "auxiliary_loss_clip": 0.0105184, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.01303315, + "balance_loss_mlp": 1.01644731, + "epoch": 0.745107470314144, + "flos": 22126449720960.0, + "grad_norm": 1.536976181545661, + "language_loss": 0.77774245, + "learning_rate": 6.435679249529487e-07, + "loss": 0.79862261, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35351562, + "step": 12393, + "time_per_iteration": 2.362616539001465 + }, + { + "auxiliary_loss_clip": 0.01053143, + "auxiliary_loss_mlp": 0.01040183, + "balance_loss_clip": 1.01417184, + "balance_loss_mlp": 1.01623762, + "epoch": 0.745167593566812, + "flos": 22235413674240.0, + "grad_norm": 1.916510112714912, + "language_loss": 0.73634279, + "learning_rate": 6.432817498580552e-07, + "loss": 0.75727594, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.36914062, + "step": 12394, + "time_per_iteration": 2.398754596710205 + }, + { + "auxiliary_loss_clip": 0.01054782, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.01124048, + "balance_loss_mlp": 1.01718211, + "epoch": 0.74522771681948, + "flos": 20665229921280.0, + "grad_norm": 1.925012861887302, + "language_loss": 0.82507205, + "learning_rate": 6.429956262100535e-07, + "loss": 0.84597337, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 12395, + "time_per_iteration": 3.6122326850891113 + }, + { + "auxiliary_loss_clip": 0.01056541, + "auxiliary_loss_mlp": 0.01038893, + "balance_loss_clip": 1.01351345, + "balance_loss_mlp": 1.01766396, + "epoch": 0.7452878400721479, + "flos": 21105903502080.0, + "grad_norm": 1.937468894476774, + "language_loss": 0.72569531, + "learning_rate": 6.427095540197937e-07, + "loss": 0.74664968, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38867188, + "step": 12396, + "time_per_iteration": 2.4092438220977783 + }, + { + "auxiliary_loss_clip": 0.01054513, + "auxiliary_loss_mlp": 0.01037372, + "balance_loss_clip": 1.01375675, + "balance_loss_mlp": 1.01623225, + "epoch": 0.7453479633248159, + "flos": 26686839755520.0, + "grad_norm": 2.1400836276841595, + "language_loss": 0.69419158, + "learning_rate": 6.424235332981245e-07, + "loss": 0.71511042, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3828125, + "step": 12397, + "time_per_iteration": 2.416828155517578 + }, + { + "auxiliary_loss_clip": 0.01051985, + "auxiliary_loss_mlp": 0.01038393, + "balance_loss_clip": 1.01394343, + "balance_loss_mlp": 1.01588547, + "epoch": 0.7454080865774838, + "flos": 17015200254720.0, + "grad_norm": 1.7190349195940162, + "language_loss": 0.77937388, + "learning_rate": 6.421375640558908e-07, + "loss": 0.80027771, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 12398, + "time_per_iteration": 2.345754623413086 + }, + { + "auxiliary_loss_clip": 0.01051579, + "auxiliary_loss_mlp": 0.01036374, + "balance_loss_clip": 1.01283073, + "balance_loss_mlp": 1.01618695, + "epoch": 0.7454682098301518, + "flos": 21322853890560.0, + "grad_norm": 1.6833532291748674, + "language_loss": 0.78868937, + "learning_rate": 6.418516463039363e-07, + "loss": 0.80956888, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 12399, + "time_per_iteration": 2.3521833419799805 + }, + { + "auxiliary_loss_clip": 0.0104799, + "auxiliary_loss_mlp": 0.01040262, + "balance_loss_clip": 1.01681328, + "balance_loss_mlp": 1.01448464, + "epoch": 0.7455283330828197, + "flos": 17857375004160.0, + "grad_norm": 2.262590771497466, + "language_loss": 0.74778455, + "learning_rate": 6.415657800531038e-07, + "loss": 0.7686671, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3359375, + "step": 12400, + "time_per_iteration": 5.154583930969238 + }, + { + "auxiliary_loss_clip": 0.01051771, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.01272666, + "balance_loss_mlp": 1.01572084, + "epoch": 0.7455884563354878, + "flos": 30772934703360.0, + "grad_norm": 3.137162313988889, + "language_loss": 0.83278787, + "learning_rate": 6.412799653142327e-07, + "loss": 0.85364717, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.36132812, + "step": 12401, + "time_per_iteration": 2.451383590698242 + }, + { + "auxiliary_loss_clip": 0.01050971, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.01747358, + "balance_loss_mlp": 1.01586986, + "epoch": 0.7456485795881557, + "flos": 23184422784000.0, + "grad_norm": 1.774307292989828, + "language_loss": 0.66462004, + "learning_rate": 6.409942020981611e-07, + "loss": 0.68553317, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 12402, + "time_per_iteration": 2.384031057357788 + }, + { + "auxiliary_loss_clip": 0.0105226, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.01197946, + "balance_loss_mlp": 1.01606441, + "epoch": 0.7457087028408237, + "flos": 38725625681280.0, + "grad_norm": 1.535604730759279, + "language_loss": 0.74125832, + "learning_rate": 6.407084904157265e-07, + "loss": 0.7621305, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 12403, + "time_per_iteration": 2.503023147583008 + }, + { + "auxiliary_loss_clip": 0.01008268, + "auxiliary_loss_mlp": 0.01006027, + "balance_loss_clip": 1.00363076, + "balance_loss_mlp": 1.00121999, + "epoch": 0.7457688260934917, + "flos": 56041113473280.0, + "grad_norm": 0.8352573538604305, + "language_loss": 0.58929932, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60944223, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.0703125, + "step": 12404, + "time_per_iteration": 2.822359323501587 + }, + { + "auxiliary_loss_clip": 0.01051848, + "auxiliary_loss_mlp": 0.01037995, + "balance_loss_clip": 1.01467717, + "balance_loss_mlp": 1.01633871, + "epoch": 0.7458289493461596, + "flos": 20115243273600.0, + "grad_norm": 1.5373707025809125, + "language_loss": 0.78353882, + "learning_rate": 6.401372216950995e-07, + "loss": 0.80443728, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 12405, + "time_per_iteration": 2.3925161361694336 + }, + { + "auxiliary_loss_clip": 0.01051643, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.0102942, + "balance_loss_mlp": 1.01618886, + "epoch": 0.7458890725988276, + "flos": 20192051998080.0, + "grad_norm": 1.5239705491963482, + "language_loss": 0.6959362, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71679747, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35351562, + "step": 12406, + "time_per_iteration": 2.3469395637512207 + }, + { + "auxiliary_loss_clip": 0.01054979, + "auxiliary_loss_mlp": 0.01039199, + "balance_loss_clip": 1.01390266, + "balance_loss_mlp": 1.01666427, + "epoch": 0.7459491958514956, + "flos": 17017818606720.0, + "grad_norm": 1.7541117882371835, + "language_loss": 0.65895623, + "learning_rate": 6.39566159239002e-07, + "loss": 0.67989802, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 12407, + "time_per_iteration": 2.3639416694641113 + }, + { + "auxiliary_loss_clip": 0.01053877, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.01650178, + "balance_loss_mlp": 1.0163151, + "epoch": 0.7460093191041636, + "flos": 25077658147200.0, + "grad_norm": 1.6353743250358053, + "language_loss": 0.73364109, + "learning_rate": 6.392807053872212e-07, + "loss": 0.75459999, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12408, + "time_per_iteration": 2.4010226726531982 + }, + { + "auxiliary_loss_clip": 0.01056788, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.01468539, + "balance_loss_mlp": 1.0181129, + "epoch": 0.7460694423568315, + "flos": 21907439562240.0, + "grad_norm": 2.350989120322418, + "language_loss": 0.74087274, + "learning_rate": 6.38995303134053e-07, + "loss": 0.76183701, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38671875, + "step": 12409, + "time_per_iteration": 2.392117738723755 + }, + { + "auxiliary_loss_clip": 0.01049505, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.01375949, + "balance_loss_mlp": 1.01539969, + "epoch": 0.7461295656094995, + "flos": 21214657987200.0, + "grad_norm": 1.5675124721226341, + "language_loss": 0.66597044, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68681633, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34179688, + "step": 12410, + "time_per_iteration": 2.3584184646606445 + }, + { + "auxiliary_loss_clip": 0.01050841, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.01018953, + "balance_loss_mlp": 1.01620448, + "epoch": 0.7461896888621674, + "flos": 22345739170560.0, + "grad_norm": 2.0192846791070576, + "language_loss": 0.84430289, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86513221, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 12411, + "time_per_iteration": 2.380342721939087 + }, + { + "auxiliary_loss_clip": 0.01052683, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.01231992, + "balance_loss_mlp": 1.01622415, + "epoch": 0.7462498121148354, + "flos": 25481777667840.0, + "grad_norm": 1.4888062649272975, + "language_loss": 0.78840947, + "learning_rate": 6.381394060744339e-07, + "loss": 0.80930078, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 12412, + "time_per_iteration": 3.8741352558135986 + }, + { + "auxiliary_loss_clip": 0.0105343, + "auxiliary_loss_mlp": 0.01040366, + "balance_loss_clip": 1.01681066, + "balance_loss_mlp": 1.01667774, + "epoch": 0.7463099353675033, + "flos": 33946539690240.0, + "grad_norm": 1.8024195493762916, + "language_loss": 0.63276732, + "learning_rate": 6.378542103239188e-07, + "loss": 0.65370524, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3671875, + "step": 12413, + "time_per_iteration": 2.469855308532715 + }, + { + "auxiliary_loss_clip": 0.01008267, + "auxiliary_loss_mlp": 0.01009357, + "balance_loss_clip": 1.00697279, + "balance_loss_mlp": 1.00133896, + "epoch": 0.7463700586201714, + "flos": 62764616747520.0, + "grad_norm": 0.723949473974381, + "language_loss": 0.54895949, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56913573, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.06933594, + "step": 12414, + "time_per_iteration": 3.070646047592163 + }, + { + "auxiliary_loss_clip": 0.01052548, + "auxiliary_loss_mlp": 0.01034203, + "balance_loss_clip": 1.01019454, + "balance_loss_mlp": 1.0157392, + "epoch": 0.7464301818728393, + "flos": 33431396446080.0, + "grad_norm": 1.7940283134453394, + "language_loss": 0.55926943, + "learning_rate": 6.372839737918154e-07, + "loss": 0.58013701, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 12415, + "time_per_iteration": 2.459868907928467 + }, + { + "auxiliary_loss_clip": 0.01053219, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.01710522, + "balance_loss_mlp": 1.01708388, + "epoch": 0.7464903051255073, + "flos": 26868667449600.0, + "grad_norm": 1.4494394481777901, + "language_loss": 0.75480181, + "learning_rate": 6.369989330318506e-07, + "loss": 0.7757374, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 12416, + "time_per_iteration": 2.419088125228882 + }, + { + "auxiliary_loss_clip": 0.01053778, + "auxiliary_loss_mlp": 0.0104025, + "balance_loss_clip": 1.01546681, + "balance_loss_mlp": 1.01686847, + "epoch": 0.7465504283781753, + "flos": 44085387271680.0, + "grad_norm": 1.6744309602791239, + "language_loss": 0.70200956, + "learning_rate": 6.367139439570233e-07, + "loss": 0.72294986, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36914062, + "step": 12417, + "time_per_iteration": 2.549337387084961 + }, + { + "auxiliary_loss_clip": 0.0105399, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.01438904, + "balance_loss_mlp": 1.01720369, + "epoch": 0.7466105516308432, + "flos": 19675267920000.0, + "grad_norm": 1.731607483149177, + "language_loss": 0.75271821, + "learning_rate": 6.364290065781392e-07, + "loss": 0.77362871, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 12418, + "time_per_iteration": 2.392871379852295 + }, + { + "auxiliary_loss_clip": 0.01052824, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.01574385, + "balance_loss_mlp": 1.0166173, + "epoch": 0.7466706748835112, + "flos": 20519711907840.0, + "grad_norm": 1.5963750114971749, + "language_loss": 0.70328259, + "learning_rate": 6.361441209060039e-07, + "loss": 0.72420198, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36132812, + "step": 12419, + "time_per_iteration": 2.3679373264312744 + }, + { + "auxiliary_loss_clip": 0.01048784, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.01569414, + "balance_loss_mlp": 1.01517236, + "epoch": 0.7467307981361792, + "flos": 21689162542080.0, + "grad_norm": 1.8976279750879554, + "language_loss": 0.75543422, + "learning_rate": 6.358592869514216e-07, + "loss": 0.77629846, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3359375, + "step": 12420, + "time_per_iteration": 2.4110121726989746 + }, + { + "auxiliary_loss_clip": 0.0105532, + "auxiliary_loss_mlp": 0.01042223, + "balance_loss_clip": 1.01720059, + "balance_loss_mlp": 1.0181309, + "epoch": 0.7467909213888472, + "flos": 19572657834240.0, + "grad_norm": 3.5091054774431507, + "language_loss": 0.686396, + "learning_rate": 6.355745047251904e-07, + "loss": 0.70737147, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 12421, + "time_per_iteration": 2.379145860671997 + }, + { + "auxiliary_loss_clip": 0.0105454, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.01552677, + "balance_loss_mlp": 1.01647139, + "epoch": 0.7468510446415151, + "flos": 23694119856000.0, + "grad_norm": 1.5988337052739932, + "language_loss": 0.73425555, + "learning_rate": 6.352897742381107e-07, + "loss": 0.75522137, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38085938, + "step": 12422, + "time_per_iteration": 2.4188194274902344 + }, + { + "auxiliary_loss_clip": 0.01051304, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.01118922, + "balance_loss_mlp": 1.01573634, + "epoch": 0.7469111678941831, + "flos": 29314472901120.0, + "grad_norm": 2.3107416866431976, + "language_loss": 0.75640869, + "learning_rate": 6.350050955009796e-07, + "loss": 0.77727193, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 12423, + "time_per_iteration": 2.4373514652252197 + }, + { + "auxiliary_loss_clip": 0.01049723, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.00880194, + "balance_loss_mlp": 1.01443124, + "epoch": 0.746971291146851, + "flos": 21797567913600.0, + "grad_norm": 1.3174064564151, + "language_loss": 0.68717623, + "learning_rate": 6.347204685245929e-07, + "loss": 0.70798206, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 12424, + "time_per_iteration": 2.401118516921997 + }, + { + "auxiliary_loss_clip": 0.01053493, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.01599813, + "balance_loss_mlp": 1.01649928, + "epoch": 0.747031414399519, + "flos": 36243650194560.0, + "grad_norm": 1.7627875263322692, + "language_loss": 0.75738668, + "learning_rate": 6.344358933197418e-07, + "loss": 0.77831644, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 12425, + "time_per_iteration": 2.5140438079833984 + }, + { + "auxiliary_loss_clip": 0.01052063, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.00987649, + "balance_loss_mlp": 1.01748037, + "epoch": 0.7470915376521869, + "flos": 19973879712000.0, + "grad_norm": 1.9387515545433291, + "language_loss": 0.71978164, + "learning_rate": 6.341513698972194e-07, + "loss": 0.74062037, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34570312, + "step": 12426, + "time_per_iteration": 2.360137939453125 + }, + { + "auxiliary_loss_clip": 0.01050162, + "auxiliary_loss_mlp": 0.0104197, + "balance_loss_clip": 1.01853311, + "balance_loss_mlp": 1.0154711, + "epoch": 0.747151660904855, + "flos": 20083262601600.0, + "grad_norm": 1.390102824358665, + "language_loss": 0.65687621, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67779756, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 12427, + "time_per_iteration": 2.389765977859497 + }, + { + "auxiliary_loss_clip": 0.01052617, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.01323509, + "balance_loss_mlp": 1.01672029, + "epoch": 0.7472117841575229, + "flos": 16289425578240.0, + "grad_norm": 1.570123428769494, + "language_loss": 0.75300717, + "learning_rate": 6.335824784423118e-07, + "loss": 0.77390265, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 12428, + "time_per_iteration": 2.3262596130371094 + }, + { + "auxiliary_loss_clip": 0.01055711, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.01227307, + "balance_loss_mlp": 1.01741838, + "epoch": 0.7472719074101909, + "flos": 21389084472960.0, + "grad_norm": 1.9066267886275796, + "language_loss": 0.59155571, + "learning_rate": 6.33298110431499e-07, + "loss": 0.6124922, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 12429, + "time_per_iteration": 2.4026732444763184 + }, + { + "auxiliary_loss_clip": 0.01055651, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.0136466, + "balance_loss_mlp": 1.01707757, + "epoch": 0.7473320306628589, + "flos": 29641993165440.0, + "grad_norm": 1.9292129391456336, + "language_loss": 0.62139767, + "learning_rate": 6.330137942461595e-07, + "loss": 0.64232415, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.38671875, + "step": 12430, + "time_per_iteration": 2.423727035522461 + }, + { + "auxiliary_loss_clip": 0.01052081, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.01335514, + "balance_loss_mlp": 1.01671255, + "epoch": 0.7473921539155268, + "flos": 24134898170880.0, + "grad_norm": 1.3856718291695302, + "language_loss": 0.76050919, + "learning_rate": 6.327295298970734e-07, + "loss": 0.78138417, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 12431, + "time_per_iteration": 2.427079677581787 + }, + { + "auxiliary_loss_clip": 0.01052291, + "auxiliary_loss_mlp": 0.01035312, + "balance_loss_clip": 1.01300776, + "balance_loss_mlp": 1.01569521, + "epoch": 0.7474522771681948, + "flos": 17487156280320.0, + "grad_norm": 1.8915791579097403, + "language_loss": 0.76385272, + "learning_rate": 6.32445317395021e-07, + "loss": 0.78472871, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36523438, + "step": 12432, + "time_per_iteration": 2.320988416671753 + }, + { + "auxiliary_loss_clip": 0.01054875, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.012079, + "balance_loss_mlp": 1.0157907, + "epoch": 0.7475124004208628, + "flos": 16726363643520.0, + "grad_norm": 1.8243589528574065, + "language_loss": 0.70935893, + "learning_rate": 6.321611567507787e-07, + "loss": 0.73030078, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 12433, + "time_per_iteration": 2.384920120239258 + }, + { + "auxiliary_loss_clip": 0.01052227, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.01290917, + "balance_loss_mlp": 1.01570857, + "epoch": 0.7475725236735308, + "flos": 19719188277120.0, + "grad_norm": 1.6712211378964323, + "language_loss": 0.68431354, + "learning_rate": 6.318770479751232e-07, + "loss": 0.70520735, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 12434, + "time_per_iteration": 3.6710000038146973 + }, + { + "auxiliary_loss_clip": 0.01048822, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.01419914, + "balance_loss_mlp": 1.01600528, + "epoch": 0.7476326469261987, + "flos": 26284814916480.0, + "grad_norm": 1.447182308639515, + "language_loss": 0.80543637, + "learning_rate": 6.315929910788263e-07, + "loss": 0.82627392, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.328125, + "step": 12435, + "time_per_iteration": 2.412912130355835 + }, + { + "auxiliary_loss_clip": 0.01054388, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.01302755, + "balance_loss_mlp": 1.01704311, + "epoch": 0.7476927701788667, + "flos": 31830488830080.0, + "grad_norm": 1.6874808513904371, + "language_loss": 0.69138527, + "learning_rate": 6.313089860726604e-07, + "loss": 0.71229106, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37304688, + "step": 12436, + "time_per_iteration": 2.4799132347106934 + }, + { + "auxiliary_loss_clip": 0.0105466, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.01512396, + "balance_loss_mlp": 1.01703715, + "epoch": 0.7477528934315346, + "flos": 31794144238080.0, + "grad_norm": 1.5092589997394859, + "language_loss": 0.71793187, + "learning_rate": 6.31025032967396e-07, + "loss": 0.73887163, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 12437, + "time_per_iteration": 2.4558494091033936 + }, + { + "auxiliary_loss_clip": 0.01049074, + "auxiliary_loss_mlp": 0.01034443, + "balance_loss_clip": 1.01346207, + "balance_loss_mlp": 1.01456165, + "epoch": 0.7478130166842026, + "flos": 20371051872000.0, + "grad_norm": 1.9804116280304538, + "language_loss": 0.68755651, + "learning_rate": 6.307411317737986e-07, + "loss": 0.70839167, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34570312, + "step": 12438, + "time_per_iteration": 2.3995018005371094 + }, + { + "auxiliary_loss_clip": 0.01052279, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.01163816, + "balance_loss_mlp": 1.01596272, + "epoch": 0.7478731399368705, + "flos": 18147992094720.0, + "grad_norm": 1.7257608317602304, + "language_loss": 0.812033, + "learning_rate": 6.304572825026344e-07, + "loss": 0.83290362, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 12439, + "time_per_iteration": 5.055356740951538 + }, + { + "auxiliary_loss_clip": 0.01052758, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.01388013, + "balance_loss_mlp": 1.01650214, + "epoch": 0.7479332631895386, + "flos": 15266994145920.0, + "grad_norm": 2.31616099872374, + "language_loss": 0.72367632, + "learning_rate": 6.301734851646674e-07, + "loss": 0.7445612, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 12440, + "time_per_iteration": 2.3562893867492676 + }, + { + "auxiliary_loss_clip": 0.01051119, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.01370311, + "balance_loss_mlp": 1.01716518, + "epoch": 0.7479933864422065, + "flos": 21141445132800.0, + "grad_norm": 1.6439077457450508, + "language_loss": 0.7546128, + "learning_rate": 6.298897397706597e-07, + "loss": 0.77548242, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.33984375, + "step": 12441, + "time_per_iteration": 2.361109972000122 + }, + { + "auxiliary_loss_clip": 0.0105451, + "auxiliary_loss_mlp": 0.01038456, + "balance_loss_clip": 1.01488864, + "balance_loss_mlp": 1.01748919, + "epoch": 0.7480535096948745, + "flos": 14391162979200.0, + "grad_norm": 2.001860709397914, + "language_loss": 0.8350122, + "learning_rate": 6.296060463313698e-07, + "loss": 0.85594189, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 12442, + "time_per_iteration": 2.355724334716797 + }, + { + "auxiliary_loss_clip": 0.01053088, + "auxiliary_loss_mlp": 0.0104137, + "balance_loss_clip": 1.01629996, + "balance_loss_mlp": 1.01618695, + "epoch": 0.7481136329475425, + "flos": 27343451295360.0, + "grad_norm": 2.46853129116258, + "language_loss": 0.63977551, + "learning_rate": 6.293224048575565e-07, + "loss": 0.66072011, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 12443, + "time_per_iteration": 2.415341854095459 + }, + { + "auxiliary_loss_clip": 0.01050901, + "auxiliary_loss_mlp": 0.01034232, + "balance_loss_clip": 1.01291728, + "balance_loss_mlp": 1.01561356, + "epoch": 0.7481737562002104, + "flos": 19530587779200.0, + "grad_norm": 1.9231871923458341, + "language_loss": 0.72931045, + "learning_rate": 6.29038815359975e-07, + "loss": 0.75016177, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35351562, + "step": 12444, + "time_per_iteration": 2.365589141845703 + }, + { + "auxiliary_loss_clip": 0.01051164, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.01152813, + "balance_loss_mlp": 1.01586771, + "epoch": 0.7482338794528784, + "flos": 21759023905920.0, + "grad_norm": 1.3723783903237237, + "language_loss": 0.69572556, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71659052, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 12445, + "time_per_iteration": 2.376697540283203 + }, + { + "auxiliary_loss_clip": 0.01049497, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.01152372, + "balance_loss_mlp": 1.01508653, + "epoch": 0.7482940027055464, + "flos": 18696372819840.0, + "grad_norm": 1.9580184937349534, + "language_loss": 0.75173855, + "learning_rate": 6.28471792336519e-07, + "loss": 0.77257818, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34375, + "step": 12446, + "time_per_iteration": 2.363255262374878 + }, + { + "auxiliary_loss_clip": 0.01055465, + "auxiliary_loss_mlp": 0.01042408, + "balance_loss_clip": 1.01587248, + "balance_loss_mlp": 1.01632833, + "epoch": 0.7483541259582144, + "flos": 15997097831040.0, + "grad_norm": 2.1832138540012824, + "language_loss": 0.74338037, + "learning_rate": 6.281883588321475e-07, + "loss": 0.76435912, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 12447, + "time_per_iteration": 2.3159799575805664 + }, + { + "auxiliary_loss_clip": 0.01051149, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.01484275, + "balance_loss_mlp": 1.01528668, + "epoch": 0.7484142492108823, + "flos": 25555130167680.0, + "grad_norm": 3.4553609144978386, + "language_loss": 0.73699647, + "learning_rate": 6.279049773470109e-07, + "loss": 0.75787526, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 12448, + "time_per_iteration": 2.4142839908599854 + }, + { + "auxiliary_loss_clip": 0.01053014, + "auxiliary_loss_mlp": 0.0104364, + "balance_loss_clip": 1.02051377, + "balance_loss_mlp": 1.01691377, + "epoch": 0.7484743724635503, + "flos": 22886788510080.0, + "grad_norm": 2.476837360108069, + "language_loss": 0.75060284, + "learning_rate": 6.276216478918543e-07, + "loss": 0.77156937, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 12449, + "time_per_iteration": 2.3679232597351074 + }, + { + "auxiliary_loss_clip": 0.0105599, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.01477528, + "balance_loss_mlp": 1.01815343, + "epoch": 0.7485344957162182, + "flos": 25299147012480.0, + "grad_norm": 2.1465246557706044, + "language_loss": 0.6343075, + "learning_rate": 6.273383704774225e-07, + "loss": 0.65524691, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37890625, + "step": 12450, + "time_per_iteration": 2.4396159648895264 + }, + { + "auxiliary_loss_clip": 0.01048308, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.01200819, + "balance_loss_mlp": 1.01469994, + "epoch": 0.7485946189688862, + "flos": 27051786864000.0, + "grad_norm": 2.0859385264812094, + "language_loss": 0.71180999, + "learning_rate": 6.270551451144577e-07, + "loss": 0.73261368, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.3359375, + "step": 12451, + "time_per_iteration": 2.399294853210449 + }, + { + "auxiliary_loss_clip": 0.01053514, + "auxiliary_loss_mlp": 0.01038765, + "balance_loss_clip": 1.01418447, + "balance_loss_mlp": 1.01682174, + "epoch": 0.7486547422215541, + "flos": 26905535712000.0, + "grad_norm": 3.8230313605256367, + "language_loss": 0.81181842, + "learning_rate": 6.267719718136988e-07, + "loss": 0.83274126, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 12452, + "time_per_iteration": 3.8183603286743164 + }, + { + "auxiliary_loss_clip": 0.01054638, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.0153327, + "balance_loss_mlp": 1.01743793, + "epoch": 0.7487148654742222, + "flos": 22345180588800.0, + "grad_norm": 2.275808008116949, + "language_loss": 0.72695917, + "learning_rate": 6.264888505858843e-07, + "loss": 0.74789929, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 12453, + "time_per_iteration": 2.3526365756988525 + }, + { + "auxiliary_loss_clip": 0.01052179, + "auxiliary_loss_mlp": 0.01037741, + "balance_loss_clip": 1.01577091, + "balance_loss_mlp": 1.01593065, + "epoch": 0.7487749887268901, + "flos": 23037717784320.0, + "grad_norm": 1.7390676564516472, + "language_loss": 0.75031334, + "learning_rate": 6.262057814417517e-07, + "loss": 0.77121258, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36328125, + "step": 12454, + "time_per_iteration": 2.398498058319092 + }, + { + "auxiliary_loss_clip": 0.01008112, + "auxiliary_loss_mlp": 0.01002309, + "balance_loss_clip": 0.99996078, + "balance_loss_mlp": 1.00126076, + "epoch": 0.7488351119795581, + "flos": 71521915985280.0, + "grad_norm": 0.733774119604931, + "language_loss": 0.59418893, + "learning_rate": 6.259227643920322e-07, + "loss": 0.6142931, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.06835938, + "step": 12455, + "time_per_iteration": 3.165294647216797 + }, + { + "auxiliary_loss_clip": 0.01052135, + "auxiliary_loss_mlp": 0.01042776, + "balance_loss_clip": 1.02082908, + "balance_loss_mlp": 1.01681316, + "epoch": 0.748895235232226, + "flos": 17195456937600.0, + "grad_norm": 1.8571525995965872, + "language_loss": 0.80472636, + "learning_rate": 6.256397994474592e-07, + "loss": 0.82567543, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 12456, + "time_per_iteration": 2.32230281829834 + }, + { + "auxiliary_loss_clip": 0.01008395, + "auxiliary_loss_mlp": 0.01004988, + "balance_loss_clip": 1.0027349, + "balance_loss_mlp": 1.00150442, + "epoch": 0.748955358484894, + "flos": 58976086250880.0, + "grad_norm": 0.8386605832083294, + "language_loss": 0.61447418, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63460803, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.06884766, + "step": 12457, + "time_per_iteration": 2.94266414642334 + }, + { + "auxiliary_loss_clip": 0.01053616, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.0102942, + "balance_loss_mlp": 1.01745999, + "epoch": 0.749015481737562, + "flos": 11359724515200.0, + "grad_norm": 2.9119262264564743, + "language_loss": 0.68418843, + "learning_rate": 6.250740259166711e-07, + "loss": 0.70505208, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 12458, + "time_per_iteration": 2.3119161128997803 + }, + { + "auxiliary_loss_clip": 0.01050815, + "auxiliary_loss_mlp": 0.01038541, + "balance_loss_clip": 1.0169282, + "balance_loss_mlp": 1.01585126, + "epoch": 0.74907560499023, + "flos": 21105414743040.0, + "grad_norm": 1.9735432571073708, + "language_loss": 0.8042208, + "learning_rate": 6.247912173519106e-07, + "loss": 0.82511437, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34960938, + "step": 12459, + "time_per_iteration": 2.3782150745391846 + }, + { + "auxiliary_loss_clip": 0.01051433, + "auxiliary_loss_mlp": 0.01041628, + "balance_loss_clip": 1.01816797, + "balance_loss_mlp": 1.01625586, + "epoch": 0.749135728242898, + "flos": 22267080144000.0, + "grad_norm": 1.869124539789917, + "language_loss": 0.81040013, + "learning_rate": 6.245084609352043e-07, + "loss": 0.83133078, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 12460, + "time_per_iteration": 2.364835739135742 + }, + { + "auxiliary_loss_clip": 0.01052313, + "auxiliary_loss_mlp": 0.01037768, + "balance_loss_clip": 1.0143795, + "balance_loss_mlp": 1.01628113, + "epoch": 0.7491958514955659, + "flos": 24056483523840.0, + "grad_norm": 1.8676211947868342, + "language_loss": 0.86867917, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88958001, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 12461, + "time_per_iteration": 2.409587860107422 + }, + { + "auxiliary_loss_clip": 0.01050322, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.01645577, + "balance_loss_mlp": 1.01582527, + "epoch": 0.7492559747482339, + "flos": 24491152350720.0, + "grad_norm": 1.9263708291082304, + "language_loss": 0.70792753, + "learning_rate": 6.239431045888435e-07, + "loss": 0.72879946, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.34570312, + "step": 12462, + "time_per_iteration": 2.407344102859497 + }, + { + "auxiliary_loss_clip": 0.01052141, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.01710868, + "balance_loss_mlp": 1.01581931, + "epoch": 0.7493160980009018, + "flos": 27744114591360.0, + "grad_norm": 1.9611855701202572, + "language_loss": 0.71166098, + "learning_rate": 6.236605046806267e-07, + "loss": 0.7326088, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36328125, + "step": 12463, + "time_per_iteration": 2.41162109375 + }, + { + "auxiliary_loss_clip": 0.01052636, + "auxiliary_loss_mlp": 0.0103655, + "balance_loss_clip": 1.0148654, + "balance_loss_mlp": 1.01722932, + "epoch": 0.7493762212535698, + "flos": 30224903091840.0, + "grad_norm": 2.0923394788370313, + "language_loss": 0.79009974, + "learning_rate": 6.233779569633419e-07, + "loss": 0.81099153, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 12464, + "time_per_iteration": 2.4782421588897705 + }, + { + "auxiliary_loss_clip": 0.01051025, + "auxiliary_loss_mlp": 0.01036878, + "balance_loss_clip": 1.01506329, + "balance_loss_mlp": 1.01566839, + "epoch": 0.7494363445062378, + "flos": 21943400129280.0, + "grad_norm": 1.8869620798325428, + "language_loss": 0.79940766, + "learning_rate": 6.230954614477034e-07, + "loss": 0.82028669, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35351562, + "step": 12465, + "time_per_iteration": 2.3632140159606934 + }, + { + "auxiliary_loss_clip": 0.01056935, + "auxiliary_loss_mlp": 0.01042149, + "balance_loss_clip": 1.01661479, + "balance_loss_mlp": 1.01754153, + "epoch": 0.7494964677589058, + "flos": 12489653623680.0, + "grad_norm": 2.832878361071903, + "language_loss": 0.75100154, + "learning_rate": 6.22813018144422e-07, + "loss": 0.77199239, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.39453125, + "step": 12466, + "time_per_iteration": 2.3843109607696533 + }, + { + "auxiliary_loss_clip": 0.01052204, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.01327109, + "balance_loss_mlp": 1.01607466, + "epoch": 0.7495565910115737, + "flos": 21651980077440.0, + "grad_norm": 2.2510430722444617, + "language_loss": 0.67295957, + "learning_rate": 6.22530627064209e-07, + "loss": 0.6938349, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36132812, + "step": 12467, + "time_per_iteration": 2.367713212966919 + }, + { + "auxiliary_loss_clip": 0.01051703, + "auxiliary_loss_mlp": 0.01035985, + "balance_loss_clip": 1.01294208, + "balance_loss_mlp": 1.01604986, + "epoch": 0.7496167142642417, + "flos": 15267622550400.0, + "grad_norm": 2.6688479273867705, + "language_loss": 0.78069305, + "learning_rate": 6.222482882177735e-07, + "loss": 0.80156994, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 12468, + "time_per_iteration": 2.3621609210968018 + }, + { + "auxiliary_loss_clip": 0.01050851, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.01297808, + "balance_loss_mlp": 1.01625657, + "epoch": 0.7496768375169096, + "flos": 22053830359680.0, + "grad_norm": 2.2713805863433008, + "language_loss": 0.70285594, + "learning_rate": 6.219660016158201e-07, + "loss": 0.72372091, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 12469, + "time_per_iteration": 2.3959569931030273 + }, + { + "auxiliary_loss_clip": 0.01054407, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.01398873, + "balance_loss_mlp": 1.01721668, + "epoch": 0.7497369607695776, + "flos": 19056187958400.0, + "grad_norm": 1.9426787872142965, + "language_loss": 0.70463216, + "learning_rate": 6.216837672690543e-07, + "loss": 0.7255491, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 12470, + "time_per_iteration": 2.3620617389678955 + }, + { + "auxiliary_loss_clip": 0.01054583, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.01495314, + "balance_loss_mlp": 1.01665473, + "epoch": 0.7497970840222457, + "flos": 21616333712640.0, + "grad_norm": 2.5737424131442, + "language_loss": 0.76029015, + "learning_rate": 6.214015851881793e-07, + "loss": 0.78124368, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 12471, + "time_per_iteration": 2.3594651222229004 + }, + { + "auxiliary_loss_clip": 0.01052717, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.00980222, + "balance_loss_mlp": 1.01635814, + "epoch": 0.7498572072749136, + "flos": 13734725996160.0, + "grad_norm": 4.3338726673471015, + "language_loss": 0.78617728, + "learning_rate": 6.211194553838929e-07, + "loss": 0.807037, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 12472, + "time_per_iteration": 2.3438053131103516 + }, + { + "auxiliary_loss_clip": 0.0105121, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.01094723, + "balance_loss_mlp": 1.01539123, + "epoch": 0.7499173305275816, + "flos": 22965412625280.0, + "grad_norm": 1.6119644705502016, + "language_loss": 0.8501358, + "learning_rate": 6.208373778668951e-07, + "loss": 0.87096846, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.359375, + "step": 12473, + "time_per_iteration": 3.6447575092315674 + }, + { + "auxiliary_loss_clip": 0.01055165, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.01585555, + "balance_loss_mlp": 1.01723731, + "epoch": 0.7499774537802495, + "flos": 22739559840000.0, + "grad_norm": 2.3152023353326707, + "language_loss": 0.75504935, + "learning_rate": 6.205553526478829e-07, + "loss": 0.77601898, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 12474, + "time_per_iteration": 2.410468578338623 + }, + { + "auxiliary_loss_clip": 0.01054726, + "auxiliary_loss_mlp": 0.01038052, + "balance_loss_clip": 1.01354241, + "balance_loss_mlp": 1.01633096, + "epoch": 0.7500375770329175, + "flos": 18295569878400.0, + "grad_norm": 1.9714629024381431, + "language_loss": 0.75282323, + "learning_rate": 6.202733797375492e-07, + "loss": 0.77375102, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38476562, + "step": 12475, + "time_per_iteration": 2.3493924140930176 + }, + { + "auxiliary_loss_clip": 0.01055486, + "auxiliary_loss_mlp": 0.01039321, + "balance_loss_clip": 1.01239169, + "balance_loss_mlp": 1.01661849, + "epoch": 0.7500977002855854, + "flos": 19169027072640.0, + "grad_norm": 1.9511248900722415, + "language_loss": 0.81376183, + "learning_rate": 6.199914591465878e-07, + "loss": 0.83470994, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38867188, + "step": 12476, + "time_per_iteration": 2.400913715362549 + }, + { + "auxiliary_loss_clip": 0.01051436, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.01551986, + "balance_loss_mlp": 1.01558018, + "epoch": 0.7501578235382534, + "flos": 22162794312960.0, + "grad_norm": 4.442719893819561, + "language_loss": 0.79156005, + "learning_rate": 6.19709590885688e-07, + "loss": 0.81246734, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 12477, + "time_per_iteration": 2.3656206130981445 + }, + { + "auxiliary_loss_clip": 0.01008814, + "auxiliary_loss_mlp": 0.01002452, + "balance_loss_clip": 1.00007951, + "balance_loss_mlp": 1.00169456, + "epoch": 0.7502179467909214, + "flos": 64462374074880.0, + "grad_norm": 0.8087064905181407, + "language_loss": 0.54541051, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56552315, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.07128906, + "step": 12478, + "time_per_iteration": 4.239579200744629 + }, + { + "auxiliary_loss_clip": 0.01051499, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.01481509, + "balance_loss_mlp": 1.01711679, + "epoch": 0.7502780700435894, + "flos": 20477432384640.0, + "grad_norm": 1.842790307447499, + "language_loss": 0.80996758, + "learning_rate": 6.191460113968272e-07, + "loss": 0.83085388, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 12479, + "time_per_iteration": 3.8660566806793213 + }, + { + "auxiliary_loss_clip": 0.01055275, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.01622939, + "balance_loss_mlp": 1.01799202, + "epoch": 0.7503381932962573, + "flos": 20444334549120.0, + "grad_norm": 1.9689502705691357, + "language_loss": 0.65477443, + "learning_rate": 6.188643001902369e-07, + "loss": 0.67573059, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37304688, + "step": 12480, + "time_per_iteration": 2.411593198776245 + }, + { + "auxiliary_loss_clip": 0.01049942, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.01548076, + "balance_loss_mlp": 1.015764, + "epoch": 0.7503983165489253, + "flos": 22380861864960.0, + "grad_norm": 1.8275288854566887, + "language_loss": 0.78267688, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80353326, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.34179688, + "step": 12481, + "time_per_iteration": 2.369314670562744 + }, + { + "auxiliary_loss_clip": 0.01053435, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.01422477, + "balance_loss_mlp": 1.01576877, + "epoch": 0.7504584398015932, + "flos": 24898309159680.0, + "grad_norm": 1.6581793000644778, + "language_loss": 0.72064519, + "learning_rate": 6.183010349061501e-07, + "loss": 0.74156165, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37695312, + "step": 12482, + "time_per_iteration": 2.3868229389190674 + }, + { + "auxiliary_loss_clip": 0.01053257, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.01346529, + "balance_loss_mlp": 1.01712346, + "epoch": 0.7505185630542612, + "flos": 25884046886400.0, + "grad_norm": 1.707358230876113, + "language_loss": 0.7123096, + "learning_rate": 6.180194808500118e-07, + "loss": 0.73323226, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.359375, + "step": 12483, + "time_per_iteration": 2.4292056560516357 + }, + { + "auxiliary_loss_clip": 0.01052223, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.01292121, + "balance_loss_mlp": 1.01629043, + "epoch": 0.7505786863069293, + "flos": 23142876399360.0, + "grad_norm": 1.7537840143334733, + "language_loss": 0.75978637, + "learning_rate": 6.177379791987131e-07, + "loss": 0.78067052, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 12484, + "time_per_iteration": 2.377311944961548 + }, + { + "auxiliary_loss_clip": 0.01051641, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.01667285, + "balance_loss_mlp": 1.01611412, + "epoch": 0.7506388095595972, + "flos": 16982416621440.0, + "grad_norm": 1.8122834202668094, + "language_loss": 0.86052936, + "learning_rate": 6.174565299629295e-07, + "loss": 0.88144308, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 12485, + "time_per_iteration": 2.3662514686584473 + }, + { + "auxiliary_loss_clip": 0.01052026, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.01436436, + "balance_loss_mlp": 1.01687968, + "epoch": 0.7506989328122652, + "flos": 22343923779840.0, + "grad_norm": 1.583703267143101, + "language_loss": 0.78662485, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80750442, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 12486, + "time_per_iteration": 2.362147092819214 + }, + { + "auxiliary_loss_clip": 0.01054819, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.01305366, + "balance_loss_mlp": 1.01734638, + "epoch": 0.7507590560649331, + "flos": 25774873464960.0, + "grad_norm": 2.304987391871904, + "language_loss": 0.73707044, + "learning_rate": 6.168937887805932e-07, + "loss": 0.75797737, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.375, + "step": 12487, + "time_per_iteration": 2.4182982444763184 + }, + { + "auxiliary_loss_clip": 0.01053268, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.01080012, + "balance_loss_mlp": 1.01604152, + "epoch": 0.7508191793176011, + "flos": 24278286591360.0, + "grad_norm": 2.2163661815682905, + "language_loss": 0.68605685, + "learning_rate": 6.166124968553801e-07, + "loss": 0.70692408, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37304688, + "step": 12488, + "time_per_iteration": 2.421172857284546 + }, + { + "auxiliary_loss_clip": 0.01052045, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.01213062, + "balance_loss_mlp": 1.01629996, + "epoch": 0.750879302570269, + "flos": 19898607087360.0, + "grad_norm": 1.9990932064711822, + "language_loss": 0.78583944, + "learning_rate": 6.163312573883592e-07, + "loss": 0.80672693, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 12489, + "time_per_iteration": 2.360384702682495 + }, + { + "auxiliary_loss_clip": 0.01050721, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.01861703, + "balance_loss_mlp": 1.01515269, + "epoch": 0.750939425822937, + "flos": 29204391784320.0, + "grad_norm": 1.8435559595526412, + "language_loss": 0.76464999, + "learning_rate": 6.160500703901956e-07, + "loss": 0.78557611, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 12490, + "time_per_iteration": 2.44028902053833 + }, + { + "auxiliary_loss_clip": 0.01051087, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.01163208, + "balance_loss_mlp": 1.01591396, + "epoch": 0.750999549075605, + "flos": 21141235664640.0, + "grad_norm": 1.5744094111060265, + "language_loss": 0.79617691, + "learning_rate": 6.157689358715527e-07, + "loss": 0.8170312, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 12491, + "time_per_iteration": 3.83137845993042 + }, + { + "auxiliary_loss_clip": 0.0104881, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.01526296, + "balance_loss_mlp": 1.0147028, + "epoch": 0.751059672328273, + "flos": 23546681717760.0, + "grad_norm": 1.869787844391525, + "language_loss": 0.77290964, + "learning_rate": 6.154878538430899e-07, + "loss": 0.79376251, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34179688, + "step": 12492, + "time_per_iteration": 2.401094675064087 + }, + { + "auxiliary_loss_clip": 0.01051019, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.0116334, + "balance_loss_mlp": 1.01558399, + "epoch": 0.7511197955809409, + "flos": 18988735478400.0, + "grad_norm": 1.892584456461371, + "language_loss": 0.7225765, + "learning_rate": 6.152068243154671e-07, + "loss": 0.74341697, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35351562, + "step": 12493, + "time_per_iteration": 2.36411190032959 + }, + { + "auxiliary_loss_clip": 0.01052719, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.01234925, + "balance_loss_mlp": 1.01669669, + "epoch": 0.7511799188336089, + "flos": 22046080037760.0, + "grad_norm": 1.6626626808198401, + "language_loss": 0.81411946, + "learning_rate": 6.149258472993395e-07, + "loss": 0.83499968, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 12494, + "time_per_iteration": 2.38456392288208 + }, + { + "auxiliary_loss_clip": 0.01053972, + "auxiliary_loss_mlp": 0.0104094, + "balance_loss_clip": 1.01532209, + "balance_loss_mlp": 1.01671922, + "epoch": 0.7512400420862768, + "flos": 16466330770560.0, + "grad_norm": 1.7361912941426558, + "language_loss": 0.80148375, + "learning_rate": 6.146449228053634e-07, + "loss": 0.82243288, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37109375, + "step": 12495, + "time_per_iteration": 2.3650412559509277 + }, + { + "auxiliary_loss_clip": 0.01052907, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_clip": 1.01694191, + "balance_loss_mlp": 1.01576662, + "epoch": 0.7513001653389448, + "flos": 20447302014720.0, + "grad_norm": 2.241029849699323, + "language_loss": 0.7152195, + "learning_rate": 6.143640508441898e-07, + "loss": 0.73614109, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.37109375, + "step": 12496, + "time_per_iteration": 2.3668289184570312 + }, + { + "auxiliary_loss_clip": 0.01052679, + "auxiliary_loss_mlp": 0.01037241, + "balance_loss_clip": 1.01610494, + "balance_loss_mlp": 1.01663876, + "epoch": 0.7513602885916129, + "flos": 23475703190400.0, + "grad_norm": 1.5490331166045441, + "language_loss": 0.79001486, + "learning_rate": 6.140832314264705e-07, + "loss": 0.81091404, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.359375, + "step": 12497, + "time_per_iteration": 2.4206466674804688 + }, + { + "auxiliary_loss_clip": 0.01053035, + "auxiliary_loss_mlp": 0.01043413, + "balance_loss_clip": 1.01932108, + "balance_loss_mlp": 1.01662588, + "epoch": 0.7514204118442808, + "flos": 26796013176960.0, + "grad_norm": 1.6992893310447554, + "language_loss": 0.77896118, + "learning_rate": 6.13802464562855e-07, + "loss": 0.79992568, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 12498, + "time_per_iteration": 2.395012617111206 + }, + { + "auxiliary_loss_clip": 0.01050749, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.01419115, + "balance_loss_mlp": 1.01670146, + "epoch": 0.7514805350969488, + "flos": 19864601556480.0, + "grad_norm": 1.7489822417009924, + "language_loss": 0.74474418, + "learning_rate": 6.135217502639878e-07, + "loss": 0.7656129, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33984375, + "step": 12499, + "time_per_iteration": 2.3836405277252197 + }, + { + "auxiliary_loss_clip": 0.01050739, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.0105288, + "balance_loss_mlp": 1.01510406, + "epoch": 0.7515406583496167, + "flos": 24570404870400.0, + "grad_norm": 2.5471870755155654, + "language_loss": 0.80414855, + "learning_rate": 6.132410885405148e-07, + "loss": 0.82496619, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.35546875, + "step": 12500, + "time_per_iteration": 2.3805432319641113 + }, + { + "auxiliary_loss_clip": 0.01055847, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.01447272, + "balance_loss_mlp": 1.0170784, + "epoch": 0.7516007816022847, + "flos": 20119223168640.0, + "grad_norm": 2.2964602961715994, + "language_loss": 0.74736398, + "learning_rate": 6.129604794030794e-07, + "loss": 0.76834887, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.38671875, + "step": 12501, + "time_per_iteration": 2.3835690021514893 + }, + { + "auxiliary_loss_clip": 0.01051665, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.01069105, + "balance_loss_mlp": 1.01589012, + "epoch": 0.7516609048549526, + "flos": 22783515108480.0, + "grad_norm": 1.705486259478708, + "language_loss": 0.78999037, + "learning_rate": 6.126799228623207e-07, + "loss": 0.81083745, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 12502, + "time_per_iteration": 2.3851442337036133 + }, + { + "auxiliary_loss_clip": 0.0105306, + "auxiliary_loss_mlp": 0.01037579, + "balance_loss_clip": 1.01302183, + "balance_loss_mlp": 1.01632738, + "epoch": 0.7517210281076206, + "flos": 10633251611520.0, + "grad_norm": 2.21844763504999, + "language_loss": 0.7307421, + "learning_rate": 6.123994189288786e-07, + "loss": 0.75164843, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3671875, + "step": 12503, + "time_per_iteration": 2.3683152198791504 + }, + { + "auxiliary_loss_clip": 0.01008427, + "auxiliary_loss_mlp": 0.01003545, + "balance_loss_clip": 1.00098217, + "balance_loss_mlp": 1.00137234, + "epoch": 0.7517811513602886, + "flos": 66049001078400.0, + "grad_norm": 1.2206335361431635, + "language_loss": 0.64133179, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66145152, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.0703125, + "step": 12504, + "time_per_iteration": 2.9171221256256104 + }, + { + "auxiliary_loss_clip": 0.01049366, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.01344657, + "balance_loss_mlp": 1.01526046, + "epoch": 0.7518412746129566, + "flos": 37266849676800.0, + "grad_norm": 1.678412278779344, + "language_loss": 0.69442666, + "learning_rate": 6.118385689264896e-07, + "loss": 0.71528286, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.33984375, + "step": 12505, + "time_per_iteration": 2.551549196243286 + }, + { + "auxiliary_loss_clip": 0.01008812, + "auxiliary_loss_mlp": 0.01002811, + "balance_loss_clip": 1.00053442, + "balance_loss_mlp": 1.00198722, + "epoch": 0.7519013978656245, + "flos": 60515162115840.0, + "grad_norm": 0.6469383450057987, + "language_loss": 0.55169588, + "learning_rate": 6.11558222878809e-07, + "loss": 0.57181215, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.06835938, + "step": 12506, + "time_per_iteration": 3.181152105331421 + }, + { + "auxiliary_loss_clip": 0.01053246, + "auxiliary_loss_mlp": 0.0103801, + "balance_loss_clip": 1.01373875, + "balance_loss_mlp": 1.01722634, + "epoch": 0.7519615211182925, + "flos": 18805895354880.0, + "grad_norm": 1.9794624955949431, + "language_loss": 0.78732973, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80824226, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 12507, + "time_per_iteration": 2.3636245727539062 + }, + { + "auxiliary_loss_clip": 0.01051882, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.01762843, + "balance_loss_mlp": 1.01662087, + "epoch": 0.7520216443709604, + "flos": 14574352216320.0, + "grad_norm": 1.6448454233968035, + "language_loss": 0.72157389, + "learning_rate": 6.10997688743631e-07, + "loss": 0.74247485, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.35351562, + "step": 12508, + "time_per_iteration": 2.3395674228668213 + }, + { + "auxiliary_loss_clip": 0.01049656, + "auxiliary_loss_mlp": 0.01038333, + "balance_loss_clip": 1.01668501, + "balance_loss_mlp": 1.01533604, + "epoch": 0.7520817676236284, + "flos": 17055629475840.0, + "grad_norm": 1.6645974282641212, + "language_loss": 0.72816771, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74904764, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 12509, + "time_per_iteration": 2.39573335647583 + }, + { + "auxiliary_loss_clip": 0.01055317, + "auxiliary_loss_mlp": 0.0104693, + "balance_loss_clip": 1.0210855, + "balance_loss_mlp": 1.01748514, + "epoch": 0.7521418908762965, + "flos": 25665211284480.0, + "grad_norm": 1.593829834748856, + "language_loss": 0.63319081, + "learning_rate": 6.104373652928785e-07, + "loss": 0.65421331, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 12510, + "time_per_iteration": 2.3912620544433594 + }, + { + "auxiliary_loss_clip": 0.01049719, + "auxiliary_loss_mlp": 0.01034976, + "balance_loss_clip": 1.01183796, + "balance_loss_mlp": 1.01543307, + "epoch": 0.7522020141289644, + "flos": 20885706357120.0, + "grad_norm": 1.6644618795926276, + "language_loss": 0.8274101, + "learning_rate": 6.10157282600722e-07, + "loss": 0.84825712, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34375, + "step": 12511, + "time_per_iteration": 2.3777265548706055 + }, + { + "auxiliary_loss_clip": 0.01055984, + "auxiliary_loss_mlp": 0.01039397, + "balance_loss_clip": 1.01442301, + "balance_loss_mlp": 1.01712751, + "epoch": 0.7522621373816324, + "flos": 12639500645760.0, + "grad_norm": 2.501082134804211, + "language_loss": 0.77364349, + "learning_rate": 6.098772526115412e-07, + "loss": 0.79459721, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 12512, + "time_per_iteration": 3.8453569412231445 + }, + { + "auxiliary_loss_clip": 0.01048872, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.01615024, + "balance_loss_mlp": 1.01497865, + "epoch": 0.7523222606343003, + "flos": 25625061354240.0, + "grad_norm": 1.6913116739396399, + "language_loss": 0.83044237, + "learning_rate": 6.095972753359537e-07, + "loss": 0.8513056, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 12513, + "time_per_iteration": 2.4073643684387207 + }, + { + "auxiliary_loss_clip": 0.01053451, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.01896679, + "balance_loss_mlp": 1.01634455, + "epoch": 0.7523823838869683, + "flos": 20447860596480.0, + "grad_norm": 1.8945618221370861, + "language_loss": 0.76095283, + "learning_rate": 6.093173507845771e-07, + "loss": 0.78192359, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 12514, + "time_per_iteration": 2.3888628482818604 + }, + { + "auxiliary_loss_clip": 0.01049865, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.01477313, + "balance_loss_mlp": 1.01578176, + "epoch": 0.7524425071396362, + "flos": 14719730584320.0, + "grad_norm": 2.1600545091254686, + "language_loss": 0.70270824, + "learning_rate": 6.090374789680271e-07, + "loss": 0.72356719, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 12515, + "time_per_iteration": 2.3318045139312744 + }, + { + "auxiliary_loss_clip": 0.01052756, + "auxiliary_loss_mlp": 0.01033747, + "balance_loss_clip": 1.01015532, + "balance_loss_mlp": 1.01665735, + "epoch": 0.7525026303923043, + "flos": 30590722984320.0, + "grad_norm": 1.7797888344747634, + "language_loss": 0.71904838, + "learning_rate": 6.087576598969137e-07, + "loss": 0.73991334, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 12516, + "time_per_iteration": 2.4364535808563232 + }, + { + "auxiliary_loss_clip": 0.01050901, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.01547313, + "balance_loss_mlp": 1.0168668, + "epoch": 0.7525627536449722, + "flos": 24790567104000.0, + "grad_norm": 1.439568493768495, + "language_loss": 0.90008628, + "learning_rate": 6.084778935818495e-07, + "loss": 0.92096549, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33984375, + "step": 12517, + "time_per_iteration": 2.4371795654296875 + }, + { + "auxiliary_loss_clip": 0.01052595, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.01986718, + "balance_loss_mlp": 1.01610911, + "epoch": 0.7526228768976402, + "flos": 20778662528640.0, + "grad_norm": 1.5002886508535487, + "language_loss": 0.75509942, + "learning_rate": 6.081981800334437e-07, + "loss": 0.77604944, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36523438, + "step": 12518, + "time_per_iteration": 3.8496694564819336 + }, + { + "auxiliary_loss_clip": 0.01008149, + "auxiliary_loss_mlp": 0.01002022, + "balance_loss_clip": 0.99966151, + "balance_loss_mlp": 1.00135398, + "epoch": 0.7526830001503081, + "flos": 66556114709760.0, + "grad_norm": 0.7052594109419351, + "language_loss": 0.55833805, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57843971, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.06835938, + "step": 12519, + "time_per_iteration": 4.463867902755737 + }, + { + "auxiliary_loss_clip": 0.01050832, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.01908946, + "balance_loss_mlp": 1.01572144, + "epoch": 0.7527431234029761, + "flos": 23476750531200.0, + "grad_norm": 1.4308760017128805, + "language_loss": 0.78461421, + "learning_rate": 6.07638911279029e-07, + "loss": 0.8055377, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 12520, + "time_per_iteration": 2.3873491287231445 + }, + { + "auxiliary_loss_clip": 0.01050494, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.01322544, + "balance_loss_mlp": 1.01514089, + "epoch": 0.752803246655644, + "flos": 22048593655680.0, + "grad_norm": 2.132872101161886, + "language_loss": 0.7547555, + "learning_rate": 6.07359356094229e-07, + "loss": 0.77561939, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 12521, + "time_per_iteration": 2.403851270675659 + }, + { + "auxiliary_loss_clip": 0.01054166, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.01932311, + "balance_loss_mlp": 1.01626313, + "epoch": 0.752863369908312, + "flos": 30152493198720.0, + "grad_norm": 1.8078920691633977, + "language_loss": 0.68359113, + "learning_rate": 6.070798537185016e-07, + "loss": 0.70458066, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 12522, + "time_per_iteration": 2.4342448711395264 + }, + { + "auxiliary_loss_clip": 0.01054317, + "auxiliary_loss_mlp": 0.01047721, + "balance_loss_clip": 1.02577496, + "balance_loss_mlp": 1.01732492, + "epoch": 0.7529234931609801, + "flos": 24566599532160.0, + "grad_norm": 1.5171288825226044, + "language_loss": 0.79356694, + "learning_rate": 6.068004041624453e-07, + "loss": 0.81458735, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.37109375, + "step": 12523, + "time_per_iteration": 2.49700927734375 + }, + { + "auxiliary_loss_clip": 0.01051982, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.01736271, + "balance_loss_mlp": 1.01647866, + "epoch": 0.752983616413648, + "flos": 23111279752320.0, + "grad_norm": 2.005956101264723, + "language_loss": 0.81010365, + "learning_rate": 6.065210074366571e-07, + "loss": 0.83102715, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 12524, + "time_per_iteration": 2.365661382675171 + }, + { + "auxiliary_loss_clip": 0.01049848, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.01564884, + "balance_loss_mlp": 1.01599145, + "epoch": 0.753043739666316, + "flos": 24315783258240.0, + "grad_norm": 1.7998712914615127, + "language_loss": 0.75009322, + "learning_rate": 6.062416635517326e-07, + "loss": 0.77095985, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33984375, + "step": 12525, + "time_per_iteration": 2.5296785831451416 + }, + { + "auxiliary_loss_clip": 0.01052382, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.01548219, + "balance_loss_mlp": 1.01636517, + "epoch": 0.7531038629189839, + "flos": 24242151467520.0, + "grad_norm": 1.7785876418073239, + "language_loss": 0.73595655, + "learning_rate": 6.059623725182641e-07, + "loss": 0.756859, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.359375, + "step": 12526, + "time_per_iteration": 2.4295523166656494 + }, + { + "auxiliary_loss_clip": 0.01051082, + "auxiliary_loss_mlp": 0.01037313, + "balance_loss_clip": 1.01533055, + "balance_loss_mlp": 1.01660705, + "epoch": 0.7531639861716519, + "flos": 30187546070400.0, + "grad_norm": 2.4175987618796806, + "language_loss": 0.73785019, + "learning_rate": 6.056831343468414e-07, + "loss": 0.75873411, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 12527, + "time_per_iteration": 2.487762928009033 + }, + { + "auxiliary_loss_clip": 0.0105013, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.00987697, + "balance_loss_mlp": 1.01587045, + "epoch": 0.7532241094243198, + "flos": 18222217378560.0, + "grad_norm": 1.9588177759441452, + "language_loss": 0.82358384, + "learning_rate": 6.054039490480539e-07, + "loss": 0.8443923, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34375, + "step": 12528, + "time_per_iteration": 2.3890132904052734 + }, + { + "auxiliary_loss_clip": 0.01053239, + "auxiliary_loss_mlp": 0.01041322, + "balance_loss_clip": 1.01639509, + "balance_loss_mlp": 1.01680732, + "epoch": 0.7532842326769879, + "flos": 20880155450880.0, + "grad_norm": 2.0096926084576205, + "language_loss": 0.86341596, + "learning_rate": 6.051248166324892e-07, + "loss": 0.88436157, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36328125, + "step": 12529, + "time_per_iteration": 2.42287540435791 + }, + { + "auxiliary_loss_clip": 0.01054521, + "auxiliary_loss_mlp": 0.01042496, + "balance_loss_clip": 1.01773667, + "balance_loss_mlp": 1.01724553, + "epoch": 0.7533443559296558, + "flos": 18077676883200.0, + "grad_norm": 1.9710426650878614, + "language_loss": 0.75477099, + "learning_rate": 6.048457371107303e-07, + "loss": 0.7757411, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37304688, + "step": 12530, + "time_per_iteration": 2.4096360206604004 + }, + { + "auxiliary_loss_clip": 0.01008317, + "auxiliary_loss_mlp": 0.01004782, + "balance_loss_clip": 1.00225461, + "balance_loss_mlp": 1.00131083, + "epoch": 0.7534044791823238, + "flos": 50252024494080.0, + "grad_norm": 0.8267312357156381, + "language_loss": 0.63711703, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65724802, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.0703125, + "step": 12531, + "time_per_iteration": 4.293577432632446 + }, + { + "auxiliary_loss_clip": 0.01053858, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.01300812, + "balance_loss_mlp": 1.01640654, + "epoch": 0.7534646024349917, + "flos": 20849222119680.0, + "grad_norm": 1.8150094370486118, + "language_loss": 0.71357971, + "learning_rate": 6.042877367909633e-07, + "loss": 0.73448992, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37304688, + "step": 12532, + "time_per_iteration": 2.3581719398498535 + }, + { + "auxiliary_loss_clip": 0.01050123, + "auxiliary_loss_mlp": 0.01038997, + "balance_loss_clip": 1.01719356, + "balance_loss_mlp": 1.01637125, + "epoch": 0.7535247256876597, + "flos": 23070780708480.0, + "grad_norm": 1.8557082007596366, + "language_loss": 0.78130305, + "learning_rate": 6.040088160141132e-07, + "loss": 0.80219418, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.3359375, + "step": 12533, + "time_per_iteration": 2.4118728637695312 + }, + { + "auxiliary_loss_clip": 0.01008173, + "auxiliary_loss_mlp": 0.01004366, + "balance_loss_clip": 1.00200605, + "balance_loss_mlp": 1.00137746, + "epoch": 0.7535848489403276, + "flos": 58620006627840.0, + "grad_norm": 0.7963334510997042, + "language_loss": 0.57392788, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59405327, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.06787109, + "step": 12534, + "time_per_iteration": 3.081427812576294 + }, + { + "auxiliary_loss_clip": 0.01050216, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.01434469, + "balance_loss_mlp": 1.01480889, + "epoch": 0.7536449721929956, + "flos": 26576688816000.0, + "grad_norm": 1.5620860264675178, + "language_loss": 0.72359294, + "learning_rate": 6.03451133279365e-07, + "loss": 0.74446201, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 12535, + "time_per_iteration": 2.4551658630371094 + }, + { + "auxiliary_loss_clip": 0.01052958, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_clip": 1.01679826, + "balance_loss_mlp": 1.01555717, + "epoch": 0.7537050954456637, + "flos": 25734898091520.0, + "grad_norm": 1.527479504455474, + "language_loss": 0.81787694, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83883739, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.375, + "step": 12536, + "time_per_iteration": 2.4245450496673584 + }, + { + "auxiliary_loss_clip": 0.01049273, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.0131309, + "balance_loss_mlp": 1.015136, + "epoch": 0.7537652186983316, + "flos": 30223192435200.0, + "grad_norm": 1.7186734083211472, + "language_loss": 0.75545728, + "learning_rate": 6.028936623737067e-07, + "loss": 0.77630937, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34179688, + "step": 12537, + "time_per_iteration": 2.4724762439727783 + }, + { + "auxiliary_loss_clip": 0.01052259, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.01098132, + "balance_loss_mlp": 1.01594496, + "epoch": 0.7538253419509996, + "flos": 12640408341120.0, + "grad_norm": 1.8185892959672139, + "language_loss": 0.75447774, + "learning_rate": 6.026150063832111e-07, + "loss": 0.77534813, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 12538, + "time_per_iteration": 2.351973056793213 + }, + { + "auxiliary_loss_clip": 0.01051737, + "auxiliary_loss_mlp": 0.01036793, + "balance_loss_clip": 1.01386857, + "balance_loss_mlp": 1.01636839, + "epoch": 0.7538854652036675, + "flos": 23184841720320.0, + "grad_norm": 1.9871801469524246, + "language_loss": 0.68278456, + "learning_rate": 6.023364033816956e-07, + "loss": 0.70366985, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 12539, + "time_per_iteration": 2.442000150680542 + }, + { + "auxiliary_loss_clip": 0.01051969, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.0124197, + "balance_loss_mlp": 1.01709008, + "epoch": 0.7539455884563355, + "flos": 23185086099840.0, + "grad_norm": 1.6563816130931548, + "language_loss": 0.75696772, + "learning_rate": 6.020578533797229e-07, + "loss": 0.77783871, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 12540, + "time_per_iteration": 2.406379461288452 + }, + { + "auxiliary_loss_clip": 0.01052434, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.01152062, + "balance_loss_mlp": 1.01569963, + "epoch": 0.7540057117090034, + "flos": 13180515073920.0, + "grad_norm": 1.9908026309125533, + "language_loss": 0.74500149, + "learning_rate": 6.017793563878566e-07, + "loss": 0.76587856, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 12541, + "time_per_iteration": 2.375148296356201 + }, + { + "auxiliary_loss_clip": 0.01051298, + "auxiliary_loss_mlp": 0.01033947, + "balance_loss_clip": 1.01045096, + "balance_loss_mlp": 1.0162102, + "epoch": 0.7540658349616715, + "flos": 45476396593920.0, + "grad_norm": 1.5880193505441267, + "language_loss": 0.73038387, + "learning_rate": 6.015009124166576e-07, + "loss": 0.75123632, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 12542, + "time_per_iteration": 2.617290496826172 + }, + { + "auxiliary_loss_clip": 0.01050937, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.00939667, + "balance_loss_mlp": 1.01585984, + "epoch": 0.7541259582143394, + "flos": 19929994266240.0, + "grad_norm": 2.550459430125662, + "language_loss": 0.85966635, + "learning_rate": 6.012225214766844e-07, + "loss": 0.88049805, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 12543, + "time_per_iteration": 2.4195051193237305 + }, + { + "auxiliary_loss_clip": 0.01052838, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.01143265, + "balance_loss_mlp": 1.01758838, + "epoch": 0.7541860814670074, + "flos": 27197025586560.0, + "grad_norm": 2.463117662276628, + "language_loss": 0.75048798, + "learning_rate": 6.009441835784927e-07, + "loss": 0.77136612, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 12544, + "time_per_iteration": 2.4434478282928467 + }, + { + "auxiliary_loss_clip": 0.01050577, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.01286364, + "balance_loss_mlp": 1.01535845, + "epoch": 0.7542462047196753, + "flos": 21323098270080.0, + "grad_norm": 1.8823328425445394, + "language_loss": 0.7022832, + "learning_rate": 6.006658987326383e-07, + "loss": 0.7231431, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 12545, + "time_per_iteration": 2.447777271270752 + }, + { + "auxiliary_loss_clip": 0.01050172, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.01436758, + "balance_loss_mlp": 1.01534712, + "epoch": 0.7543063279723433, + "flos": 11940330291840.0, + "grad_norm": 1.9742601714330743, + "language_loss": 0.70754349, + "learning_rate": 6.003876669496728e-07, + "loss": 0.72841656, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34765625, + "step": 12546, + "time_per_iteration": 2.3720648288726807 + }, + { + "auxiliary_loss_clip": 0.01052968, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.01516747, + "balance_loss_mlp": 1.01623046, + "epoch": 0.7543664512250112, + "flos": 22818882182400.0, + "grad_norm": 2.1237248508987197, + "language_loss": 0.75436819, + "learning_rate": 6.00109488240147e-07, + "loss": 0.77529138, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 12547, + "time_per_iteration": 2.395521640777588 + }, + { + "auxiliary_loss_clip": 0.01052222, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.0133456, + "balance_loss_mlp": 1.01640034, + "epoch": 0.7544265744776792, + "flos": 20922784087680.0, + "grad_norm": 1.8168276630838271, + "language_loss": 0.6847899, + "learning_rate": 5.998313626146099e-07, + "loss": 0.70569366, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 12548, + "time_per_iteration": 2.3880324363708496 + }, + { + "auxiliary_loss_clip": 0.01053457, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.00884986, + "balance_loss_mlp": 1.01615644, + "epoch": 0.7544866977303473, + "flos": 15194584252800.0, + "grad_norm": 1.8912743803450867, + "language_loss": 0.876302, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89717245, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37304688, + "step": 12549, + "time_per_iteration": 2.365236282348633 + }, + { + "auxiliary_loss_clip": 0.01048966, + "auxiliary_loss_mlp": 0.0103663, + "balance_loss_clip": 1.01345551, + "balance_loss_mlp": 1.01493835, + "epoch": 0.7545468209830152, + "flos": 27082615461120.0, + "grad_norm": 1.6931000095884086, + "language_loss": 0.78390276, + "learning_rate": 5.992752706576865e-07, + "loss": 0.80475879, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.33984375, + "step": 12550, + "time_per_iteration": 2.4013724327087402 + }, + { + "auxiliary_loss_clip": 0.01051854, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.01026034, + "balance_loss_mlp": 1.01610982, + "epoch": 0.7546069442356832, + "flos": 26870447928960.0, + "grad_norm": 1.4095999150504634, + "language_loss": 0.70162749, + "learning_rate": 5.98997304347386e-07, + "loss": 0.72246826, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35742188, + "step": 12551, + "time_per_iteration": 2.4543628692626953 + }, + { + "auxiliary_loss_clip": 0.01053962, + "auxiliary_loss_mlp": 0.01038258, + "balance_loss_clip": 1.01410615, + "balance_loss_mlp": 1.01766741, + "epoch": 0.7546670674883511, + "flos": 15742196928000.0, + "grad_norm": 2.1097407869695566, + "language_loss": 0.8711127, + "learning_rate": 5.987193911632487e-07, + "loss": 0.89203501, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 12552, + "time_per_iteration": 3.59165096282959 + }, + { + "auxiliary_loss_clip": 0.01052721, + "auxiliary_loss_mlp": 0.01038111, + "balance_loss_clip": 1.01450765, + "balance_loss_mlp": 1.01662266, + "epoch": 0.7547271907410191, + "flos": 23476575974400.0, + "grad_norm": 1.6838345207498164, + "language_loss": 0.79692757, + "learning_rate": 5.98441531115812e-07, + "loss": 0.81783587, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 12553, + "time_per_iteration": 2.4197230339050293 + }, + { + "auxiliary_loss_clip": 0.01052372, + "auxiliary_loss_mlp": 0.01040041, + "balance_loss_clip": 1.01588929, + "balance_loss_mlp": 1.01583827, + "epoch": 0.754787313993687, + "flos": 31721455054080.0, + "grad_norm": 2.377922187168272, + "language_loss": 0.64757091, + "learning_rate": 5.981637242156135e-07, + "loss": 0.66849506, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 12554, + "time_per_iteration": 2.4552290439605713 + }, + { + "auxiliary_loss_clip": 0.0105248, + "auxiliary_loss_mlp": 0.01039191, + "balance_loss_clip": 1.01536107, + "balance_loss_mlp": 1.01671255, + "epoch": 0.7548474372463551, + "flos": 27561833049600.0, + "grad_norm": 1.6637711263967532, + "language_loss": 0.74044049, + "learning_rate": 5.978859704731864e-07, + "loss": 0.76135719, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 12555, + "time_per_iteration": 2.460484743118286 + }, + { + "auxiliary_loss_clip": 0.01054608, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.01236069, + "balance_loss_mlp": 1.0171392, + "epoch": 0.754907560499023, + "flos": 19317547463040.0, + "grad_norm": 1.7125545547400447, + "language_loss": 0.80081761, + "learning_rate": 5.976082698990645e-07, + "loss": 0.82174039, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 12556, + "time_per_iteration": 2.3539249897003174 + }, + { + "auxiliary_loss_clip": 0.01008569, + "auxiliary_loss_mlp": 0.01002092, + "balance_loss_clip": 0.99981493, + "balance_loss_mlp": 1.00153208, + "epoch": 0.754967683751691, + "flos": 69741100800000.0, + "grad_norm": 0.7422118952474142, + "language_loss": 0.50469643, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52480304, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.0703125, + "step": 12557, + "time_per_iteration": 4.374576807022095 + }, + { + "auxiliary_loss_clip": 0.01053687, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.00985169, + "balance_loss_mlp": 1.01653719, + "epoch": 0.7550278070043589, + "flos": 24420872050560.0, + "grad_norm": 1.5803874855307685, + "language_loss": 0.72675377, + "learning_rate": 5.970530282978525e-07, + "loss": 0.74764299, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 12558, + "time_per_iteration": 3.8435425758361816 + }, + { + "auxiliary_loss_clip": 0.01051526, + "auxiliary_loss_mlp": 0.01033212, + "balance_loss_clip": 1.01195741, + "balance_loss_mlp": 1.01644361, + "epoch": 0.7550879302570269, + "flos": 32633246787840.0, + "grad_norm": 2.935641351520168, + "language_loss": 0.81219047, + "learning_rate": 5.967754872918187e-07, + "loss": 0.83303785, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.3515625, + "step": 12559, + "time_per_iteration": 2.480257034301758 + }, + { + "auxiliary_loss_clip": 0.01053291, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.01140881, + "balance_loss_mlp": 1.01624417, + "epoch": 0.7551480535096948, + "flos": 21794565536640.0, + "grad_norm": 2.3322215228471843, + "language_loss": 0.79806304, + "learning_rate": 5.96497999496199e-07, + "loss": 0.81894499, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 12560, + "time_per_iteration": 2.3976242542266846 + }, + { + "auxiliary_loss_clip": 0.01050277, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.01543951, + "balance_loss_mlp": 1.01611781, + "epoch": 0.7552081767623628, + "flos": 18514126189440.0, + "grad_norm": 1.6806169942466942, + "language_loss": 0.72221422, + "learning_rate": 5.96220564921515e-07, + "loss": 0.74308419, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34179688, + "step": 12561, + "time_per_iteration": 2.390883445739746 + }, + { + "auxiliary_loss_clip": 0.01051836, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.01212764, + "balance_loss_mlp": 1.01544094, + "epoch": 0.7552683000150308, + "flos": 27633614538240.0, + "grad_norm": 1.6075129737101073, + "language_loss": 0.77034283, + "learning_rate": 5.959431835782889e-07, + "loss": 0.79121089, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 12562, + "time_per_iteration": 2.4276885986328125 + }, + { + "auxiliary_loss_clip": 0.01051104, + "auxiliary_loss_mlp": 0.01038522, + "balance_loss_clip": 1.01532352, + "balance_loss_mlp": 1.01524985, + "epoch": 0.7553284232676988, + "flos": 20301888735360.0, + "grad_norm": 2.049202068091425, + "language_loss": 0.76852477, + "learning_rate": 5.956658554770371e-07, + "loss": 0.78942102, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 12563, + "time_per_iteration": 2.4069366455078125 + }, + { + "auxiliary_loss_clip": 0.01055737, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_clip": 1.02108765, + "balance_loss_mlp": 1.016047, + "epoch": 0.7553885465203668, + "flos": 33254072317440.0, + "grad_norm": 2.301263179847579, + "language_loss": 0.695539, + "learning_rate": 5.953885806282768e-07, + "loss": 0.71657097, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39648438, + "step": 12564, + "time_per_iteration": 2.511375904083252 + }, + { + "auxiliary_loss_clip": 0.01053579, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.01428795, + "balance_loss_mlp": 1.01633191, + "epoch": 0.7554486697730347, + "flos": 21615181637760.0, + "grad_norm": 2.932863721180713, + "language_loss": 0.69791436, + "learning_rate": 5.951113590425228e-07, + "loss": 0.71885026, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 12565, + "time_per_iteration": 2.4608850479125977 + }, + { + "auxiliary_loss_clip": 0.01055591, + "auxiliary_loss_mlp": 0.01038318, + "balance_loss_clip": 1.01316476, + "balance_loss_mlp": 1.01693201, + "epoch": 0.7555087930257027, + "flos": 27631834058880.0, + "grad_norm": 1.5078230082223012, + "language_loss": 0.7577095, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77864861, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 12566, + "time_per_iteration": 2.4740943908691406 + }, + { + "auxiliary_loss_clip": 0.01054749, + "auxiliary_loss_mlp": 0.01046762, + "balance_loss_clip": 1.02059555, + "balance_loss_mlp": 1.0166223, + "epoch": 0.7555689162783706, + "flos": 23620557888000.0, + "grad_norm": 2.3780071904064353, + "language_loss": 0.75050056, + "learning_rate": 5.945570757020789e-07, + "loss": 0.77151573, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 12567, + "time_per_iteration": 2.3886756896972656 + }, + { + "auxiliary_loss_clip": 0.01052621, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.01239562, + "balance_loss_mlp": 1.01659799, + "epoch": 0.7556290395310387, + "flos": 24861929656320.0, + "grad_norm": 1.9924222408612715, + "language_loss": 0.64157999, + "learning_rate": 5.942800139684073e-07, + "loss": 0.662462, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 12568, + "time_per_iteration": 2.4692583084106445 + }, + { + "auxiliary_loss_clip": 0.01051165, + "auxiliary_loss_mlp": 0.0103891, + "balance_loss_clip": 1.01687968, + "balance_loss_mlp": 1.01596165, + "epoch": 0.7556891627837066, + "flos": 43542103605120.0, + "grad_norm": 1.7775138923897402, + "language_loss": 0.67392147, + "learning_rate": 5.940030055397789e-07, + "loss": 0.69482219, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 12569, + "time_per_iteration": 2.571235418319702 + }, + { + "auxiliary_loss_clip": 0.01055799, + "auxiliary_loss_mlp": 0.01041746, + "balance_loss_clip": 1.01507854, + "balance_loss_mlp": 1.0168736, + "epoch": 0.7557492860363746, + "flos": 26649727113600.0, + "grad_norm": 1.7797614892613784, + "language_loss": 0.68246788, + "learning_rate": 5.93726050426697e-07, + "loss": 0.70344329, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38867188, + "step": 12570, + "time_per_iteration": 3.9672651290893555 + }, + { + "auxiliary_loss_clip": 0.01053397, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.008196, + "balance_loss_mlp": 1.01675844, + "epoch": 0.7558094092890425, + "flos": 55180889550720.0, + "grad_norm": 1.858876083293177, + "language_loss": 0.72753829, + "learning_rate": 5.934491486396647e-07, + "loss": 0.74841434, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 12571, + "time_per_iteration": 2.697713851928711 + }, + { + "auxiliary_loss_clip": 0.01054112, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.01396847, + "balance_loss_mlp": 1.01614416, + "epoch": 0.7558695325417105, + "flos": 23987145830400.0, + "grad_norm": 1.7703581139518583, + "language_loss": 0.74726391, + "learning_rate": 5.931723001891811e-07, + "loss": 0.76819748, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 12572, + "time_per_iteration": 2.483304977416992 + }, + { + "auxiliary_loss_clip": 0.01055268, + "auxiliary_loss_mlp": 0.01042351, + "balance_loss_clip": 1.01846194, + "balance_loss_mlp": 1.01754117, + "epoch": 0.7559296557943784, + "flos": 14610382606080.0, + "grad_norm": 2.088030550892795, + "language_loss": 0.77809834, + "learning_rate": 5.928955050857456e-07, + "loss": 0.79907453, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37695312, + "step": 12573, + "time_per_iteration": 2.5594263076782227 + }, + { + "auxiliary_loss_clip": 0.01054353, + "auxiliary_loss_mlp": 0.01042548, + "balance_loss_clip": 1.01756132, + "balance_loss_mlp": 1.01548874, + "epoch": 0.7559897790470465, + "flos": 18549528174720.0, + "grad_norm": 1.622856970935265, + "language_loss": 0.6964258, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71739483, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38867188, + "step": 12574, + "time_per_iteration": 2.4467971324920654 + }, + { + "auxiliary_loss_clip": 0.01050958, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.01542652, + "balance_loss_mlp": 1.01556015, + "epoch": 0.7560499022997144, + "flos": 17966897539200.0, + "grad_norm": 2.270159299372539, + "language_loss": 0.73860627, + "learning_rate": 5.923420749619974e-07, + "loss": 0.75950736, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 12575, + "time_per_iteration": 2.4479002952575684 + }, + { + "auxiliary_loss_clip": 0.01051406, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_clip": 1.01928151, + "balance_loss_mlp": 1.01587069, + "epoch": 0.7561100255523824, + "flos": 15737030046720.0, + "grad_norm": 2.2294623127545634, + "language_loss": 0.73786789, + "learning_rate": 5.92065439962673e-07, + "loss": 0.75880462, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 12576, + "time_per_iteration": 2.3828234672546387 + }, + { + "auxiliary_loss_clip": 0.01052182, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.0156188, + "balance_loss_mlp": 1.01669157, + "epoch": 0.7561701488050504, + "flos": 15887191271040.0, + "grad_norm": 1.8308687767308962, + "language_loss": 0.68457657, + "learning_rate": 5.917888583523669e-07, + "loss": 0.7054857, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 12577, + "time_per_iteration": 2.3886446952819824 + }, + { + "auxiliary_loss_clip": 0.01051669, + "auxiliary_loss_mlp": 0.01037682, + "balance_loss_clip": 1.01587892, + "balance_loss_mlp": 1.01590014, + "epoch": 0.7562302720577183, + "flos": 20338128593280.0, + "grad_norm": 2.0238211397279677, + "language_loss": 0.79147708, + "learning_rate": 5.915123301415685e-07, + "loss": 0.81237054, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35742188, + "step": 12578, + "time_per_iteration": 2.4017505645751953 + }, + { + "auxiliary_loss_clip": 0.0105217, + "auxiliary_loss_mlp": 0.01040575, + "balance_loss_clip": 1.01629186, + "balance_loss_mlp": 1.0154047, + "epoch": 0.7562903953103863, + "flos": 20811201782400.0, + "grad_norm": 1.5150385376026165, + "language_loss": 0.76423585, + "learning_rate": 5.912358553407641e-07, + "loss": 0.78516328, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3671875, + "step": 12579, + "time_per_iteration": 2.4347798824310303 + }, + { + "auxiliary_loss_clip": 0.01055519, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.01658428, + "balance_loss_mlp": 1.01640153, + "epoch": 0.7563505185630542, + "flos": 37595487104640.0, + "grad_norm": 1.8717892959707667, + "language_loss": 0.63957512, + "learning_rate": 5.90959433960437e-07, + "loss": 0.66054893, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.390625, + "step": 12580, + "time_per_iteration": 2.507239818572998 + }, + { + "auxiliary_loss_clip": 0.01053539, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.00909448, + "balance_loss_mlp": 1.01734614, + "epoch": 0.7564106418157223, + "flos": 20229932689920.0, + "grad_norm": 2.189514300205276, + "language_loss": 0.76127678, + "learning_rate": 5.906830660110691e-07, + "loss": 0.78213418, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 12581, + "time_per_iteration": 2.4476778507232666 + }, + { + "auxiliary_loss_clip": 0.01051552, + "auxiliary_loss_mlp": 0.01038651, + "balance_loss_clip": 1.01564336, + "balance_loss_mlp": 1.01503682, + "epoch": 0.7564707650683902, + "flos": 24753698841600.0, + "grad_norm": 2.2629812107112897, + "language_loss": 0.6405021, + "learning_rate": 5.904067515031412e-07, + "loss": 0.66140413, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 12582, + "time_per_iteration": 2.395158290863037 + }, + { + "auxiliary_loss_clip": 0.01008671, + "auxiliary_loss_mlp": 0.01004513, + "balance_loss_clip": 1.00192654, + "balance_loss_mlp": 1.00148892, + "epoch": 0.7565308883210582, + "flos": 48527594887680.0, + "grad_norm": 0.945847273377463, + "language_loss": 0.60670328, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62683511, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.07177734, + "step": 12583, + "time_per_iteration": 2.8325090408325195 + }, + { + "auxiliary_loss_clip": 0.0105409, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.01651323, + "balance_loss_mlp": 1.01642299, + "epoch": 0.7565910115737261, + "flos": 12494261923200.0, + "grad_norm": 1.9747874918623742, + "language_loss": 0.80435079, + "learning_rate": 5.898542828535125e-07, + "loss": 0.82530266, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 12584, + "time_per_iteration": 2.3499972820281982 + }, + { + "auxiliary_loss_clip": 0.010507, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.01217163, + "balance_loss_mlp": 1.01583719, + "epoch": 0.7566511348263941, + "flos": 21172099173120.0, + "grad_norm": 2.4801759039610745, + "language_loss": 0.78104901, + "learning_rate": 5.895781287327612e-07, + "loss": 0.80190063, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 12585, + "time_per_iteration": 2.423314094543457 + }, + { + "auxiliary_loss_clip": 0.01055152, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.01374674, + "balance_loss_mlp": 1.01740324, + "epoch": 0.756711258079062, + "flos": 21753961758720.0, + "grad_norm": 1.6882541611631476, + "language_loss": 0.84630626, + "learning_rate": 5.893020280953493e-07, + "loss": 0.86723912, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37695312, + "step": 12586, + "time_per_iteration": 2.3527650833129883 + }, + { + "auxiliary_loss_clip": 0.0105433, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.01291347, + "balance_loss_mlp": 1.01696277, + "epoch": 0.75677138133173, + "flos": 22381804471680.0, + "grad_norm": 2.324012972716028, + "language_loss": 0.85135579, + "learning_rate": 5.890259809517459e-07, + "loss": 0.87226224, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 12587, + "time_per_iteration": 2.4331281185150146 + }, + { + "auxiliary_loss_clip": 0.01051219, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.01185489, + "balance_loss_mlp": 1.01578295, + "epoch": 0.756831504584398, + "flos": 22707928281600.0, + "grad_norm": 1.792715653228071, + "language_loss": 0.71916819, + "learning_rate": 5.88749987312418e-07, + "loss": 0.74003196, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 12588, + "time_per_iteration": 2.5149829387664795 + }, + { + "auxiliary_loss_clip": 0.01053482, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.01347971, + "balance_loss_mlp": 1.01662016, + "epoch": 0.756891627837066, + "flos": 24097192035840.0, + "grad_norm": 3.9273913905253828, + "language_loss": 0.70407307, + "learning_rate": 5.884740471878327e-07, + "loss": 0.72498226, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36914062, + "step": 12589, + "time_per_iteration": 2.4207379817962646 + }, + { + "auxiliary_loss_clip": 0.01053824, + "auxiliary_loss_mlp": 0.01038761, + "balance_loss_clip": 1.01407242, + "balance_loss_mlp": 1.01634479, + "epoch": 0.756951751089734, + "flos": 19748166572160.0, + "grad_norm": 1.6924315941977943, + "language_loss": 0.93045932, + "learning_rate": 5.881981605884522e-07, + "loss": 0.95138514, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.375, + "step": 12590, + "time_per_iteration": 2.405069351196289 + }, + { + "auxiliary_loss_clip": 0.01051004, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.01295137, + "balance_loss_mlp": 1.01588237, + "epoch": 0.7570118743424019, + "flos": 35077830341760.0, + "grad_norm": 1.9847884533682028, + "language_loss": 0.66234982, + "learning_rate": 5.879223275247391e-07, + "loss": 0.68321633, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3515625, + "step": 12591, + "time_per_iteration": 3.7293288707733154 + }, + { + "auxiliary_loss_clip": 0.01052289, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.00929523, + "balance_loss_mlp": 1.0169189, + "epoch": 0.7570719975950699, + "flos": 25593325061760.0, + "grad_norm": 1.5421673371489821, + "language_loss": 0.75709641, + "learning_rate": 5.876465480071528e-07, + "loss": 0.77793223, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 12592, + "time_per_iteration": 2.4673094749450684 + }, + { + "auxiliary_loss_clip": 0.01052302, + "auxiliary_loss_mlp": 0.01040255, + "balance_loss_clip": 1.01623404, + "balance_loss_mlp": 1.01570153, + "epoch": 0.7571321208477378, + "flos": 10815463330560.0, + "grad_norm": 2.5496933745105093, + "language_loss": 0.72908777, + "learning_rate": 5.873708220461522e-07, + "loss": 0.75001335, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 12593, + "time_per_iteration": 2.416538953781128 + }, + { + "auxiliary_loss_clip": 0.01053755, + "auxiliary_loss_mlp": 0.01040627, + "balance_loss_clip": 1.01623702, + "balance_loss_mlp": 1.01605904, + "epoch": 0.7571922441004059, + "flos": 18259120552320.0, + "grad_norm": 2.344541736379826, + "language_loss": 0.67487955, + "learning_rate": 5.870951496521903e-07, + "loss": 0.69582337, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37695312, + "step": 12594, + "time_per_iteration": 2.3820931911468506 + }, + { + "auxiliary_loss_clip": 0.01054456, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.01409698, + "balance_loss_mlp": 1.01680291, + "epoch": 0.7572523673530738, + "flos": 22889476684800.0, + "grad_norm": 1.9751125145847035, + "language_loss": 0.81531274, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83624077, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37695312, + "step": 12595, + "time_per_iteration": 2.4210867881774902 + }, + { + "auxiliary_loss_clip": 0.01052201, + "auxiliary_loss_mlp": 0.01036397, + "balance_loss_clip": 1.01445031, + "balance_loss_mlp": 1.01651359, + "epoch": 0.7573124906057418, + "flos": 20995263803520.0, + "grad_norm": 2.0090268050523368, + "language_loss": 0.72715831, + "learning_rate": 5.865439656071993e-07, + "loss": 0.74804425, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35742188, + "step": 12596, + "time_per_iteration": 2.4222259521484375 + }, + { + "auxiliary_loss_clip": 0.01051053, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.01026917, + "balance_loss_mlp": 1.01596403, + "epoch": 0.7573726138584097, + "flos": 20885252509440.0, + "grad_norm": 3.1984769106012743, + "language_loss": 0.82085133, + "learning_rate": 5.862684539770706e-07, + "loss": 0.84169167, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 12597, + "time_per_iteration": 3.8836419582366943 + }, + { + "auxiliary_loss_clip": 0.01056108, + "auxiliary_loss_mlp": 0.01039344, + "balance_loss_clip": 1.01262879, + "balance_loss_mlp": 1.01807892, + "epoch": 0.7574327371110777, + "flos": 24529486890240.0, + "grad_norm": 1.8450985515331628, + "language_loss": 0.83532596, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85628051, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38085938, + "step": 12598, + "time_per_iteration": 2.4708263874053955 + }, + { + "auxiliary_loss_clip": 0.01051977, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.01028943, + "balance_loss_mlp": 1.01669455, + "epoch": 0.7574928603637456, + "flos": 23363492480640.0, + "grad_norm": 1.564969128359294, + "language_loss": 0.63856089, + "learning_rate": 5.857175915537845e-07, + "loss": 0.65940422, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 12599, + "time_per_iteration": 2.4094395637512207 + }, + { + "auxiliary_loss_clip": 0.01055481, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.0120846, + "balance_loss_mlp": 1.01743495, + "epoch": 0.7575529836164137, + "flos": 13515436546560.0, + "grad_norm": 2.3364843261246495, + "language_loss": 0.6543417, + "learning_rate": 5.854422407815161e-07, + "loss": 0.67527902, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 12600, + "time_per_iteration": 2.408433675765991 + }, + { + "auxiliary_loss_clip": 0.01051727, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.01246071, + "balance_loss_mlp": 1.01682782, + "epoch": 0.7576131068690816, + "flos": 19645556486400.0, + "grad_norm": 2.104252771477835, + "language_loss": 0.67357445, + "learning_rate": 5.851669436494191e-07, + "loss": 0.69442779, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34960938, + "step": 12601, + "time_per_iteration": 2.388345718383789 + }, + { + "auxiliary_loss_clip": 0.01051628, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.01816809, + "balance_loss_mlp": 1.01666594, + "epoch": 0.7576732301217496, + "flos": 20047197300480.0, + "grad_norm": 1.5828370897945256, + "language_loss": 0.69280565, + "learning_rate": 5.848917001679335e-07, + "loss": 0.71371269, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34960938, + "step": 12602, + "time_per_iteration": 2.4322757720947266 + }, + { + "auxiliary_loss_clip": 0.01051189, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.01554418, + "balance_loss_mlp": 1.01590216, + "epoch": 0.7577333533744176, + "flos": 15376202478720.0, + "grad_norm": 1.7995432895370804, + "language_loss": 0.68332636, + "learning_rate": 5.846165103474967e-07, + "loss": 0.70423448, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35351562, + "step": 12603, + "time_per_iteration": 2.380073308944702 + }, + { + "auxiliary_loss_clip": 0.01049417, + "auxiliary_loss_mlp": 0.01035267, + "balance_loss_clip": 1.01425028, + "balance_loss_mlp": 1.01514983, + "epoch": 0.7577934766270855, + "flos": 17893894152960.0, + "grad_norm": 1.9597389789789155, + "language_loss": 0.63141966, + "learning_rate": 5.843413741985439e-07, + "loss": 0.6522665, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34179688, + "step": 12604, + "time_per_iteration": 2.353097915649414 + }, + { + "auxiliary_loss_clip": 0.0105283, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.0128032, + "balance_loss_mlp": 1.01679075, + "epoch": 0.7578535998797535, + "flos": 21612772753920.0, + "grad_norm": 1.862593364300812, + "language_loss": 0.81059808, + "learning_rate": 5.840662917315076e-07, + "loss": 0.83147967, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.359375, + "step": 12605, + "time_per_iteration": 2.4288647174835205 + }, + { + "auxiliary_loss_clip": 0.01054543, + "auxiliary_loss_mlp": 0.01041941, + "balance_loss_clip": 1.01623964, + "balance_loss_mlp": 1.01564264, + "epoch": 0.7579137231324214, + "flos": 18477397572480.0, + "grad_norm": 3.3023209589049607, + "language_loss": 0.80740035, + "learning_rate": 5.837912629568198e-07, + "loss": 0.82836521, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38867188, + "step": 12606, + "time_per_iteration": 2.3573949337005615 + }, + { + "auxiliary_loss_clip": 0.01051101, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.01592255, + "balance_loss_mlp": 1.01666009, + "epoch": 0.7579738463850895, + "flos": 23254004856960.0, + "grad_norm": 1.3766187284215885, + "language_loss": 0.73606139, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75692368, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.34375, + "step": 12607, + "time_per_iteration": 2.427187204360962 + }, + { + "auxiliary_loss_clip": 0.01053003, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.01741052, + "balance_loss_mlp": 1.01545894, + "epoch": 0.7580339696377574, + "flos": 14026180959360.0, + "grad_norm": 1.959243076848414, + "language_loss": 0.75718975, + "learning_rate": 5.83241366526202e-07, + "loss": 0.77814007, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 12608, + "time_per_iteration": 2.394636869430542 + }, + { + "auxiliary_loss_clip": 0.01051763, + "auxiliary_loss_mlp": 0.01040891, + "balance_loss_clip": 1.01552355, + "balance_loss_mlp": 1.01688516, + "epoch": 0.7580940928904254, + "flos": 25081603130880.0, + "grad_norm": 1.7068109579343937, + "language_loss": 0.72890043, + "learning_rate": 5.829664988911245e-07, + "loss": 0.74982691, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.34960938, + "step": 12609, + "time_per_iteration": 2.43148136138916 + }, + { + "auxiliary_loss_clip": 0.01053412, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.01278579, + "balance_loss_mlp": 1.01562297, + "epoch": 0.7581542161430933, + "flos": 23835448506240.0, + "grad_norm": 1.6308651912054826, + "language_loss": 0.82232749, + "learning_rate": 5.826916849901007e-07, + "loss": 0.843243, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 12610, + "time_per_iteration": 3.9108715057373047 + }, + { + "auxiliary_loss_clip": 0.01054926, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.01442933, + "balance_loss_mlp": 1.01681376, + "epoch": 0.7582143393957613, + "flos": 22235902433280.0, + "grad_norm": 1.6711675764362164, + "language_loss": 0.71260351, + "learning_rate": 5.824169248335488e-07, + "loss": 0.73353726, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38085938, + "step": 12611, + "time_per_iteration": 2.4045212268829346 + }, + { + "auxiliary_loss_clip": 0.01052426, + "auxiliary_loss_mlp": 0.01033261, + "balance_loss_clip": 1.01027727, + "balance_loss_mlp": 1.01645172, + "epoch": 0.7582744626484292, + "flos": 21105310008960.0, + "grad_norm": 1.7245988890993413, + "language_loss": 0.71947581, + "learning_rate": 5.821422184318893e-07, + "loss": 0.74033272, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 12612, + "time_per_iteration": 2.4358608722686768 + }, + { + "auxiliary_loss_clip": 0.01055341, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.01925921, + "balance_loss_mlp": 1.01781678, + "epoch": 0.7583345859010973, + "flos": 24603712174080.0, + "grad_norm": 1.463220223020773, + "language_loss": 0.60627794, + "learning_rate": 5.818675657955397e-07, + "loss": 0.62725866, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 12613, + "time_per_iteration": 2.422016143798828 + }, + { + "auxiliary_loss_clip": 0.0105301, + "auxiliary_loss_mlp": 0.01038531, + "balance_loss_clip": 1.01476097, + "balance_loss_mlp": 1.0163517, + "epoch": 0.7583947091537652, + "flos": 33545422546560.0, + "grad_norm": 1.8184402422983665, + "language_loss": 0.60951322, + "learning_rate": 5.815929669349135e-07, + "loss": 0.63042867, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 12614, + "time_per_iteration": 2.526916742324829 + }, + { + "auxiliary_loss_clip": 0.0105302, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.01357484, + "balance_loss_mlp": 1.01632905, + "epoch": 0.7584548324064332, + "flos": 20119956307200.0, + "grad_norm": 1.6742489226330188, + "language_loss": 0.73890758, + "learning_rate": 5.813184218604246e-07, + "loss": 0.7597968, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3671875, + "step": 12615, + "time_per_iteration": 2.3701961040496826 + }, + { + "auxiliary_loss_clip": 0.01008792, + "auxiliary_loss_mlp": 0.01004399, + "balance_loss_clip": 1.00197923, + "balance_loss_mlp": 1.0016222, + "epoch": 0.7585149556591012, + "flos": 70399004060160.0, + "grad_norm": 0.813807366047582, + "language_loss": 0.67798907, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69812095, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.07177734, + "step": 12616, + "time_per_iteration": 3.0687966346740723 + }, + { + "auxiliary_loss_clip": 0.01055517, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.01478815, + "balance_loss_mlp": 1.01730728, + "epoch": 0.7585750789117691, + "flos": 16142860224000.0, + "grad_norm": 1.8058177760210061, + "language_loss": 0.85245633, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87341964, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 12617, + "time_per_iteration": 2.3727877140045166 + }, + { + "auxiliary_loss_clip": 0.01054382, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.01549602, + "balance_loss_mlp": 1.01693702, + "epoch": 0.7586352021644371, + "flos": 17492218427520.0, + "grad_norm": 2.26123035208268, + "language_loss": 0.76460135, + "learning_rate": 5.804951094578757e-07, + "loss": 0.78552377, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.375, + "step": 12618, + "time_per_iteration": 2.3974733352661133 + }, + { + "auxiliary_loss_clip": 0.01054742, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.01857233, + "balance_loss_mlp": 1.01617122, + "epoch": 0.758695325417105, + "flos": 17274220698240.0, + "grad_norm": 1.9713821444274184, + "language_loss": 0.79036748, + "learning_rate": 5.802207796320209e-07, + "loss": 0.81135571, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 12619, + "time_per_iteration": 2.362710475921631 + }, + { + "auxiliary_loss_clip": 0.01052881, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.01062429, + "balance_loss_mlp": 1.01685262, + "epoch": 0.7587554486697731, + "flos": 29494415381760.0, + "grad_norm": 1.6271759503793115, + "language_loss": 0.83167231, + "learning_rate": 5.79946503644337e-07, + "loss": 0.85255718, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.359375, + "step": 12620, + "time_per_iteration": 2.4521214962005615 + }, + { + "auxiliary_loss_clip": 0.01054066, + "auxiliary_loss_mlp": 0.01041763, + "balance_loss_clip": 1.01591861, + "balance_loss_mlp": 1.01609373, + "epoch": 0.758815571922441, + "flos": 16100057030400.0, + "grad_norm": 4.942242850813006, + "language_loss": 0.83610427, + "learning_rate": 5.796722815052242e-07, + "loss": 0.85706258, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 12621, + "time_per_iteration": 2.333015203475952 + }, + { + "auxiliary_loss_clip": 0.01052588, + "auxiliary_loss_mlp": 0.01039462, + "balance_loss_clip": 1.01598942, + "balance_loss_mlp": 1.01632345, + "epoch": 0.758875695175109, + "flos": 16142790401280.0, + "grad_norm": 2.050092714871132, + "language_loss": 0.75111836, + "learning_rate": 5.7939811322508e-07, + "loss": 0.77203882, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 12622, + "time_per_iteration": 2.410367488861084 + }, + { + "auxiliary_loss_clip": 0.01008556, + "auxiliary_loss_mlp": 0.01002838, + "balance_loss_clip": 1.00047719, + "balance_loss_mlp": 1.00146306, + "epoch": 0.7589358184277769, + "flos": 68458671406080.0, + "grad_norm": 0.8440297375844196, + "language_loss": 0.60833216, + "learning_rate": 5.791239988143024e-07, + "loss": 0.6284461, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.07080078, + "step": 12623, + "time_per_iteration": 3.03436017036438 + }, + { + "auxiliary_loss_clip": 0.01052269, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.01352501, + "balance_loss_mlp": 1.01703107, + "epoch": 0.7589959416804449, + "flos": 20046289605120.0, + "grad_norm": 2.4516521903050514, + "language_loss": 0.68688917, + "learning_rate": 5.788499382832847e-07, + "loss": 0.70775962, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 12624, + "time_per_iteration": 2.44975209236145 + }, + { + "auxiliary_loss_clip": 0.01050946, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.01595283, + "balance_loss_mlp": 1.01644194, + "epoch": 0.7590560649331128, + "flos": 18770772660480.0, + "grad_norm": 1.7763545047130584, + "language_loss": 0.77273613, + "learning_rate": 5.785759316424196e-07, + "loss": 0.7936179, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 12625, + "time_per_iteration": 2.5047059059143066 + }, + { + "auxiliary_loss_clip": 0.01053531, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.01632881, + "balance_loss_mlp": 1.01752877, + "epoch": 0.7591161881857809, + "flos": 29823995416320.0, + "grad_norm": 1.993863544775772, + "language_loss": 0.64106381, + "learning_rate": 5.783019789020977e-07, + "loss": 0.66200584, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 12626, + "time_per_iteration": 2.6232433319091797 + }, + { + "auxiliary_loss_clip": 0.01053527, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.01711726, + "balance_loss_mlp": 1.0172205, + "epoch": 0.7591763114384488, + "flos": 20301679267200.0, + "grad_norm": 1.9218332381082242, + "language_loss": 0.75191373, + "learning_rate": 5.780280800727084e-07, + "loss": 0.77289426, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.36328125, + "step": 12627, + "time_per_iteration": 2.473642110824585 + }, + { + "auxiliary_loss_clip": 0.01054867, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.01159668, + "balance_loss_mlp": 1.0163312, + "epoch": 0.7592364346911168, + "flos": 20812563325440.0, + "grad_norm": 6.511439016625779, + "language_loss": 0.69724894, + "learning_rate": 5.777542351646356e-07, + "loss": 0.71815097, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38476562, + "step": 12628, + "time_per_iteration": 2.4487459659576416 + }, + { + "auxiliary_loss_clip": 0.01057347, + "auxiliary_loss_mlp": 0.01043985, + "balance_loss_clip": 1.01744878, + "balance_loss_mlp": 1.01712036, + "epoch": 0.7592965579437848, + "flos": 21250443997440.0, + "grad_norm": 2.006054057483096, + "language_loss": 0.63970375, + "learning_rate": 5.774804441882648e-07, + "loss": 0.66071707, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.40234375, + "step": 12629, + "time_per_iteration": 2.4446065425872803 + }, + { + "auxiliary_loss_clip": 0.01049798, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.01417887, + "balance_loss_mlp": 1.01516557, + "epoch": 0.7593566811964527, + "flos": 26212405023360.0, + "grad_norm": 1.6522217497268992, + "language_loss": 0.79197502, + "learning_rate": 5.772067071539786e-07, + "loss": 0.8128444, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34765625, + "step": 12630, + "time_per_iteration": 2.626588821411133 + }, + { + "auxiliary_loss_clip": 0.01008581, + "auxiliary_loss_mlp": 0.01005769, + "balance_loss_clip": 1.00357556, + "balance_loss_mlp": 1.00147486, + "epoch": 0.7594168044491207, + "flos": 71233777601280.0, + "grad_norm": 0.9982977486307736, + "language_loss": 0.61576617, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63590968, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.07128906, + "step": 12631, + "time_per_iteration": 4.304818391799927 + }, + { + "auxiliary_loss_clip": 0.01056864, + "auxiliary_loss_mlp": 0.0104535, + "balance_loss_clip": 1.01729989, + "balance_loss_mlp": 1.018538, + "epoch": 0.7594769277017887, + "flos": 26612160624000.0, + "grad_norm": 1.780526134385108, + "language_loss": 0.75197297, + "learning_rate": 5.766593949531767e-07, + "loss": 0.77299511, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.3828125, + "step": 12632, + "time_per_iteration": 2.42234206199646 + }, + { + "auxiliary_loss_clip": 0.01054323, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.01317966, + "balance_loss_mlp": 1.01730394, + "epoch": 0.7595370509544567, + "flos": 17595177626880.0, + "grad_norm": 2.056443341151545, + "language_loss": 0.75332689, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77425253, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 12633, + "time_per_iteration": 2.400239944458008 + }, + { + "auxiliary_loss_clip": 0.01051119, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.01375985, + "balance_loss_mlp": 1.01603961, + "epoch": 0.7595971742071246, + "flos": 18002020233600.0, + "grad_norm": 2.158577484574086, + "language_loss": 0.74417555, + "learning_rate": 5.76112298645246e-07, + "loss": 0.76504749, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3515625, + "step": 12634, + "time_per_iteration": 2.370774507522583 + }, + { + "auxiliary_loss_clip": 0.01053613, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.01259017, + "balance_loss_mlp": 1.01739931, + "epoch": 0.7596572974597926, + "flos": 28839060650880.0, + "grad_norm": 1.7350354429533523, + "language_loss": 0.65959454, + "learning_rate": 5.758388314770408e-07, + "loss": 0.68049878, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 12635, + "time_per_iteration": 2.4188039302825928 + }, + { + "auxiliary_loss_clip": 0.01055022, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.01518369, + "balance_loss_mlp": 1.01711726, + "epoch": 0.7597174207124605, + "flos": 14281954646400.0, + "grad_norm": 1.686316050820359, + "language_loss": 0.70490086, + "learning_rate": 5.7556541831317e-07, + "loss": 0.72586656, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 12636, + "time_per_iteration": 3.799367666244507 + }, + { + "auxiliary_loss_clip": 0.01054583, + "auxiliary_loss_mlp": 0.01042425, + "balance_loss_clip": 1.01778412, + "balance_loss_mlp": 1.01727688, + "epoch": 0.7597775439651285, + "flos": 21687870821760.0, + "grad_norm": 1.8044394555550034, + "language_loss": 0.82159936, + "learning_rate": 5.752920591640018e-07, + "loss": 0.84256941, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 12637, + "time_per_iteration": 3.907252073287964 + }, + { + "auxiliary_loss_clip": 0.01053812, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_clip": 1.01610446, + "balance_loss_mlp": 1.01671195, + "epoch": 0.7598376672177964, + "flos": 36099773015040.0, + "grad_norm": 1.8018279202719802, + "language_loss": 0.67252958, + "learning_rate": 5.750187540399017e-07, + "loss": 0.69347489, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 12638, + "time_per_iteration": 2.566556930541992 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01043044, + "balance_loss_clip": 1.01684213, + "balance_loss_mlp": 1.01693928, + "epoch": 0.7598977904704645, + "flos": 18331355888640.0, + "grad_norm": 2.285753481577789, + "language_loss": 0.6710968, + "learning_rate": 5.747455029512323e-07, + "loss": 0.69206417, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3671875, + "step": 12639, + "time_per_iteration": 2.360905408859253 + }, + { + "auxiliary_loss_clip": 0.01053212, + "auxiliary_loss_mlp": 0.01037067, + "balance_loss_clip": 1.01152062, + "balance_loss_mlp": 1.01638043, + "epoch": 0.7599579137231324, + "flos": 20191633061760.0, + "grad_norm": 2.1959489046411393, + "language_loss": 0.71130884, + "learning_rate": 5.744723059083572e-07, + "loss": 0.73221171, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 12640, + "time_per_iteration": 2.3644821643829346 + }, + { + "auxiliary_loss_clip": 0.01054343, + "auxiliary_loss_mlp": 0.0103934, + "balance_loss_clip": 1.01310182, + "balance_loss_mlp": 1.01648211, + "epoch": 0.7600180369758004, + "flos": 24023699890560.0, + "grad_norm": 1.9185500389468169, + "language_loss": 0.67818403, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69912088, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 12641, + "time_per_iteration": 2.427910327911377 + }, + { + "auxiliary_loss_clip": 0.01053606, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.01531243, + "balance_loss_mlp": 1.0157361, + "epoch": 0.7600781602284684, + "flos": 18988526010240.0, + "grad_norm": 2.5604654026865483, + "language_loss": 0.68470198, + "learning_rate": 5.73926074001422e-07, + "loss": 0.70564634, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 12642, + "time_per_iteration": 2.3517918586730957 + }, + { + "auxiliary_loss_clip": 0.01052074, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.01248264, + "balance_loss_mlp": 1.0166316, + "epoch": 0.7601382834811363, + "flos": 26066328428160.0, + "grad_norm": 2.00950115997072, + "language_loss": 0.77358735, + "learning_rate": 5.736530391580765e-07, + "loss": 0.79447162, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35351562, + "step": 12643, + "time_per_iteration": 2.4327619075775146 + }, + { + "auxiliary_loss_clip": 0.01054657, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.01412654, + "balance_loss_mlp": 1.017537, + "epoch": 0.7601984067338043, + "flos": 18843217464960.0, + "grad_norm": 1.6706111747663748, + "language_loss": 0.79388511, + "learning_rate": 5.733800584019508e-07, + "loss": 0.81481797, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 12644, + "time_per_iteration": 2.3700451850891113 + }, + { + "auxiliary_loss_clip": 0.01052778, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.01605749, + "balance_loss_mlp": 1.01629782, + "epoch": 0.7602585299864723, + "flos": 24645188736000.0, + "grad_norm": 1.7147490522588766, + "language_loss": 0.80883205, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82975054, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 12645, + "time_per_iteration": 2.428832769393921 + }, + { + "auxiliary_loss_clip": 0.01054254, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.01752329, + "balance_loss_mlp": 1.01669109, + "epoch": 0.7603186532391403, + "flos": 23840964501120.0, + "grad_norm": 1.9175231543078288, + "language_loss": 0.74124062, + "learning_rate": 5.728342591927611e-07, + "loss": 0.76220739, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 12646, + "time_per_iteration": 2.3785500526428223 + }, + { + "auxiliary_loss_clip": 0.01051008, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.01483464, + "balance_loss_mlp": 1.01622188, + "epoch": 0.7603787764918082, + "flos": 22198824702720.0, + "grad_norm": 2.0389568632739925, + "language_loss": 0.69068867, + "learning_rate": 5.725614407603949e-07, + "loss": 0.71158218, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.34765625, + "step": 12647, + "time_per_iteration": 2.393958330154419 + }, + { + "auxiliary_loss_clip": 0.01008247, + "auxiliary_loss_mlp": 0.01002945, + "balance_loss_clip": 1.00053668, + "balance_loss_mlp": 1.00108314, + "epoch": 0.7604388997444762, + "flos": 54083951677440.0, + "grad_norm": 0.7221198814539381, + "language_loss": 0.4903208, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51043272, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.07177734, + "step": 12648, + "time_per_iteration": 2.9570586681365967 + }, + { + "auxiliary_loss_clip": 0.01051681, + "auxiliary_loss_mlp": 0.01037851, + "balance_loss_clip": 1.01461744, + "balance_loss_mlp": 1.01636279, + "epoch": 0.7604990229971441, + "flos": 19680923560320.0, + "grad_norm": 1.674493310670297, + "language_loss": 0.77286011, + "learning_rate": 5.720159662918451e-07, + "loss": 0.79375541, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 12649, + "time_per_iteration": 2.399400472640991 + }, + { + "auxiliary_loss_clip": 0.01051372, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.0148977, + "balance_loss_mlp": 1.01568043, + "epoch": 0.7605591462498121, + "flos": 25226876764800.0, + "grad_norm": 1.5391888866727546, + "language_loss": 0.69696319, + "learning_rate": 5.717433102763462e-07, + "loss": 0.71785152, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 12650, + "time_per_iteration": 3.9584317207336426 + }, + { + "auxiliary_loss_clip": 0.01008004, + "auxiliary_loss_mlp": 0.01003619, + "balance_loss_clip": 1.00102043, + "balance_loss_mlp": 1.00092804, + "epoch": 0.76061926950248, + "flos": 66780466306560.0, + "grad_norm": 0.7537745542921246, + "language_loss": 0.62835455, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64847076, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.07080078, + "step": 12651, + "time_per_iteration": 3.0211620330810547 + }, + { + "auxiliary_loss_clip": 0.01052206, + "auxiliary_loss_mlp": 0.01038817, + "balance_loss_clip": 1.01495171, + "balance_loss_mlp": 1.01643407, + "epoch": 0.7606793927551481, + "flos": 25337167349760.0, + "grad_norm": 1.6700590408743354, + "language_loss": 0.72378695, + "learning_rate": 5.711981607345951e-07, + "loss": 0.74469721, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 12652, + "time_per_iteration": 2.4361419677734375 + }, + { + "auxiliary_loss_clip": 0.01055433, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.01552081, + "balance_loss_mlp": 1.01802921, + "epoch": 0.760739516007816, + "flos": 18222636314880.0, + "grad_norm": 2.068787206577919, + "language_loss": 0.8095659, + "learning_rate": 5.709256672290152e-07, + "loss": 0.83053142, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12653, + "time_per_iteration": 2.350820541381836 + }, + { + "auxiliary_loss_clip": 0.01056099, + "auxiliary_loss_mlp": 0.01040059, + "balance_loss_clip": 1.01569247, + "balance_loss_mlp": 1.01719344, + "epoch": 0.760799639260484, + "flos": 22558185993600.0, + "grad_norm": 1.6524386643108755, + "language_loss": 0.81139696, + "learning_rate": 5.706532279140785e-07, + "loss": 0.83235848, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38867188, + "step": 12654, + "time_per_iteration": 2.4235339164733887 + }, + { + "auxiliary_loss_clip": 0.0105471, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_clip": 1.01912034, + "balance_loss_mlp": 1.01678848, + "epoch": 0.760859762513152, + "flos": 22308242503680.0, + "grad_norm": 2.0710776742886705, + "language_loss": 0.80286604, + "learning_rate": 5.703808428001136e-07, + "loss": 0.82386792, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 12655, + "time_per_iteration": 2.3636739253997803 + }, + { + "auxiliary_loss_clip": 0.01050151, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.01034093, + "balance_loss_mlp": 1.01604056, + "epoch": 0.7609198857658199, + "flos": 24862732617600.0, + "grad_norm": 1.850155118476888, + "language_loss": 0.69280386, + "learning_rate": 5.701085118974505e-07, + "loss": 0.71361041, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.34179688, + "step": 12656, + "time_per_iteration": 2.487635612487793 + }, + { + "auxiliary_loss_clip": 0.01052765, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.01617026, + "balance_loss_mlp": 1.01562667, + "epoch": 0.760980009018488, + "flos": 16835851267200.0, + "grad_norm": 2.1153968165761174, + "language_loss": 0.74830973, + "learning_rate": 5.698362352164164e-07, + "loss": 0.76923746, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 12657, + "time_per_iteration": 2.357116460800171 + }, + { + "auxiliary_loss_clip": 0.01007789, + "auxiliary_loss_mlp": 0.0100329, + "balance_loss_clip": 1.00065541, + "balance_loss_mlp": 1.00072777, + "epoch": 0.7610401322711559, + "flos": 61227670475520.0, + "grad_norm": 0.8659976329393893, + "language_loss": 0.65004146, + "learning_rate": 5.695640127673347e-07, + "loss": 0.67015219, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.0703125, + "step": 12658, + "time_per_iteration": 2.997544288635254 + }, + { + "auxiliary_loss_clip": 0.01051116, + "auxiliary_loss_mlp": 0.01037361, + "balance_loss_clip": 1.01548624, + "balance_loss_mlp": 1.01597106, + "epoch": 0.7611002555238239, + "flos": 19639865934720.0, + "grad_norm": 1.620463708611102, + "language_loss": 0.80464888, + "learning_rate": 5.692918445605293e-07, + "loss": 0.82553363, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 12659, + "time_per_iteration": 2.363365888595581 + }, + { + "auxiliary_loss_clip": 0.0105017, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.01508904, + "balance_loss_mlp": 1.01515615, + "epoch": 0.7611603787764918, + "flos": 26870936688000.0, + "grad_norm": 1.5576986862404827, + "language_loss": 0.69783539, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71870852, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 12660, + "time_per_iteration": 2.468965768814087 + }, + { + "auxiliary_loss_clip": 0.0105238, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.01120007, + "balance_loss_mlp": 1.01555443, + "epoch": 0.7612205020291598, + "flos": 27343032359040.0, + "grad_norm": 1.8998456713682055, + "language_loss": 0.71587253, + "learning_rate": 5.687476709150281e-07, + "loss": 0.73675036, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 12661, + "time_per_iteration": 2.4072518348693848 + }, + { + "auxiliary_loss_clip": 0.01052249, + "auxiliary_loss_mlp": 0.01035876, + "balance_loss_clip": 1.01233244, + "balance_loss_mlp": 1.01580095, + "epoch": 0.7612806252818277, + "flos": 29313320826240.0, + "grad_norm": 1.5795593840352138, + "language_loss": 0.84286189, + "learning_rate": 5.68475665496966e-07, + "loss": 0.86374319, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 12662, + "time_per_iteration": 2.462442398071289 + }, + { + "auxiliary_loss_clip": 0.01051646, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.02306712, + "balance_loss_mlp": 1.01527989, + "epoch": 0.7613407485344957, + "flos": 19025045159040.0, + "grad_norm": 1.7813096401851325, + "language_loss": 0.69413006, + "learning_rate": 5.682037143624505e-07, + "loss": 0.71509743, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 12663, + "time_per_iteration": 2.3431339263916016 + }, + { + "auxiliary_loss_clip": 0.01051323, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.00562406, + "balance_loss_mlp": 1.01632166, + "epoch": 0.7614008717871636, + "flos": 23254982375040.0, + "grad_norm": 1.644411530972499, + "language_loss": 0.70525503, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72603947, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34960938, + "step": 12664, + "time_per_iteration": 2.409648895263672 + }, + { + "auxiliary_loss_clip": 0.01055933, + "auxiliary_loss_mlp": 0.01045747, + "balance_loss_clip": 1.01948488, + "balance_loss_mlp": 1.0177232, + "epoch": 0.7614609950398317, + "flos": 21578837045760.0, + "grad_norm": 1.715878078569503, + "language_loss": 0.7996192, + "learning_rate": 5.676599749853066e-07, + "loss": 0.82063603, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3828125, + "step": 12665, + "time_per_iteration": 2.3870623111724854 + }, + { + "auxiliary_loss_clip": 0.01052246, + "auxiliary_loss_mlp": 0.01040645, + "balance_loss_clip": 1.01844859, + "balance_loss_mlp": 1.01681733, + "epoch": 0.7615211182924996, + "flos": 29276627120640.0, + "grad_norm": 1.8813872264883427, + "language_loss": 0.88796765, + "learning_rate": 5.673881867632959e-07, + "loss": 0.90889657, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35351562, + "step": 12666, + "time_per_iteration": 2.4651947021484375 + }, + { + "auxiliary_loss_clip": 0.0105337, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.01398396, + "balance_loss_mlp": 1.01658595, + "epoch": 0.7615812415451676, + "flos": 13260291264000.0, + "grad_norm": 2.122823261277655, + "language_loss": 0.84252107, + "learning_rate": 5.671164528660693e-07, + "loss": 0.86343282, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 12667, + "time_per_iteration": 2.343019723892212 + }, + { + "auxiliary_loss_clip": 0.01051894, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.02120638, + "balance_loss_mlp": 1.01700044, + "epoch": 0.7616413647978356, + "flos": 18583847907840.0, + "grad_norm": 1.5688748420895675, + "language_loss": 0.79450893, + "learning_rate": 5.668447733039296e-07, + "loss": 0.81546706, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 12668, + "time_per_iteration": 2.383962631225586 + }, + { + "auxiliary_loss_clip": 0.01050482, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.01336908, + "balance_loss_mlp": 1.01562929, + "epoch": 0.7617014880505035, + "flos": 18515173530240.0, + "grad_norm": 1.7488940507255015, + "language_loss": 0.64905894, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66991794, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 12669, + "time_per_iteration": 2.3505423069000244 + }, + { + "auxiliary_loss_clip": 0.0105445, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.01361012, + "balance_loss_mlp": 1.01691127, + "epoch": 0.7617616113031715, + "flos": 24972010773120.0, + "grad_norm": 1.6617799422315742, + "language_loss": 0.67075706, + "learning_rate": 5.663015772261202e-07, + "loss": 0.69168746, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 12670, + "time_per_iteration": 3.744530439376831 + }, + { + "auxiliary_loss_clip": 0.01053099, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_clip": 1.0184375, + "balance_loss_mlp": 1.01583827, + "epoch": 0.7618217345558395, + "flos": 23293910407680.0, + "grad_norm": 1.658549459691781, + "language_loss": 0.73993468, + "learning_rate": 5.660300607310493e-07, + "loss": 0.76089871, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 12671, + "time_per_iteration": 2.4137401580810547 + }, + { + "auxiliary_loss_clip": 0.01050789, + "auxiliary_loss_mlp": 0.01037718, + "balance_loss_clip": 1.01511621, + "balance_loss_mlp": 1.01563096, + "epoch": 0.7618818578085075, + "flos": 25481742756480.0, + "grad_norm": 1.6175520579860723, + "language_loss": 0.73781574, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75870085, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 12672, + "time_per_iteration": 2.447244167327881 + }, + { + "auxiliary_loss_clip": 0.01008351, + "auxiliary_loss_mlp": 0.01002794, + "balance_loss_clip": 1.00027835, + "balance_loss_mlp": 1.00116289, + "epoch": 0.7619419810611754, + "flos": 61149220917120.0, + "grad_norm": 0.7646802531981837, + "language_loss": 0.56747508, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58758652, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.02514648, + "router_z_loss_mlp": 0.07226562, + "step": 12673, + "time_per_iteration": 3.0157690048217773 + }, + { + "auxiliary_loss_clip": 0.01053711, + "auxiliary_loss_mlp": 0.01040532, + "balance_loss_clip": 1.01550984, + "balance_loss_mlp": 1.01642966, + "epoch": 0.7620021043138434, + "flos": 23257530904320.0, + "grad_norm": 2.1786857019842096, + "language_loss": 0.76026046, + "learning_rate": 5.652158375447102e-07, + "loss": 0.78120291, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37304688, + "step": 12674, + "time_per_iteration": 2.4133248329162598 + }, + { + "auxiliary_loss_clip": 0.01049761, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.01278901, + "balance_loss_mlp": 1.0150423, + "epoch": 0.7620622275665113, + "flos": 25081323840000.0, + "grad_norm": 2.3052539006888044, + "language_loss": 0.74017286, + "learning_rate": 5.649445386165286e-07, + "loss": 0.76100349, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34765625, + "step": 12675, + "time_per_iteration": 2.4667890071868896 + }, + { + "auxiliary_loss_clip": 0.01050983, + "auxiliary_loss_mlp": 0.01036662, + "balance_loss_clip": 1.01503754, + "balance_loss_mlp": 1.01637733, + "epoch": 0.7621223508191793, + "flos": 20154031660800.0, + "grad_norm": 2.160556439869312, + "language_loss": 0.73988503, + "learning_rate": 5.646732941057936e-07, + "loss": 0.7607615, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34570312, + "step": 12676, + "time_per_iteration": 3.776190996170044 + }, + { + "auxiliary_loss_clip": 0.01054767, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.01528633, + "balance_loss_mlp": 1.01585901, + "epoch": 0.7621824740718472, + "flos": 17999332058880.0, + "grad_norm": 2.556186114783881, + "language_loss": 0.55824554, + "learning_rate": 5.644021040227927e-07, + "loss": 0.57918644, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38867188, + "step": 12677, + "time_per_iteration": 3.7703723907470703 + }, + { + "auxiliary_loss_clip": 0.01053659, + "auxiliary_loss_mlp": 0.01040289, + "balance_loss_clip": 1.01527929, + "balance_loss_mlp": 1.01642418, + "epoch": 0.7622425973245153, + "flos": 21724599438720.0, + "grad_norm": 2.1460659306714165, + "language_loss": 0.81995821, + "learning_rate": 5.641309683778064e-07, + "loss": 0.84089768, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 12678, + "time_per_iteration": 2.383782148361206 + }, + { + "auxiliary_loss_clip": 0.01054095, + "auxiliary_loss_mlp": 0.01038886, + "balance_loss_clip": 1.0140667, + "balance_loss_mlp": 1.01650095, + "epoch": 0.7623027205771832, + "flos": 19717547443200.0, + "grad_norm": 1.7439544716004574, + "language_loss": 0.77446562, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79539549, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 12679, + "time_per_iteration": 2.414151191711426 + }, + { + "auxiliary_loss_clip": 0.01052277, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.01313865, + "balance_loss_mlp": 1.0163213, + "epoch": 0.7623628438298512, + "flos": 23987669500800.0, + "grad_norm": 1.3672936133807874, + "language_loss": 0.80233836, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82320994, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.359375, + "step": 12680, + "time_per_iteration": 2.394313335418701 + }, + { + "auxiliary_loss_clip": 0.01053949, + "auxiliary_loss_mlp": 0.01041309, + "balance_loss_clip": 1.01600051, + "balance_loss_mlp": 1.01687336, + "epoch": 0.7624229670825191, + "flos": 22344622007040.0, + "grad_norm": 2.3173460082403796, + "language_loss": 0.64383471, + "learning_rate": 5.633178881737493e-07, + "loss": 0.66478723, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 12681, + "time_per_iteration": 2.4145023822784424 + }, + { + "auxiliary_loss_clip": 0.01051902, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.01259184, + "balance_loss_mlp": 1.01705849, + "epoch": 0.7624830903351871, + "flos": 22710651367680.0, + "grad_norm": 2.175298268097647, + "language_loss": 0.77113378, + "learning_rate": 5.63046970383622e-07, + "loss": 0.79199988, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 12682, + "time_per_iteration": 2.3797357082366943 + }, + { + "auxiliary_loss_clip": 0.01050432, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.01017404, + "balance_loss_mlp": 1.01598167, + "epoch": 0.7625432135878552, + "flos": 25592522100480.0, + "grad_norm": 1.4641744586198198, + "language_loss": 0.69250208, + "learning_rate": 5.627761070828974e-07, + "loss": 0.71331829, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34375, + "step": 12683, + "time_per_iteration": 2.452089786529541 + }, + { + "auxiliary_loss_clip": 0.0105237, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.01524949, + "balance_loss_mlp": 1.01641273, + "epoch": 0.7626033368405231, + "flos": 23986517425920.0, + "grad_norm": 2.615747008894074, + "language_loss": 0.8436445, + "learning_rate": 5.625052982818472e-07, + "loss": 0.86455059, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 12684, + "time_per_iteration": 2.3668100833892822 + }, + { + "auxiliary_loss_clip": 0.01053623, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.01182246, + "balance_loss_mlp": 1.01718593, + "epoch": 0.7626634600931911, + "flos": 12598443020160.0, + "grad_norm": 1.8425421971329572, + "language_loss": 0.83634102, + "learning_rate": 5.622345439907396e-07, + "loss": 0.85722399, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 12685, + "time_per_iteration": 2.399744987487793 + }, + { + "auxiliary_loss_clip": 0.01052708, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.01225805, + "balance_loss_mlp": 1.01573634, + "epoch": 0.762723583345859, + "flos": 26321403888000.0, + "grad_norm": 1.9236919545568707, + "language_loss": 0.78966963, + "learning_rate": 5.619638442198422e-07, + "loss": 0.81055415, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 12686, + "time_per_iteration": 2.396284580230713 + }, + { + "auxiliary_loss_clip": 0.01054465, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.01317191, + "balance_loss_mlp": 1.01659989, + "epoch": 0.762783706598527, + "flos": 21906008196480.0, + "grad_norm": 2.009977745773024, + "language_loss": 0.73793316, + "learning_rate": 5.616931989794198e-07, + "loss": 0.75888932, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.37890625, + "step": 12687, + "time_per_iteration": 2.4410672187805176 + }, + { + "auxiliary_loss_clip": 0.01053326, + "auxiliary_loss_mlp": 0.01046875, + "balance_loss_clip": 1.02089953, + "balance_loss_mlp": 1.01578426, + "epoch": 0.7628438298511949, + "flos": 15338915280000.0, + "grad_norm": 1.7891592602529882, + "language_loss": 0.66085035, + "learning_rate": 5.614226082797369e-07, + "loss": 0.6818524, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 12688, + "time_per_iteration": 2.3476152420043945 + }, + { + "auxiliary_loss_clip": 0.0105138, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.01863372, + "balance_loss_mlp": 1.01604903, + "epoch": 0.7629039531038629, + "flos": 13005460183680.0, + "grad_norm": 2.066488529333495, + "language_loss": 0.71912789, + "learning_rate": 5.611520721310515e-07, + "loss": 0.74005747, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 12689, + "time_per_iteration": 3.817654609680176 + }, + { + "auxiliary_loss_clip": 0.01056861, + "auxiliary_loss_mlp": 0.01041311, + "balance_loss_clip": 1.01574063, + "balance_loss_mlp": 1.01825047, + "epoch": 0.7629640763565309, + "flos": 26170614259200.0, + "grad_norm": 1.8497906731091749, + "language_loss": 0.70415008, + "learning_rate": 5.608815905436238e-07, + "loss": 0.72513175, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 12690, + "time_per_iteration": 2.4117329120635986 + }, + { + "auxiliary_loss_clip": 0.01052813, + "auxiliary_loss_mlp": 0.01039466, + "balance_loss_clip": 1.01567245, + "balance_loss_mlp": 1.01615202, + "epoch": 0.7630241996091989, + "flos": 36792240387840.0, + "grad_norm": 1.4887908522426623, + "language_loss": 0.70714259, + "learning_rate": 5.606111635277109e-07, + "loss": 0.72806537, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 12691, + "time_per_iteration": 2.541783332824707 + }, + { + "auxiliary_loss_clip": 0.01052112, + "auxiliary_loss_mlp": 0.01040665, + "balance_loss_clip": 1.01864743, + "balance_loss_mlp": 1.01636231, + "epoch": 0.7630843228618668, + "flos": 21834087062400.0, + "grad_norm": 1.5785602620097348, + "language_loss": 0.83230388, + "learning_rate": 5.603407910935662e-07, + "loss": 0.85323167, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35742188, + "step": 12692, + "time_per_iteration": 2.3721070289611816 + }, + { + "auxiliary_loss_clip": 0.01056218, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.01442075, + "balance_loss_mlp": 1.01842856, + "epoch": 0.7631444461145348, + "flos": 12639710113920.0, + "grad_norm": 2.2031498521762383, + "language_loss": 0.78318578, + "learning_rate": 5.600704732514438e-07, + "loss": 0.80411869, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37890625, + "step": 12693, + "time_per_iteration": 2.376164197921753 + }, + { + "auxiliary_loss_clip": 0.01052133, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.01079655, + "balance_loss_mlp": 1.01608157, + "epoch": 0.7632045693672027, + "flos": 16835676710400.0, + "grad_norm": 2.5020725687393632, + "language_loss": 0.7513935, + "learning_rate": 5.598002100115933e-07, + "loss": 0.77226877, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 12694, + "time_per_iteration": 2.3585779666900635 + }, + { + "auxiliary_loss_clip": 0.01050588, + "auxiliary_loss_mlp": 0.01038, + "balance_loss_clip": 1.01561213, + "balance_loss_mlp": 1.01577139, + "epoch": 0.7632646926198707, + "flos": 22016263870080.0, + "grad_norm": 2.497537138623823, + "language_loss": 0.71363729, + "learning_rate": 5.595300013842625e-07, + "loss": 0.73452318, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 12695, + "time_per_iteration": 2.403942346572876 + }, + { + "auxiliary_loss_clip": 0.01053045, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.01167846, + "balance_loss_mlp": 1.01670122, + "epoch": 0.7633248158725388, + "flos": 23112850763520.0, + "grad_norm": 1.5496705724898812, + "language_loss": 0.73109579, + "learning_rate": 5.592598473796985e-07, + "loss": 0.75198174, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 12696, + "time_per_iteration": 2.4131603240966797 + }, + { + "auxiliary_loss_clip": 0.01053403, + "auxiliary_loss_mlp": 0.01039751, + "balance_loss_clip": 1.01505041, + "balance_loss_mlp": 1.01664853, + "epoch": 0.7633849391252067, + "flos": 10889060209920.0, + "grad_norm": 2.270334776894465, + "language_loss": 0.73081052, + "learning_rate": 5.589897480081453e-07, + "loss": 0.75174206, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3671875, + "step": 12697, + "time_per_iteration": 2.4284164905548096 + }, + { + "auxiliary_loss_clip": 0.01052769, + "auxiliary_loss_mlp": 0.01040537, + "balance_loss_clip": 1.01619446, + "balance_loss_mlp": 1.01663971, + "epoch": 0.7634450623778747, + "flos": 20993169121920.0, + "grad_norm": 1.9814556628050048, + "language_loss": 0.67730367, + "learning_rate": 5.587197032798461e-07, + "loss": 0.69823676, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36132812, + "step": 12698, + "time_per_iteration": 2.357182502746582 + }, + { + "auxiliary_loss_clip": 0.01053092, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.01724982, + "balance_loss_mlp": 1.01628065, + "epoch": 0.7635051856305426, + "flos": 18880993422720.0, + "grad_norm": 1.6141414593281105, + "language_loss": 0.73200005, + "learning_rate": 5.5844971320504e-07, + "loss": 0.75294572, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 12699, + "time_per_iteration": 2.4349634647369385 + }, + { + "auxiliary_loss_clip": 0.01051737, + "auxiliary_loss_mlp": 0.01037664, + "balance_loss_clip": 1.01530051, + "balance_loss_mlp": 1.01650023, + "epoch": 0.7635653088832106, + "flos": 34785572417280.0, + "grad_norm": 1.6658134833266927, + "language_loss": 0.74098921, + "learning_rate": 5.581797777939648e-07, + "loss": 0.76188314, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3515625, + "step": 12700, + "time_per_iteration": 2.4759881496429443 + }, + { + "auxiliary_loss_clip": 0.01052458, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.01067638, + "balance_loss_mlp": 1.01572704, + "epoch": 0.7636254321358785, + "flos": 23177510334720.0, + "grad_norm": 2.0645689459595227, + "language_loss": 0.69883156, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71968311, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3671875, + "step": 12701, + "time_per_iteration": 2.387096881866455 + }, + { + "auxiliary_loss_clip": 0.01053389, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.01695752, + "balance_loss_mlp": 1.01742661, + "epoch": 0.7636855553885465, + "flos": 21324145610880.0, + "grad_norm": 1.6513640265973635, + "language_loss": 0.65743196, + "learning_rate": 5.576400710039508e-07, + "loss": 0.67837399, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 12702, + "time_per_iteration": 2.36795711517334 + }, + { + "auxiliary_loss_clip": 0.01051651, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.01145864, + "balance_loss_mlp": 1.01587427, + "epoch": 0.7637456786412145, + "flos": 28656814020480.0, + "grad_norm": 2.296785135016134, + "language_loss": 0.67187858, + "learning_rate": 5.57370299645477e-07, + "loss": 0.69273263, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 12703, + "time_per_iteration": 2.449291229248047 + }, + { + "auxiliary_loss_clip": 0.01052012, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.01213789, + "balance_loss_mlp": 1.01635885, + "epoch": 0.7638058018938825, + "flos": 21906217664640.0, + "grad_norm": 1.8020107562838503, + "language_loss": 0.85654652, + "learning_rate": 5.571005829916668e-07, + "loss": 0.87740862, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 12704, + "time_per_iteration": 2.365377426147461 + }, + { + "auxiliary_loss_clip": 0.01055244, + "auxiliary_loss_mlp": 0.01038715, + "balance_loss_clip": 1.01472998, + "balance_loss_mlp": 1.01771307, + "epoch": 0.7638659251465504, + "flos": 29642586658560.0, + "grad_norm": 1.5806921423314881, + "language_loss": 0.68311286, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70405239, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 12705, + "time_per_iteration": 2.4613256454467773 + }, + { + "auxiliary_loss_clip": 0.01051241, + "auxiliary_loss_mlp": 0.01038204, + "balance_loss_clip": 1.01446915, + "balance_loss_mlp": 1.01567221, + "epoch": 0.7639260483992184, + "flos": 26139960218880.0, + "grad_norm": 1.7547885622047517, + "language_loss": 0.74671996, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76761436, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 12706, + "time_per_iteration": 2.4175801277160645 + }, + { + "auxiliary_loss_clip": 0.01053402, + "auxiliary_loss_mlp": 0.01036456, + "balance_loss_clip": 1.01323438, + "balance_loss_mlp": 1.01640916, + "epoch": 0.7639861716518863, + "flos": 20155672494720.0, + "grad_norm": 2.179134861409055, + "language_loss": 0.79184139, + "learning_rate": 5.562917613604781e-07, + "loss": 0.81274003, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 12707, + "time_per_iteration": 2.4321393966674805 + }, + { + "auxiliary_loss_clip": 0.01051687, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.0140835, + "balance_loss_mlp": 1.01560378, + "epoch": 0.7640462949045543, + "flos": 18582276896640.0, + "grad_norm": 1.707753405944436, + "language_loss": 0.81012082, + "learning_rate": 5.560222636275751e-07, + "loss": 0.83100307, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 12708, + "time_per_iteration": 2.350581407546997 + }, + { + "auxiliary_loss_clip": 0.01008398, + "auxiliary_loss_mlp": 0.01004435, + "balance_loss_clip": 1.00213444, + "balance_loss_mlp": 1.00116038, + "epoch": 0.7641064181572224, + "flos": 68318494830720.0, + "grad_norm": 0.8172562742613345, + "language_loss": 0.56749892, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58762717, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.07226562, + "step": 12709, + "time_per_iteration": 3.073061466217041 + }, + { + "auxiliary_loss_clip": 0.01054787, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_clip": 1.02034688, + "balance_loss_mlp": 1.01683807, + "epoch": 0.7641665414098903, + "flos": 17967979791360.0, + "grad_norm": 1.911256015092136, + "language_loss": 0.65651661, + "learning_rate": 5.554834324393271e-07, + "loss": 0.67753643, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 12710, + "time_per_iteration": 3.671189308166504 + }, + { + "auxiliary_loss_clip": 0.01053545, + "auxiliary_loss_mlp": 0.0103986, + "balance_loss_clip": 1.01490903, + "balance_loss_mlp": 1.01545584, + "epoch": 0.7642266646625583, + "flos": 21251002579200.0, + "grad_norm": 2.5982299650325276, + "language_loss": 0.66108644, + "learning_rate": 5.552140990044154e-07, + "loss": 0.68202049, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 12711, + "time_per_iteration": 2.4329936504364014 + }, + { + "auxiliary_loss_clip": 0.01051692, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.01414454, + "balance_loss_mlp": 1.01592588, + "epoch": 0.7642867879152262, + "flos": 22746681757440.0, + "grad_norm": 1.4733777421418268, + "language_loss": 0.73927653, + "learning_rate": 5.549448203559293e-07, + "loss": 0.7601499, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35742188, + "step": 12712, + "time_per_iteration": 2.3774375915527344 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.01130366, + "balance_loss_mlp": 1.01630974, + "epoch": 0.7643469111678942, + "flos": 23330988138240.0, + "grad_norm": 1.4898537185426444, + "language_loss": 0.81991601, + "learning_rate": 5.546755965040804e-07, + "loss": 0.84076989, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 12713, + "time_per_iteration": 2.419034957885742 + }, + { + "auxiliary_loss_clip": 0.01054532, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_clip": 1.01927161, + "balance_loss_mlp": 1.0169425, + "epoch": 0.7644070344205621, + "flos": 19856292652800.0, + "grad_norm": 2.122576191121344, + "language_loss": 0.84195119, + "learning_rate": 5.544064274590776e-07, + "loss": 0.86293298, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37695312, + "step": 12714, + "time_per_iteration": 2.3625900745391846 + }, + { + "auxiliary_loss_clip": 0.01055568, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.01873171, + "balance_loss_mlp": 1.01780045, + "epoch": 0.7644671576732301, + "flos": 22089546547200.0, + "grad_norm": 1.492321626693684, + "language_loss": 0.73872554, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75973058, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37695312, + "step": 12715, + "time_per_iteration": 3.8207969665527344 + }, + { + "auxiliary_loss_clip": 0.01051021, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.01361263, + "balance_loss_mlp": 1.0153538, + "epoch": 0.7645272809258981, + "flos": 25480311390720.0, + "grad_norm": 1.8674236514965161, + "language_loss": 0.64298147, + "learning_rate": 5.538682538304376e-07, + "loss": 0.6638487, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 12716, + "time_per_iteration": 3.8544297218322754 + }, + { + "auxiliary_loss_clip": 0.01055434, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.02093482, + "balance_loss_mlp": 1.01746929, + "epoch": 0.7645874041785661, + "flos": 21540851619840.0, + "grad_norm": 1.543555107035806, + "language_loss": 0.80581546, + "learning_rate": 5.535992492672068e-07, + "loss": 0.82682967, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 12717, + "time_per_iteration": 2.4164652824401855 + }, + { + "auxiliary_loss_clip": 0.01052391, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.0149827, + "balance_loss_mlp": 1.01648092, + "epoch": 0.764647527431234, + "flos": 20629862847360.0, + "grad_norm": 2.2652140878879115, + "language_loss": 0.67677051, + "learning_rate": 5.53330299551638e-07, + "loss": 0.69766575, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 12718, + "time_per_iteration": 2.4390506744384766 + }, + { + "auxiliary_loss_clip": 0.01049466, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.01352239, + "balance_loss_mlp": 1.01526213, + "epoch": 0.764707650683902, + "flos": 21433004830080.0, + "grad_norm": 1.9507226433490688, + "language_loss": 0.79005158, + "learning_rate": 5.530614046939286e-07, + "loss": 0.81091017, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34179688, + "step": 12719, + "time_per_iteration": 2.39414381980896 + }, + { + "auxiliary_loss_clip": 0.01052328, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.01040184, + "balance_loss_mlp": 1.01563716, + "epoch": 0.7647677739365699, + "flos": 22710092785920.0, + "grad_norm": 3.7017122461565286, + "language_loss": 0.71426004, + "learning_rate": 5.527925647042754e-07, + "loss": 0.73511112, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3671875, + "step": 12720, + "time_per_iteration": 2.412700653076172 + }, + { + "auxiliary_loss_clip": 0.01053899, + "auxiliary_loss_mlp": 0.01042854, + "balance_loss_clip": 1.01838088, + "balance_loss_mlp": 1.01674628, + "epoch": 0.7648278971892379, + "flos": 21323063358720.0, + "grad_norm": 1.621789069798318, + "language_loss": 0.74662113, + "learning_rate": 5.52523779592875e-07, + "loss": 0.76758868, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37109375, + "step": 12721, + "time_per_iteration": 2.3606297969818115 + }, + { + "auxiliary_loss_clip": 0.01051855, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.01094389, + "balance_loss_mlp": 1.01632452, + "epoch": 0.764888020441906, + "flos": 20666312173440.0, + "grad_norm": 1.7707633347751002, + "language_loss": 0.74651849, + "learning_rate": 5.522550493699163e-07, + "loss": 0.76737612, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 12722, + "time_per_iteration": 2.3680193424224854 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.01433647, + "balance_loss_mlp": 1.0154599, + "epoch": 0.7649481436945739, + "flos": 25081358751360.0, + "grad_norm": 2.0910766918435897, + "language_loss": 0.75219625, + "learning_rate": 5.519863740455912e-07, + "loss": 0.77308261, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36523438, + "step": 12723, + "time_per_iteration": 2.402522087097168 + }, + { + "auxiliary_loss_clip": 0.01052632, + "auxiliary_loss_mlp": 0.01035864, + "balance_loss_clip": 1.01106882, + "balance_loss_mlp": 1.01495934, + "epoch": 0.7650082669472419, + "flos": 24899705614080.0, + "grad_norm": 1.934159051449939, + "language_loss": 0.73682135, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75770628, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 12724, + "time_per_iteration": 2.4418442249298096 + }, + { + "auxiliary_loss_clip": 0.01050584, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.00979424, + "balance_loss_mlp": 1.01622891, + "epoch": 0.7650683901999098, + "flos": 14646517729920.0, + "grad_norm": 1.9568577826270541, + "language_loss": 0.84919059, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86999917, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34375, + "step": 12725, + "time_per_iteration": 2.3476407527923584 + }, + { + "auxiliary_loss_clip": 0.01050666, + "auxiliary_loss_mlp": 0.01038888, + "balance_loss_clip": 1.01516545, + "balance_loss_mlp": 1.01515424, + "epoch": 0.7651285134525778, + "flos": 26351429523840.0, + "grad_norm": 1.7784091742384471, + "language_loss": 0.78904974, + "learning_rate": 5.511806775662901e-07, + "loss": 0.80994529, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 12726, + "time_per_iteration": 2.4907591342926025 + }, + { + "auxiliary_loss_clip": 0.01052049, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.01745319, + "balance_loss_mlp": 1.015697, + "epoch": 0.7651886367052457, + "flos": 26645782129920.0, + "grad_norm": 1.6751538669022377, + "language_loss": 0.7135675, + "learning_rate": 5.509122219383615e-07, + "loss": 0.73449451, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 12727, + "time_per_iteration": 2.4009079933166504 + }, + { + "auxiliary_loss_clip": 0.01050588, + "auxiliary_loss_mlp": 0.01032037, + "balance_loss_clip": 1.01109195, + "balance_loss_mlp": 1.01615882, + "epoch": 0.7652487599579137, + "flos": 25701660610560.0, + "grad_norm": 1.632910883198225, + "language_loss": 0.80403376, + "learning_rate": 5.506438212599864e-07, + "loss": 0.82485998, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34375, + "step": 12728, + "time_per_iteration": 2.4325737953186035 + }, + { + "auxiliary_loss_clip": 0.01053783, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.01014352, + "balance_loss_mlp": 1.01675272, + "epoch": 0.7653088832105817, + "flos": 28584299393280.0, + "grad_norm": 1.9625544226296037, + "language_loss": 0.57336682, + "learning_rate": 5.503754755413424e-07, + "loss": 0.59424853, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 12729, + "time_per_iteration": 3.8606715202331543 + }, + { + "auxiliary_loss_clip": 0.01051346, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.01439285, + "balance_loss_mlp": 1.01554883, + "epoch": 0.7653690064632497, + "flos": 23365656984960.0, + "grad_norm": 1.743031029575185, + "language_loss": 0.79082859, + "learning_rate": 5.501071847926055e-07, + "loss": 0.81170797, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 12730, + "time_per_iteration": 2.4229512214660645 + }, + { + "auxiliary_loss_clip": 0.01056335, + "auxiliary_loss_mlp": 0.01041904, + "balance_loss_clip": 1.01685798, + "balance_loss_mlp": 1.01863694, + "epoch": 0.7654291297159176, + "flos": 15773130259200.0, + "grad_norm": 1.6531436130550428, + "language_loss": 0.69279879, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71378118, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 12731, + "time_per_iteration": 2.375032663345337 + }, + { + "auxiliary_loss_clip": 0.01053286, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.01475048, + "balance_loss_mlp": 1.01679683, + "epoch": 0.7654892529685856, + "flos": 18033023387520.0, + "grad_norm": 1.9164556269048885, + "language_loss": 0.71480262, + "learning_rate": 5.495707682455471e-07, + "loss": 0.73570824, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36523438, + "step": 12732, + "time_per_iteration": 2.3706295490264893 + }, + { + "auxiliary_loss_clip": 0.01053568, + "auxiliary_loss_mlp": 0.01042387, + "balance_loss_clip": 1.01740098, + "balance_loss_mlp": 1.01611948, + "epoch": 0.7655493762212535, + "flos": 27234766632960.0, + "grad_norm": 1.9880893527950658, + "language_loss": 0.79184449, + "learning_rate": 5.493026424675653e-07, + "loss": 0.8128041, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 12733, + "time_per_iteration": 2.422466278076172 + }, + { + "auxiliary_loss_clip": 0.01051967, + "auxiliary_loss_mlp": 0.01040562, + "balance_loss_clip": 1.0172565, + "balance_loss_mlp": 1.01661301, + "epoch": 0.7656094994739215, + "flos": 20773006888320.0, + "grad_norm": 1.8561667467510132, + "language_loss": 0.78583199, + "learning_rate": 5.490345717001726e-07, + "loss": 0.80675733, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 12734, + "time_per_iteration": 2.3961362838745117 + }, + { + "auxiliary_loss_clip": 0.01054104, + "auxiliary_loss_mlp": 0.01037764, + "balance_loss_clip": 1.01280212, + "balance_loss_mlp": 1.01610255, + "epoch": 0.7656696227265896, + "flos": 23038136720640.0, + "grad_norm": 2.040923309914301, + "language_loss": 0.74535573, + "learning_rate": 5.48766555953535e-07, + "loss": 0.76627433, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 12735, + "time_per_iteration": 2.4182705879211426 + }, + { + "auxiliary_loss_clip": 0.01051407, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.01411986, + "balance_loss_mlp": 1.01610637, + "epoch": 0.7657297459792575, + "flos": 27524441116800.0, + "grad_norm": 10.337123658166268, + "language_loss": 0.73430741, + "learning_rate": 5.484985952378145e-07, + "loss": 0.75519854, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 12736, + "time_per_iteration": 2.448190927505493 + }, + { + "auxiliary_loss_clip": 0.0105571, + "auxiliary_loss_mlp": 0.01048461, + "balance_loss_clip": 1.01946974, + "balance_loss_mlp": 1.01716483, + "epoch": 0.7657898692319255, + "flos": 17127515698560.0, + "grad_norm": 2.2458308097029835, + "language_loss": 0.79294765, + "learning_rate": 5.482306895631728e-07, + "loss": 0.81398934, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.38671875, + "step": 12737, + "time_per_iteration": 2.353928565979004 + }, + { + "auxiliary_loss_clip": 0.01054456, + "auxiliary_loss_mlp": 0.01039997, + "balance_loss_clip": 1.0140934, + "balance_loss_mlp": 1.01684535, + "epoch": 0.7658499924845934, + "flos": 21464810945280.0, + "grad_norm": 1.7936976658135209, + "language_loss": 0.77156317, + "learning_rate": 5.479628389397699e-07, + "loss": 0.79250777, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 12738, + "time_per_iteration": 2.3898069858551025 + }, + { + "auxiliary_loss_clip": 0.01054284, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.00972116, + "balance_loss_mlp": 1.01719928, + "epoch": 0.7659101157372614, + "flos": 29495392899840.0, + "grad_norm": 1.9773731579296154, + "language_loss": 0.63672233, + "learning_rate": 5.476950433777603e-07, + "loss": 0.65761364, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 12739, + "time_per_iteration": 2.418248176574707 + }, + { + "auxiliary_loss_clip": 0.01054076, + "auxiliary_loss_mlp": 0.0104074, + "balance_loss_clip": 1.01611125, + "balance_loss_mlp": 1.01684892, + "epoch": 0.7659702389899293, + "flos": 18550819895040.0, + "grad_norm": 1.9853247425954526, + "language_loss": 0.81421787, + "learning_rate": 5.474273028873004e-07, + "loss": 0.83516604, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 12740, + "time_per_iteration": 2.391996383666992 + }, + { + "auxiliary_loss_clip": 0.0105375, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.01196003, + "balance_loss_mlp": 1.01706684, + "epoch": 0.7660303622425974, + "flos": 23548078172160.0, + "grad_norm": 1.5648815822931679, + "language_loss": 0.66120541, + "learning_rate": 5.471596174785429e-07, + "loss": 0.68212223, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 12741, + "time_per_iteration": 2.417964220046997 + }, + { + "auxiliary_loss_clip": 0.01051397, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.01083708, + "balance_loss_mlp": 1.01589942, + "epoch": 0.7660904854952653, + "flos": 18915732092160.0, + "grad_norm": 1.5957519796768864, + "language_loss": 0.7690801, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78994244, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 12742, + "time_per_iteration": 2.5376827716827393 + }, + { + "auxiliary_loss_clip": 0.01050977, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.01755178, + "balance_loss_mlp": 1.01632416, + "epoch": 0.7661506087479333, + "flos": 23146437358080.0, + "grad_norm": 1.3737761269619064, + "language_loss": 0.77106643, + "learning_rate": 5.46624411946736e-07, + "loss": 0.79198599, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 12743, + "time_per_iteration": 2.398345470428467 + }, + { + "auxiliary_loss_clip": 0.010519, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.01754963, + "balance_loss_mlp": 1.01584959, + "epoch": 0.7662107320006012, + "flos": 17564837788800.0, + "grad_norm": 1.7956684689844167, + "language_loss": 0.7585845, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77950442, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.359375, + "step": 12744, + "time_per_iteration": 2.361865997314453 + }, + { + "auxiliary_loss_clip": 0.01053309, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.00914311, + "balance_loss_mlp": 1.01622605, + "epoch": 0.7662708552532692, + "flos": 22302167927040.0, + "grad_norm": 2.460852042496772, + "language_loss": 0.7218529, + "learning_rate": 5.460894268635181e-07, + "loss": 0.74273801, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37109375, + "step": 12745, + "time_per_iteration": 2.512939453125 + }, + { + "auxiliary_loss_clip": 0.01053182, + "auxiliary_loss_mlp": 0.01044019, + "balance_loss_clip": 1.01866293, + "balance_loss_mlp": 1.01669538, + "epoch": 0.7663309785059371, + "flos": 15741149587200.0, + "grad_norm": 2.50198242975992, + "language_loss": 0.79356194, + "learning_rate": 5.458220170154896e-07, + "loss": 0.81453395, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 12746, + "time_per_iteration": 2.5211589336395264 + }, + { + "auxiliary_loss_clip": 0.01008351, + "auxiliary_loss_mlp": 0.01002605, + "balance_loss_clip": 1.00020897, + "balance_loss_mlp": 1.00146973, + "epoch": 0.7663911017586051, + "flos": 62159780620800.0, + "grad_norm": 0.6676416255623655, + "language_loss": 0.56862473, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58873427, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.06884766, + "step": 12747, + "time_per_iteration": 3.0681021213531494 + }, + { + "auxiliary_loss_clip": 0.01049923, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.01720786, + "balance_loss_mlp": 1.01512587, + "epoch": 0.7664512250112732, + "flos": 26504802593280.0, + "grad_norm": 1.5479623995954555, + "language_loss": 0.73195833, + "learning_rate": 5.452873627572956e-07, + "loss": 0.75283682, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34765625, + "step": 12748, + "time_per_iteration": 2.407489776611328 + }, + { + "auxiliary_loss_clip": 0.01052213, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.01020527, + "balance_loss_mlp": 1.01650405, + "epoch": 0.7665113482639411, + "flos": 16248717066240.0, + "grad_norm": 2.6606575171335156, + "language_loss": 0.71115494, + "learning_rate": 5.450201183674052e-07, + "loss": 0.73203409, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.35742188, + "step": 12749, + "time_per_iteration": 3.6445515155792236 + }, + { + "auxiliary_loss_clip": 0.01051716, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.01557803, + "balance_loss_mlp": 1.01540399, + "epoch": 0.7665714715166091, + "flos": 27196676472960.0, + "grad_norm": 1.5818622426615363, + "language_loss": 0.74772489, + "learning_rate": 5.447529291504967e-07, + "loss": 0.76864564, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 12750, + "time_per_iteration": 2.4174180030822754 + }, + { + "auxiliary_loss_clip": 0.01050568, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.01162612, + "balance_loss_mlp": 1.01595283, + "epoch": 0.766631594769277, + "flos": 21066766001280.0, + "grad_norm": 2.3122808269325748, + "language_loss": 0.76593566, + "learning_rate": 5.444857951167026e-07, + "loss": 0.78676552, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34570312, + "step": 12751, + "time_per_iteration": 2.3904788494110107 + }, + { + "auxiliary_loss_clip": 0.01053473, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.01643991, + "balance_loss_mlp": 1.01690125, + "epoch": 0.766691718021945, + "flos": 24096808010880.0, + "grad_norm": 1.815475002164816, + "language_loss": 0.6259141, + "learning_rate": 5.442187162761537e-07, + "loss": 0.64685166, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 12752, + "time_per_iteration": 2.4050722122192383 + }, + { + "auxiliary_loss_clip": 0.01053801, + "auxiliary_loss_mlp": 0.01040332, + "balance_loss_clip": 1.01278305, + "balance_loss_mlp": 1.01643038, + "epoch": 0.7667518412746129, + "flos": 23439533155200.0, + "grad_norm": 1.9116312387886978, + "language_loss": 0.70526487, + "learning_rate": 5.439516926389767e-07, + "loss": 0.72620612, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.375, + "step": 12753, + "time_per_iteration": 2.3904943466186523 + }, + { + "auxiliary_loss_clip": 0.01052105, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.02245212, + "balance_loss_mlp": 1.01566136, + "epoch": 0.766811964527281, + "flos": 18147852449280.0, + "grad_norm": 2.712687608843921, + "language_loss": 0.63473821, + "learning_rate": 5.436847242152971e-07, + "loss": 0.6557281, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 12754, + "time_per_iteration": 3.7526795864105225 + }, + { + "auxiliary_loss_clip": 0.01052283, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.00932491, + "balance_loss_mlp": 1.0164268, + "epoch": 0.7668720877799489, + "flos": 19535056433280.0, + "grad_norm": 3.035460768437019, + "language_loss": 0.81481415, + "learning_rate": 5.434178110152401e-07, + "loss": 0.83567291, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.359375, + "step": 12755, + "time_per_iteration": 2.4135172367095947 + }, + { + "auxiliary_loss_clip": 0.01052055, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.0164938, + "balance_loss_mlp": 1.01632559, + "epoch": 0.7669322110326169, + "flos": 22673224523520.0, + "grad_norm": 1.868365133016885, + "language_loss": 0.71516085, + "learning_rate": 5.431509530489242e-07, + "loss": 0.73609376, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.35546875, + "step": 12756, + "time_per_iteration": 3.757382869720459 + }, + { + "auxiliary_loss_clip": 0.01051801, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_clip": 1.0194788, + "balance_loss_mlp": 1.01563263, + "epoch": 0.7669923342852848, + "flos": 26468178710400.0, + "grad_norm": 1.5892951497498515, + "language_loss": 0.70694375, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72788805, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 12757, + "time_per_iteration": 2.4543678760528564 + }, + { + "auxiliary_loss_clip": 0.01052853, + "auxiliary_loss_mlp": 0.01039961, + "balance_loss_clip": 1.01455772, + "balance_loss_mlp": 1.01682651, + "epoch": 0.7670524575379528, + "flos": 22855052217600.0, + "grad_norm": 1.9865678024804194, + "language_loss": 0.7767092, + "learning_rate": 5.426174028579955e-07, + "loss": 0.79763734, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.359375, + "step": 12758, + "time_per_iteration": 2.3725173473358154 + }, + { + "auxiliary_loss_clip": 0.01051244, + "auxiliary_loss_mlp": 0.01041858, + "balance_loss_clip": 1.01894557, + "balance_loss_mlp": 1.01602697, + "epoch": 0.7671125807906207, + "flos": 22451142165120.0, + "grad_norm": 3.333139095823116, + "language_loss": 0.77201629, + "learning_rate": 5.423507106536156e-07, + "loss": 0.79294729, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 12759, + "time_per_iteration": 2.4158780574798584 + }, + { + "auxiliary_loss_clip": 0.01052527, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.00863838, + "balance_loss_mlp": 1.01582098, + "epoch": 0.7671727040432887, + "flos": 35370088266240.0, + "grad_norm": 2.099909262958268, + "language_loss": 0.69422191, + "learning_rate": 5.420840737234425e-07, + "loss": 0.71506578, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 12760, + "time_per_iteration": 2.4774861335754395 + }, + { + "auxiliary_loss_clip": 0.01054594, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.01443958, + "balance_loss_mlp": 1.01788092, + "epoch": 0.7672328272959568, + "flos": 22493770801920.0, + "grad_norm": 1.5219371926517717, + "language_loss": 0.80042726, + "learning_rate": 5.418174920775871e-07, + "loss": 0.82137018, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3671875, + "step": 12761, + "time_per_iteration": 2.4428563117980957 + }, + { + "auxiliary_loss_clip": 0.01051065, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.0179801, + "balance_loss_mlp": 1.01562738, + "epoch": 0.7672929505486247, + "flos": 22814588085120.0, + "grad_norm": 2.507823835597998, + "language_loss": 0.67715144, + "learning_rate": 5.415509657261589e-07, + "loss": 0.69807804, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 12762, + "time_per_iteration": 2.3837387561798096 + }, + { + "auxiliary_loss_clip": 0.01053182, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.01094604, + "balance_loss_mlp": 1.0154078, + "epoch": 0.7673530738012927, + "flos": 20337814391040.0, + "grad_norm": 1.6921245092834227, + "language_loss": 0.7565887, + "learning_rate": 5.412844946792639e-07, + "loss": 0.77747488, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37695312, + "step": 12763, + "time_per_iteration": 2.3854806423187256 + }, + { + "auxiliary_loss_clip": 0.01053091, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.01896775, + "balance_loss_mlp": 1.01640809, + "epoch": 0.7674131970539606, + "flos": 34932137771520.0, + "grad_norm": 1.5086013313442899, + "language_loss": 0.71658784, + "learning_rate": 5.410180789470067e-07, + "loss": 0.73754781, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 12764, + "time_per_iteration": 2.479351758956909 + }, + { + "auxiliary_loss_clip": 0.01052038, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.01549363, + "balance_loss_mlp": 1.01591229, + "epoch": 0.7674733203066286, + "flos": 28327618010880.0, + "grad_norm": 1.560054839445609, + "language_loss": 0.69897044, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71986562, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36132812, + "step": 12765, + "time_per_iteration": 2.4481985569000244 + }, + { + "auxiliary_loss_clip": 0.01049319, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.0114007, + "balance_loss_mlp": 1.01422358, + "epoch": 0.7675334435592965, + "flos": 16288797173760.0, + "grad_norm": 2.529219918468953, + "language_loss": 0.62026083, + "learning_rate": 5.404854134668162e-07, + "loss": 0.64108604, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 12766, + "time_per_iteration": 2.3527235984802246 + }, + { + "auxiliary_loss_clip": 0.01008075, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 0.99994349, + "balance_loss_mlp": 1.001001, + "epoch": 0.7675935668119646, + "flos": 64822641194880.0, + "grad_norm": 0.7367794062559335, + "language_loss": 0.6086452, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62874877, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.07080078, + "step": 12767, + "time_per_iteration": 3.156628370285034 + }, + { + "auxiliary_loss_clip": 0.01050727, + "auxiliary_loss_mlp": 0.01032974, + "balance_loss_clip": 1.01075339, + "balance_loss_mlp": 1.01627517, + "epoch": 0.7676536900646325, + "flos": 22674271864320.0, + "grad_norm": 1.8341283066430407, + "language_loss": 0.69992578, + "learning_rate": 5.399529693663801e-07, + "loss": 0.72076285, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 12768, + "time_per_iteration": 3.8093857765197754 + }, + { + "auxiliary_loss_clip": 0.01055718, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.01184928, + "balance_loss_mlp": 1.01674533, + "epoch": 0.7677138133173005, + "flos": 26938563724800.0, + "grad_norm": 1.7207351733516836, + "language_loss": 0.71781516, + "learning_rate": 5.3968683035881e-07, + "loss": 0.73874217, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.390625, + "step": 12769, + "time_per_iteration": 2.419064521789551 + }, + { + "auxiliary_loss_clip": 0.01053173, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.01538157, + "balance_loss_mlp": 1.01616681, + "epoch": 0.7677739365699684, + "flos": 23798580243840.0, + "grad_norm": 2.030546886349247, + "language_loss": 0.81322211, + "learning_rate": 5.394207467264611e-07, + "loss": 0.83415186, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 12770, + "time_per_iteration": 2.4146625995635986 + }, + { + "auxiliary_loss_clip": 0.01051657, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.01426876, + "balance_loss_mlp": 1.01649952, + "epoch": 0.7678340598226364, + "flos": 34454176992000.0, + "grad_norm": 1.600062406333276, + "language_loss": 0.79435003, + "learning_rate": 5.391547184794245e-07, + "loss": 0.81521958, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.3515625, + "step": 12771, + "time_per_iteration": 2.492295742034912 + }, + { + "auxiliary_loss_clip": 0.0105236, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.01364243, + "balance_loss_mlp": 1.01622462, + "epoch": 0.7678941830753043, + "flos": 23840615387520.0, + "grad_norm": 1.4538967992058485, + "language_loss": 0.69252163, + "learning_rate": 5.388887456277876e-07, + "loss": 0.71340853, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 12772, + "time_per_iteration": 2.440007448196411 + }, + { + "auxiliary_loss_clip": 0.01049769, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.00861669, + "balance_loss_mlp": 1.01546884, + "epoch": 0.7679543063279723, + "flos": 25409751799680.0, + "grad_norm": 1.6942182341047893, + "language_loss": 0.74205542, + "learning_rate": 5.386228281816349e-07, + "loss": 0.76286, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 12773, + "time_per_iteration": 2.411992073059082 + }, + { + "auxiliary_loss_clip": 0.01050269, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.01236486, + "balance_loss_mlp": 1.01566625, + "epoch": 0.7680144295806404, + "flos": 27961204625280.0, + "grad_norm": 2.053486309858787, + "language_loss": 0.81806445, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83889526, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34570312, + "step": 12774, + "time_per_iteration": 2.423240900039673 + }, + { + "auxiliary_loss_clip": 0.01053097, + "auxiliary_loss_mlp": 0.01036342, + "balance_loss_clip": 1.0126555, + "balance_loss_mlp": 1.01693726, + "epoch": 0.7680745528333083, + "flos": 20411760384000.0, + "grad_norm": 1.8165699595014853, + "language_loss": 0.71028203, + "learning_rate": 5.380911595461177e-07, + "loss": 0.73117638, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36132812, + "step": 12775, + "time_per_iteration": 2.371366262435913 + }, + { + "auxiliary_loss_clip": 0.01008171, + "auxiliary_loss_mlp": 0.01003181, + "balance_loss_clip": 1.00071323, + "balance_loss_mlp": 1.00117838, + "epoch": 0.7681346760859763, + "flos": 68397433148160.0, + "grad_norm": 0.6958216998138518, + "language_loss": 0.56889433, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58900785, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.0246582, + "router_z_loss_mlp": 0.0703125, + "step": 12776, + "time_per_iteration": 3.1368296146392822 + }, + { + "auxiliary_loss_clip": 0.01051858, + "auxiliary_loss_mlp": 0.0103732, + "balance_loss_clip": 1.01447916, + "balance_loss_mlp": 1.0160985, + "epoch": 0.7681947993386442, + "flos": 21250409086080.0, + "grad_norm": 2.049606451018449, + "language_loss": 0.75509572, + "learning_rate": 5.375597126535188e-07, + "loss": 0.77598751, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 12777, + "time_per_iteration": 2.387273073196411 + }, + { + "auxiliary_loss_clip": 0.01052812, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.01244199, + "balance_loss_mlp": 1.01743245, + "epoch": 0.7682549225913122, + "flos": 21396625326720.0, + "grad_norm": 2.1388700507850777, + "language_loss": 0.71925449, + "learning_rate": 5.372940723860043e-07, + "loss": 0.74012744, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 12778, + "time_per_iteration": 2.3946659564971924 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.0142138, + "balance_loss_mlp": 1.01670337, + "epoch": 0.7683150458439801, + "flos": 23037822518400.0, + "grad_norm": 1.8574763484267862, + "language_loss": 0.7132566, + "learning_rate": 5.37028487584446e-07, + "loss": 0.73416054, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 12779, + "time_per_iteration": 2.414494276046753 + }, + { + "auxiliary_loss_clip": 0.01054077, + "auxiliary_loss_mlp": 0.01036889, + "balance_loss_clip": 1.01272559, + "balance_loss_mlp": 1.01746321, + "epoch": 0.7683751690966482, + "flos": 67330070795520.0, + "grad_norm": 1.695606352259959, + "language_loss": 0.59771073, + "learning_rate": 5.367629582589133e-07, + "loss": 0.61862046, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 12780, + "time_per_iteration": 2.7964818477630615 + }, + { + "auxiliary_loss_clip": 0.01054772, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.01462722, + "balance_loss_mlp": 1.01718545, + "epoch": 0.7684352923493161, + "flos": 21797812293120.0, + "grad_norm": 1.8521937739231058, + "language_loss": 0.69332767, + "learning_rate": 5.364974844194759e-07, + "loss": 0.714275, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 12781, + "time_per_iteration": 2.4283041954040527 + }, + { + "auxiliary_loss_clip": 0.010513, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.01294804, + "balance_loss_mlp": 1.01588929, + "epoch": 0.7684954156019841, + "flos": 25846445485440.0, + "grad_norm": 1.7980633099844932, + "language_loss": 0.80329686, + "learning_rate": 5.362320660762016e-07, + "loss": 0.82417142, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 12782, + "time_per_iteration": 2.4206135272979736 + }, + { + "auxiliary_loss_clip": 0.01053342, + "auxiliary_loss_mlp": 0.01042063, + "balance_loss_clip": 1.01524091, + "balance_loss_mlp": 1.01630044, + "epoch": 0.768555538854652, + "flos": 25446201125760.0, + "grad_norm": 1.7644960834046488, + "language_loss": 0.68392754, + "learning_rate": 5.35966703239153e-07, + "loss": 0.70488161, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.37109375, + "step": 12783, + "time_per_iteration": 2.397552251815796 + }, + { + "auxiliary_loss_clip": 0.01052853, + "auxiliary_loss_mlp": 0.01044204, + "balance_loss_clip": 1.0191102, + "balance_loss_mlp": 1.01587963, + "epoch": 0.76861566210732, + "flos": 19645347018240.0, + "grad_norm": 1.609071405988719, + "language_loss": 0.70643139, + "learning_rate": 5.357013959183938e-07, + "loss": 0.72740197, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 12784, + "time_per_iteration": 2.4014434814453125 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.00801122, + "balance_loss_mlp": 1.01540089, + "epoch": 0.7686757853599879, + "flos": 22417939595520.0, + "grad_norm": 1.6769175162814445, + "language_loss": 0.81331909, + "learning_rate": 5.354361441239843e-07, + "loss": 0.83411002, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3515625, + "step": 12785, + "time_per_iteration": 2.3646156787872314 + }, + { + "auxiliary_loss_clip": 0.0105335, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.01857734, + "balance_loss_mlp": 1.01689184, + "epoch": 0.768735908612656, + "flos": 47772529580160.0, + "grad_norm": 1.6468120002815054, + "language_loss": 0.78399998, + "learning_rate": 5.351709478659836e-07, + "loss": 0.80498111, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36328125, + "step": 12786, + "time_per_iteration": 2.639465570449829 + }, + { + "auxiliary_loss_clip": 0.01052215, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.01282048, + "balance_loss_mlp": 1.01581752, + "epoch": 0.7687960318653239, + "flos": 30261876088320.0, + "grad_norm": 2.046991099904427, + "language_loss": 0.59640676, + "learning_rate": 5.349058071544468e-07, + "loss": 0.61730397, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36523438, + "step": 12787, + "time_per_iteration": 2.424450159072876 + }, + { + "auxiliary_loss_clip": 0.01050458, + "auxiliary_loss_mlp": 0.0103703, + "balance_loss_clip": 1.01504838, + "balance_loss_mlp": 1.01470113, + "epoch": 0.7688561551179919, + "flos": 19572413454720.0, + "grad_norm": 1.970844897828011, + "language_loss": 0.77272522, + "learning_rate": 5.346407219994292e-07, + "loss": 0.7936002, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35742188, + "step": 12788, + "time_per_iteration": 2.389676809310913 + }, + { + "auxiliary_loss_clip": 0.01051697, + "auxiliary_loss_mlp": 0.01040323, + "balance_loss_clip": 1.0156225, + "balance_loss_mlp": 1.01529145, + "epoch": 0.7689162783706599, + "flos": 22782677235840.0, + "grad_norm": 1.7346455297798944, + "language_loss": 0.68451273, + "learning_rate": 5.343756924109821e-07, + "loss": 0.70543289, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 12789, + "time_per_iteration": 3.615783452987671 + }, + { + "auxiliary_loss_clip": 0.01052431, + "auxiliary_loss_mlp": 0.01037637, + "balance_loss_clip": 1.01365185, + "balance_loss_mlp": 1.01651621, + "epoch": 0.7689764016233278, + "flos": 34202767224960.0, + "grad_norm": 1.8013026820104745, + "language_loss": 0.70215207, + "learning_rate": 5.341107183991553e-07, + "loss": 0.72305274, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 12790, + "time_per_iteration": 2.4757652282714844 + }, + { + "auxiliary_loss_clip": 0.01050289, + "auxiliary_loss_mlp": 0.01037339, + "balance_loss_clip": 1.01476121, + "balance_loss_mlp": 1.01525199, + "epoch": 0.7690365248759958, + "flos": 17273522471040.0, + "grad_norm": 1.6952376053007376, + "language_loss": 0.69511169, + "learning_rate": 5.338457999739969e-07, + "loss": 0.71598804, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 12791, + "time_per_iteration": 2.3556394577026367 + }, + { + "auxiliary_loss_clip": 0.0105065, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.01588511, + "balance_loss_mlp": 1.01618099, + "epoch": 0.7690966481286637, + "flos": 18222182467200.0, + "grad_norm": 1.815549463488252, + "language_loss": 0.8039391, + "learning_rate": 5.335809371455526e-07, + "loss": 0.82482409, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 12792, + "time_per_iteration": 2.3782742023468018 + }, + { + "auxiliary_loss_clip": 0.01056239, + "auxiliary_loss_mlp": 0.01045584, + "balance_loss_clip": 1.01789141, + "balance_loss_mlp": 1.01783717, + "epoch": 0.7691567713813318, + "flos": 21536662256640.0, + "grad_norm": 1.767144221021261, + "language_loss": 0.73774689, + "learning_rate": 5.333161299238673e-07, + "loss": 0.75876516, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38476562, + "step": 12793, + "time_per_iteration": 2.371781826019287 + }, + { + "auxiliary_loss_clip": 0.01052133, + "auxiliary_loss_mlp": 0.0103915, + "balance_loss_clip": 1.01516533, + "balance_loss_mlp": 1.01632786, + "epoch": 0.7692168946339997, + "flos": 39378571528320.0, + "grad_norm": 2.4227261962831648, + "language_loss": 0.64831752, + "learning_rate": 5.330513783189803e-07, + "loss": 0.66923034, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35742188, + "step": 12794, + "time_per_iteration": 3.9140408039093018 + }, + { + "auxiliary_loss_clip": 0.01054649, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.01381373, + "balance_loss_mlp": 1.01742339, + "epoch": 0.7692770178866677, + "flos": 25008774301440.0, + "grad_norm": 1.3884154727048397, + "language_loss": 0.77108246, + "learning_rate": 5.327866823409319e-07, + "loss": 0.79201746, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37304688, + "step": 12795, + "time_per_iteration": 3.8382973670959473 + }, + { + "auxiliary_loss_clip": 0.0105209, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.01190352, + "balance_loss_mlp": 1.01565361, + "epoch": 0.7693371411393356, + "flos": 24715154833920.0, + "grad_norm": 1.6099524937954728, + "language_loss": 0.72300649, + "learning_rate": 5.325220419997601e-07, + "loss": 0.74388766, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 12796, + "time_per_iteration": 2.3973636627197266 + }, + { + "auxiliary_loss_clip": 0.01051456, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.01287484, + "balance_loss_mlp": 1.01645398, + "epoch": 0.7693972643920036, + "flos": 15923884976640.0, + "grad_norm": 3.21601125369956, + "language_loss": 0.66374737, + "learning_rate": 5.32257457305499e-07, + "loss": 0.68459994, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34960938, + "step": 12797, + "time_per_iteration": 2.3735015392303467 + }, + { + "auxiliary_loss_clip": 0.01053117, + "auxiliary_loss_mlp": 0.01042494, + "balance_loss_clip": 1.0186646, + "balance_loss_mlp": 1.01658368, + "epoch": 0.7694573876446715, + "flos": 25404864209280.0, + "grad_norm": 1.9611755044806252, + "language_loss": 0.92037773, + "learning_rate": 5.319929282681823e-07, + "loss": 0.94133389, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 12798, + "time_per_iteration": 2.395881175994873 + }, + { + "auxiliary_loss_clip": 0.01052868, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.01115918, + "balance_loss_mlp": 1.01631832, + "epoch": 0.7695175108973396, + "flos": 16653290434560.0, + "grad_norm": 2.090271963159474, + "language_loss": 0.83160913, + "learning_rate": 5.317284548978418e-07, + "loss": 0.85247737, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 12799, + "time_per_iteration": 2.362508535385132 + }, + { + "auxiliary_loss_clip": 0.01053787, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.01410174, + "balance_loss_mlp": 1.01654649, + "epoch": 0.7695776341500075, + "flos": 13625657308800.0, + "grad_norm": 2.7519058004355155, + "language_loss": 0.79213768, + "learning_rate": 5.314640372045045e-07, + "loss": 0.81305879, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 12800, + "time_per_iteration": 2.3397159576416016 + }, + { + "auxiliary_loss_clip": 0.01055653, + "auxiliary_loss_mlp": 0.01039673, + "balance_loss_clip": 1.01307797, + "balance_loss_mlp": 1.01671731, + "epoch": 0.7696377574026755, + "flos": 24275633328000.0, + "grad_norm": 2.884319974988817, + "language_loss": 0.84863222, + "learning_rate": 5.31199675198198e-07, + "loss": 0.86958551, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 12801, + "time_per_iteration": 2.4041638374328613 + }, + { + "auxiliary_loss_clip": 0.01051626, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.01523709, + "balance_loss_mlp": 1.01626956, + "epoch": 0.7696978806553435, + "flos": 20922085860480.0, + "grad_norm": 1.899259041430755, + "language_loss": 0.729397, + "learning_rate": 5.30935368888947e-07, + "loss": 0.75029194, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 12802, + "time_per_iteration": 2.3861377239227295 + }, + { + "auxiliary_loss_clip": 0.01050767, + "auxiliary_loss_mlp": 0.01038096, + "balance_loss_clip": 1.01573288, + "balance_loss_mlp": 1.01587737, + "epoch": 0.7697580039080114, + "flos": 22928509451520.0, + "grad_norm": 1.951869540926632, + "language_loss": 0.76845777, + "learning_rate": 5.306711182867747e-07, + "loss": 0.7893464, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 12803, + "time_per_iteration": 2.4393162727355957 + }, + { + "auxiliary_loss_clip": 0.01008295, + "auxiliary_loss_mlp": 0.01003285, + "balance_loss_clip": 1.00047207, + "balance_loss_mlp": 1.00118542, + "epoch": 0.7698181271606794, + "flos": 68714305447680.0, + "grad_norm": 0.7280577046798192, + "language_loss": 0.55849123, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57860696, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.02807617, + "router_z_loss_mlp": 0.07128906, + "step": 12804, + "time_per_iteration": 3.026589870452881 + }, + { + "auxiliary_loss_clip": 0.01007847, + "auxiliary_loss_mlp": 0.01003421, + "balance_loss_clip": 1.00127542, + "balance_loss_mlp": 1.00103855, + "epoch": 0.7698782504133473, + "flos": 67406249249280.0, + "grad_norm": 0.736818595495299, + "language_loss": 0.54085201, + "learning_rate": 5.301427842437429e-07, + "loss": 0.5609647, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06835938, + "step": 12805, + "time_per_iteration": 3.176511764526367 + }, + { + "auxiliary_loss_clip": 0.01055776, + "auxiliary_loss_mlp": 0.01039282, + "balance_loss_clip": 1.01384318, + "balance_loss_mlp": 1.01837063, + "epoch": 0.7699383736660154, + "flos": 22487835870720.0, + "grad_norm": 2.024235065155715, + "language_loss": 0.74210751, + "learning_rate": 5.298787008229187e-07, + "loss": 0.76305807, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 12806, + "time_per_iteration": 2.3517298698425293 + }, + { + "auxiliary_loss_clip": 0.01051571, + "auxiliary_loss_mlp": 0.01038958, + "balance_loss_clip": 1.01499653, + "balance_loss_mlp": 1.01558053, + "epoch": 0.7699984969186833, + "flos": 21538756938240.0, + "grad_norm": 1.7947686277494967, + "language_loss": 0.75643051, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77733582, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 12807, + "time_per_iteration": 2.368196487426758 + }, + { + "auxiliary_loss_clip": 0.01055154, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_clip": 1.01760662, + "balance_loss_mlp": 1.01698804, + "epoch": 0.7700586201713513, + "flos": 21718210659840.0, + "grad_norm": 2.100813498011537, + "language_loss": 0.81017172, + "learning_rate": 5.293507012327218e-07, + "loss": 0.83114666, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3828125, + "step": 12808, + "time_per_iteration": 3.868474245071411 + }, + { + "auxiliary_loss_clip": 0.01054371, + "auxiliary_loss_mlp": 0.01041406, + "balance_loss_clip": 1.01591897, + "balance_loss_mlp": 1.01693463, + "epoch": 0.7701187434240192, + "flos": 27854754289920.0, + "grad_norm": 2.529079254403235, + "language_loss": 0.80043608, + "learning_rate": 5.290867850833718e-07, + "loss": 0.82139385, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 12809, + "time_per_iteration": 2.4175643920898438 + }, + { + "auxiliary_loss_clip": 0.01048572, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.01142585, + "balance_loss_mlp": 1.01515698, + "epoch": 0.7701788666766872, + "flos": 28620050492160.0, + "grad_norm": 1.490041181068458, + "language_loss": 0.71194577, + "learning_rate": 5.288229247111993e-07, + "loss": 0.73277056, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.33398438, + "step": 12810, + "time_per_iteration": 2.442531108856201 + }, + { + "auxiliary_loss_clip": 0.01055775, + "auxiliary_loss_mlp": 0.01040094, + "balance_loss_clip": 1.01349819, + "balance_loss_mlp": 1.01788187, + "epoch": 0.7702389899293551, + "flos": 14245575143040.0, + "grad_norm": 4.34944225124783, + "language_loss": 0.79704463, + "learning_rate": 5.285591201262079e-07, + "loss": 0.81800336, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 12811, + "time_per_iteration": 2.3264055252075195 + }, + { + "auxiliary_loss_clip": 0.01007816, + "auxiliary_loss_mlp": 0.01003356, + "balance_loss_clip": 1.00087607, + "balance_loss_mlp": 1.00091696, + "epoch": 0.7702991131820232, + "flos": 70570847105280.0, + "grad_norm": 0.8147873411196881, + "language_loss": 0.56761456, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58772629, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.06884766, + "step": 12812, + "time_per_iteration": 3.0539534091949463 + }, + { + "auxiliary_loss_clip": 0.01052914, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.01537657, + "balance_loss_mlp": 1.01627398, + "epoch": 0.7703592364346911, + "flos": 25478949847680.0, + "grad_norm": 1.7214166247144893, + "language_loss": 0.72989988, + "learning_rate": 5.280316783577836e-07, + "loss": 0.75083041, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 12813, + "time_per_iteration": 2.3852429389953613 + }, + { + "auxiliary_loss_clip": 0.01052656, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.01178277, + "balance_loss_mlp": 1.01614904, + "epoch": 0.7704193596873591, + "flos": 19279911150720.0, + "grad_norm": 1.7031747956235483, + "language_loss": 0.68124878, + "learning_rate": 5.27768041194351e-07, + "loss": 0.70214868, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36523438, + "step": 12814, + "time_per_iteration": 2.4301040172576904 + }, + { + "auxiliary_loss_clip": 0.01052043, + "auxiliary_loss_mlp": 0.01037493, + "balance_loss_clip": 1.01396132, + "balance_loss_mlp": 1.01634622, + "epoch": 0.7704794829400271, + "flos": 23657356327680.0, + "grad_norm": 3.958660952340384, + "language_loss": 0.67156947, + "learning_rate": 5.275044598581018e-07, + "loss": 0.69246483, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35742188, + "step": 12815, + "time_per_iteration": 2.392043352127075 + }, + { + "auxiliary_loss_clip": 0.01052334, + "auxiliary_loss_mlp": 0.01035346, + "balance_loss_clip": 1.01137257, + "balance_loss_mlp": 1.01583147, + "epoch": 0.770539606192695, + "flos": 18988316542080.0, + "grad_norm": 2.5264897192823055, + "language_loss": 0.67347741, + "learning_rate": 5.272409343590322e-07, + "loss": 0.69435424, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 12816, + "time_per_iteration": 2.405233144760132 + }, + { + "auxiliary_loss_clip": 0.01052986, + "auxiliary_loss_mlp": 0.01038519, + "balance_loss_clip": 1.01495171, + "balance_loss_mlp": 1.01714921, + "epoch": 0.770599729445363, + "flos": 11829585859200.0, + "grad_norm": 6.186916995105536, + "language_loss": 0.73441637, + "learning_rate": 5.26977464707133e-07, + "loss": 0.7553314, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 12817, + "time_per_iteration": 2.325200319290161 + }, + { + "auxiliary_loss_clip": 0.01052022, + "auxiliary_loss_mlp": 0.01039689, + "balance_loss_clip": 1.01649094, + "balance_loss_mlp": 1.0164001, + "epoch": 0.770659852698031, + "flos": 17821623905280.0, + "grad_norm": 1.8060620214351653, + "language_loss": 0.62463856, + "learning_rate": 5.267140509123957e-07, + "loss": 0.64555568, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 12818, + "time_per_iteration": 2.3811862468719482 + }, + { + "auxiliary_loss_clip": 0.01051828, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.01047909, + "balance_loss_mlp": 1.0168364, + "epoch": 0.770719975950699, + "flos": 21870885502080.0, + "grad_norm": 1.637011415399494, + "language_loss": 0.67693859, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69776505, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34960938, + "step": 12819, + "time_per_iteration": 2.39805006980896 + }, + { + "auxiliary_loss_clip": 0.01053966, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.01509929, + "balance_loss_mlp": 1.01700592, + "epoch": 0.7707800992033669, + "flos": 21323971054080.0, + "grad_norm": 2.620768702349678, + "language_loss": 0.58366895, + "learning_rate": 5.261873909343608e-07, + "loss": 0.60460377, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 12820, + "time_per_iteration": 2.364400863647461 + }, + { + "auxiliary_loss_clip": 0.01051014, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.01211905, + "balance_loss_mlp": 1.01512384, + "epoch": 0.7708402224560349, + "flos": 28178294659200.0, + "grad_norm": 2.202115342948981, + "language_loss": 0.8205328, + "learning_rate": 5.259241447710343e-07, + "loss": 0.8414138, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.359375, + "step": 12821, + "time_per_iteration": 2.4637911319732666 + }, + { + "auxiliary_loss_clip": 0.01051888, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.01013827, + "balance_loss_mlp": 1.01594996, + "epoch": 0.7709003457087028, + "flos": 15376167567360.0, + "grad_norm": 2.2035550323589224, + "language_loss": 0.69788241, + "learning_rate": 5.256609545048114e-07, + "loss": 0.71873701, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 12822, + "time_per_iteration": 2.3525612354278564 + }, + { + "auxiliary_loss_clip": 0.01051598, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.01427472, + "balance_loss_mlp": 1.01634955, + "epoch": 0.7709604689613708, + "flos": 30620713708800.0, + "grad_norm": 1.551146522078857, + "language_loss": 0.72752166, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74842095, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35351562, + "step": 12823, + "time_per_iteration": 2.4726462364196777 + }, + { + "auxiliary_loss_clip": 0.01057196, + "auxiliary_loss_mlp": 0.01038389, + "balance_loss_clip": 1.01199651, + "balance_loss_mlp": 1.01756465, + "epoch": 0.7710205922140387, + "flos": 20300282812800.0, + "grad_norm": 2.9006244831450836, + "language_loss": 0.77478862, + "learning_rate": 5.251347417035969e-07, + "loss": 0.79574448, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39648438, + "step": 12824, + "time_per_iteration": 2.366564989089966 + }, + { + "auxiliary_loss_clip": 0.0105437, + "auxiliary_loss_mlp": 0.01036239, + "balance_loss_clip": 1.01223063, + "balance_loss_mlp": 1.01767492, + "epoch": 0.7710807154667068, + "flos": 19643252336640.0, + "grad_norm": 2.1421638816044433, + "language_loss": 0.74010253, + "learning_rate": 5.248717191885592e-07, + "loss": 0.76100862, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 12825, + "time_per_iteration": 2.403416395187378 + }, + { + "auxiliary_loss_clip": 0.01050288, + "auxiliary_loss_mlp": 0.01034467, + "balance_loss_clip": 1.01430857, + "balance_loss_mlp": 1.01624751, + "epoch": 0.7711408387193747, + "flos": 20005441447680.0, + "grad_norm": 1.4544235905386564, + "language_loss": 0.74494129, + "learning_rate": 5.246087526105343e-07, + "loss": 0.76578885, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 12826, + "time_per_iteration": 2.4249658584594727 + }, + { + "auxiliary_loss_clip": 0.01053496, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.01367497, + "balance_loss_mlp": 1.01696479, + "epoch": 0.7712009619720427, + "flos": 24970020825600.0, + "grad_norm": 1.5528736633342006, + "language_loss": 0.82364774, + "learning_rate": 5.243458419794933e-07, + "loss": 0.8445704, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36523438, + "step": 12827, + "time_per_iteration": 2.43949556350708 + }, + { + "auxiliary_loss_clip": 0.01007692, + "auxiliary_loss_mlp": 0.01003163, + "balance_loss_clip": 1.00077868, + "balance_loss_mlp": 1.00072134, + "epoch": 0.7712610852247107, + "flos": 63246347953920.0, + "grad_norm": 0.8638866278180034, + "language_loss": 0.55187881, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57198739, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.06982422, + "step": 12828, + "time_per_iteration": 4.457131385803223 + }, + { + "auxiliary_loss_clip": 0.01051115, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.01060438, + "balance_loss_mlp": 1.01625443, + "epoch": 0.7713212084773786, + "flos": 18696861578880.0, + "grad_norm": 1.9755529744036866, + "language_loss": 0.70944107, + "learning_rate": 5.23820188598238e-07, + "loss": 0.73027515, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 12829, + "time_per_iteration": 2.3447306156158447 + }, + { + "auxiliary_loss_clip": 0.01054902, + "auxiliary_loss_mlp": 0.01041772, + "balance_loss_clip": 1.01706004, + "balance_loss_mlp": 1.01716113, + "epoch": 0.7713813317300466, + "flos": 14172501934080.0, + "grad_norm": 2.367205141035658, + "language_loss": 0.81077015, + "learning_rate": 5.235574458679579e-07, + "loss": 0.83173692, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 12830, + "time_per_iteration": 2.3815901279449463 + }, + { + "auxiliary_loss_clip": 0.01054868, + "auxiliary_loss_mlp": 0.01042693, + "balance_loss_clip": 1.01535797, + "balance_loss_mlp": 1.01671195, + "epoch": 0.7714414549827145, + "flos": 25702742862720.0, + "grad_norm": 1.6010278965915659, + "language_loss": 0.79221988, + "learning_rate": 5.232947591245269e-07, + "loss": 0.81319547, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38085938, + "step": 12831, + "time_per_iteration": 2.3912479877471924 + }, + { + "auxiliary_loss_clip": 0.01051865, + "auxiliary_loss_mlp": 0.01038088, + "balance_loss_clip": 1.01452041, + "balance_loss_mlp": 1.01604009, + "epoch": 0.7715015782353826, + "flos": 30553994367360.0, + "grad_norm": 1.510688916968866, + "language_loss": 0.62101132, + "learning_rate": 5.230321283779071e-07, + "loss": 0.64191085, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35742188, + "step": 12832, + "time_per_iteration": 2.4633495807647705 + }, + { + "auxiliary_loss_clip": 0.01054016, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.01489639, + "balance_loss_mlp": 1.01562142, + "epoch": 0.7715617014880505, + "flos": 20228326767360.0, + "grad_norm": 1.6790874095979706, + "language_loss": 0.80527878, + "learning_rate": 5.227695536380572e-07, + "loss": 0.8262068, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3828125, + "step": 12833, + "time_per_iteration": 3.771522045135498 + }, + { + "auxiliary_loss_clip": 0.01008084, + "auxiliary_loss_mlp": 0.01003116, + "balance_loss_clip": 1.00067246, + "balance_loss_mlp": 1.00103617, + "epoch": 0.7716218247407185, + "flos": 63662059912320.0, + "grad_norm": 0.8592873767362482, + "language_loss": 0.55643904, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57655096, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.0703125, + "step": 12834, + "time_per_iteration": 4.357520818710327 + }, + { + "auxiliary_loss_clip": 0.0105288, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.01244736, + "balance_loss_mlp": 1.01676393, + "epoch": 0.7716819479933864, + "flos": 19790795208960.0, + "grad_norm": 2.6026694299596067, + "language_loss": 0.7439667, + "learning_rate": 5.222445722184903e-07, + "loss": 0.7648502, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 12835, + "time_per_iteration": 2.4197685718536377 + }, + { + "auxiliary_loss_clip": 0.01053278, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.01435328, + "balance_loss_mlp": 1.01655853, + "epoch": 0.7717420712460544, + "flos": 18441192625920.0, + "grad_norm": 1.8750246564537822, + "language_loss": 0.72284818, + "learning_rate": 5.219821655586814e-07, + "loss": 0.74378467, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 12836, + "time_per_iteration": 2.3590691089630127 + }, + { + "auxiliary_loss_clip": 0.01050776, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.01252723, + "balance_loss_mlp": 1.01593161, + "epoch": 0.7718021944987223, + "flos": 35188016192640.0, + "grad_norm": 2.333447654738038, + "language_loss": 0.61298108, + "learning_rate": 5.217198149454575e-07, + "loss": 0.63384688, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34765625, + "step": 12837, + "time_per_iteration": 2.53525710105896 + }, + { + "auxiliary_loss_clip": 0.01008152, + "auxiliary_loss_mlp": 0.01002423, + "balance_loss_clip": 0.99984759, + "balance_loss_mlp": 1.00119495, + "epoch": 0.7718623177513904, + "flos": 67919612014080.0, + "grad_norm": 0.8615660925894095, + "language_loss": 0.5589202, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57902598, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.02575684, + "router_z_loss_mlp": 0.06933594, + "step": 12838, + "time_per_iteration": 2.989227294921875 + }, + { + "auxiliary_loss_clip": 0.0105077, + "auxiliary_loss_mlp": 0.01036132, + "balance_loss_clip": 1.01366091, + "balance_loss_mlp": 1.01586413, + "epoch": 0.7719224410040583, + "flos": 18580601151360.0, + "grad_norm": 2.8110305887644365, + "language_loss": 0.70369387, + "learning_rate": 5.211952818985538e-07, + "loss": 0.72456294, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 12839, + "time_per_iteration": 2.34385085105896 + }, + { + "auxiliary_loss_clip": 0.01051562, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.01264715, + "balance_loss_mlp": 1.01718462, + "epoch": 0.7719825642567263, + "flos": 23074690780800.0, + "grad_norm": 1.9464568162900922, + "language_loss": 0.81200135, + "learning_rate": 5.209330994847647e-07, + "loss": 0.83286554, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 12840, + "time_per_iteration": 2.3669679164886475 + }, + { + "auxiliary_loss_clip": 0.01053553, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.01494551, + "balance_loss_mlp": 1.01712787, + "epoch": 0.7720426875093943, + "flos": 20338058770560.0, + "grad_norm": 2.078362666931096, + "language_loss": 0.80924785, + "learning_rate": 5.206709731573402e-07, + "loss": 0.83016264, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 12841, + "time_per_iteration": 2.384835720062256 + }, + { + "auxiliary_loss_clip": 0.01051281, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.0136013, + "balance_loss_mlp": 1.01572669, + "epoch": 0.7721028107620622, + "flos": 23879508508800.0, + "grad_norm": 1.4600064951566638, + "language_loss": 0.77899218, + "learning_rate": 5.204089029262208e-07, + "loss": 0.79987442, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 12842, + "time_per_iteration": 2.4024016857147217 + }, + { + "auxiliary_loss_clip": 0.01054298, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.01781881, + "balance_loss_mlp": 1.01725483, + "epoch": 0.7721629340147302, + "flos": 26650355518080.0, + "grad_norm": 1.908205047247588, + "language_loss": 0.69685727, + "learning_rate": 5.201468888013445e-07, + "loss": 0.71781003, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 12843, + "time_per_iteration": 2.444014072418213 + }, + { + "auxiliary_loss_clip": 0.01054906, + "auxiliary_loss_mlp": 0.01037002, + "balance_loss_clip": 1.01392317, + "balance_loss_mlp": 1.01672316, + "epoch": 0.7722230572673981, + "flos": 21177789724800.0, + "grad_norm": 2.9027927685910573, + "language_loss": 0.75909221, + "learning_rate": 5.198849307926465e-07, + "loss": 0.7800113, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3828125, + "step": 12844, + "time_per_iteration": 2.363124370574951 + }, + { + "auxiliary_loss_clip": 0.01050981, + "auxiliary_loss_mlp": 0.01035361, + "balance_loss_clip": 1.01330721, + "balance_loss_mlp": 1.01595199, + "epoch": 0.7722831805200662, + "flos": 27963404040960.0, + "grad_norm": 1.5045451223593904, + "language_loss": 0.72815681, + "learning_rate": 5.196230289100596e-07, + "loss": 0.74902022, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 12845, + "time_per_iteration": 2.4450411796569824 + }, + { + "auxiliary_loss_clip": 0.01049609, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.01503301, + "balance_loss_mlp": 1.01552212, + "epoch": 0.7723433037727341, + "flos": 33874164708480.0, + "grad_norm": 1.7363221210393276, + "language_loss": 0.65487683, + "learning_rate": 5.193611831635159e-07, + "loss": 0.67573404, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 12846, + "time_per_iteration": 2.5328209400177 + }, + { + "auxiliary_loss_clip": 0.01008172, + "auxiliary_loss_mlp": 0.01004369, + "balance_loss_clip": 1.00212765, + "balance_loss_mlp": 1.00130725, + "epoch": 0.7724034270254021, + "flos": 62844951519360.0, + "grad_norm": 0.7804982211816042, + "language_loss": 0.6186744, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63879979, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06884766, + "step": 12847, + "time_per_iteration": 2.972306728363037 + }, + { + "auxiliary_loss_clip": 0.0105099, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.01093304, + "balance_loss_mlp": 1.01496732, + "epoch": 0.77246355027807, + "flos": 23294329344000.0, + "grad_norm": 1.7862877024684414, + "language_loss": 0.80250037, + "learning_rate": 5.188376601182732e-07, + "loss": 0.82334346, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.359375, + "step": 12848, + "time_per_iteration": 3.8425590991973877 + }, + { + "auxiliary_loss_clip": 0.01053662, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.0188334, + "balance_loss_mlp": 1.01611149, + "epoch": 0.772523673530738, + "flos": 20120235598080.0, + "grad_norm": 1.7261201562374693, + "language_loss": 0.74056941, + "learning_rate": 5.185759828394261e-07, + "loss": 0.76152551, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.375, + "step": 12849, + "time_per_iteration": 2.3561737537384033 + }, + { + "auxiliary_loss_clip": 0.0105089, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.00927091, + "balance_loss_mlp": 1.01519334, + "epoch": 0.7725837967834059, + "flos": 17819180110080.0, + "grad_norm": 1.8588748537990467, + "language_loss": 0.79715395, + "learning_rate": 5.183143617363261e-07, + "loss": 0.81798303, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35742188, + "step": 12850, + "time_per_iteration": 2.3530564308166504 + }, + { + "auxiliary_loss_clip": 0.0105368, + "auxiliary_loss_mlp": 0.01040719, + "balance_loss_clip": 1.01662707, + "balance_loss_mlp": 1.01634538, + "epoch": 0.772643920036074, + "flos": 27197688902400.0, + "grad_norm": 1.6952698406929767, + "language_loss": 0.80780017, + "learning_rate": 5.180527968188935e-07, + "loss": 0.82874417, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37304688, + "step": 12851, + "time_per_iteration": 2.4131436347961426 + }, + { + "auxiliary_loss_clip": 0.01051516, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_clip": 1.0198046, + "balance_loss_mlp": 1.01619339, + "epoch": 0.7727040432887419, + "flos": 21578453020800.0, + "grad_norm": 1.7032113347659417, + "language_loss": 0.75116277, + "learning_rate": 5.177912880970474e-07, + "loss": 0.7721113, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 12852, + "time_per_iteration": 2.3662424087524414 + }, + { + "auxiliary_loss_clip": 0.01050599, + "auxiliary_loss_mlp": 0.01041933, + "balance_loss_clip": 1.01948595, + "balance_loss_mlp": 1.01495886, + "epoch": 0.7727641665414099, + "flos": 22235553319680.0, + "grad_norm": 1.698800758306886, + "language_loss": 0.82873321, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84965861, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 12853, + "time_per_iteration": 2.3861634731292725 + }, + { + "auxiliary_loss_clip": 0.01007661, + "auxiliary_loss_mlp": 0.01001397, + "balance_loss_clip": 0.99915552, + "balance_loss_mlp": 1.00070858, + "epoch": 0.7728242897940779, + "flos": 54828822867840.0, + "grad_norm": 0.8933027017113838, + "language_loss": 0.54607069, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56616122, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06933594, + "step": 12854, + "time_per_iteration": 3.0723183155059814 + }, + { + "auxiliary_loss_clip": 0.0105312, + "auxiliary_loss_mlp": 0.01036372, + "balance_loss_clip": 1.01239955, + "balance_loss_mlp": 1.01586354, + "epoch": 0.7728844130467458, + "flos": 34460461036800.0, + "grad_norm": 1.564832345684066, + "language_loss": 0.7306686, + "learning_rate": 5.170070992041826e-07, + "loss": 0.75156355, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37304688, + "step": 12855, + "time_per_iteration": 2.4712162017822266 + }, + { + "auxiliary_loss_clip": 0.01051935, + "auxiliary_loss_mlp": 0.01037957, + "balance_loss_clip": 1.01359081, + "balance_loss_mlp": 1.01620448, + "epoch": 0.7729445362994138, + "flos": 18915348067200.0, + "grad_norm": 1.6112444619286324, + "language_loss": 0.69203258, + "learning_rate": 5.167458153638254e-07, + "loss": 0.71293151, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35742188, + "step": 12856, + "time_per_iteration": 2.3786211013793945 + }, + { + "auxiliary_loss_clip": 0.0105293, + "auxiliary_loss_mlp": 0.01040752, + "balance_loss_clip": 1.01692176, + "balance_loss_mlp": 1.01620889, + "epoch": 0.7730046595520818, + "flos": 22198964348160.0, + "grad_norm": 1.7862089198120918, + "language_loss": 0.80312634, + "learning_rate": 5.164845877686162e-07, + "loss": 0.82406312, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 12857, + "time_per_iteration": 2.369840145111084 + }, + { + "auxiliary_loss_clip": 0.01052001, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.01228678, + "balance_loss_mlp": 1.01559269, + "epoch": 0.7730647828047498, + "flos": 13551501847680.0, + "grad_norm": 1.7513197449145859, + "language_loss": 0.79368901, + "learning_rate": 5.162234164284591e-07, + "loss": 0.81458032, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 12858, + "time_per_iteration": 2.317845344543457 + }, + { + "auxiliary_loss_clip": 0.01050231, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.00836277, + "balance_loss_mlp": 1.01442778, + "epoch": 0.7731249060574177, + "flos": 21975101510400.0, + "grad_norm": 2.3901923447547655, + "language_loss": 0.78583872, + "learning_rate": 5.159623013532591e-07, + "loss": 0.80664909, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 12859, + "time_per_iteration": 2.3920135498046875 + }, + { + "auxiliary_loss_clip": 0.01048989, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.01375723, + "balance_loss_mlp": 1.01603341, + "epoch": 0.7731850293100857, + "flos": 22600709896320.0, + "grad_norm": 1.416694556896704, + "language_loss": 0.6877768, + "learning_rate": 5.157012425529186e-07, + "loss": 0.70860374, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33007812, + "step": 12860, + "time_per_iteration": 2.3788037300109863 + }, + { + "auxiliary_loss_clip": 0.01053428, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.0149473, + "balance_loss_mlp": 1.01617646, + "epoch": 0.7732451525627536, + "flos": 14097613334400.0, + "grad_norm": 2.3162201134554885, + "language_loss": 0.77072227, + "learning_rate": 5.154402400373343e-07, + "loss": 0.79165304, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37304688, + "step": 12861, + "time_per_iteration": 2.3552823066711426 + }, + { + "auxiliary_loss_clip": 0.01053551, + "auxiliary_loss_mlp": 0.01036335, + "balance_loss_clip": 1.01285052, + "balance_loss_mlp": 1.01643538, + "epoch": 0.7733052758154216, + "flos": 21468965397120.0, + "grad_norm": 1.4950012925137461, + "language_loss": 0.7585361, + "learning_rate": 5.15179293816405e-07, + "loss": 0.77943498, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 12862, + "time_per_iteration": 2.4103405475616455 + }, + { + "auxiliary_loss_clip": 0.01050814, + "auxiliary_loss_mlp": 0.01038001, + "balance_loss_clip": 1.01789093, + "balance_loss_mlp": 1.01627231, + "epoch": 0.7733653990680895, + "flos": 21393308747520.0, + "grad_norm": 1.851449653157231, + "language_loss": 0.83214897, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85303712, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.34570312, + "step": 12863, + "time_per_iteration": 2.383023738861084 + }, + { + "auxiliary_loss_clip": 0.01050595, + "auxiliary_loss_mlp": 0.01038791, + "balance_loss_clip": 1.01618934, + "balance_loss_mlp": 1.01570368, + "epoch": 0.7734255223207576, + "flos": 17675093462400.0, + "grad_norm": 1.7240945722970054, + "language_loss": 0.74317873, + "learning_rate": 5.146575702980898e-07, + "loss": 0.76407254, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 12864, + "time_per_iteration": 2.3390071392059326 + }, + { + "auxiliary_loss_clip": 0.01050948, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.01510549, + "balance_loss_mlp": 1.01517391, + "epoch": 0.7734856455734255, + "flos": 25229599850880.0, + "grad_norm": 2.0591636920830747, + "language_loss": 0.82701981, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84790003, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35742188, + "step": 12865, + "time_per_iteration": 2.4269559383392334 + }, + { + "auxiliary_loss_clip": 0.01055531, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.01928377, + "balance_loss_mlp": 1.01718795, + "epoch": 0.7735457688260935, + "flos": 23432201769600.0, + "grad_norm": 2.2329713907406044, + "language_loss": 0.73490822, + "learning_rate": 5.141360720771077e-07, + "loss": 0.75592256, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 12866, + "time_per_iteration": 2.376915454864502 + }, + { + "auxiliary_loss_clip": 0.01054194, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.01439476, + "balance_loss_mlp": 1.01681185, + "epoch": 0.7736058920787615, + "flos": 18728388403200.0, + "grad_norm": 2.682029344651614, + "language_loss": 0.66259706, + "learning_rate": 5.138754074778371e-07, + "loss": 0.68353999, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 12867, + "time_per_iteration": 2.3612287044525146 + }, + { + "auxiliary_loss_clip": 0.01051564, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.0200603, + "balance_loss_mlp": 1.01567614, + "epoch": 0.7736660153314294, + "flos": 22892199770880.0, + "grad_norm": 1.4504178332617506, + "language_loss": 0.7155931, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73655343, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 12868, + "time_per_iteration": 3.6806933879852295 + }, + { + "auxiliary_loss_clip": 0.01053692, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.01620507, + "balance_loss_mlp": 1.01662111, + "epoch": 0.7737261385840974, + "flos": 13800258351360.0, + "grad_norm": 2.053678107914755, + "language_loss": 0.7815389, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80248463, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 12869, + "time_per_iteration": 2.351701498031616 + }, + { + "auxiliary_loss_clip": 0.01050009, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.01185179, + "balance_loss_mlp": 1.01527619, + "epoch": 0.7737862618367654, + "flos": 28729468293120.0, + "grad_norm": 1.5966029996509639, + "language_loss": 0.75044596, + "learning_rate": 5.130937518435124e-07, + "loss": 0.77128136, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 12870, + "time_per_iteration": 2.425358533859253 + }, + { + "auxiliary_loss_clip": 0.01052687, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.01413512, + "balance_loss_mlp": 1.01629448, + "epoch": 0.7738463850894334, + "flos": 17017644049920.0, + "grad_norm": 2.430606237463681, + "language_loss": 0.75929749, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78019476, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 12871, + "time_per_iteration": 2.357271671295166 + }, + { + "auxiliary_loss_clip": 0.01052032, + "auxiliary_loss_mlp": 0.01038181, + "balance_loss_clip": 1.01538849, + "balance_loss_mlp": 1.0160737, + "epoch": 0.7739065083421013, + "flos": 20702970967680.0, + "grad_norm": 1.7179228867822032, + "language_loss": 0.69838125, + "learning_rate": 5.12572929988999e-07, + "loss": 0.7192834, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 12872, + "time_per_iteration": 2.3658108711242676 + }, + { + "auxiliary_loss_clip": 0.01051548, + "auxiliary_loss_mlp": 0.01039732, + "balance_loss_clip": 1.01516342, + "balance_loss_mlp": 1.01551116, + "epoch": 0.7739666315947693, + "flos": 20696372720640.0, + "grad_norm": 2.38281755833363, + "language_loss": 0.85829043, + "learning_rate": 5.123126036618804e-07, + "loss": 0.8792032, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 12873, + "time_per_iteration": 3.778946876525879 + }, + { + "auxiliary_loss_clip": 0.01052889, + "auxiliary_loss_mlp": 0.01039862, + "balance_loss_clip": 1.01612771, + "balance_loss_mlp": 1.01628578, + "epoch": 0.7740267548474372, + "flos": 29569373804160.0, + "grad_norm": 2.601235743130282, + "language_loss": 0.6711762, + "learning_rate": 5.120523337480174e-07, + "loss": 0.69210368, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 12874, + "time_per_iteration": 3.856391668319702 + }, + { + "auxiliary_loss_clip": 0.01051127, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.01334202, + "balance_loss_mlp": 1.01653934, + "epoch": 0.7740868781001052, + "flos": 23657984732160.0, + "grad_norm": 1.6729804350386472, + "language_loss": 0.63168305, + "learning_rate": 5.117921202572785e-07, + "loss": 0.65254551, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34570312, + "step": 12875, + "time_per_iteration": 2.424463987350464 + }, + { + "auxiliary_loss_clip": 0.01052858, + "auxiliary_loss_mlp": 0.01036744, + "balance_loss_clip": 1.01404631, + "balance_loss_mlp": 1.0168556, + "epoch": 0.7741470013527731, + "flos": 24716167263360.0, + "grad_norm": 1.8044391197565974, + "language_loss": 0.66467619, + "learning_rate": 5.115319631995318e-07, + "loss": 0.68557221, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 12876, + "time_per_iteration": 2.440645694732666 + }, + { + "auxiliary_loss_clip": 0.01049717, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.01380932, + "balance_loss_mlp": 1.01551795, + "epoch": 0.7742071246054412, + "flos": 21870571299840.0, + "grad_norm": 2.6720451575143946, + "language_loss": 0.72567046, + "learning_rate": 5.112718625846433e-07, + "loss": 0.74652803, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34179688, + "step": 12877, + "time_per_iteration": 2.3866336345672607 + }, + { + "auxiliary_loss_clip": 0.01052598, + "auxiliary_loss_mlp": 0.01039998, + "balance_loss_clip": 1.0155127, + "balance_loss_mlp": 1.01602983, + "epoch": 0.7742672478581091, + "flos": 22673154700800.0, + "grad_norm": 1.7526462561663059, + "language_loss": 0.84164, + "learning_rate": 5.110118184224736e-07, + "loss": 0.86256593, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3671875, + "step": 12878, + "time_per_iteration": 2.371530055999756 + }, + { + "auxiliary_loss_clip": 0.01054018, + "auxiliary_loss_mlp": 0.01042084, + "balance_loss_clip": 1.01676428, + "balance_loss_mlp": 1.01649809, + "epoch": 0.7743273711107771, + "flos": 18839970708480.0, + "grad_norm": 1.7068660672880127, + "language_loss": 0.74375457, + "learning_rate": 5.10751830722885e-07, + "loss": 0.76471555, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 12879, + "time_per_iteration": 2.3368301391601562 + }, + { + "auxiliary_loss_clip": 0.01049731, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.01338243, + "balance_loss_mlp": 1.0155251, + "epoch": 0.7743874943634451, + "flos": 28728106750080.0, + "grad_norm": 1.5812827137235541, + "language_loss": 0.8054828, + "learning_rate": 5.104918994957364e-07, + "loss": 0.82632828, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 12880, + "time_per_iteration": 2.444201707839966 + }, + { + "auxiliary_loss_clip": 0.01052826, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.01253724, + "balance_loss_mlp": 1.01735306, + "epoch": 0.774447617616113, + "flos": 21908521814400.0, + "grad_norm": 1.4484094464069734, + "language_loss": 0.7172482, + "learning_rate": 5.102320247508847e-07, + "loss": 0.73814523, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 12881, + "time_per_iteration": 2.3950397968292236 + }, + { + "auxiliary_loss_clip": 0.01054341, + "auxiliary_loss_mlp": 0.01047917, + "balance_loss_clip": 1.02005744, + "balance_loss_mlp": 1.01693726, + "epoch": 0.774507740868781, + "flos": 19499619536640.0, + "grad_norm": 1.817782284184215, + "language_loss": 0.85813612, + "learning_rate": 5.099722064981832e-07, + "loss": 0.87915874, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.375, + "step": 12882, + "time_per_iteration": 2.382051944732666 + }, + { + "auxiliary_loss_clip": 0.01007603, + "auxiliary_loss_mlp": 0.01004043, + "balance_loss_clip": 1.00155175, + "balance_loss_mlp": 1.00086057, + "epoch": 0.774567864121449, + "flos": 59423113699200.0, + "grad_norm": 0.7666922445536866, + "language_loss": 0.60586309, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62597954, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.06738281, + "step": 12883, + "time_per_iteration": 2.9852066040039062 + }, + { + "auxiliary_loss_clip": 0.01054172, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.01513469, + "balance_loss_mlp": 1.01699328, + "epoch": 0.774627987374117, + "flos": 13224470342400.0, + "grad_norm": 1.8923077093120344, + "language_loss": 0.74163908, + "learning_rate": 5.094527395086416e-07, + "loss": 0.762586, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 12884, + "time_per_iteration": 2.365237236022949 + }, + { + "auxiliary_loss_clip": 0.01050848, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.01186299, + "balance_loss_mlp": 1.01616085, + "epoch": 0.7746881106267849, + "flos": 21393064368000.0, + "grad_norm": 1.5608728541543393, + "language_loss": 0.81751239, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83834827, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 12885, + "time_per_iteration": 2.3604304790496826 + }, + { + "auxiliary_loss_clip": 0.01051075, + "auxiliary_loss_mlp": 0.01034004, + "balance_loss_clip": 1.01214075, + "balance_loss_mlp": 1.0157392, + "epoch": 0.7747482338794529, + "flos": 25628168465280.0, + "grad_norm": 1.8140905135789172, + "language_loss": 0.65662062, + "learning_rate": 5.089334986059029e-07, + "loss": 0.67747134, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 12886, + "time_per_iteration": 2.420694351196289 + }, + { + "auxiliary_loss_clip": 0.01051683, + "auxiliary_loss_mlp": 0.01034469, + "balance_loss_clip": 1.01363075, + "balance_loss_mlp": 1.0159514, + "epoch": 0.7748083571321208, + "flos": 11546125597440.0, + "grad_norm": 1.8134728979125818, + "language_loss": 0.7076689, + "learning_rate": 5.086739629616987e-07, + "loss": 0.72853041, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.359375, + "step": 12887, + "time_per_iteration": 3.81010103225708 + }, + { + "auxiliary_loss_clip": 0.01051121, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.01273751, + "balance_loss_mlp": 1.01646733, + "epoch": 0.7748684803847888, + "flos": 19061424662400.0, + "grad_norm": 1.7831589608994476, + "language_loss": 0.71947134, + "learning_rate": 5.084144838687275e-07, + "loss": 0.74031883, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34570312, + "step": 12888, + "time_per_iteration": 2.361963987350464 + }, + { + "auxiliary_loss_clip": 0.01053148, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.0142616, + "balance_loss_mlp": 1.0160532, + "epoch": 0.7749286036374567, + "flos": 22272072468480.0, + "grad_norm": 1.5906132683902054, + "language_loss": 0.8264569, + "learning_rate": 5.081550613368279e-07, + "loss": 0.84737885, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 12889, + "time_per_iteration": 2.3900370597839355 + }, + { + "auxiliary_loss_clip": 0.01053011, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.01475644, + "balance_loss_mlp": 1.01736975, + "epoch": 0.7749887268901248, + "flos": 20191458504960.0, + "grad_norm": 1.9042650216199808, + "language_loss": 0.8053987, + "learning_rate": 5.07895695375838e-07, + "loss": 0.82630742, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 12890, + "time_per_iteration": 2.3576319217681885 + }, + { + "auxiliary_loss_clip": 0.01055421, + "auxiliary_loss_mlp": 0.01044779, + "balance_loss_clip": 1.02068639, + "balance_loss_mlp": 1.01797342, + "epoch": 0.7750488501427927, + "flos": 20336557582080.0, + "grad_norm": 1.991701789202154, + "language_loss": 0.6786406, + "learning_rate": 5.076363859955932e-07, + "loss": 0.69964254, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 12891, + "time_per_iteration": 2.3536574840545654 + }, + { + "auxiliary_loss_clip": 0.01050777, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.01260304, + "balance_loss_mlp": 1.01577067, + "epoch": 0.7751089733954607, + "flos": 28362845439360.0, + "grad_norm": 1.3634390965247747, + "language_loss": 0.79309654, + "learning_rate": 5.073771332059257e-07, + "loss": 0.81395125, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 12892, + "time_per_iteration": 2.467600107192993 + }, + { + "auxiliary_loss_clip": 0.01054083, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.0178653, + "balance_loss_mlp": 1.01681471, + "epoch": 0.7751690966481286, + "flos": 16942930007040.0, + "grad_norm": 2.366343943947146, + "language_loss": 0.69445324, + "learning_rate": 5.071179370166669e-07, + "loss": 0.71540725, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37304688, + "step": 12893, + "time_per_iteration": 2.342428207397461 + }, + { + "auxiliary_loss_clip": 0.01007667, + "auxiliary_loss_mlp": 0.01003193, + "balance_loss_clip": 1.00082064, + "balance_loss_mlp": 1.00093341, + "epoch": 0.7752292199007966, + "flos": 65664362096640.0, + "grad_norm": 0.8042684634737156, + "language_loss": 0.58592463, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60603321, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.06738281, + "step": 12894, + "time_per_iteration": 3.0615978240966797 + }, + { + "auxiliary_loss_clip": 0.01052932, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.01438546, + "balance_loss_mlp": 1.01647782, + "epoch": 0.7752893431534646, + "flos": 20593622989440.0, + "grad_norm": 2.037849871611893, + "language_loss": 0.79056108, + "learning_rate": 5.065997144786895e-07, + "loss": 0.81148601, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 12895, + "time_per_iteration": 2.356987237930298 + }, + { + "auxiliary_loss_clip": 0.01053698, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.01686835, + "balance_loss_mlp": 1.01726604, + "epoch": 0.7753494664061326, + "flos": 20484309922560.0, + "grad_norm": 1.7329966931794394, + "language_loss": 0.68774021, + "learning_rate": 5.063406881496209e-07, + "loss": 0.70869589, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 12896, + "time_per_iteration": 2.3843705654144287 + }, + { + "auxiliary_loss_clip": 0.01052034, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.01598787, + "balance_loss_mlp": 1.01599514, + "epoch": 0.7754095896588006, + "flos": 20264880827520.0, + "grad_norm": 2.090730312411665, + "language_loss": 0.70021105, + "learning_rate": 5.060817184602629e-07, + "loss": 0.72111237, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 12897, + "time_per_iteration": 2.361423969268799 + }, + { + "auxiliary_loss_clip": 0.01054072, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.01558101, + "balance_loss_mlp": 1.01711845, + "epoch": 0.7754697129114685, + "flos": 23329975708800.0, + "grad_norm": 1.8815937487118626, + "language_loss": 0.7603507, + "learning_rate": 5.058228054204364e-07, + "loss": 0.78131616, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.36914062, + "step": 12898, + "time_per_iteration": 2.4217164516448975 + }, + { + "auxiliary_loss_clip": 0.01052372, + "auxiliary_loss_mlp": 0.01035063, + "balance_loss_clip": 1.00960016, + "balance_loss_mlp": 1.01645589, + "epoch": 0.7755298361641365, + "flos": 17346665502720.0, + "grad_norm": 2.31457495365626, + "language_loss": 0.71052235, + "learning_rate": 5.055639490399588e-07, + "loss": 0.73139668, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.359375, + "step": 12899, + "time_per_iteration": 2.316455125808716 + }, + { + "auxiliary_loss_clip": 0.0105176, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.01448011, + "balance_loss_mlp": 1.01614618, + "epoch": 0.7755899594168044, + "flos": 19644858259200.0, + "grad_norm": 2.0723310012022007, + "language_loss": 0.76098967, + "learning_rate": 5.053051493286453e-07, + "loss": 0.7818948, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 12900, + "time_per_iteration": 2.364990234375 + }, + { + "auxiliary_loss_clip": 0.01051247, + "auxiliary_loss_mlp": 0.01044344, + "balance_loss_clip": 1.02301741, + "balance_loss_mlp": 1.01602983, + "epoch": 0.7756500826694724, + "flos": 27413312659200.0, + "grad_norm": 1.699091932561673, + "language_loss": 0.78321576, + "learning_rate": 5.050464062963113e-07, + "loss": 0.80417168, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 12901, + "time_per_iteration": 2.4199023246765137 + }, + { + "auxiliary_loss_clip": 0.01052656, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.01516247, + "balance_loss_mlp": 1.01682055, + "epoch": 0.7757102059221404, + "flos": 28729258824960.0, + "grad_norm": 2.1326453517712767, + "language_loss": 0.77868909, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79958791, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 12902, + "time_per_iteration": 2.418944835662842 + }, + { + "auxiliary_loss_clip": 0.01051899, + "auxiliary_loss_mlp": 0.01037938, + "balance_loss_clip": 1.01537168, + "balance_loss_mlp": 1.01650405, + "epoch": 0.7757703291748084, + "flos": 22485845923200.0, + "grad_norm": 1.7511137470427962, + "language_loss": 0.74280632, + "learning_rate": 5.045290903078215e-07, + "loss": 0.76370466, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 12903, + "time_per_iteration": 2.381988763809204 + }, + { + "auxiliary_loss_clip": 0.01051066, + "auxiliary_loss_mlp": 0.01034016, + "balance_loss_clip": 1.01103234, + "balance_loss_mlp": 1.01593673, + "epoch": 0.7758304524274763, + "flos": 21429199491840.0, + "grad_norm": 1.968568562417605, + "language_loss": 0.77174616, + "learning_rate": 5.042705173712835e-07, + "loss": 0.79259706, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 12904, + "time_per_iteration": 2.3860630989074707 + }, + { + "auxiliary_loss_clip": 0.01048592, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.0112505, + "balance_loss_mlp": 1.01469374, + "epoch": 0.7758905756801443, + "flos": 23658124377600.0, + "grad_norm": 2.0397343292278527, + "language_loss": 0.69438595, + "learning_rate": 5.040120011529576e-07, + "loss": 0.71519214, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33984375, + "step": 12905, + "time_per_iteration": 2.377030611038208 + }, + { + "auxiliary_loss_clip": 0.01050525, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.01116836, + "balance_loss_mlp": 1.01572418, + "epoch": 0.7759506989328122, + "flos": 28364241893760.0, + "grad_norm": 1.580558261177534, + "language_loss": 0.68506843, + "learning_rate": 5.037535416626459e-07, + "loss": 0.70590377, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 12906, + "time_per_iteration": 2.454740047454834 + }, + { + "auxiliary_loss_clip": 0.01052335, + "auxiliary_loss_mlp": 0.01040902, + "balance_loss_clip": 1.01759624, + "balance_loss_mlp": 1.01627171, + "epoch": 0.7760108221854802, + "flos": 14901907392000.0, + "grad_norm": 1.8837795102591517, + "language_loss": 0.82568949, + "learning_rate": 5.034951389101498e-07, + "loss": 0.84662181, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 12907, + "time_per_iteration": 3.640040397644043 + }, + { + "auxiliary_loss_clip": 0.01050686, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.01698637, + "balance_loss_mlp": 1.01687217, + "epoch": 0.7760709454381483, + "flos": 14791651718400.0, + "grad_norm": 2.2882048483231068, + "language_loss": 0.68491274, + "learning_rate": 5.032367929052685e-07, + "loss": 0.70580661, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33789062, + "step": 12908, + "time_per_iteration": 2.3452656269073486 + }, + { + "auxiliary_loss_clip": 0.01054359, + "auxiliary_loss_mlp": 0.01041312, + "balance_loss_clip": 1.016505, + "balance_loss_mlp": 1.01830888, + "epoch": 0.7761310686908162, + "flos": 17378995288320.0, + "grad_norm": 1.760629757446218, + "language_loss": 0.71051896, + "learning_rate": 5.029785036577976e-07, + "loss": 0.73147571, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 12909, + "time_per_iteration": 2.3647639751434326 + }, + { + "auxiliary_loss_clip": 0.01050609, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.01477981, + "balance_loss_mlp": 1.0159049, + "epoch": 0.7761911919434842, + "flos": 25555374547200.0, + "grad_norm": 1.585784995960741, + "language_loss": 0.68751609, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70839328, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 12910, + "time_per_iteration": 2.438021421432495 + }, + { + "auxiliary_loss_clip": 0.0105327, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.02063596, + "balance_loss_mlp": 1.01724195, + "epoch": 0.7762513151961521, + "flos": 23178802055040.0, + "grad_norm": 1.6899973124752887, + "language_loss": 0.72882444, + "learning_rate": 5.024620954742646e-07, + "loss": 0.74980563, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 12911, + "time_per_iteration": 2.371218681335449 + }, + { + "auxiliary_loss_clip": 0.01054078, + "auxiliary_loss_mlp": 0.01041113, + "balance_loss_clip": 1.01570928, + "balance_loss_mlp": 1.0172981, + "epoch": 0.7763114384488201, + "flos": 21688534137600.0, + "grad_norm": 2.640333321389839, + "language_loss": 0.64535403, + "learning_rate": 5.022039765577836e-07, + "loss": 0.66630596, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3671875, + "step": 12912, + "time_per_iteration": 2.3640151023864746 + }, + { + "auxiliary_loss_clip": 0.0100786, + "auxiliary_loss_mlp": 0.0100178, + "balance_loss_clip": 0.99945509, + "balance_loss_mlp": 1.00119972, + "epoch": 0.776371561701488, + "flos": 69021749813760.0, + "grad_norm": 0.7791379205014731, + "language_loss": 0.53334588, + "learning_rate": 5.019459144378779e-07, + "loss": 0.5534423, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.06640625, + "step": 12913, + "time_per_iteration": 4.469132661819458 + }, + { + "auxiliary_loss_clip": 0.01052144, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.01429844, + "balance_loss_mlp": 1.01637781, + "epoch": 0.776431684954156, + "flos": 22892793264000.0, + "grad_norm": 1.669233211173606, + "language_loss": 0.63244879, + "learning_rate": 5.016879091243338e-07, + "loss": 0.6533587, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 12914, + "time_per_iteration": 3.8122642040252686 + }, + { + "auxiliary_loss_clip": 0.01051852, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.01218832, + "balance_loss_mlp": 1.01609564, + "epoch": 0.776491808206824, + "flos": 20260656552960.0, + "grad_norm": 1.9886137369188122, + "language_loss": 0.83642626, + "learning_rate": 5.014299606269339e-07, + "loss": 0.85730439, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35742188, + "step": 12915, + "time_per_iteration": 2.350306749343872 + }, + { + "auxiliary_loss_clip": 0.01054006, + "auxiliary_loss_mlp": 0.01044829, + "balance_loss_clip": 1.02065325, + "balance_loss_mlp": 1.01657176, + "epoch": 0.776551931459492, + "flos": 26757888105600.0, + "grad_norm": 1.7303296645467678, + "language_loss": 0.76195043, + "learning_rate": 5.011720689554603e-07, + "loss": 0.78293884, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 12916, + "time_per_iteration": 2.402024984359741 + }, + { + "auxiliary_loss_clip": 0.01052286, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.01443398, + "balance_loss_mlp": 1.01588154, + "epoch": 0.7766120547121599, + "flos": 52663162965120.0, + "grad_norm": 1.4543409344000637, + "language_loss": 0.6622197, + "learning_rate": 5.009142341196919e-07, + "loss": 0.68311566, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 12917, + "time_per_iteration": 2.6602914333343506 + }, + { + "auxiliary_loss_clip": 0.01051156, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.01231313, + "balance_loss_mlp": 1.01488924, + "epoch": 0.7766721779648279, + "flos": 25155025453440.0, + "grad_norm": 1.5187151199136055, + "language_loss": 0.65021104, + "learning_rate": 5.006564561294065e-07, + "loss": 0.6710692, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36328125, + "step": 12918, + "time_per_iteration": 2.422973394393921 + }, + { + "auxiliary_loss_clip": 0.01051621, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.01236558, + "balance_loss_mlp": 1.01653481, + "epoch": 0.7767323012174958, + "flos": 23759861679360.0, + "grad_norm": 2.844342850974983, + "language_loss": 0.74163324, + "learning_rate": 5.003987349943777e-07, + "loss": 0.76248062, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.3515625, + "step": 12919, + "time_per_iteration": 2.4093081951141357 + }, + { + "auxiliary_loss_clip": 0.01052617, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.01180971, + "balance_loss_mlp": 1.01588702, + "epoch": 0.7767924244701638, + "flos": 22085671386240.0, + "grad_norm": 2.1266588183221917, + "language_loss": 0.80562222, + "learning_rate": 5.001410707243792e-07, + "loss": 0.82652283, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3671875, + "step": 12920, + "time_per_iteration": 2.3687024116516113 + }, + { + "auxiliary_loss_clip": 0.01053648, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.01328683, + "balance_loss_mlp": 1.01756573, + "epoch": 0.7768525477228319, + "flos": 21980547682560.0, + "grad_norm": 3.9206323607336686, + "language_loss": 0.7127676, + "learning_rate": 4.998834633291829e-07, + "loss": 0.7336638, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36132812, + "step": 12921, + "time_per_iteration": 2.430094003677368 + }, + { + "auxiliary_loss_clip": 0.01055958, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.01268721, + "balance_loss_mlp": 1.01696181, + "epoch": 0.7769126709754998, + "flos": 21793622929920.0, + "grad_norm": 1.8951623079927071, + "language_loss": 0.77146804, + "learning_rate": 4.996259128185547e-07, + "loss": 0.79241604, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 12922, + "time_per_iteration": 2.373117685317993 + }, + { + "auxiliary_loss_clip": 0.01053727, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.01300287, + "balance_loss_mlp": 1.01678216, + "epoch": 0.7769727942281678, + "flos": 20046952920960.0, + "grad_norm": 1.6840534402370615, + "language_loss": 0.82196158, + "learning_rate": 4.993684192022625e-07, + "loss": 0.84285533, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36914062, + "step": 12923, + "time_per_iteration": 2.3952529430389404 + }, + { + "auxiliary_loss_clip": 0.01053184, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.01399207, + "balance_loss_mlp": 1.01694977, + "epoch": 0.7770329174808357, + "flos": 21685776140160.0, + "grad_norm": 1.8730452819331227, + "language_loss": 0.93310988, + "learning_rate": 4.991109824900699e-07, + "loss": 0.95400119, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36328125, + "step": 12924, + "time_per_iteration": 2.342617988586426 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.01416862, + "balance_loss_mlp": 1.01581335, + "epoch": 0.7770930407335037, + "flos": 25848051408000.0, + "grad_norm": 2.1253082185761896, + "language_loss": 0.67157412, + "learning_rate": 4.988536026917401e-07, + "loss": 0.69248402, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3671875, + "step": 12925, + "time_per_iteration": 2.421358346939087 + }, + { + "auxiliary_loss_clip": 0.01053701, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.01560104, + "balance_loss_mlp": 1.01760602, + "epoch": 0.7771531639861716, + "flos": 24346856234880.0, + "grad_norm": 1.9325192512002993, + "language_loss": 0.73424125, + "learning_rate": 4.985962798170314e-07, + "loss": 0.75516832, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 12926, + "time_per_iteration": 2.3806920051574707 + }, + { + "auxiliary_loss_clip": 0.01053831, + "auxiliary_loss_mlp": 0.01041031, + "balance_loss_clip": 1.01542532, + "balance_loss_mlp": 1.01630688, + "epoch": 0.7772132872388396, + "flos": 25628761958400.0, + "grad_norm": 1.7355182327894574, + "language_loss": 0.66986299, + "learning_rate": 4.983390138757027e-07, + "loss": 0.69081163, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 12927, + "time_per_iteration": 3.848069190979004 + }, + { + "auxiliary_loss_clip": 0.01053571, + "auxiliary_loss_mlp": 0.01042368, + "balance_loss_clip": 1.0141511, + "balance_loss_mlp": 1.01646817, + "epoch": 0.7772734104915076, + "flos": 26066223694080.0, + "grad_norm": 1.9326274239754369, + "language_loss": 0.73113, + "learning_rate": 4.980818048775093e-07, + "loss": 0.75208932, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.37109375, + "step": 12928, + "time_per_iteration": 2.455474615097046 + }, + { + "auxiliary_loss_clip": 0.01050206, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.01832342, + "balance_loss_mlp": 1.01493597, + "epoch": 0.7773335337441756, + "flos": 22924075708800.0, + "grad_norm": 1.5824460832897658, + "language_loss": 0.75124943, + "learning_rate": 4.978246528322036e-07, + "loss": 0.77216512, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 12929, + "time_per_iteration": 2.3935930728912354 + }, + { + "auxiliary_loss_clip": 0.01051711, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.01394033, + "balance_loss_mlp": 1.0163157, + "epoch": 0.7773936569968435, + "flos": 20775729974400.0, + "grad_norm": 1.9584208957755669, + "language_loss": 0.78431565, + "learning_rate": 4.975675577495377e-07, + "loss": 0.80521441, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35351562, + "step": 12930, + "time_per_iteration": 2.3866143226623535 + }, + { + "auxiliary_loss_clip": 0.0105302, + "auxiliary_loss_mlp": 0.01040381, + "balance_loss_clip": 1.01650357, + "balance_loss_mlp": 1.01702523, + "epoch": 0.7774537802495115, + "flos": 20371331162880.0, + "grad_norm": 2.0416206354074995, + "language_loss": 0.80485678, + "learning_rate": 4.973105196392613e-07, + "loss": 0.82579082, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 12931, + "time_per_iteration": 2.401240348815918 + }, + { + "auxiliary_loss_clip": 0.01008899, + "auxiliary_loss_mlp": 0.01003083, + "balance_loss_clip": 1.00085366, + "balance_loss_mlp": 1.00188041, + "epoch": 0.7775139035021794, + "flos": 53908931698560.0, + "grad_norm": 0.8115363137729467, + "language_loss": 0.59847128, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61859107, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.0703125, + "step": 12932, + "time_per_iteration": 2.9641058444976807 + }, + { + "auxiliary_loss_clip": 0.01054412, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.0158484, + "balance_loss_mlp": 1.01710749, + "epoch": 0.7775740267548474, + "flos": 28841155332480.0, + "grad_norm": 1.526981197217689, + "language_loss": 0.77455181, + "learning_rate": 4.967966143748595e-07, + "loss": 0.79550898, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 12933, + "time_per_iteration": 2.4534475803375244 + }, + { + "auxiliary_loss_clip": 0.01053541, + "auxiliary_loss_mlp": 0.01043737, + "balance_loss_clip": 1.01685584, + "balance_loss_mlp": 1.01583254, + "epoch": 0.7776341500075155, + "flos": 21871374261120.0, + "grad_norm": 1.956232730229949, + "language_loss": 0.74476123, + "learning_rate": 4.965397472402215e-07, + "loss": 0.76573396, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37695312, + "step": 12934, + "time_per_iteration": 2.3785595893859863 + }, + { + "auxiliary_loss_clip": 0.01052617, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.01417327, + "balance_loss_mlp": 1.0155735, + "epoch": 0.7776942732601834, + "flos": 20228815526400.0, + "grad_norm": 1.7391452333577067, + "language_loss": 0.71727246, + "learning_rate": 4.962829371169475e-07, + "loss": 0.73819774, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 12935, + "time_per_iteration": 2.3711256980895996 + }, + { + "auxiliary_loss_clip": 0.01052192, + "auxiliary_loss_mlp": 0.01039814, + "balance_loss_clip": 1.01572227, + "balance_loss_mlp": 1.01569521, + "epoch": 0.7777543965128514, + "flos": 22230875197440.0, + "grad_norm": 1.9660806122941459, + "language_loss": 0.84575409, + "learning_rate": 4.960261840147746e-07, + "loss": 0.86667418, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 12936, + "time_per_iteration": 2.3894755840301514 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.01414907, + "balance_loss_mlp": 1.01686907, + "epoch": 0.7778145197655193, + "flos": 14501069539200.0, + "grad_norm": 2.0885410986212536, + "language_loss": 0.69395691, + "learning_rate": 4.957694879434397e-07, + "loss": 0.71486509, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.37304688, + "step": 12937, + "time_per_iteration": 2.368121862411499 + }, + { + "auxiliary_loss_clip": 0.01054773, + "auxiliary_loss_mlp": 0.01037016, + "balance_loss_clip": 1.01244724, + "balance_loss_mlp": 1.01728237, + "epoch": 0.7778746430181873, + "flos": 21139280628480.0, + "grad_norm": 1.4273398449966652, + "language_loss": 0.882065, + "learning_rate": 4.955128489126777e-07, + "loss": 0.90298289, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 12938, + "time_per_iteration": 2.369689464569092 + }, + { + "auxiliary_loss_clip": 0.01053278, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.01443791, + "balance_loss_mlp": 1.01622415, + "epoch": 0.7779347662708552, + "flos": 20265334675200.0, + "grad_norm": 2.22685019474679, + "language_loss": 0.86355805, + "learning_rate": 4.95256266932218e-07, + "loss": 0.88447428, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 12939, + "time_per_iteration": 2.388915538787842 + }, + { + "auxiliary_loss_clip": 0.01049379, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.01426697, + "balance_loss_mlp": 1.01508498, + "epoch": 0.7779948895235232, + "flos": 19207990016640.0, + "grad_norm": 2.468166313043386, + "language_loss": 0.70473742, + "learning_rate": 4.949997420117915e-07, + "loss": 0.72559905, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34375, + "step": 12940, + "time_per_iteration": 2.3624417781829834 + }, + { + "auxiliary_loss_clip": 0.01052745, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.01026094, + "balance_loss_mlp": 1.01617098, + "epoch": 0.7780550127761912, + "flos": 23913583862400.0, + "grad_norm": 1.5420912309938426, + "language_loss": 0.78676128, + "learning_rate": 4.947432741611255e-07, + "loss": 0.80764204, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36523438, + "step": 12941, + "time_per_iteration": 2.430081367492676 + }, + { + "auxiliary_loss_clip": 0.01054714, + "auxiliary_loss_mlp": 0.01040822, + "balance_loss_clip": 1.01390433, + "balance_loss_mlp": 1.01644063, + "epoch": 0.7781151360288592, + "flos": 32414585742720.0, + "grad_norm": 2.490981068374838, + "language_loss": 0.73807395, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75902927, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3828125, + "step": 12942, + "time_per_iteration": 2.4553780555725098 + }, + { + "auxiliary_loss_clip": 0.01052177, + "auxiliary_loss_mlp": 0.01041254, + "balance_loss_clip": 1.01763916, + "balance_loss_mlp": 1.01602149, + "epoch": 0.7781752592815271, + "flos": 22345285322880.0, + "grad_norm": 4.078136476515427, + "language_loss": 0.69030631, + "learning_rate": 4.942305097079751e-07, + "loss": 0.71124059, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 12943, + "time_per_iteration": 2.4106791019439697 + }, + { + "auxiliary_loss_clip": 0.01007945, + "auxiliary_loss_mlp": 0.01003103, + "balance_loss_clip": 1.00079048, + "balance_loss_mlp": 1.00096869, + "epoch": 0.7782353825341951, + "flos": 70457030472960.0, + "grad_norm": 0.7906399592527179, + "language_loss": 0.58646619, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60657668, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.06982422, + "step": 12944, + "time_per_iteration": 3.1938228607177734 + }, + { + "auxiliary_loss_clip": 0.01053983, + "auxiliary_loss_mlp": 0.0104485, + "balance_loss_clip": 1.01765847, + "balance_loss_mlp": 1.01596546, + "epoch": 0.778295505786863, + "flos": 19061564307840.0, + "grad_norm": 2.387708957732708, + "language_loss": 0.69024652, + "learning_rate": 4.937179736505428e-07, + "loss": 0.71123487, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.37890625, + "step": 12945, + "time_per_iteration": 2.3716681003570557 + }, + { + "auxiliary_loss_clip": 0.01053905, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.01662505, + "balance_loss_mlp": 1.01733637, + "epoch": 0.778355629039531, + "flos": 20998580382720.0, + "grad_norm": 2.038182182658981, + "language_loss": 0.70411515, + "learning_rate": 4.93461791294516e-07, + "loss": 0.72506928, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36523438, + "step": 12946, + "time_per_iteration": 2.4352924823760986 + }, + { + "auxiliary_loss_clip": 0.01052036, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.01328814, + "balance_loss_mlp": 1.01542687, + "epoch": 0.7784157522921991, + "flos": 21397009351680.0, + "grad_norm": 1.794963237878672, + "language_loss": 0.65869302, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67959428, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 12947, + "time_per_iteration": 3.6577954292297363 + }, + { + "auxiliary_loss_clip": 0.01052636, + "auxiliary_loss_mlp": 0.0103966, + "balance_loss_clip": 1.01519799, + "balance_loss_mlp": 1.01603734, + "epoch": 0.778475875544867, + "flos": 20812807704960.0, + "grad_norm": 1.9009834586929546, + "language_loss": 0.66661572, + "learning_rate": 4.929495979764147e-07, + "loss": 0.68753874, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 12948, + "time_per_iteration": 2.37751841545105 + }, + { + "auxiliary_loss_clip": 0.01054268, + "auxiliary_loss_mlp": 0.01042037, + "balance_loss_clip": 1.01480985, + "balance_loss_mlp": 1.01685274, + "epoch": 0.778535998797535, + "flos": 14354504184960.0, + "grad_norm": 1.699825791111126, + "language_loss": 0.75743449, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77839756, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.375, + "step": 12949, + "time_per_iteration": 2.354609727859497 + }, + { + "auxiliary_loss_clip": 0.01055931, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.01539946, + "balance_loss_mlp": 1.01769698, + "epoch": 0.7785961220502029, + "flos": 19208513687040.0, + "grad_norm": 1.6619516410191302, + "language_loss": 0.69546062, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71643984, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 12950, + "time_per_iteration": 2.371218681335449 + }, + { + "auxiliary_loss_clip": 0.01053981, + "auxiliary_loss_mlp": 0.01037432, + "balance_loss_clip": 1.01386452, + "balance_loss_mlp": 1.01672292, + "epoch": 0.7786562453028709, + "flos": 25737586266240.0, + "grad_norm": 1.815805072154146, + "language_loss": 0.72807407, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74898815, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 12951, + "time_per_iteration": 2.3961660861968994 + }, + { + "auxiliary_loss_clip": 0.01052327, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.01489663, + "balance_loss_mlp": 1.01681077, + "epoch": 0.7787163685555388, + "flos": 25738249582080.0, + "grad_norm": 1.798974456971515, + "language_loss": 0.66949618, + "learning_rate": 4.919258971878877e-07, + "loss": 0.69043028, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.35546875, + "step": 12952, + "time_per_iteration": 2.4522953033447266 + }, + { + "auxiliary_loss_clip": 0.0104772, + "auxiliary_loss_mlp": 0.01036569, + "balance_loss_clip": 1.01453948, + "balance_loss_mlp": 1.0142833, + "epoch": 0.7787764918082068, + "flos": 22746611934720.0, + "grad_norm": 1.5875385924726528, + "language_loss": 0.82116336, + "learning_rate": 4.916701149323022e-07, + "loss": 0.84200627, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33398438, + "step": 12953, + "time_per_iteration": 3.7386562824249268 + }, + { + "auxiliary_loss_clip": 0.01055239, + "auxiliary_loss_mlp": 0.01041218, + "balance_loss_clip": 1.01641035, + "balance_loss_mlp": 1.01745546, + "epoch": 0.7788366150608748, + "flos": 15190080687360.0, + "grad_norm": 3.256129659782537, + "language_loss": 0.78317726, + "learning_rate": 4.91414389872737e-07, + "loss": 0.80414182, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 12954, + "time_per_iteration": 3.830127239227295 + }, + { + "auxiliary_loss_clip": 0.01053095, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.0124507, + "balance_loss_mlp": 1.01643622, + "epoch": 0.7788967383135428, + "flos": 21209316549120.0, + "grad_norm": 1.5773422814989122, + "language_loss": 0.73576027, + "learning_rate": 4.911587220188905e-07, + "loss": 0.75664741, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36523438, + "step": 12955, + "time_per_iteration": 2.390669584274292 + }, + { + "auxiliary_loss_clip": 0.0105378, + "auxiliary_loss_mlp": 0.01035983, + "balance_loss_clip": 1.01137817, + "balance_loss_mlp": 1.01636863, + "epoch": 0.7789568615662107, + "flos": 21682075536000.0, + "grad_norm": 1.477404264375413, + "language_loss": 0.69020224, + "learning_rate": 4.909031113804551e-07, + "loss": 0.71109986, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 12956, + "time_per_iteration": 2.3824710845947266 + }, + { + "auxiliary_loss_clip": 0.01053766, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.0123024, + "balance_loss_mlp": 1.01749682, + "epoch": 0.7790169848188787, + "flos": 26359144934400.0, + "grad_norm": 1.474200238222114, + "language_loss": 0.77228093, + "learning_rate": 4.906475579671252e-07, + "loss": 0.79316783, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 12957, + "time_per_iteration": 2.455156087875366 + }, + { + "auxiliary_loss_clip": 0.01052599, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.0098207, + "balance_loss_mlp": 1.01675606, + "epoch": 0.7790771080715466, + "flos": 25515119882880.0, + "grad_norm": 1.4829990844731398, + "language_loss": 0.78314668, + "learning_rate": 4.903920617885917e-07, + "loss": 0.80402613, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.359375, + "step": 12958, + "time_per_iteration": 2.4076333045959473 + }, + { + "auxiliary_loss_clip": 0.0105351, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.01308489, + "balance_loss_mlp": 1.01670825, + "epoch": 0.7791372313242146, + "flos": 16033267866240.0, + "grad_norm": 1.8609690398876075, + "language_loss": 0.72703373, + "learning_rate": 4.901366228545418e-07, + "loss": 0.74794519, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3671875, + "step": 12959, + "time_per_iteration": 2.447665214538574 + }, + { + "auxiliary_loss_clip": 0.01052693, + "auxiliary_loss_mlp": 0.01043243, + "balance_loss_clip": 1.01795852, + "balance_loss_mlp": 1.01620078, + "epoch": 0.7791973545768827, + "flos": 23841069235200.0, + "grad_norm": 1.6081758495031038, + "language_loss": 0.78718901, + "learning_rate": 4.898812411746632e-07, + "loss": 0.80814838, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 12960, + "time_per_iteration": 2.392622947692871 + }, + { + "auxiliary_loss_clip": 0.01054583, + "auxiliary_loss_mlp": 0.01042061, + "balance_loss_clip": 1.01460719, + "balance_loss_mlp": 1.01705563, + "epoch": 0.7792574778295506, + "flos": 24167297779200.0, + "grad_norm": 1.8656938016393536, + "language_loss": 0.76292652, + "learning_rate": 4.896259167586385e-07, + "loss": 0.78389299, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.375, + "step": 12961, + "time_per_iteration": 2.4209365844726562 + }, + { + "auxiliary_loss_clip": 0.01051106, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.01360238, + "balance_loss_mlp": 1.01690292, + "epoch": 0.7793176010822186, + "flos": 21464007984000.0, + "grad_norm": 1.6368517716269988, + "language_loss": 0.74833816, + "learning_rate": 4.893706496161511e-07, + "loss": 0.76921582, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34179688, + "step": 12962, + "time_per_iteration": 2.376387596130371 + }, + { + "auxiliary_loss_clip": 0.01052202, + "auxiliary_loss_mlp": 0.01032801, + "balance_loss_clip": 1.01105762, + "balance_loss_mlp": 1.01687801, + "epoch": 0.7793777243348865, + "flos": 20665683768960.0, + "grad_norm": 1.8482307900987534, + "language_loss": 0.71038151, + "learning_rate": 4.891154397568795e-07, + "loss": 0.73123151, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35351562, + "step": 12963, + "time_per_iteration": 2.362128496170044 + }, + { + "auxiliary_loss_clip": 0.0105313, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.01587498, + "balance_loss_mlp": 1.01692557, + "epoch": 0.7794378475875545, + "flos": 27124545870720.0, + "grad_norm": 1.7765380299037952, + "language_loss": 0.6478045, + "learning_rate": 4.888602871905019e-07, + "loss": 0.66874826, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 12964, + "time_per_iteration": 2.472013235092163 + }, + { + "auxiliary_loss_clip": 0.01053426, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.01552522, + "balance_loss_mlp": 1.0160954, + "epoch": 0.7794979708402224, + "flos": 28072891664640.0, + "grad_norm": 1.6241874677285475, + "language_loss": 0.77382791, + "learning_rate": 4.88605191926694e-07, + "loss": 0.79475337, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37304688, + "step": 12965, + "time_per_iteration": 2.433488368988037 + }, + { + "auxiliary_loss_clip": 0.01049298, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.01499248, + "balance_loss_mlp": 1.01591814, + "epoch": 0.7795580940928905, + "flos": 26868353247360.0, + "grad_norm": 1.6171405143071382, + "language_loss": 0.73185658, + "learning_rate": 4.883501539751289e-07, + "loss": 0.7527163, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33398438, + "step": 12966, + "time_per_iteration": 2.4479498863220215 + }, + { + "auxiliary_loss_clip": 0.01050149, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.01360607, + "balance_loss_mlp": 1.01552904, + "epoch": 0.7796182173455584, + "flos": 23834436076800.0, + "grad_norm": 1.4486078474039017, + "language_loss": 0.75028342, + "learning_rate": 4.880951733454768e-07, + "loss": 0.77112663, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34570312, + "step": 12967, + "time_per_iteration": 3.8182566165924072 + }, + { + "auxiliary_loss_clip": 0.01055445, + "auxiliary_loss_mlp": 0.01037497, + "balance_loss_clip": 1.01272476, + "balance_loss_mlp": 1.01722348, + "epoch": 0.7796783405982264, + "flos": 19791214145280.0, + "grad_norm": 2.3678204411972055, + "language_loss": 0.74042755, + "learning_rate": 4.878402500474073e-07, + "loss": 0.76135695, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 12968, + "time_per_iteration": 2.3740506172180176 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.01044912, + "balance_loss_clip": 1.01947284, + "balance_loss_mlp": 1.01764297, + "epoch": 0.7797384638508943, + "flos": 15449310599040.0, + "grad_norm": 2.0432609050183843, + "language_loss": 0.62341404, + "learning_rate": 4.875853840905874e-07, + "loss": 0.64440417, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 12969, + "time_per_iteration": 2.3779804706573486 + }, + { + "auxiliary_loss_clip": 0.01050887, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.01487613, + "balance_loss_mlp": 1.01614404, + "epoch": 0.7797985871035623, + "flos": 20921701835520.0, + "grad_norm": 1.7124044813444863, + "language_loss": 0.70941275, + "learning_rate": 4.873305754846811e-07, + "loss": 0.73029304, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 12970, + "time_per_iteration": 2.344909429550171 + }, + { + "auxiliary_loss_clip": 0.01053964, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02113593, + "balance_loss_mlp": 1.01740074, + "epoch": 0.7798587103562302, + "flos": 36935803365120.0, + "grad_norm": 2.1315656591042416, + "language_loss": 0.72758532, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74858999, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 12971, + "time_per_iteration": 2.5021302700042725 + }, + { + "auxiliary_loss_clip": 0.0105545, + "auxiliary_loss_mlp": 0.010402, + "balance_loss_clip": 1.01434326, + "balance_loss_mlp": 1.01717758, + "epoch": 0.7799188336088982, + "flos": 22418183975040.0, + "grad_norm": 1.7142791028704145, + "language_loss": 0.75154686, + "learning_rate": 4.868211303642578e-07, + "loss": 0.77250332, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 12972, + "time_per_iteration": 2.372159242630005 + }, + { + "auxiliary_loss_clip": 0.01053163, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.00985861, + "balance_loss_mlp": 1.01620138, + "epoch": 0.7799789568615663, + "flos": 18879457322880.0, + "grad_norm": 1.958113986023157, + "language_loss": 0.73226428, + "learning_rate": 4.865664938690584e-07, + "loss": 0.75313276, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 12973, + "time_per_iteration": 2.376121759414673 + }, + { + "auxiliary_loss_clip": 0.0105039, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.01345682, + "balance_loss_mlp": 1.01546931, + "epoch": 0.7800390801142342, + "flos": 20261354780160.0, + "grad_norm": 1.9297566212048989, + "language_loss": 0.7890622, + "learning_rate": 4.863119147634089e-07, + "loss": 0.80992728, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 12974, + "time_per_iteration": 2.3408312797546387 + }, + { + "auxiliary_loss_clip": 0.0105153, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.01076019, + "balance_loss_mlp": 1.01589251, + "epoch": 0.7800992033669022, + "flos": 16689390647040.0, + "grad_norm": 1.6567666565964025, + "language_loss": 0.70193344, + "learning_rate": 4.86057393056964e-07, + "loss": 0.72279561, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35546875, + "step": 12975, + "time_per_iteration": 2.3465535640716553 + }, + { + "auxiliary_loss_clip": 0.01051774, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.01679122, + "balance_loss_mlp": 1.01670349, + "epoch": 0.7801593266195701, + "flos": 18584301755520.0, + "grad_norm": 2.0487160554580797, + "language_loss": 0.83767319, + "learning_rate": 4.858029287593739e-07, + "loss": 0.85859233, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3515625, + "step": 12976, + "time_per_iteration": 2.355829954147339 + }, + { + "auxiliary_loss_clip": 0.01055106, + "auxiliary_loss_mlp": 0.010385, + "balance_loss_clip": 1.01108146, + "balance_loss_mlp": 1.01603127, + "epoch": 0.7802194498722381, + "flos": 25483732704000.0, + "grad_norm": 1.6664681951412832, + "language_loss": 0.66384941, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68478549, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.390625, + "step": 12977, + "time_per_iteration": 2.3913071155548096 + }, + { + "auxiliary_loss_clip": 0.01052103, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.01358175, + "balance_loss_mlp": 1.01668048, + "epoch": 0.780279573124906, + "flos": 31174959542400.0, + "grad_norm": 2.2973478430694723, + "language_loss": 0.75258064, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77344465, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.35546875, + "step": 12978, + "time_per_iteration": 2.4588351249694824 + }, + { + "auxiliary_loss_clip": 0.01054493, + "auxiliary_loss_mlp": 0.01041627, + "balance_loss_clip": 1.01463783, + "balance_loss_mlp": 1.01634359, + "epoch": 0.780339696377574, + "flos": 26942787999360.0, + "grad_norm": 1.79945675806762, + "language_loss": 0.62852883, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64949, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38085938, + "step": 12979, + "time_per_iteration": 2.4333090782165527 + }, + { + "auxiliary_loss_clip": 0.01052827, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.01477528, + "balance_loss_mlp": 1.01720965, + "epoch": 0.780399819630242, + "flos": 27956386857600.0, + "grad_norm": 2.0123200799250625, + "language_loss": 0.78312516, + "learning_rate": 4.847856458505217e-07, + "loss": 0.80403316, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 12980, + "time_per_iteration": 2.491596221923828 + }, + { + "auxiliary_loss_clip": 0.01054613, + "auxiliary_loss_mlp": 0.01042189, + "balance_loss_clip": 1.01784658, + "balance_loss_mlp": 1.01722956, + "epoch": 0.78045994288291, + "flos": 22485845923200.0, + "grad_norm": 1.8391255266892814, + "language_loss": 0.7817173, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80268532, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 12981, + "time_per_iteration": 2.363982677459717 + }, + { + "auxiliary_loss_clip": 0.01055554, + "auxiliary_loss_mlp": 0.0103877, + "balance_loss_clip": 1.01350975, + "balance_loss_mlp": 1.01853967, + "epoch": 0.7805200661355779, + "flos": 20849780701440.0, + "grad_norm": 1.7791610757518848, + "language_loss": 0.73769152, + "learning_rate": 4.842773491000067e-07, + "loss": 0.75863481, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 12982, + "time_per_iteration": 2.412245988845825 + }, + { + "auxiliary_loss_clip": 0.01052536, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.01105034, + "balance_loss_mlp": 1.01601315, + "epoch": 0.7805801893882459, + "flos": 25664792348160.0, + "grad_norm": 1.3614217809370464, + "language_loss": 0.74254191, + "learning_rate": 4.840232869344636e-07, + "loss": 0.76341271, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36523438, + "step": 12983, + "time_per_iteration": 2.398818016052246 + }, + { + "auxiliary_loss_clip": 0.01052953, + "auxiliary_loss_mlp": 0.0104049, + "balance_loss_clip": 1.01606369, + "balance_loss_mlp": 1.01626945, + "epoch": 0.7806403126409138, + "flos": 11327010704640.0, + "grad_norm": 1.8921076450987002, + "language_loss": 0.75738883, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77832329, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 12984, + "time_per_iteration": 2.3546581268310547 + }, + { + "auxiliary_loss_clip": 0.01050236, + "auxiliary_loss_mlp": 0.01038581, + "balance_loss_clip": 1.015347, + "balance_loss_mlp": 1.01542032, + "epoch": 0.7807004358935818, + "flos": 19572343632000.0, + "grad_norm": 2.147578814122469, + "language_loss": 0.82743549, + "learning_rate": 4.835153350709746e-07, + "loss": 0.8483237, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 12985, + "time_per_iteration": 2.332937240600586 + }, + { + "auxiliary_loss_clip": 0.01050877, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.0185591, + "balance_loss_mlp": 1.01553583, + "epoch": 0.7807605591462499, + "flos": 19134812073600.0, + "grad_norm": 1.7646744407216968, + "language_loss": 0.77927089, + "learning_rate": 4.832614453922915e-07, + "loss": 0.80019319, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 12986, + "time_per_iteration": 2.3722445964813232 + }, + { + "auxiliary_loss_clip": 0.01052894, + "auxiliary_loss_mlp": 0.01041126, + "balance_loss_clip": 1.01697373, + "balance_loss_mlp": 1.01662993, + "epoch": 0.7808206823989178, + "flos": 32373423383040.0, + "grad_norm": 1.7025866819770958, + "language_loss": 0.75962085, + "learning_rate": 4.830076132284859e-07, + "loss": 0.78056103, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 12987, + "time_per_iteration": 3.8618781566619873 + }, + { + "auxiliary_loss_clip": 0.01008477, + "auxiliary_loss_mlp": 0.01002049, + "balance_loss_clip": 0.99991506, + "balance_loss_mlp": 1.00106966, + "epoch": 0.7808808056515858, + "flos": 55046855508480.0, + "grad_norm": 0.7273101898096401, + "language_loss": 0.55157185, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57167709, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.07421875, + "step": 12988, + "time_per_iteration": 3.009838819503784 + }, + { + "auxiliary_loss_clip": 0.0105223, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.01401591, + "balance_loss_mlp": 1.01677454, + "epoch": 0.7809409289042537, + "flos": 12858650449920.0, + "grad_norm": 2.462816083519718, + "language_loss": 0.81373769, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83463037, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 12989, + "time_per_iteration": 2.354731798171997 + }, + { + "auxiliary_loss_clip": 0.01050396, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.01158714, + "balance_loss_mlp": 1.01513982, + "epoch": 0.7810010521569217, + "flos": 21686229987840.0, + "grad_norm": 1.520186259180861, + "language_loss": 0.71930969, + "learning_rate": 4.822464619225806e-07, + "loss": 0.74017674, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3515625, + "step": 12990, + "time_per_iteration": 2.35052490234375 + }, + { + "auxiliary_loss_clip": 0.01052931, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.01297069, + "balance_loss_mlp": 1.01603234, + "epoch": 0.7810611754095896, + "flos": 16756319456640.0, + "grad_norm": 2.0728907077183862, + "language_loss": 0.78788888, + "learning_rate": 4.819928599145184e-07, + "loss": 0.80880368, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37109375, + "step": 12991, + "time_per_iteration": 2.3377022743225098 + }, + { + "auxiliary_loss_clip": 0.01052533, + "auxiliary_loss_mlp": 0.01038214, + "balance_loss_clip": 1.01428843, + "balance_loss_mlp": 1.0162673, + "epoch": 0.7811212986622577, + "flos": 43505793924480.0, + "grad_norm": 1.4981515046829033, + "language_loss": 0.67180073, + "learning_rate": 4.817393154694398e-07, + "loss": 0.69270819, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 12992, + "time_per_iteration": 2.548750162124634 + }, + { + "auxiliary_loss_clip": 0.01053069, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.01424837, + "balance_loss_mlp": 1.01683867, + "epoch": 0.7811814219149256, + "flos": 21756754667520.0, + "grad_norm": 3.457431991218914, + "language_loss": 0.63152206, + "learning_rate": 4.814858285969578e-07, + "loss": 0.65243876, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 12993, + "time_per_iteration": 3.8565962314605713 + }, + { + "auxiliary_loss_clip": 0.01052447, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.01146221, + "balance_loss_mlp": 1.01702154, + "epoch": 0.7812415451675936, + "flos": 24060358684800.0, + "grad_norm": 1.504628641647026, + "language_loss": 0.70003659, + "learning_rate": 4.812323993066862e-07, + "loss": 0.72091854, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 12994, + "time_per_iteration": 3.695828676223755 + }, + { + "auxiliary_loss_clip": 0.01050551, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.01334596, + "balance_loss_mlp": 1.01562238, + "epoch": 0.7813016684202615, + "flos": 18988700567040.0, + "grad_norm": 1.984117456026829, + "language_loss": 0.70793998, + "learning_rate": 4.809790276082335e-07, + "loss": 0.72879791, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 12995, + "time_per_iteration": 2.331050395965576 + }, + { + "auxiliary_loss_clip": 0.01050364, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.01304734, + "balance_loss_mlp": 1.01518536, + "epoch": 0.7813617916729295, + "flos": 25259730220800.0, + "grad_norm": 1.6833966057689018, + "language_loss": 0.76165801, + "learning_rate": 4.807257135112088e-07, + "loss": 0.78250933, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 12996, + "time_per_iteration": 2.393113136291504 + }, + { + "auxiliary_loss_clip": 0.01056003, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.01198316, + "balance_loss_mlp": 1.01783383, + "epoch": 0.7814219149255974, + "flos": 17965117059840.0, + "grad_norm": 3.2422271928248234, + "language_loss": 0.71272677, + "learning_rate": 4.804724570252167e-07, + "loss": 0.73365378, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3828125, + "step": 12997, + "time_per_iteration": 2.3168907165527344 + }, + { + "auxiliary_loss_clip": 0.01054456, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.01270616, + "balance_loss_mlp": 1.016572, + "epoch": 0.7814820381782654, + "flos": 25774978199040.0, + "grad_norm": 1.7040123812732635, + "language_loss": 0.82643092, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84736037, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 12998, + "time_per_iteration": 2.3963725566864014 + }, + { + "auxiliary_loss_clip": 0.01053375, + "auxiliary_loss_mlp": 0.010425, + "balance_loss_clip": 1.01739478, + "balance_loss_mlp": 1.01626563, + "epoch": 0.7815421614309335, + "flos": 20518594744320.0, + "grad_norm": 1.9075536354174656, + "language_loss": 0.75770849, + "learning_rate": 4.799661169247453e-07, + "loss": 0.77866721, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 12999, + "time_per_iteration": 2.3990745544433594 + }, + { + "auxiliary_loss_clip": 0.01056141, + "auxiliary_loss_mlp": 0.01046254, + "balance_loss_clip": 1.02173269, + "balance_loss_mlp": 1.01805043, + "epoch": 0.7816022846836014, + "flos": 21286614032640.0, + "grad_norm": 1.543685426734998, + "language_loss": 0.85354936, + "learning_rate": 4.797130333294652e-07, + "loss": 0.87457335, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38085938, + "step": 13000, + "time_per_iteration": 2.3662493228912354 + }, + { + "auxiliary_loss_clip": 0.01053074, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.01209497, + "balance_loss_mlp": 1.01608229, + "epoch": 0.7816624079362694, + "flos": 19207396523520.0, + "grad_norm": 2.322699996032797, + "language_loss": 0.67208648, + "learning_rate": 4.794600073836192e-07, + "loss": 0.69297624, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 13001, + "time_per_iteration": 2.3683700561523438 + }, + { + "auxiliary_loss_clip": 0.01053219, + "auxiliary_loss_mlp": 0.01040138, + "balance_loss_clip": 1.01699901, + "balance_loss_mlp": 1.01655293, + "epoch": 0.7817225311889373, + "flos": 26103475981440.0, + "grad_norm": 1.5890480476548756, + "language_loss": 0.67368996, + "learning_rate": 4.792070390968027e-07, + "loss": 0.69462353, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 13002, + "time_per_iteration": 2.3884780406951904 + }, + { + "auxiliary_loss_clip": 0.01054266, + "auxiliary_loss_mlp": 0.01041314, + "balance_loss_clip": 1.01488507, + "balance_loss_mlp": 1.01647758, + "epoch": 0.7817826544416053, + "flos": 21249885415680.0, + "grad_norm": 3.7697316708603386, + "language_loss": 0.7498455, + "learning_rate": 4.78954128478607e-07, + "loss": 0.77080137, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 13003, + "time_per_iteration": 2.361257553100586 + }, + { + "auxiliary_loss_clip": 0.01054896, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.01549923, + "balance_loss_mlp": 1.01768208, + "epoch": 0.7818427776942732, + "flos": 19931320897920.0, + "grad_norm": 1.7306638562413317, + "language_loss": 0.62836438, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64931035, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 13004, + "time_per_iteration": 2.3482108116149902 + }, + { + "auxiliary_loss_clip": 0.01047103, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.00999522, + "balance_loss_mlp": 1.01411724, + "epoch": 0.7819029009469413, + "flos": 11362971271680.0, + "grad_norm": 1.7784787287866872, + "language_loss": 0.83469927, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85548246, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33007812, + "step": 13005, + "time_per_iteration": 2.346139669418335 + }, + { + "auxiliary_loss_clip": 0.01050456, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.01671052, + "balance_loss_mlp": 1.01487947, + "epoch": 0.7819630241996092, + "flos": 24278146945920.0, + "grad_norm": 1.888021263342438, + "language_loss": 0.73758239, + "learning_rate": 4.781957427316432e-07, + "loss": 0.75849402, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 13006, + "time_per_iteration": 3.796691417694092 + }, + { + "auxiliary_loss_clip": 0.01055457, + "auxiliary_loss_mlp": 0.01041767, + "balance_loss_clip": 1.01455116, + "balance_loss_mlp": 1.01755035, + "epoch": 0.7820231474522772, + "flos": 22707858458880.0, + "grad_norm": 1.5899160664711856, + "language_loss": 0.72480214, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74577439, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.37890625, + "step": 13007, + "time_per_iteration": 2.3757400512695312 + }, + { + "auxiliary_loss_clip": 0.01054272, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.01514864, + "balance_loss_mlp": 1.01667309, + "epoch": 0.7820832707049451, + "flos": 20046394339200.0, + "grad_norm": 2.0757124608080795, + "language_loss": 0.70181072, + "learning_rate": 4.776904407525397e-07, + "loss": 0.72276413, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 13008, + "time_per_iteration": 2.363776206970215 + }, + { + "auxiliary_loss_clip": 0.01052158, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.0122447, + "balance_loss_mlp": 1.01573217, + "epoch": 0.7821433939576131, + "flos": 27161553778560.0, + "grad_norm": 1.8192873198521333, + "language_loss": 0.71012348, + "learning_rate": 4.774378763473954e-07, + "loss": 0.73101294, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 13009, + "time_per_iteration": 2.445284366607666 + }, + { + "auxiliary_loss_clip": 0.01051311, + "auxiliary_loss_mlp": 0.01040356, + "balance_loss_clip": 1.01809955, + "balance_loss_mlp": 1.01533937, + "epoch": 0.782203517210281, + "flos": 22600954275840.0, + "grad_norm": 1.6641998872744002, + "language_loss": 0.8236649, + "learning_rate": 4.771853696779586e-07, + "loss": 0.8445816, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 13010, + "time_per_iteration": 2.3654022216796875 + }, + { + "auxiliary_loss_clip": 0.01049711, + "auxiliary_loss_mlp": 0.01037808, + "balance_loss_clip": 1.01613569, + "balance_loss_mlp": 1.01509464, + "epoch": 0.782263640462949, + "flos": 29058524657280.0, + "grad_norm": 2.1978155193701157, + "language_loss": 0.63017571, + "learning_rate": 4.76932920753806e-07, + "loss": 0.65105087, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34570312, + "step": 13011, + "time_per_iteration": 2.4796652793884277 + }, + { + "auxiliary_loss_clip": 0.01049795, + "auxiliary_loss_mlp": 0.01029549, + "balance_loss_clip": 1.00962949, + "balance_loss_mlp": 1.01524043, + "epoch": 0.782323763715617, + "flos": 25298378962560.0, + "grad_norm": 2.073686044641185, + "language_loss": 0.70683527, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72762865, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.34570312, + "step": 13012, + "time_per_iteration": 2.388181686401367 + }, + { + "auxiliary_loss_clip": 0.01008746, + "auxiliary_loss_mlp": 0.01003265, + "balance_loss_clip": 1.00098777, + "balance_loss_mlp": 1.00137901, + "epoch": 0.782383886968285, + "flos": 65190695414400.0, + "grad_norm": 0.7054228088600454, + "language_loss": 0.55074829, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57086837, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.07373047, + "step": 13013, + "time_per_iteration": 3.1194424629211426 + }, + { + "auxiliary_loss_clip": 0.01054295, + "auxiliary_loss_mlp": 0.01039559, + "balance_loss_clip": 1.01508498, + "balance_loss_mlp": 1.01677489, + "epoch": 0.782444010220953, + "flos": 18404464008960.0, + "grad_norm": 1.7167253766814454, + "language_loss": 0.66672647, + "learning_rate": 4.76175920548765e-07, + "loss": 0.68766505, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 13014, + "time_per_iteration": 2.3541159629821777 + }, + { + "auxiliary_loss_clip": 0.01008337, + "auxiliary_loss_mlp": 0.01002856, + "balance_loss_clip": 1.00061476, + "balance_loss_mlp": 1.00118518, + "epoch": 0.7825041334736209, + "flos": 63951313593600.0, + "grad_norm": 0.7308556053282091, + "language_loss": 0.58571571, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60582763, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.07128906, + "step": 13015, + "time_per_iteration": 3.112224578857422 + }, + { + "auxiliary_loss_clip": 0.01051479, + "auxiliary_loss_mlp": 0.01038316, + "balance_loss_clip": 1.01622701, + "balance_loss_mlp": 1.01608551, + "epoch": 0.7825642567262889, + "flos": 20338338061440.0, + "grad_norm": 1.615309249854176, + "language_loss": 0.75211251, + "learning_rate": 4.756715426472666e-07, + "loss": 0.77301049, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 13016, + "time_per_iteration": 2.358203172683716 + }, + { + "auxiliary_loss_clip": 0.01052659, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.01433647, + "balance_loss_mlp": 1.01598823, + "epoch": 0.7826243799789568, + "flos": 20262018096000.0, + "grad_norm": 1.6321316400525547, + "language_loss": 0.76250046, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.78342295, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 13017, + "time_per_iteration": 2.3476481437683105 + }, + { + "auxiliary_loss_clip": 0.0105387, + "auxiliary_loss_mlp": 0.01041735, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.01582432, + "epoch": 0.7826845032316249, + "flos": 21132123799680.0, + "grad_norm": 2.0122831293630212, + "language_loss": 0.76448011, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.78543615, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.38085938, + "step": 13018, + "time_per_iteration": 2.3517165184020996 + }, + { + "auxiliary_loss_clip": 0.01052468, + "auxiliary_loss_mlp": 0.01037804, + "balance_loss_clip": 1.01544023, + "balance_loss_mlp": 1.01629245, + "epoch": 0.7827446264842928, + "flos": 22491152449920.0, + "grad_norm": 1.4420829899616083, + "language_loss": 0.77757972, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79848254, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36132812, + "step": 13019, + "time_per_iteration": 2.370814800262451 + }, + { + "auxiliary_loss_clip": 0.01049971, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.0108583, + "balance_loss_mlp": 1.01520348, + "epoch": 0.7828047497369608, + "flos": 28839374853120.0, + "grad_norm": 1.5515975900530117, + "language_loss": 0.68316472, + "learning_rate": 4.746634805529852e-07, + "loss": 0.70400429, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 13020, + "time_per_iteration": 2.4722111225128174 + }, + { + "auxiliary_loss_clip": 0.01052839, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.01811314, + "balance_loss_mlp": 1.01674151, + "epoch": 0.7828648729896287, + "flos": 23256588297600.0, + "grad_norm": 3.5672304469740226, + "language_loss": 0.635364, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.65630162, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 13021, + "time_per_iteration": 2.353741407394409 + }, + { + "auxiliary_loss_clip": 0.01050677, + "auxiliary_loss_mlp": 0.01038582, + "balance_loss_clip": 1.01679039, + "balance_loss_mlp": 1.01612568, + "epoch": 0.7829249962422967, + "flos": 25264478165760.0, + "grad_norm": 1.564535045723806, + "language_loss": 0.70229626, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.72318876, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 13022, + "time_per_iteration": 2.41540789604187 + }, + { + "auxiliary_loss_clip": 0.01008292, + "auxiliary_loss_mlp": 0.01009261, + "balance_loss_clip": 1.00709176, + "balance_loss_mlp": 1.00132942, + "epoch": 0.7829851194949646, + "flos": 70718704179840.0, + "grad_norm": 0.6456304402234454, + "language_loss": 0.56220782, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58238328, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.06982422, + "step": 13023, + "time_per_iteration": 3.143916606903076 + }, + { + "auxiliary_loss_clip": 0.0104795, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.01076591, + "balance_loss_mlp": 1.01408887, + "epoch": 0.7830452427476327, + "flos": 25659765112320.0, + "grad_norm": 1.6769362943205355, + "language_loss": 0.67621005, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69699317, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.33789062, + "step": 13024, + "time_per_iteration": 2.4001219272613525 + }, + { + "auxiliary_loss_clip": 0.01053768, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.01081097, + "balance_loss_mlp": 1.0163635, + "epoch": 0.7831053660003006, + "flos": 22783200906240.0, + "grad_norm": 1.5933757518534784, + "language_loss": 0.78760844, + "learning_rate": 4.734047044272498e-07, + "loss": 0.80850172, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 13025, + "time_per_iteration": 2.3863184452056885 + }, + { + "auxiliary_loss_clip": 0.01052777, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.01543546, + "balance_loss_mlp": 1.01687622, + "epoch": 0.7831654892529686, + "flos": 25811078411520.0, + "grad_norm": 1.737249421752044, + "language_loss": 0.79358274, + "learning_rate": 4.731531228298673e-07, + "loss": 0.81448299, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 13026, + "time_per_iteration": 3.6791322231292725 + }, + { + "auxiliary_loss_clip": 0.01051039, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.00929463, + "balance_loss_mlp": 1.0168345, + "epoch": 0.7832256125056366, + "flos": 20770667827200.0, + "grad_norm": 1.9919547802765962, + "language_loss": 0.7596162, + "learning_rate": 4.729015991306715e-07, + "loss": 0.78043985, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34179688, + "step": 13027, + "time_per_iteration": 2.3278372287750244 + }, + { + "auxiliary_loss_clip": 0.01052414, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.01553631, + "balance_loss_mlp": 1.01714361, + "epoch": 0.7832857357583045, + "flos": 21505484545920.0, + "grad_norm": 1.8131012619816327, + "language_loss": 0.71487939, + "learning_rate": 4.726501333391997e-07, + "loss": 0.7357887, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 13028, + "time_per_iteration": 2.3874857425689697 + }, + { + "auxiliary_loss_clip": 0.01053215, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.01534581, + "balance_loss_mlp": 1.01655936, + "epoch": 0.7833458590109725, + "flos": 18076804099200.0, + "grad_norm": 1.921685083288741, + "language_loss": 0.69525027, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71619171, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 13029, + "time_per_iteration": 2.3223965167999268 + }, + { + "auxiliary_loss_clip": 0.01053329, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.01913249, + "balance_loss_mlp": 1.01606917, + "epoch": 0.7834059822636404, + "flos": 28287607726080.0, + "grad_norm": 2.709339855277154, + "language_loss": 0.82230043, + "learning_rate": 4.721473755175698e-07, + "loss": 0.84328353, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 13030, + "time_per_iteration": 2.5288212299346924 + }, + { + "auxiliary_loss_clip": 0.01053832, + "auxiliary_loss_mlp": 0.01033166, + "balance_loss_clip": 1.01063585, + "balance_loss_mlp": 1.01645947, + "epoch": 0.7834661055163085, + "flos": 31684866082560.0, + "grad_norm": 1.990914456147986, + "language_loss": 0.7228893, + "learning_rate": 4.71896083506476e-07, + "loss": 0.74375927, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.375, + "step": 13031, + "time_per_iteration": 2.450366735458374 + }, + { + "auxiliary_loss_clip": 0.01051695, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.01443195, + "balance_loss_mlp": 1.01553297, + "epoch": 0.7835262287689764, + "flos": 12932352063360.0, + "grad_norm": 1.7303883207339177, + "language_loss": 0.80208516, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.82298797, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 13032, + "time_per_iteration": 3.765632390975952 + }, + { + "auxiliary_loss_clip": 0.01055537, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.01751423, + "balance_loss_mlp": 1.01761103, + "epoch": 0.7835863520216444, + "flos": 16142301642240.0, + "grad_norm": 2.151569053410961, + "language_loss": 0.64346468, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.66442883, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37890625, + "step": 13033, + "time_per_iteration": 3.8196401596069336 + }, + { + "auxiliary_loss_clip": 0.01052522, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.01359153, + "balance_loss_mlp": 1.0165, + "epoch": 0.7836464752743123, + "flos": 11509117689600.0, + "grad_norm": 1.6986639487001598, + "language_loss": 0.7272895, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74820882, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.359375, + "step": 13034, + "time_per_iteration": 2.342808723449707 + }, + { + "auxiliary_loss_clip": 0.01054845, + "auxiliary_loss_mlp": 0.0104546, + "balance_loss_clip": 1.02046239, + "balance_loss_mlp": 1.0180459, + "epoch": 0.7837065985269803, + "flos": 18222706137600.0, + "grad_norm": 1.7841344524585203, + "language_loss": 0.73579574, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.75679886, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 13035, + "time_per_iteration": 2.3902299404144287 + }, + { + "auxiliary_loss_clip": 0.01053535, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.01576948, + "balance_loss_mlp": 1.01701558, + "epoch": 0.7837667217796482, + "flos": 24753244993920.0, + "grad_norm": 1.9708394179175979, + "language_loss": 0.67412829, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.69508541, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36523438, + "step": 13036, + "time_per_iteration": 2.420297384262085 + }, + { + "auxiliary_loss_clip": 0.01056107, + "auxiliary_loss_mlp": 0.01040986, + "balance_loss_clip": 1.01440227, + "balance_loss_mlp": 1.01752913, + "epoch": 0.7838268450323163, + "flos": 22382013939840.0, + "grad_norm": 3.7765432074586878, + "language_loss": 0.73163444, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75260538, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38476562, + "step": 13037, + "time_per_iteration": 2.381239652633667 + }, + { + "auxiliary_loss_clip": 0.01051304, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.0171783, + "balance_loss_mlp": 1.016078, + "epoch": 0.7838869682849842, + "flos": 19499270423040.0, + "grad_norm": 2.681516541368443, + "language_loss": 0.62095875, + "learning_rate": 4.701386624460717e-07, + "loss": 0.64186901, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 13038, + "time_per_iteration": 2.35034441947937 + }, + { + "auxiliary_loss_clip": 0.01050776, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.0146991, + "balance_loss_mlp": 1.01576757, + "epoch": 0.7839470915376522, + "flos": 32891394447360.0, + "grad_norm": 1.4974727359841413, + "language_loss": 0.69120222, + "learning_rate": 4.698878342684349e-07, + "loss": 0.71208143, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 13039, + "time_per_iteration": 2.504504442214966 + }, + { + "auxiliary_loss_clip": 0.01049326, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.01015782, + "balance_loss_mlp": 1.01459658, + "epoch": 0.7840072147903202, + "flos": 29674811710080.0, + "grad_norm": 2.214159387984537, + "language_loss": 0.70378006, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.72458375, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 13040, + "time_per_iteration": 2.41644287109375 + }, + { + "auxiliary_loss_clip": 0.01052548, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.01844072, + "balance_loss_mlp": 1.01603103, + "epoch": 0.7840673380429881, + "flos": 18185768052480.0, + "grad_norm": 1.5040597121049684, + "language_loss": 0.69166911, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.71260631, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36523438, + "step": 13041, + "time_per_iteration": 2.40739369392395 + }, + { + "auxiliary_loss_clip": 0.0100828, + "auxiliary_loss_mlp": 0.01002302, + "balance_loss_clip": 1.00010848, + "balance_loss_mlp": 1.00103045, + "epoch": 0.7841274612956561, + "flos": 66342725280000.0, + "grad_norm": 0.806008262590338, + "language_loss": 0.57505232, + "learning_rate": 4.691356979055998e-07, + "loss": 0.5951581, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.07226562, + "step": 13042, + "time_per_iteration": 2.9753613471984863 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01035214, + "balance_loss_clip": 1.01157475, + "balance_loss_mlp": 1.01658368, + "epoch": 0.784187584548324, + "flos": 26647353141120.0, + "grad_norm": 2.438606993246624, + "language_loss": 0.85278559, + "learning_rate": 4.688851018730369e-07, + "loss": 0.87366199, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 13043, + "time_per_iteration": 2.4851784706115723 + }, + { + "auxiliary_loss_clip": 0.01050392, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.01533067, + "balance_loss_mlp": 1.01557791, + "epoch": 0.7842477078009921, + "flos": 25738947809280.0, + "grad_norm": 1.401136453681194, + "language_loss": 0.88979518, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.91066563, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34765625, + "step": 13044, + "time_per_iteration": 2.404571294784546 + }, + { + "auxiliary_loss_clip": 0.01055037, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.01824474, + "balance_loss_mlp": 1.01710296, + "epoch": 0.78430783105366, + "flos": 21979884366720.0, + "grad_norm": 1.7122297087846543, + "language_loss": 0.80347717, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.82446373, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 13045, + "time_per_iteration": 2.4352242946624756 + }, + { + "auxiliary_loss_clip": 0.01050653, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.0114975, + "balance_loss_mlp": 1.01561081, + "epoch": 0.784367954306328, + "flos": 23841139057920.0, + "grad_norm": 1.5395511279233067, + "language_loss": 0.73390871, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.75474596, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34960938, + "step": 13046, + "time_per_iteration": 3.941368341445923 + }, + { + "auxiliary_loss_clip": 0.01051464, + "auxiliary_loss_mlp": 0.01038764, + "balance_loss_clip": 1.01492178, + "balance_loss_mlp": 1.01670456, + "epoch": 0.7844280775589959, + "flos": 24825515241600.0, + "grad_norm": 1.6142623351116727, + "language_loss": 0.63719612, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65809834, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34765625, + "step": 13047, + "time_per_iteration": 2.463311195373535 + }, + { + "auxiliary_loss_clip": 0.01052122, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.01160312, + "balance_loss_mlp": 1.01638985, + "epoch": 0.7844882008116639, + "flos": 22454563478400.0, + "grad_norm": 1.5167197139199908, + "language_loss": 0.73745096, + "learning_rate": 4.676329928006515e-07, + "loss": 0.7583077, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35742188, + "step": 13048, + "time_per_iteration": 2.441051721572876 + }, + { + "auxiliary_loss_clip": 0.01053693, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.01282883, + "balance_loss_mlp": 1.01766062, + "epoch": 0.7845483240643318, + "flos": 26102847576960.0, + "grad_norm": 1.9682780060945169, + "language_loss": 0.75212604, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.77304828, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36132812, + "step": 13049, + "time_per_iteration": 2.431675672531128 + }, + { + "auxiliary_loss_clip": 0.01053972, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.01483107, + "balance_loss_mlp": 1.01597083, + "epoch": 0.7846084473169999, + "flos": 19353298561920.0, + "grad_norm": 1.7080967357273877, + "language_loss": 0.73565543, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.7566148, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.38085938, + "step": 13050, + "time_per_iteration": 2.362901449203491 + }, + { + "auxiliary_loss_clip": 0.01051462, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.02072561, + "balance_loss_mlp": 1.01639581, + "epoch": 0.7846685705696678, + "flos": 23324843738880.0, + "grad_norm": 2.0862481426322366, + "language_loss": 0.75144833, + "learning_rate": 4.668824245713825e-07, + "loss": 0.77238655, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34960938, + "step": 13051, + "time_per_iteration": 2.401057243347168 + }, + { + "auxiliary_loss_clip": 0.01053745, + "auxiliary_loss_mlp": 0.0103906, + "balance_loss_clip": 1.01226163, + "balance_loss_mlp": 1.01703787, + "epoch": 0.7847286938223358, + "flos": 35808073672320.0, + "grad_norm": 2.882832962588486, + "language_loss": 0.73877299, + "learning_rate": 4.666323514209227e-07, + "loss": 0.75970101, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3671875, + "step": 13052, + "time_per_iteration": 2.4924302101135254 + }, + { + "auxiliary_loss_clip": 0.01048885, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.0149653, + "balance_loss_mlp": 1.0150131, + "epoch": 0.7847888170750038, + "flos": 18477188104320.0, + "grad_norm": 1.8583694523886467, + "language_loss": 0.70631516, + "learning_rate": 4.663823364159183e-07, + "loss": 0.72716719, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33984375, + "step": 13053, + "time_per_iteration": 2.485466241836548 + }, + { + "auxiliary_loss_clip": 0.0105021, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.00992513, + "balance_loss_mlp": 1.01619518, + "epoch": 0.7848489403276717, + "flos": 25117982634240.0, + "grad_norm": 2.0774545171421432, + "language_loss": 0.70989227, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.73070961, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 13054, + "time_per_iteration": 2.4205853939056396 + }, + { + "auxiliary_loss_clip": 0.01054518, + "auxiliary_loss_mlp": 0.01041165, + "balance_loss_clip": 1.01613069, + "balance_loss_mlp": 1.01658106, + "epoch": 0.7849090635803397, + "flos": 26501311457280.0, + "grad_norm": 1.5500339872307878, + "language_loss": 0.76782793, + "learning_rate": 4.658824808801938e-07, + "loss": 0.78878474, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 13055, + "time_per_iteration": 2.450080633163452 + }, + { + "auxiliary_loss_clip": 0.01056146, + "auxiliary_loss_mlp": 0.01042025, + "balance_loss_clip": 1.01546526, + "balance_loss_mlp": 1.01693654, + "epoch": 0.7849691868330076, + "flos": 20958605009280.0, + "grad_norm": 1.9502113610033256, + "language_loss": 0.7578243, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77880597, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 13056, + "time_per_iteration": 2.3737223148345947 + }, + { + "auxiliary_loss_clip": 0.01051977, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.01063669, + "balance_loss_mlp": 1.01622415, + "epoch": 0.7850293100856757, + "flos": 26066293516800.0, + "grad_norm": 1.642859046430362, + "language_loss": 0.71396005, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.73482519, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 13057, + "time_per_iteration": 2.432526111602783 + }, + { + "auxiliary_loss_clip": 0.01050699, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.00913465, + "balance_loss_mlp": 1.01560473, + "epoch": 0.7850894333383436, + "flos": 22490803336320.0, + "grad_norm": 1.7718411885756047, + "language_loss": 0.7796911, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.80051422, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 13058, + "time_per_iteration": 2.3655998706817627 + }, + { + "auxiliary_loss_clip": 0.01053516, + "auxiliary_loss_mlp": 0.01043427, + "balance_loss_clip": 1.01891708, + "balance_loss_mlp": 1.01745391, + "epoch": 0.7851495565910116, + "flos": 20557592599680.0, + "grad_norm": 2.0340615255799634, + "language_loss": 0.71981347, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.74078292, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36132812, + "step": 13059, + "time_per_iteration": 2.421283721923828 + }, + { + "auxiliary_loss_clip": 0.01055414, + "auxiliary_loss_mlp": 0.01042103, + "balance_loss_clip": 1.01591253, + "balance_loss_mlp": 1.01746058, + "epoch": 0.7852096798436795, + "flos": 15923919888000.0, + "grad_norm": 1.8584801608138541, + "language_loss": 0.77628922, + "learning_rate": 4.646338602497144e-07, + "loss": 0.7972644, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 13060, + "time_per_iteration": 2.3508081436157227 + }, + { + "auxiliary_loss_clip": 0.01054091, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.01391172, + "balance_loss_mlp": 1.01754785, + "epoch": 0.7852698030963475, + "flos": 19061285016960.0, + "grad_norm": 2.5356908410048664, + "language_loss": 0.77512443, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79605758, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 13061, + "time_per_iteration": 2.350389003753662 + }, + { + "auxiliary_loss_clip": 0.01051785, + "auxiliary_loss_mlp": 0.01040406, + "balance_loss_clip": 1.01632524, + "balance_loss_mlp": 1.01584339, + "epoch": 0.7853299263490154, + "flos": 24643233699840.0, + "grad_norm": 1.893948364582214, + "language_loss": 0.75009966, + "learning_rate": 4.641348194799164e-07, + "loss": 0.77102149, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 13062, + "time_per_iteration": 2.380682945251465 + }, + { + "auxiliary_loss_clip": 0.01049909, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.01372766, + "balance_loss_mlp": 1.01567841, + "epoch": 0.7853900496016835, + "flos": 22016892274560.0, + "grad_norm": 1.5440720420859588, + "language_loss": 0.70105278, + "learning_rate": 4.638853864505297e-07, + "loss": 0.72189879, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 13063, + "time_per_iteration": 2.388108968734741 + }, + { + "auxiliary_loss_clip": 0.01052838, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_clip": 1.02008963, + "balance_loss_mlp": 1.01771057, + "epoch": 0.7854501728543514, + "flos": 30226090078080.0, + "grad_norm": 1.864747014705704, + "language_loss": 0.74627858, + "learning_rate": 4.636360116707625e-07, + "loss": 0.76723683, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 13064, + "time_per_iteration": 2.4215452671051025 + }, + { + "auxiliary_loss_clip": 0.01054541, + "auxiliary_loss_mlp": 0.01038582, + "balance_loss_clip": 1.0153842, + "balance_loss_mlp": 1.01733482, + "epoch": 0.7855102961070194, + "flos": 18842693794560.0, + "grad_norm": 1.5710335813462553, + "language_loss": 0.68899775, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70992899, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 13065, + "time_per_iteration": 3.5785601139068604 + }, + { + "auxiliary_loss_clip": 0.01052898, + "auxiliary_loss_mlp": 0.01042975, + "balance_loss_clip": 1.01968169, + "balance_loss_mlp": 1.01690495, + "epoch": 0.7855704193596874, + "flos": 22308870908160.0, + "grad_norm": 1.7667515823612099, + "language_loss": 0.77220309, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.79316175, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 13066, + "time_per_iteration": 2.36228084564209 + }, + { + "auxiliary_loss_clip": 0.01008124, + "auxiliary_loss_mlp": 0.01003202, + "balance_loss_clip": 1.00081742, + "balance_loss_mlp": 1.00112677, + "epoch": 0.7856305426123553, + "flos": 60001136035200.0, + "grad_norm": 0.8813964842510358, + "language_loss": 0.5343895, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55450279, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.0703125, + "step": 13067, + "time_per_iteration": 3.0697269439697266 + }, + { + "auxiliary_loss_clip": 0.01051177, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.02020812, + "balance_loss_mlp": 1.01545763, + "epoch": 0.7856906658650233, + "flos": 21867603834240.0, + "grad_norm": 1.4626506944138644, + "language_loss": 0.6804359, + "learning_rate": 4.62639095236989e-07, + "loss": 0.70138502, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35742188, + "step": 13068, + "time_per_iteration": 2.3634679317474365 + }, + { + "auxiliary_loss_clip": 0.01051294, + "auxiliary_loss_mlp": 0.01033551, + "balance_loss_clip": 1.01173544, + "balance_loss_mlp": 1.01660693, + "epoch": 0.7857507891176913, + "flos": 23621814696960.0, + "grad_norm": 1.7463223755337627, + "language_loss": 0.6906597, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.71150815, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34765625, + "step": 13069, + "time_per_iteration": 2.388317346572876 + }, + { + "auxiliary_loss_clip": 0.01052699, + "auxiliary_loss_mlp": 0.01035632, + "balance_loss_clip": 1.01230288, + "balance_loss_mlp": 1.01705956, + "epoch": 0.7858109123703593, + "flos": 25518890309760.0, + "grad_norm": 1.6791370314961107, + "language_loss": 0.77150494, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.7923882, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 13070, + "time_per_iteration": 2.383430004119873 + }, + { + "auxiliary_loss_clip": 0.0105129, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.01245999, + "balance_loss_mlp": 1.01617754, + "epoch": 0.7858710356230272, + "flos": 17456432417280.0, + "grad_norm": 1.639937926779383, + "language_loss": 0.6651777, + "learning_rate": 4.618920199958083e-07, + "loss": 0.68603623, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 13071, + "time_per_iteration": 2.3524727821350098 + }, + { + "auxiliary_loss_clip": 0.01052099, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.01422644, + "balance_loss_mlp": 1.01630378, + "epoch": 0.7859311588756952, + "flos": 24678565862400.0, + "grad_norm": 1.5455397300355838, + "language_loss": 0.75047386, + "learning_rate": 4.616431115532442e-07, + "loss": 0.77137184, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35742188, + "step": 13072, + "time_per_iteration": 3.8536064624786377 + }, + { + "auxiliary_loss_clip": 0.01052764, + "auxiliary_loss_mlp": 0.01038558, + "balance_loss_clip": 1.0122962, + "balance_loss_mlp": 1.01658773, + "epoch": 0.7859912821283631, + "flos": 21798056672640.0, + "grad_norm": 1.7122432014676567, + "language_loss": 0.72359234, + "learning_rate": 4.613942614453268e-07, + "loss": 0.74450552, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36328125, + "step": 13073, + "time_per_iteration": 3.6731374263763428 + }, + { + "auxiliary_loss_clip": 0.0105179, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.01246011, + "balance_loss_mlp": 1.01631045, + "epoch": 0.7860514053810311, + "flos": 20846324476800.0, + "grad_norm": 1.6080883264708346, + "language_loss": 0.77651227, + "learning_rate": 4.611454696814938e-07, + "loss": 0.79739398, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35546875, + "step": 13074, + "time_per_iteration": 2.369936227798462 + }, + { + "auxiliary_loss_clip": 0.01049639, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.01168501, + "balance_loss_mlp": 1.01566434, + "epoch": 0.786111528633699, + "flos": 24314561360640.0, + "grad_norm": 1.8461845892796624, + "language_loss": 0.76040113, + "learning_rate": 4.608967362711782e-07, + "loss": 0.78122747, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 13075, + "time_per_iteration": 2.3769848346710205 + }, + { + "auxiliary_loss_clip": 0.01051077, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.01200175, + "balance_loss_mlp": 1.01603723, + "epoch": 0.7861716518863671, + "flos": 24352023116160.0, + "grad_norm": 1.7402161765821813, + "language_loss": 0.70226657, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.72310752, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34960938, + "step": 13076, + "time_per_iteration": 2.408473491668701 + }, + { + "auxiliary_loss_clip": 0.01050048, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.01474595, + "balance_loss_mlp": 1.01575065, + "epoch": 0.786231775139035, + "flos": 14021677393920.0, + "grad_norm": 1.967780386936859, + "language_loss": 0.8166151, + "learning_rate": 4.603994445488282e-07, + "loss": 0.83750069, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34375, + "step": 13077, + "time_per_iteration": 2.347256660461426 + }, + { + "auxiliary_loss_clip": 0.01051062, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.01294422, + "balance_loss_mlp": 1.01581311, + "epoch": 0.786291898391703, + "flos": 33722991054720.0, + "grad_norm": 1.682118932510479, + "language_loss": 0.71205139, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.73291117, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 13078, + "time_per_iteration": 2.5068202018737793 + }, + { + "auxiliary_loss_clip": 0.0105126, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.01466119, + "balance_loss_mlp": 1.01640201, + "epoch": 0.786352021644371, + "flos": 25810310361600.0, + "grad_norm": 1.5264657681525122, + "language_loss": 0.82308853, + "learning_rate": 4.599023863537039e-07, + "loss": 0.84397554, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34765625, + "step": 13079, + "time_per_iteration": 2.382204294204712 + }, + { + "auxiliary_loss_clip": 0.01049543, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.01411879, + "balance_loss_mlp": 1.01614285, + "epoch": 0.7864121448970389, + "flos": 28909620241920.0, + "grad_norm": 1.5966149409264057, + "language_loss": 0.69008911, + "learning_rate": 4.596539448524146e-07, + "loss": 0.71094042, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33398438, + "step": 13080, + "time_per_iteration": 2.4558568000793457 + }, + { + "auxiliary_loss_clip": 0.01052192, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.01321507, + "balance_loss_mlp": 1.01587784, + "epoch": 0.7864722681497069, + "flos": 19207815459840.0, + "grad_norm": 1.6437292341142, + "language_loss": 0.71028119, + "learning_rate": 4.594055617612016e-07, + "loss": 0.73116481, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 13081, + "time_per_iteration": 2.4095816612243652 + }, + { + "auxiliary_loss_clip": 0.01050751, + "auxiliary_loss_mlp": 0.01037603, + "balance_loss_clip": 1.01548982, + "balance_loss_mlp": 1.01567686, + "epoch": 0.7865323914023749, + "flos": 21870501477120.0, + "grad_norm": 1.4814050838637678, + "language_loss": 0.69421005, + "learning_rate": 4.591572370894838e-07, + "loss": 0.71509361, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 13082, + "time_per_iteration": 2.4049739837646484 + }, + { + "auxiliary_loss_clip": 0.01052916, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.01578879, + "balance_loss_mlp": 1.01679003, + "epoch": 0.7865925146550429, + "flos": 25519134689280.0, + "grad_norm": 1.8120468943595218, + "language_loss": 0.67559838, + "learning_rate": 4.589089708466789e-07, + "loss": 0.69650447, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36132812, + "step": 13083, + "time_per_iteration": 2.396791934967041 + }, + { + "auxiliary_loss_clip": 0.01054612, + "auxiliary_loss_mlp": 0.01039331, + "balance_loss_clip": 1.01452363, + "balance_loss_mlp": 1.01739001, + "epoch": 0.7866526379077108, + "flos": 19096407711360.0, + "grad_norm": 2.256877418901593, + "language_loss": 0.76725686, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.78819633, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 13084, + "time_per_iteration": 2.3946433067321777 + }, + { + "auxiliary_loss_clip": 0.01051233, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.01557124, + "balance_loss_mlp": 1.01687729, + "epoch": 0.7867127611603788, + "flos": 16173025505280.0, + "grad_norm": 1.8368751499841338, + "language_loss": 0.71262664, + "learning_rate": 4.584126136854591e-07, + "loss": 0.73350692, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 13085, + "time_per_iteration": 3.7150564193725586 + }, + { + "auxiliary_loss_clip": 0.0105199, + "auxiliary_loss_mlp": 0.01036232, + "balance_loss_clip": 1.0124017, + "balance_loss_mlp": 1.01566529, + "epoch": 0.7867728844130467, + "flos": 20772692686080.0, + "grad_norm": 1.9572648997454458, + "language_loss": 0.73083544, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.75171769, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 13086, + "time_per_iteration": 2.3497791290283203 + }, + { + "auxiliary_loss_clip": 0.01050966, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.01330495, + "balance_loss_mlp": 1.0162704, + "epoch": 0.7868330076657147, + "flos": 21759093728640.0, + "grad_norm": 1.7871805258435856, + "language_loss": 0.75335598, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.77420354, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34765625, + "step": 13087, + "time_per_iteration": 2.35309100151062 + }, + { + "auxiliary_loss_clip": 0.01051605, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.01533115, + "balance_loss_mlp": 1.01623464, + "epoch": 0.7868931309183826, + "flos": 25699565928960.0, + "grad_norm": 1.5893351560019862, + "language_loss": 0.71819979, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73908931, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 13088, + "time_per_iteration": 2.3978638648986816 + }, + { + "auxiliary_loss_clip": 0.01007937, + "auxiliary_loss_mlp": 0.01001784, + "balance_loss_clip": 0.99976945, + "balance_loss_mlp": 1.00116611, + "epoch": 0.7869532541710507, + "flos": 64641267348480.0, + "grad_norm": 0.6733849772074044, + "language_loss": 0.55560607, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57570332, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.06787109, + "step": 13089, + "time_per_iteration": 3.0756068229675293 + }, + { + "auxiliary_loss_clip": 0.01008047, + "auxiliary_loss_mlp": 0.0100223, + "balance_loss_clip": 0.99998885, + "balance_loss_mlp": 1.00120878, + "epoch": 0.7870133774237186, + "flos": 67449925226880.0, + "grad_norm": 0.7326027158982577, + "language_loss": 0.50175047, + "learning_rate": 4.571727439470976e-07, + "loss": 0.52185327, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06835938, + "step": 13090, + "time_per_iteration": 3.1257426738739014 + }, + { + "auxiliary_loss_clip": 0.01049743, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.01427865, + "balance_loss_mlp": 1.01571727, + "epoch": 0.7870735006763866, + "flos": 26067096478080.0, + "grad_norm": 1.7013846994246846, + "language_loss": 0.84805304, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.86890566, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33984375, + "step": 13091, + "time_per_iteration": 2.420112133026123 + }, + { + "auxiliary_loss_clip": 0.01007986, + "auxiliary_loss_mlp": 0.01002369, + "balance_loss_clip": 1.00004411, + "balance_loss_mlp": 1.0012598, + "epoch": 0.7871336239290546, + "flos": 70286095123200.0, + "grad_norm": 0.7179978777224387, + "language_loss": 0.63997459, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66007817, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.06738281, + "step": 13092, + "time_per_iteration": 3.0552520751953125 + }, + { + "auxiliary_loss_clip": 0.01053069, + "auxiliary_loss_mlp": 0.01043049, + "balance_loss_clip": 1.0189929, + "balance_loss_mlp": 1.01686287, + "epoch": 0.7871937471817225, + "flos": 15777668736000.0, + "grad_norm": 1.9819361630003371, + "language_loss": 0.80222678, + "learning_rate": 4.564295240788285e-07, + "loss": 0.82318795, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 13093, + "time_per_iteration": 2.3523361682891846 + }, + { + "auxiliary_loss_clip": 0.01050786, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.01073861, + "balance_loss_mlp": 1.01637375, + "epoch": 0.7872538704343905, + "flos": 20484205188480.0, + "grad_norm": 2.0353476908739125, + "language_loss": 0.77170861, + "learning_rate": 4.561819011749106e-07, + "loss": 0.79254007, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34375, + "step": 13094, + "time_per_iteration": 2.3489818572998047 + }, + { + "auxiliary_loss_clip": 0.01053139, + "auxiliary_loss_mlp": 0.01036671, + "balance_loss_clip": 1.01458097, + "balance_loss_mlp": 1.01714551, + "epoch": 0.7873139936870585, + "flos": 25081498396800.0, + "grad_norm": 1.7904680846948264, + "language_loss": 0.80082059, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.82171869, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 13095, + "time_per_iteration": 2.4295496940612793 + }, + { + "auxiliary_loss_clip": 0.01053859, + "auxiliary_loss_mlp": 0.01037782, + "balance_loss_clip": 1.01434588, + "balance_loss_mlp": 1.01679885, + "epoch": 0.7873741169397265, + "flos": 30881863745280.0, + "grad_norm": 1.9429240520599127, + "language_loss": 0.69096887, + "learning_rate": 4.556868310016715e-07, + "loss": 0.71188533, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 13096, + "time_per_iteration": 2.439267158508301 + }, + { + "auxiliary_loss_clip": 0.0104867, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.0107131, + "balance_loss_mlp": 1.01505208, + "epoch": 0.7874342401923944, + "flos": 46790178255360.0, + "grad_norm": 1.9710723788122153, + "language_loss": 0.70784414, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72863746, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.3359375, + "step": 13097, + "time_per_iteration": 2.6201696395874023 + }, + { + "auxiliary_loss_clip": 0.01052939, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.01435137, + "balance_loss_mlp": 1.01712251, + "epoch": 0.7874943634450624, + "flos": 23583480157440.0, + "grad_norm": 1.687857701535057, + "language_loss": 0.82022941, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.84113336, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 13098, + "time_per_iteration": 2.383824586868286 + }, + { + "auxiliary_loss_clip": 0.01050305, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.01130509, + "balance_loss_mlp": 1.01599538, + "epoch": 0.7875544866977303, + "flos": 20190201696000.0, + "grad_norm": 2.4211965624443605, + "language_loss": 0.7542699, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.77510166, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34375, + "step": 13099, + "time_per_iteration": 2.421232223510742 + }, + { + "auxiliary_loss_clip": 0.01051914, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.01242995, + "balance_loss_mlp": 1.016325, + "epoch": 0.7876146099503983, + "flos": 22601443034880.0, + "grad_norm": 1.5558773667089265, + "language_loss": 0.78647089, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80735874, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 13100, + "time_per_iteration": 2.3794920444488525 + }, + { + "auxiliary_loss_clip": 0.01056916, + "auxiliary_loss_mlp": 0.01035098, + "balance_loss_clip": 1.0096823, + "balance_loss_mlp": 1.01769781, + "epoch": 0.7876747332030662, + "flos": 10705102922880.0, + "grad_norm": 2.2445722479293457, + "language_loss": 0.67953789, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.70045799, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.390625, + "step": 13101, + "time_per_iteration": 2.3516077995300293 + }, + { + "auxiliary_loss_clip": 0.01050548, + "auxiliary_loss_mlp": 0.0104208, + "balance_loss_clip": 1.01979971, + "balance_loss_mlp": 1.015396, + "epoch": 0.7877348564557343, + "flos": 38397791214720.0, + "grad_norm": 1.4086819059027418, + "language_loss": 0.78668201, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.80760825, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 13102, + "time_per_iteration": 2.527534246444702 + }, + { + "auxiliary_loss_clip": 0.01052072, + "auxiliary_loss_mlp": 0.01036087, + "balance_loss_clip": 1.01473689, + "balance_loss_mlp": 1.01659536, + "epoch": 0.7877949797084022, + "flos": 18328632802560.0, + "grad_norm": 1.926526288258514, + "language_loss": 0.8301053, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.8509869, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35546875, + "step": 13103, + "time_per_iteration": 2.3296546936035156 + }, + { + "auxiliary_loss_clip": 0.01054394, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.01668227, + "balance_loss_mlp": 1.01791251, + "epoch": 0.7878551029610702, + "flos": 25805702062080.0, + "grad_norm": 2.1766963061682394, + "language_loss": 0.8184312, + "learning_rate": 4.537088934794913e-07, + "loss": 0.83937645, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36523438, + "step": 13104, + "time_per_iteration": 2.4109456539154053 + }, + { + "auxiliary_loss_clip": 0.01051675, + "auxiliary_loss_mlp": 0.01038935, + "balance_loss_clip": 1.01681018, + "balance_loss_mlp": 1.01577902, + "epoch": 0.7879152262137382, + "flos": 22341689452800.0, + "grad_norm": 2.3284621814727737, + "language_loss": 0.74878001, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.7696861, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 13105, + "time_per_iteration": 3.590615749359131 + }, + { + "auxiliary_loss_clip": 0.01053073, + "auxiliary_loss_mlp": 0.01044349, + "balance_loss_clip": 1.02011371, + "balance_loss_mlp": 1.01622987, + "epoch": 0.7879753494664061, + "flos": 24784317970560.0, + "grad_norm": 1.9370318236933564, + "language_loss": 0.7664668, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.78744096, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 13106, + "time_per_iteration": 2.3771698474884033 + }, + { + "auxiliary_loss_clip": 0.01053937, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.01250982, + "balance_loss_mlp": 1.01758265, + "epoch": 0.7880354727190741, + "flos": 16908156426240.0, + "grad_norm": 5.522013413253047, + "language_loss": 0.74648869, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.76740009, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 13107, + "time_per_iteration": 2.367156505584717 + }, + { + "auxiliary_loss_clip": 0.01050116, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.01514304, + "balance_loss_mlp": 1.01523352, + "epoch": 0.7880955959717421, + "flos": 22229583477120.0, + "grad_norm": 2.43072378147805, + "language_loss": 0.74471951, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.76559663, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 13108, + "time_per_iteration": 2.3572044372558594 + }, + { + "auxiliary_loss_clip": 0.01007702, + "auxiliary_loss_mlp": 0.01003476, + "balance_loss_clip": 1.00136626, + "balance_loss_mlp": 1.00091481, + "epoch": 0.7881557192244101, + "flos": 69180082162560.0, + "grad_norm": 0.8823129055400022, + "language_loss": 0.6038065, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62391818, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06787109, + "step": 13109, + "time_per_iteration": 2.9582979679107666 + }, + { + "auxiliary_loss_clip": 0.01049663, + "auxiliary_loss_mlp": 0.01038901, + "balance_loss_clip": 1.01807535, + "balance_loss_mlp": 1.01560163, + "epoch": 0.788215842477078, + "flos": 24934304638080.0, + "grad_norm": 1.9327957359709333, + "language_loss": 0.73287439, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.75376004, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33984375, + "step": 13110, + "time_per_iteration": 2.4374961853027344 + }, + { + "auxiliary_loss_clip": 0.01050012, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.01160204, + "balance_loss_mlp": 1.0160749, + "epoch": 0.788275965729746, + "flos": 26105221549440.0, + "grad_norm": 1.3631490689235013, + "language_loss": 0.75903934, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77986503, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.33984375, + "step": 13111, + "time_per_iteration": 2.447052240371704 + }, + { + "auxiliary_loss_clip": 0.01051959, + "auxiliary_loss_mlp": 0.0103882, + "balance_loss_clip": 1.01622963, + "balance_loss_mlp": 1.01672256, + "epoch": 0.7883360889824139, + "flos": 21213750291840.0, + "grad_norm": 1.8219600402557927, + "language_loss": 0.62956607, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.65047389, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 13112, + "time_per_iteration": 5.254083156585693 + }, + { + "auxiliary_loss_clip": 0.01051069, + "auxiliary_loss_mlp": 0.01038308, + "balance_loss_clip": 1.01621878, + "balance_loss_mlp": 1.01495183, + "epoch": 0.7883962122350819, + "flos": 21141480044160.0, + "grad_norm": 2.340802663422589, + "language_loss": 0.68977153, + "learning_rate": 4.514881996216644e-07, + "loss": 0.71066523, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36132812, + "step": 13113, + "time_per_iteration": 2.383671522140503 + }, + { + "auxiliary_loss_clip": 0.01051387, + "auxiliary_loss_mlp": 0.01036388, + "balance_loss_clip": 1.01327312, + "balance_loss_mlp": 1.01669097, + "epoch": 0.7884563354877498, + "flos": 15302047017600.0, + "grad_norm": 2.776431184247072, + "language_loss": 0.60928011, + "learning_rate": 4.5124174933361e-07, + "loss": 0.63015789, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34765625, + "step": 13114, + "time_per_iteration": 2.38053297996521 + }, + { + "auxiliary_loss_clip": 0.01053374, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.01719308, + "balance_loss_mlp": 1.01708031, + "epoch": 0.7885164587404179, + "flos": 24387180721920.0, + "grad_norm": 1.621968881487676, + "language_loss": 0.66944063, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69038975, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 13115, + "time_per_iteration": 2.389820098876953 + }, + { + "auxiliary_loss_clip": 0.01051934, + "auxiliary_loss_mlp": 0.01035701, + "balance_loss_clip": 1.01357603, + "balance_loss_mlp": 1.01638472, + "epoch": 0.7885765819930858, + "flos": 14385193136640.0, + "grad_norm": 1.8128310065291144, + "language_loss": 0.88783896, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90871525, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 13116, + "time_per_iteration": 2.376133441925049 + }, + { + "auxiliary_loss_clip": 0.01056228, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.01978612, + "balance_loss_mlp": 1.01709843, + "epoch": 0.7886367052457538, + "flos": 18258945995520.0, + "grad_norm": 1.7954699259788616, + "language_loss": 0.73776799, + "learning_rate": 4.505027508812245e-07, + "loss": 0.75879371, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.390625, + "step": 13117, + "time_per_iteration": 2.3407537937164307 + }, + { + "auxiliary_loss_clip": 0.01049885, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01307368, + "balance_loss_mlp": 1.01629186, + "epoch": 0.7886968284984217, + "flos": 15304176610560.0, + "grad_norm": 1.4032688456367244, + "language_loss": 0.80989122, + "learning_rate": 4.502565355654926e-07, + "loss": 0.83074582, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3359375, + "step": 13118, + "time_per_iteration": 2.39129638671875 + }, + { + "auxiliary_loss_clip": 0.01050735, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.01177669, + "balance_loss_mlp": 1.01553559, + "epoch": 0.7887569517510897, + "flos": 21214378696320.0, + "grad_norm": 1.6724015476361158, + "language_loss": 0.74436247, + "learning_rate": 4.500103790161878e-07, + "loss": 0.76521981, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 13119, + "time_per_iteration": 2.38012957572937 + }, + { + "auxiliary_loss_clip": 0.01053391, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.01389647, + "balance_loss_mlp": 1.01701355, + "epoch": 0.7888170750037578, + "flos": 22710127697280.0, + "grad_norm": 1.3864186895434452, + "language_loss": 0.73204195, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.75295216, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 13120, + "time_per_iteration": 2.4412171840667725 + }, + { + "auxiliary_loss_clip": 0.01051512, + "auxiliary_loss_mlp": 0.01039938, + "balance_loss_clip": 1.01607192, + "balance_loss_mlp": 1.01585615, + "epoch": 0.7888771982564257, + "flos": 36427677304320.0, + "grad_norm": 1.4864492503930322, + "language_loss": 0.79700148, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.81791598, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 13121, + "time_per_iteration": 2.49934720993042 + }, + { + "auxiliary_loss_clip": 0.01050276, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.01163697, + "balance_loss_mlp": 1.01552427, + "epoch": 0.7889373215090937, + "flos": 27308712625920.0, + "grad_norm": 1.3574115686135173, + "language_loss": 0.80964649, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.83050048, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 13122, + "time_per_iteration": 2.4726414680480957 + }, + { + "auxiliary_loss_clip": 0.01051818, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.01288056, + "balance_loss_mlp": 1.01553631, + "epoch": 0.7889974447617616, + "flos": 19827977673600.0, + "grad_norm": 2.0469188013146358, + "language_loss": 0.78674906, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80761504, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 13123, + "time_per_iteration": 2.359224319458008 + }, + { + "auxiliary_loss_clip": 0.01053716, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.01044595, + "balance_loss_mlp": 1.01669896, + "epoch": 0.7890575680144296, + "flos": 17270345537280.0, + "grad_norm": 1.7961183703832058, + "language_loss": 0.68260324, + "learning_rate": 4.487804780926985e-07, + "loss": 0.70349801, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 13124, + "time_per_iteration": 2.350379467010498 + }, + { + "auxiliary_loss_clip": 0.01053626, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.01166224, + "balance_loss_mlp": 1.01704597, + "epoch": 0.7891176912670975, + "flos": 27598910780160.0, + "grad_norm": 1.9592621888966097, + "language_loss": 0.73419499, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.7550931, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36523438, + "step": 13125, + "time_per_iteration": 3.906547784805298 + }, + { + "auxiliary_loss_clip": 0.01052088, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.0127939, + "balance_loss_mlp": 1.0152868, + "epoch": 0.7891778145197655, + "flos": 22710546633600.0, + "grad_norm": 1.9006836912922955, + "language_loss": 0.73861682, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.75949383, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 13126, + "time_per_iteration": 2.4091665744781494 + }, + { + "auxiliary_loss_clip": 0.01053007, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.00777829, + "balance_loss_mlp": 1.01608682, + "epoch": 0.7892379377724335, + "flos": 17309832151680.0, + "grad_norm": 1.816202912893336, + "language_loss": 0.77789676, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79875779, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 13127, + "time_per_iteration": 2.330759286880493 + }, + { + "auxiliary_loss_clip": 0.01050681, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.01411164, + "balance_loss_mlp": 1.01653779, + "epoch": 0.7892980610251015, + "flos": 25774489440000.0, + "grad_norm": 1.5641376762579695, + "language_loss": 0.86398518, + "learning_rate": 4.47797616101103e-07, + "loss": 0.88486159, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34179688, + "step": 13128, + "time_per_iteration": 2.432013750076294 + }, + { + "auxiliary_loss_clip": 0.01051161, + "auxiliary_loss_mlp": 0.01039059, + "balance_loss_clip": 1.01646852, + "balance_loss_mlp": 1.01628184, + "epoch": 0.7893581842777694, + "flos": 21578871957120.0, + "grad_norm": 2.099712731481739, + "language_loss": 0.7067216, + "learning_rate": 4.475520477290904e-07, + "loss": 0.72762382, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 13129, + "time_per_iteration": 2.36267352104187 + }, + { + "auxiliary_loss_clip": 0.01008138, + "auxiliary_loss_mlp": 0.0100193, + "balance_loss_clip": 0.99984419, + "balance_loss_mlp": 1.00113475, + "epoch": 0.7894183075304374, + "flos": 69012917239680.0, + "grad_norm": 0.7239290699226015, + "language_loss": 0.61704433, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63714504, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.0703125, + "step": 13130, + "time_per_iteration": 3.018737316131592 + }, + { + "auxiliary_loss_clip": 0.01053417, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.00959349, + "balance_loss_mlp": 1.01755273, + "epoch": 0.7894784307831053, + "flos": 24242116556160.0, + "grad_norm": 4.920897525876791, + "language_loss": 0.74324864, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.76410806, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 13131, + "time_per_iteration": 2.4414162635803223 + }, + { + "auxiliary_loss_clip": 0.01057637, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.00943112, + "balance_loss_mlp": 1.01723003, + "epoch": 0.7895385540357733, + "flos": 20265509232000.0, + "grad_norm": 2.9615169153554666, + "language_loss": 0.71552098, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.73646176, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.40429688, + "step": 13132, + "time_per_iteration": 2.398754119873047 + }, + { + "auxiliary_loss_clip": 0.0105509, + "auxiliary_loss_mlp": 0.01046918, + "balance_loss_clip": 1.01996517, + "balance_loss_mlp": 1.01843858, + "epoch": 0.7895986772884414, + "flos": 20995508183040.0, + "grad_norm": 1.864015286096193, + "language_loss": 0.62986702, + "learning_rate": 4.465703630239468e-07, + "loss": 0.65088713, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3671875, + "step": 13133, + "time_per_iteration": 2.3838300704956055 + }, + { + "auxiliary_loss_clip": 0.01055531, + "auxiliary_loss_mlp": 0.01039823, + "balance_loss_clip": 1.013978, + "balance_loss_mlp": 1.0169853, + "epoch": 0.7896588005411093, + "flos": 18657095673600.0, + "grad_norm": 2.0506530195686348, + "language_loss": 0.81527114, + "learning_rate": 4.463250890899195e-07, + "loss": 0.83622468, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38476562, + "step": 13134, + "time_per_iteration": 2.373037338256836 + }, + { + "auxiliary_loss_clip": 0.01052759, + "auxiliary_loss_mlp": 0.01040788, + "balance_loss_clip": 1.01757765, + "balance_loss_mlp": 1.0170486, + "epoch": 0.7897189237937773, + "flos": 18404917856640.0, + "grad_norm": 1.7509394274774932, + "language_loss": 0.81635243, + "learning_rate": 4.460798740713998e-07, + "loss": 0.8372879, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35742188, + "step": 13135, + "time_per_iteration": 2.370577096939087 + }, + { + "auxiliary_loss_clip": 0.01051384, + "auxiliary_loss_mlp": 0.01041571, + "balance_loss_clip": 1.01659632, + "balance_loss_mlp": 1.01543379, + "epoch": 0.7897790470464452, + "flos": 23730499359360.0, + "grad_norm": 1.692349122535223, + "language_loss": 0.72926819, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.75019777, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.359375, + "step": 13136, + "time_per_iteration": 2.44268798828125 + }, + { + "auxiliary_loss_clip": 0.01056598, + "auxiliary_loss_mlp": 0.01042394, + "balance_loss_clip": 1.01616836, + "balance_loss_mlp": 1.01642692, + "epoch": 0.7898391702991132, + "flos": 15918194424960.0, + "grad_norm": 2.220319343795838, + "language_loss": 0.72598553, + "learning_rate": 4.455896208180778e-07, + "loss": 0.74697542, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.40234375, + "step": 13137, + "time_per_iteration": 2.358534574508667 + }, + { + "auxiliary_loss_clip": 0.01050756, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.01661336, + "balance_loss_mlp": 1.01579583, + "epoch": 0.7898992935517811, + "flos": 19828012584960.0, + "grad_norm": 1.8653329526671472, + "language_loss": 0.75592989, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.77683628, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34960938, + "step": 13138, + "time_per_iteration": 2.4070839881896973 + }, + { + "auxiliary_loss_clip": 0.0105186, + "auxiliary_loss_mlp": 0.0103681, + "balance_loss_clip": 1.01386237, + "balance_loss_mlp": 1.01561236, + "epoch": 0.7899594168044491, + "flos": 16215339939840.0, + "grad_norm": 2.2085614442270125, + "language_loss": 0.69705069, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.71793735, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 13139, + "time_per_iteration": 2.3544600009918213 + }, + { + "auxiliary_loss_clip": 0.01008043, + "auxiliary_loss_mlp": 0.01004179, + "balance_loss_clip": 1.0023309, + "balance_loss_mlp": 1.00107837, + "epoch": 0.790019540057117, + "flos": 68327257582080.0, + "grad_norm": 0.8741404112023985, + "language_loss": 0.60288221, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62300444, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.06933594, + "step": 13140, + "time_per_iteration": 3.1090009212493896 + }, + { + "auxiliary_loss_clip": 0.01052773, + "auxiliary_loss_mlp": 0.01040344, + "balance_loss_clip": 1.01681221, + "balance_loss_mlp": 1.01645684, + "epoch": 0.7900796633097851, + "flos": 30331562895360.0, + "grad_norm": 1.609796185412189, + "language_loss": 0.77899468, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.7999258, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 13141, + "time_per_iteration": 2.4656901359558105 + }, + { + "auxiliary_loss_clip": 0.01054251, + "auxiliary_loss_mlp": 0.01043354, + "balance_loss_clip": 1.01978588, + "balance_loss_mlp": 1.01691246, + "epoch": 0.790139786562453, + "flos": 22125716582400.0, + "grad_norm": 1.9501693479290214, + "language_loss": 0.69355023, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.7145263, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 13142, + "time_per_iteration": 2.3962888717651367 + }, + { + "auxiliary_loss_clip": 0.01007714, + "auxiliary_loss_mlp": 0.01002997, + "balance_loss_clip": 1.00080395, + "balance_loss_mlp": 1.00091171, + "epoch": 0.790199909815121, + "flos": 58204296535680.0, + "grad_norm": 0.8128750405646906, + "language_loss": 0.60113227, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62123936, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06787109, + "step": 13143, + "time_per_iteration": 2.830991268157959 + }, + { + "auxiliary_loss_clip": 0.01054602, + "auxiliary_loss_mlp": 0.01036951, + "balance_loss_clip": 1.01283526, + "balance_loss_mlp": 1.01750004, + "epoch": 0.7902600330677889, + "flos": 34531858500480.0, + "grad_norm": 1.8528931249460574, + "language_loss": 0.75698566, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.77790117, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 13144, + "time_per_iteration": 2.5148165225982666 + }, + { + "auxiliary_loss_clip": 0.01055378, + "auxiliary_loss_mlp": 0.01039694, + "balance_loss_clip": 1.0139209, + "balance_loss_mlp": 1.01698279, + "epoch": 0.7903201563204569, + "flos": 22345285322880.0, + "grad_norm": 1.891803898500093, + "language_loss": 0.84017539, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.86112607, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 13145, + "time_per_iteration": 3.6872386932373047 + }, + { + "auxiliary_loss_clip": 0.01048442, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.01142466, + "balance_loss_mlp": 1.01441324, + "epoch": 0.790380279573125, + "flos": 22052468816640.0, + "grad_norm": 1.627646293057472, + "language_loss": 0.74121606, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.76202118, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33984375, + "step": 13146, + "time_per_iteration": 2.386735439300537 + }, + { + "auxiliary_loss_clip": 0.01053745, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.01394594, + "balance_loss_mlp": 1.01623857, + "epoch": 0.7904404028257929, + "flos": 20301574533120.0, + "grad_norm": 1.9101783126782728, + "language_loss": 0.76474506, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.78566289, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.375, + "step": 13147, + "time_per_iteration": 2.3629133701324463 + }, + { + "auxiliary_loss_clip": 0.01051712, + "auxiliary_loss_mlp": 0.01042798, + "balance_loss_clip": 1.01759684, + "balance_loss_mlp": 1.01596212, + "epoch": 0.7905005260784609, + "flos": 20007955065600.0, + "grad_norm": 1.8147438417060116, + "language_loss": 0.73103535, + "learning_rate": 4.428974443697087e-07, + "loss": 0.75198054, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.35742188, + "step": 13148, + "time_per_iteration": 2.386448621749878 + }, + { + "auxiliary_loss_clip": 0.01051397, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.01564717, + "balance_loss_mlp": 1.01483488, + "epoch": 0.7905606493311288, + "flos": 26904732750720.0, + "grad_norm": 1.6104705326619932, + "language_loss": 0.72916663, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.75006783, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 13149, + "time_per_iteration": 2.4476826190948486 + }, + { + "auxiliary_loss_clip": 0.01054639, + "auxiliary_loss_mlp": 0.01041772, + "balance_loss_clip": 1.01481915, + "balance_loss_mlp": 1.01753139, + "epoch": 0.7906207725837968, + "flos": 23695097374080.0, + "grad_norm": 1.9280106282514626, + "language_loss": 0.66010129, + "learning_rate": 4.424087249723225e-07, + "loss": 0.68106538, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37109375, + "step": 13150, + "time_per_iteration": 2.3981781005859375 + }, + { + "auxiliary_loss_clip": 0.01051524, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.01157272, + "balance_loss_mlp": 1.01590824, + "epoch": 0.7906808958364647, + "flos": 20847825665280.0, + "grad_norm": 1.739017235685906, + "language_loss": 0.71336436, + "learning_rate": 4.421644538650231e-07, + "loss": 0.73422223, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 13151, + "time_per_iteration": 3.807577610015869 + }, + { + "auxiliary_loss_clip": 0.01054872, + "auxiliary_loss_mlp": 0.01047052, + "balance_loss_clip": 1.02067113, + "balance_loss_mlp": 1.01691103, + "epoch": 0.7907410190891327, + "flos": 40733585372160.0, + "grad_norm": 1.7948704825507629, + "language_loss": 0.71017283, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.73119205, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 13152, + "time_per_iteration": 3.8467752933502197 + }, + { + "auxiliary_loss_clip": 0.01052528, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.01147389, + "balance_loss_mlp": 1.01713204, + "epoch": 0.7908011423418007, + "flos": 13260326175360.0, + "grad_norm": 1.7719191729595685, + "language_loss": 0.73787349, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.75874436, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 13153, + "time_per_iteration": 2.3916752338409424 + }, + { + "auxiliary_loss_clip": 0.0105195, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.00866699, + "balance_loss_mlp": 1.01597345, + "epoch": 0.7908612655944687, + "flos": 19753752389760.0, + "grad_norm": 1.5204414732063258, + "language_loss": 0.79609656, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81693661, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 13154, + "time_per_iteration": 2.3500359058380127 + }, + { + "auxiliary_loss_clip": 0.01056239, + "auxiliary_loss_mlp": 0.01039596, + "balance_loss_clip": 1.01307154, + "balance_loss_mlp": 1.01737535, + "epoch": 0.7909213888471366, + "flos": 21286683855360.0, + "grad_norm": 2.122793219092856, + "language_loss": 0.7122348, + "learning_rate": 4.411879602612185e-07, + "loss": 0.73319316, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38867188, + "step": 13155, + "time_per_iteration": 2.38899827003479 + }, + { + "auxiliary_loss_clip": 0.0105269, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.00829017, + "balance_loss_mlp": 1.01635623, + "epoch": 0.7909815120998046, + "flos": 22527776332800.0, + "grad_norm": 1.5589271657483865, + "language_loss": 0.78022236, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.80107093, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 13156, + "time_per_iteration": 2.364368200302124 + }, + { + "auxiliary_loss_clip": 0.01051596, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.00939655, + "balance_loss_mlp": 1.01607168, + "epoch": 0.7910416353524725, + "flos": 26726396192640.0, + "grad_norm": 2.91416012422953, + "language_loss": 0.67056084, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.69139779, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 13157, + "time_per_iteration": 2.50380539894104 + }, + { + "auxiliary_loss_clip": 0.01053651, + "auxiliary_loss_mlp": 0.01040102, + "balance_loss_clip": 1.01595044, + "balance_loss_mlp": 1.01715994, + "epoch": 0.7911017586051405, + "flos": 24643687547520.0, + "grad_norm": 1.7318591129986136, + "language_loss": 0.75544858, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.77638614, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 13158, + "time_per_iteration": 2.393826723098755 + }, + { + "auxiliary_loss_clip": 0.01049007, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.01213253, + "balance_loss_mlp": 1.01501441, + "epoch": 0.7911618818578086, + "flos": 17564558497920.0, + "grad_norm": 2.0311819075175648, + "language_loss": 0.68602896, + "learning_rate": 4.40212412422309e-07, + "loss": 0.7068401, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.33984375, + "step": 13159, + "time_per_iteration": 2.4017014503479004 + }, + { + "auxiliary_loss_clip": 0.01052432, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.01352644, + "balance_loss_mlp": 1.01697421, + "epoch": 0.7912220051104765, + "flos": 16720882560000.0, + "grad_norm": 1.689647334024438, + "language_loss": 0.6791535, + "learning_rate": 4.399686733077206e-07, + "loss": 0.70003664, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 13160, + "time_per_iteration": 2.3631880283355713 + }, + { + "auxiliary_loss_clip": 0.01049237, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01261234, + "balance_loss_mlp": 1.01541138, + "epoch": 0.7912821283631445, + "flos": 13697892645120.0, + "grad_norm": 1.9281286032409604, + "language_loss": 0.73644626, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75727272, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33789062, + "step": 13161, + "time_per_iteration": 2.4395992755889893 + }, + { + "auxiliary_loss_clip": 0.01051033, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.01022804, + "balance_loss_mlp": 1.01586318, + "epoch": 0.7913422516158124, + "flos": 23767891292160.0, + "grad_norm": 1.5900258742175382, + "language_loss": 0.74707425, + "learning_rate": 4.39481372557418e-07, + "loss": 0.76792681, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 13162, + "time_per_iteration": 2.383955240249634 + }, + { + "auxiliary_loss_clip": 0.01053095, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.01362371, + "balance_loss_mlp": 1.01621687, + "epoch": 0.7914023748684804, + "flos": 19937220917760.0, + "grad_norm": 1.6632050340241853, + "language_loss": 0.72456253, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74547708, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36914062, + "step": 13163, + "time_per_iteration": 2.4014806747436523 + }, + { + "auxiliary_loss_clip": 0.01051113, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.01306164, + "balance_loss_mlp": 1.01515412, + "epoch": 0.7914624981211483, + "flos": 20593762634880.0, + "grad_norm": 1.9813852279033242, + "language_loss": 0.71193159, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.73280978, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 13164, + "time_per_iteration": 2.3362114429473877 + }, + { + "auxiliary_loss_clip": 0.01051464, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.01490903, + "balance_loss_mlp": 1.01555693, + "epoch": 0.7915226213738163, + "flos": 21798370874880.0, + "grad_norm": 1.953204233391065, + "language_loss": 0.68153822, + "learning_rate": 4.387508652677177e-07, + "loss": 0.70243943, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 13165, + "time_per_iteration": 3.770310401916504 + }, + { + "auxiliary_loss_clip": 0.0104885, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.01286542, + "balance_loss_mlp": 1.01490402, + "epoch": 0.7915827446264843, + "flos": 16287470542080.0, + "grad_norm": 1.7626309783929734, + "language_loss": 0.7333138, + "learning_rate": 4.385074812309557e-07, + "loss": 0.75414479, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33984375, + "step": 13166, + "time_per_iteration": 2.315321445465088 + }, + { + "auxiliary_loss_clip": 0.01050645, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.01486409, + "balance_loss_mlp": 1.01478457, + "epoch": 0.7916428678791523, + "flos": 25701416231040.0, + "grad_norm": 1.7400472570786014, + "language_loss": 0.78686684, + "learning_rate": 4.382641564061462e-07, + "loss": 0.80777323, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.359375, + "step": 13167, + "time_per_iteration": 2.410093307495117 + }, + { + "auxiliary_loss_clip": 0.0105081, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.01218152, + "balance_loss_mlp": 1.01597726, + "epoch": 0.7917029911318202, + "flos": 23877378915840.0, + "grad_norm": 1.5988320687027526, + "language_loss": 0.85265124, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.87350273, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34960938, + "step": 13168, + "time_per_iteration": 2.3919177055358887 + }, + { + "auxiliary_loss_clip": 0.010527, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.01030159, + "balance_loss_mlp": 1.0162816, + "epoch": 0.7917631143844882, + "flos": 21645696032640.0, + "grad_norm": 2.631962691081977, + "language_loss": 0.73616087, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.75703311, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 13169, + "time_per_iteration": 2.361884832382202 + }, + { + "auxiliary_loss_clip": 0.01054635, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_clip": 1.01543295, + "balance_loss_mlp": 1.01675665, + "epoch": 0.7918232376371561, + "flos": 38872644883200.0, + "grad_norm": 1.6941845569344933, + "language_loss": 0.68614775, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.70711833, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 13170, + "time_per_iteration": 2.5291528701782227 + }, + { + "auxiliary_loss_clip": 0.01052629, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.01132393, + "balance_loss_mlp": 1.01582301, + "epoch": 0.7918833608898241, + "flos": 20774542988160.0, + "grad_norm": 1.6173816799506826, + "language_loss": 0.71457916, + "learning_rate": 4.372914494109412e-07, + "loss": 0.73543388, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.36914062, + "step": 13171, + "time_per_iteration": 2.386094570159912 + }, + { + "auxiliary_loss_clip": 0.01051212, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.01281857, + "balance_loss_mlp": 1.01565385, + "epoch": 0.7919434841424922, + "flos": 33908763732480.0, + "grad_norm": 1.8872248507030527, + "language_loss": 0.68349189, + "learning_rate": 4.370484207842553e-07, + "loss": 0.70438051, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35546875, + "step": 13172, + "time_per_iteration": 2.457155227661133 + }, + { + "auxiliary_loss_clip": 0.01051473, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.01208663, + "balance_loss_mlp": 1.01571465, + "epoch": 0.7920036073951601, + "flos": 21063728712960.0, + "grad_norm": 1.6971220705413375, + "language_loss": 0.80506819, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.82593405, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 13173, + "time_per_iteration": 2.4437193870544434 + }, + { + "auxiliary_loss_clip": 0.01052051, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.01479387, + "balance_loss_mlp": 1.01621366, + "epoch": 0.7920637306478281, + "flos": 23654947443840.0, + "grad_norm": 1.8152328390984418, + "language_loss": 0.78048539, + "learning_rate": 4.365625413419365e-07, + "loss": 0.80137849, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 13174, + "time_per_iteration": 2.4004623889923096 + }, + { + "auxiliary_loss_clip": 0.01049307, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.01597619, + "balance_loss_mlp": 1.01531458, + "epoch": 0.792123853900496, + "flos": 27194302500480.0, + "grad_norm": 1.7631516706074781, + "language_loss": 0.72776866, + "learning_rate": 4.363196905447297e-07, + "loss": 0.74864513, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.33984375, + "step": 13175, + "time_per_iteration": 2.570491075515747 + }, + { + "auxiliary_loss_clip": 0.01052528, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.01652837, + "balance_loss_mlp": 1.01613426, + "epoch": 0.792183977153164, + "flos": 19097664520320.0, + "grad_norm": 2.920604200740116, + "language_loss": 0.60317653, + "learning_rate": 4.360768990424364e-07, + "loss": 0.62410814, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 13176, + "time_per_iteration": 2.3686742782592773 + }, + { + "auxiliary_loss_clip": 0.01052966, + "auxiliary_loss_mlp": 0.01037515, + "balance_loss_clip": 1.01419783, + "balance_loss_mlp": 1.01693892, + "epoch": 0.7922441004058319, + "flos": 17127899723520.0, + "grad_norm": 1.6907674350315618, + "language_loss": 0.74645841, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.76736331, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 13177, + "time_per_iteration": 2.3941731452941895 + }, + { + "auxiliary_loss_clip": 0.01052038, + "auxiliary_loss_mlp": 0.01038522, + "balance_loss_clip": 1.0147754, + "balance_loss_mlp": 1.01713979, + "epoch": 0.7923042236585, + "flos": 17820681298560.0, + "grad_norm": 8.830392110926448, + "language_loss": 0.65161747, + "learning_rate": 4.355914939594174e-07, + "loss": 0.67252302, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34765625, + "step": 13178, + "time_per_iteration": 2.3704211711883545 + }, + { + "auxiliary_loss_clip": 0.01051816, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.01218033, + "balance_loss_mlp": 1.01617765, + "epoch": 0.7923643469111679, + "flos": 29933901976320.0, + "grad_norm": 1.480724941203536, + "language_loss": 0.69477057, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.7156207, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.35546875, + "step": 13179, + "time_per_iteration": 2.4881720542907715 + }, + { + "auxiliary_loss_clip": 0.01050888, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.01145411, + "balance_loss_mlp": 1.01535177, + "epoch": 0.7924244701638359, + "flos": 22673608548480.0, + "grad_norm": 2.1054216243464645, + "language_loss": 0.75616181, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.77701604, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 13180, + "time_per_iteration": 2.3702096939086914 + }, + { + "auxiliary_loss_clip": 0.01053816, + "auxiliary_loss_mlp": 0.01036502, + "balance_loss_clip": 1.01268411, + "balance_loss_mlp": 1.01677394, + "epoch": 0.7924845934165038, + "flos": 17967176830080.0, + "grad_norm": 2.0059046304641543, + "language_loss": 0.82907581, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.84997892, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 13181, + "time_per_iteration": 2.3908777236938477 + }, + { + "auxiliary_loss_clip": 0.0105154, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_clip": 1.01804042, + "balance_loss_mlp": 1.01534986, + "epoch": 0.7925447166691718, + "flos": 23475842835840.0, + "grad_norm": 1.8147211796278988, + "language_loss": 0.7834174, + "learning_rate": 4.346213957372895e-07, + "loss": 0.80435622, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36132812, + "step": 13182, + "time_per_iteration": 2.4420676231384277 + }, + { + "auxiliary_loss_clip": 0.01056353, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_clip": 1.01596785, + "balance_loss_mlp": 1.01723135, + "epoch": 0.7926048399218397, + "flos": 20446568876160.0, + "grad_norm": 2.412028183117106, + "language_loss": 0.75988305, + "learning_rate": 4.34379019557056e-07, + "loss": 0.78086996, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.390625, + "step": 13183, + "time_per_iteration": 2.4970364570617676 + }, + { + "auxiliary_loss_clip": 0.01050601, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.01122618, + "balance_loss_mlp": 1.01485634, + "epoch": 0.7926649631745077, + "flos": 37158514128000.0, + "grad_norm": 2.696979578853507, + "language_loss": 0.70025992, + "learning_rate": 4.341367027453264e-07, + "loss": 0.72111887, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35742188, + "step": 13184, + "time_per_iteration": 3.7900891304016113 + }, + { + "auxiliary_loss_clip": 0.01053778, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.014117, + "balance_loss_mlp": 1.0167284, + "epoch": 0.7927250864271758, + "flos": 17017678961280.0, + "grad_norm": 2.0346207222938477, + "language_loss": 0.72058332, + "learning_rate": 4.338944453112907e-07, + "loss": 0.74149507, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 13185, + "time_per_iteration": 2.3930413722991943 + }, + { + "auxiliary_loss_clip": 0.01053, + "auxiliary_loss_mlp": 0.01043321, + "balance_loss_clip": 1.01770246, + "balance_loss_mlp": 1.01564586, + "epoch": 0.7927852096798437, + "flos": 17748236494080.0, + "grad_norm": 2.002771972067062, + "language_loss": 0.66977918, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.69074237, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 13186, + "time_per_iteration": 2.347581148147583 + }, + { + "auxiliary_loss_clip": 0.01050671, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.01689792, + "balance_loss_mlp": 1.01623154, + "epoch": 0.7928453329325117, + "flos": 23837403542400.0, + "grad_norm": 1.802596826220055, + "language_loss": 0.77500284, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79588729, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 13187, + "time_per_iteration": 2.410280227661133 + }, + { + "auxiliary_loss_clip": 0.01050975, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.01333523, + "balance_loss_mlp": 1.01600718, + "epoch": 0.7929054561851796, + "flos": 17454023533440.0, + "grad_norm": 2.223922955777871, + "language_loss": 0.7349422, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.7558006, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34960938, + "step": 13188, + "time_per_iteration": 2.3225107192993164 + }, + { + "auxiliary_loss_clip": 0.01054417, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.01945901, + "balance_loss_mlp": 1.01634967, + "epoch": 0.7929655794378476, + "flos": 21980198568960.0, + "grad_norm": 2.5216683712881562, + "language_loss": 0.64399487, + "learning_rate": 4.329260095357725e-07, + "loss": 0.66499507, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38085938, + "step": 13189, + "time_per_iteration": 2.436572551727295 + }, + { + "auxiliary_loss_clip": 0.01051396, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.01522374, + "balance_loss_mlp": 1.01637387, + "epoch": 0.7930257026905155, + "flos": 17272998800640.0, + "grad_norm": 1.8553171385029397, + "language_loss": 0.74057662, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.76147711, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 13190, + "time_per_iteration": 2.3367631435394287 + }, + { + "auxiliary_loss_clip": 0.01049571, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.01580477, + "balance_loss_mlp": 1.01567268, + "epoch": 0.7930858259431836, + "flos": 27299565849600.0, + "grad_norm": 1.739541764862739, + "language_loss": 0.7394352, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.76029515, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.33984375, + "step": 13191, + "time_per_iteration": 3.831942319869995 + }, + { + "auxiliary_loss_clip": 0.01050869, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.01699543, + "balance_loss_mlp": 1.01522946, + "epoch": 0.7931459491958515, + "flos": 19862751254400.0, + "grad_norm": 1.7816878512190157, + "language_loss": 0.70457256, + "learning_rate": 4.322003066198219e-07, + "loss": 0.72549045, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35546875, + "step": 13192, + "time_per_iteration": 3.713895797729492 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.01266336, + "balance_loss_mlp": 1.01630509, + "epoch": 0.7932060724485195, + "flos": 23146053333120.0, + "grad_norm": 1.7455021671000863, + "language_loss": 0.75942492, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.78029728, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 13193, + "time_per_iteration": 2.3818862438201904 + }, + { + "auxiliary_loss_clip": 0.01053614, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.01456392, + "balance_loss_mlp": 1.01764631, + "epoch": 0.7932661957011874, + "flos": 29933552862720.0, + "grad_norm": 1.8368595266389598, + "language_loss": 0.73221481, + "learning_rate": 4.317168019161741e-07, + "loss": 0.75314885, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.359375, + "step": 13194, + "time_per_iteration": 2.4834189414978027 + }, + { + "auxiliary_loss_clip": 0.01053973, + "auxiliary_loss_mlp": 0.01038791, + "balance_loss_clip": 1.01297045, + "balance_loss_mlp": 1.016186, + "epoch": 0.7933263189538554, + "flos": 22558185993600.0, + "grad_norm": 2.0182067654354228, + "language_loss": 0.70719105, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72811866, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 13195, + "time_per_iteration": 2.393211841583252 + }, + { + "auxiliary_loss_clip": 0.01051682, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.01188302, + "balance_loss_mlp": 1.01615477, + "epoch": 0.7933864422065233, + "flos": 25478007240960.0, + "grad_norm": 1.5707810157651836, + "language_loss": 0.77736688, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79823458, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 13196, + "time_per_iteration": 2.426161050796509 + }, + { + "auxiliary_loss_clip": 0.01054214, + "auxiliary_loss_mlp": 0.01041642, + "balance_loss_clip": 1.01617837, + "balance_loss_mlp": 1.01699054, + "epoch": 0.7934465654591913, + "flos": 33581767138560.0, + "grad_norm": 1.5579550222133354, + "language_loss": 0.69811469, + "learning_rate": 4.309919909045268e-07, + "loss": 0.7190733, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 13197, + "time_per_iteration": 2.5184993743896484 + }, + { + "auxiliary_loss_clip": 0.01051739, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.01223278, + "balance_loss_mlp": 1.01618254, + "epoch": 0.7935066887118594, + "flos": 31431152165760.0, + "grad_norm": 2.3523200804655473, + "language_loss": 0.65952468, + "learning_rate": 4.30750506215646e-07, + "loss": 0.68039048, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 13198, + "time_per_iteration": 2.476816415786743 + }, + { + "auxiliary_loss_clip": 0.01056227, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.01820266, + "balance_loss_mlp": 1.01829696, + "epoch": 0.7935668119645273, + "flos": 14681780069760.0, + "grad_norm": 2.0238564274784, + "language_loss": 0.73973191, + "learning_rate": 4.30509081032864e-07, + "loss": 0.76073503, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 13199, + "time_per_iteration": 2.3761396408081055 + }, + { + "auxiliary_loss_clip": 0.01052558, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.01660466, + "balance_loss_mlp": 1.01632595, + "epoch": 0.7936269352171953, + "flos": 18003277042560.0, + "grad_norm": 2.040327663530415, + "language_loss": 0.81556559, + "learning_rate": 4.302677153653349e-07, + "loss": 0.83649677, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 13200, + "time_per_iteration": 2.4006898403167725 + }, + { + "auxiliary_loss_clip": 0.01051348, + "auxiliary_loss_mlp": 0.01038035, + "balance_loss_clip": 1.01517045, + "balance_loss_mlp": 1.01633179, + "epoch": 0.7936870584698632, + "flos": 18879212943360.0, + "grad_norm": 1.796680248725001, + "language_loss": 0.77799594, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79888976, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34960938, + "step": 13201, + "time_per_iteration": 2.387350082397461 + }, + { + "auxiliary_loss_clip": 0.01051333, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.01644921, + "balance_loss_mlp": 1.01588511, + "epoch": 0.7937471817225312, + "flos": 23365901364480.0, + "grad_norm": 1.6711730482115423, + "language_loss": 0.67435521, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69526529, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 13202, + "time_per_iteration": 2.4267404079437256 + }, + { + "auxiliary_loss_clip": 0.01053334, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.0111239, + "balance_loss_mlp": 1.01617336, + "epoch": 0.7938073049751991, + "flos": 22673329257600.0, + "grad_norm": 1.9564292706442066, + "language_loss": 0.75462556, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77551305, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 13203, + "time_per_iteration": 2.391225576400757 + }, + { + "auxiliary_loss_clip": 0.01051256, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.01195955, + "balance_loss_mlp": 1.01566947, + "epoch": 0.7938674282278672, + "flos": 22850478829440.0, + "grad_norm": 1.9098061085753986, + "language_loss": 0.67777848, + "learning_rate": 4.293028480307643e-07, + "loss": 0.6986438, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 13204, + "time_per_iteration": 2.3808417320251465 + }, + { + "auxiliary_loss_clip": 0.01050124, + "auxiliary_loss_mlp": 0.01042127, + "balance_loss_clip": 1.01898825, + "balance_loss_mlp": 1.01518679, + "epoch": 0.7939275514805351, + "flos": 27011392554240.0, + "grad_norm": 1.408836384406725, + "language_loss": 0.79867774, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81960022, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 13205, + "time_per_iteration": 3.8605754375457764 + }, + { + "auxiliary_loss_clip": 0.01049723, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.0168314, + "balance_loss_mlp": 1.014606, + "epoch": 0.7939876747332031, + "flos": 21141759335040.0, + "grad_norm": 1.8606133237099647, + "language_loss": 0.78945887, + "learning_rate": 4.28820771692858e-07, + "loss": 0.81034875, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 13206, + "time_per_iteration": 2.447221517562866 + }, + { + "auxiliary_loss_clip": 0.01055401, + "auxiliary_loss_mlp": 0.01040109, + "balance_loss_clip": 1.01328743, + "balance_loss_mlp": 1.01742601, + "epoch": 0.794047797985871, + "flos": 23288115121920.0, + "grad_norm": 2.3552185659632117, + "language_loss": 0.8065694, + "learning_rate": 4.285798228882456e-07, + "loss": 0.82752448, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.37890625, + "step": 13207, + "time_per_iteration": 2.405446767807007 + }, + { + "auxiliary_loss_clip": 0.01051711, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.0155623, + "balance_loss_mlp": 1.01556909, + "epoch": 0.794107921238539, + "flos": 24606924019200.0, + "grad_norm": 1.8478018583175289, + "language_loss": 0.84746504, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86836427, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 13208, + "time_per_iteration": 2.4263768196105957 + }, + { + "auxiliary_loss_clip": 0.01007821, + "auxiliary_loss_mlp": 0.01003184, + "balance_loss_clip": 1.00091875, + "balance_loss_mlp": 1.00110936, + "epoch": 0.7941680444912069, + "flos": 64090198448640.0, + "grad_norm": 0.775098410322917, + "language_loss": 0.58350468, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60361469, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.06738281, + "step": 13209, + "time_per_iteration": 3.1642396450042725 + }, + { + "auxiliary_loss_clip": 0.01054896, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.01476312, + "balance_loss_mlp": 1.01740599, + "epoch": 0.794228167743875, + "flos": 24387704392320.0, + "grad_norm": 2.173564310863978, + "language_loss": 0.64073908, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.66169012, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 13210, + "time_per_iteration": 2.427150249481201 + }, + { + "auxiliary_loss_clip": 0.01051567, + "auxiliary_loss_mlp": 0.01041571, + "balance_loss_clip": 1.02011323, + "balance_loss_mlp": 1.01584697, + "epoch": 0.794288290996543, + "flos": 28511226184320.0, + "grad_norm": 1.5172336907186137, + "language_loss": 0.70157486, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.72250617, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35742188, + "step": 13211, + "time_per_iteration": 2.430877923965454 + }, + { + "auxiliary_loss_clip": 0.01054366, + "auxiliary_loss_mlp": 0.01047301, + "balance_loss_clip": 1.02134967, + "balance_loss_mlp": 1.01683474, + "epoch": 0.7943484142492109, + "flos": 25920915148800.0, + "grad_norm": 1.6605743558221266, + "language_loss": 0.73420042, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.75521708, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 13212, + "time_per_iteration": 2.422306776046753 + }, + { + "auxiliary_loss_clip": 0.01050819, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.01150155, + "balance_loss_mlp": 1.01629496, + "epoch": 0.7944085375018789, + "flos": 23914142444160.0, + "grad_norm": 1.5723543833328346, + "language_loss": 0.80848604, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82932574, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34570312, + "step": 13213, + "time_per_iteration": 2.3897955417633057 + }, + { + "auxiliary_loss_clip": 0.010548, + "auxiliary_loss_mlp": 0.01037432, + "balance_loss_clip": 1.01213622, + "balance_loss_mlp": 1.01679349, + "epoch": 0.7944686607545468, + "flos": 20228920260480.0, + "grad_norm": 2.389444456671732, + "language_loss": 0.69294846, + "learning_rate": 4.268948502428327e-07, + "loss": 0.71387076, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 13214, + "time_per_iteration": 2.365450859069824 + }, + { + "auxiliary_loss_clip": 0.01050161, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.01050961, + "balance_loss_mlp": 1.01525354, + "epoch": 0.7945287840072148, + "flos": 21979919278080.0, + "grad_norm": 1.756461279648147, + "language_loss": 0.73187661, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.75270265, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34960938, + "step": 13215, + "time_per_iteration": 2.3719539642333984 + }, + { + "auxiliary_loss_clip": 0.01052655, + "auxiliary_loss_mlp": 0.0103528, + "balance_loss_clip": 1.01303554, + "balance_loss_mlp": 1.01691866, + "epoch": 0.7945889072598827, + "flos": 26396467044480.0, + "grad_norm": 1.5997530441770806, + "language_loss": 0.79929084, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.82017016, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 13216, + "time_per_iteration": 2.4333584308624268 + }, + { + "auxiliary_loss_clip": 0.01051795, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.01314199, + "balance_loss_mlp": 1.01576269, + "epoch": 0.7946490305125508, + "flos": 25809123375360.0, + "grad_norm": 1.831357188272053, + "language_loss": 0.74851483, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76939726, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 13217, + "time_per_iteration": 2.3970069885253906 + }, + { + "auxiliary_loss_clip": 0.01050269, + "auxiliary_loss_mlp": 0.01037234, + "balance_loss_clip": 1.01591933, + "balance_loss_mlp": 1.01602292, + "epoch": 0.7947091537652187, + "flos": 15960055011840.0, + "grad_norm": 1.897438560728471, + "language_loss": 0.7477839, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76865888, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 13218, + "time_per_iteration": 2.3656723499298096 + }, + { + "auxiliary_loss_clip": 0.01054954, + "auxiliary_loss_mlp": 0.01047184, + "balance_loss_clip": 1.02247202, + "balance_loss_mlp": 1.0168153, + "epoch": 0.7947692770178867, + "flos": 18586885196160.0, + "grad_norm": 1.933882970918329, + "language_loss": 0.85320532, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.87422669, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 13219, + "time_per_iteration": 2.340045690536499 + }, + { + "auxiliary_loss_clip": 0.01055467, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.01361918, + "balance_loss_mlp": 1.01762938, + "epoch": 0.7948294002705546, + "flos": 20441367083520.0, + "grad_norm": 1.9232535381441154, + "language_loss": 0.76423484, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.7852006, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.37890625, + "step": 13220, + "time_per_iteration": 2.3990962505340576 + }, + { + "auxiliary_loss_clip": 0.01054244, + "auxiliary_loss_mlp": 0.01038269, + "balance_loss_clip": 1.01465368, + "balance_loss_mlp": 1.01706946, + "epoch": 0.7948895235232226, + "flos": 38179653840000.0, + "grad_norm": 1.7769890584710024, + "language_loss": 0.73338026, + "learning_rate": 4.252128005599176e-07, + "loss": 0.75430536, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 13221, + "time_per_iteration": 2.498739719390869 + }, + { + "auxiliary_loss_clip": 0.01050023, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.01358521, + "balance_loss_mlp": 1.01564944, + "epoch": 0.7949496467758905, + "flos": 15558902956800.0, + "grad_norm": 1.9211936492821031, + "language_loss": 0.76051068, + "learning_rate": 4.249727465395634e-07, + "loss": 0.78136647, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 13222, + "time_per_iteration": 2.400449514389038 + }, + { + "auxiliary_loss_clip": 0.01007787, + "auxiliary_loss_mlp": 0.0100253, + "balance_loss_clip": 1.00082564, + "balance_loss_mlp": 1.00099754, + "epoch": 0.7950097700285585, + "flos": 70893898715520.0, + "grad_norm": 0.769830955911655, + "language_loss": 0.67202109, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69212425, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.06787109, + "step": 13223, + "time_per_iteration": 2.8637583255767822 + }, + { + "auxiliary_loss_clip": 0.01052431, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.01495767, + "balance_loss_mlp": 1.0162077, + "epoch": 0.7950698932812266, + "flos": 23950487036160.0, + "grad_norm": 1.8422853974359592, + "language_loss": 0.72251922, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.74343985, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 13224, + "time_per_iteration": 3.612887382507324 + }, + { + "auxiliary_loss_clip": 0.01007425, + "auxiliary_loss_mlp": 0.01002199, + "balance_loss_clip": 1.00006533, + "balance_loss_mlp": 1.00076056, + "epoch": 0.7951300165338945, + "flos": 60279638883840.0, + "grad_norm": 0.6692413730035823, + "language_loss": 0.55124021, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57133639, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.06640625, + "step": 13225, + "time_per_iteration": 3.093139410018921 + }, + { + "auxiliary_loss_clip": 0.01049337, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.01215839, + "balance_loss_mlp": 1.01472759, + "epoch": 0.7951901397865625, + "flos": 22817939575680.0, + "grad_norm": 1.8993546841533853, + "language_loss": 0.65856194, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67940247, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34570312, + "step": 13226, + "time_per_iteration": 2.3524067401885986 + }, + { + "auxiliary_loss_clip": 0.01053841, + "auxiliary_loss_mlp": 0.01042971, + "balance_loss_clip": 1.01860476, + "balance_loss_mlp": 1.01629472, + "epoch": 0.7952502630392304, + "flos": 35694326862720.0, + "grad_norm": 2.313079447413372, + "language_loss": 0.71219361, + "learning_rate": 4.237733724976349e-07, + "loss": 0.73316169, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 13227, + "time_per_iteration": 2.5145812034606934 + }, + { + "auxiliary_loss_clip": 0.01049158, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.01843512, + "balance_loss_mlp": 1.01528084, + "epoch": 0.7953103862918984, + "flos": 25628657224320.0, + "grad_norm": 2.935239577945484, + "language_loss": 0.71335185, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.73424047, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 13228, + "time_per_iteration": 2.4003708362579346 + }, + { + "auxiliary_loss_clip": 0.01053136, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.0195539, + "balance_loss_mlp": 1.01625752, + "epoch": 0.7953705095445663, + "flos": 40550396135040.0, + "grad_norm": 1.4565672181769005, + "language_loss": 0.71362174, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73458701, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 13229, + "time_per_iteration": 2.567638635635376 + }, + { + "auxiliary_loss_clip": 0.01055448, + "auxiliary_loss_mlp": 0.01040148, + "balance_loss_clip": 1.01441038, + "balance_loss_mlp": 1.0171423, + "epoch": 0.7954306327972344, + "flos": 27635429928960.0, + "grad_norm": 2.907856099591437, + "language_loss": 0.73052227, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.75147825, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 13230, + "time_per_iteration": 3.812774419784546 + }, + { + "auxiliary_loss_clip": 0.01008005, + "auxiliary_loss_mlp": 0.0100514, + "balance_loss_clip": 1.00305426, + "balance_loss_mlp": 1.00077677, + "epoch": 0.7954907560499023, + "flos": 59500481005440.0, + "grad_norm": 0.8985191451752992, + "language_loss": 0.63660574, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65673721, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.07226562, + "step": 13231, + "time_per_iteration": 4.457201242446899 + }, + { + "auxiliary_loss_clip": 0.01053436, + "auxiliary_loss_mlp": 0.01038656, + "balance_loss_clip": 1.01595879, + "balance_loss_mlp": 1.01733518, + "epoch": 0.7955508793025703, + "flos": 20119502459520.0, + "grad_norm": 1.638163003632879, + "language_loss": 0.70327628, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72419721, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 13232, + "time_per_iteration": 2.4699573516845703 + }, + { + "auxiliary_loss_clip": 0.01051284, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.00853133, + "balance_loss_mlp": 1.01577961, + "epoch": 0.7956110025552382, + "flos": 26504174188800.0, + "grad_norm": 1.5809829052088047, + "language_loss": 0.78815711, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80898631, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 13233, + "time_per_iteration": 2.3929152488708496 + }, + { + "auxiliary_loss_clip": 0.01052868, + "auxiliary_loss_mlp": 0.01041092, + "balance_loss_clip": 1.01608229, + "balance_loss_mlp": 1.01634395, + "epoch": 0.7956711258079062, + "flos": 22564365304320.0, + "grad_norm": 2.7164320185865467, + "language_loss": 0.79070431, + "learning_rate": 4.220967594613769e-07, + "loss": 0.81164396, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36523438, + "step": 13234, + "time_per_iteration": 2.3869435787200928 + }, + { + "auxiliary_loss_clip": 0.010511, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.01234722, + "balance_loss_mlp": 1.01632571, + "epoch": 0.7957312490605741, + "flos": 17378192327040.0, + "grad_norm": 1.5711083594471142, + "language_loss": 0.70821357, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72905642, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34765625, + "step": 13235, + "time_per_iteration": 2.324753522872925 + }, + { + "auxiliary_loss_clip": 0.01052122, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.00793254, + "balance_loss_mlp": 1.01581573, + "epoch": 0.7957913723132422, + "flos": 22490349488640.0, + "grad_norm": 1.7230194039366533, + "language_loss": 0.69516337, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.71600878, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36328125, + "step": 13236, + "time_per_iteration": 2.382568836212158 + }, + { + "auxiliary_loss_clip": 0.0105116, + "auxiliary_loss_mlp": 0.01036975, + "balance_loss_clip": 1.01337147, + "balance_loss_mlp": 1.01626348, + "epoch": 0.7958514955659101, + "flos": 22636984665600.0, + "grad_norm": 1.579744751238117, + "language_loss": 0.7598331, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.78071451, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34765625, + "step": 13237, + "time_per_iteration": 2.3580968379974365 + }, + { + "auxiliary_loss_clip": 0.01052985, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.01157308, + "balance_loss_mlp": 1.01681471, + "epoch": 0.7959116188185781, + "flos": 20703180435840.0, + "grad_norm": 1.926957361610192, + "language_loss": 0.72006035, + "learning_rate": 4.211400110229175e-07, + "loss": 0.74095392, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 13238, + "time_per_iteration": 2.364130973815918 + }, + { + "auxiliary_loss_clip": 0.01053811, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.01198828, + "balance_loss_mlp": 1.01676846, + "epoch": 0.7959717420712461, + "flos": 19023718527360.0, + "grad_norm": 1.6268585816864745, + "language_loss": 0.75529695, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.77618468, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36914062, + "step": 13239, + "time_per_iteration": 2.3379557132720947 + }, + { + "auxiliary_loss_clip": 0.01053731, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.01411319, + "balance_loss_mlp": 1.0164485, + "epoch": 0.796031865323914, + "flos": 26355514152960.0, + "grad_norm": 1.8427786457791275, + "language_loss": 0.71509844, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.73601663, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37304688, + "step": 13240, + "time_per_iteration": 2.4375221729278564 + }, + { + "auxiliary_loss_clip": 0.01007564, + "auxiliary_loss_mlp": 0.01002352, + "balance_loss_clip": 1.00037301, + "balance_loss_mlp": 1.00069809, + "epoch": 0.796091988576582, + "flos": 62066493734400.0, + "grad_norm": 0.8910606473468836, + "language_loss": 0.58797693, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.6080761, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.06884766, + "step": 13241, + "time_per_iteration": 2.820869207382202 + }, + { + "auxiliary_loss_clip": 0.01051979, + "auxiliary_loss_mlp": 0.0103466, + "balance_loss_clip": 1.01240361, + "balance_loss_mlp": 1.01676428, + "epoch": 0.7961521118292499, + "flos": 39018546921600.0, + "grad_norm": 2.084570476609054, + "language_loss": 0.64882237, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66968876, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 13242, + "time_per_iteration": 2.5433218479156494 + }, + { + "auxiliary_loss_clip": 0.0105121, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.0130055, + "balance_loss_mlp": 1.01565385, + "epoch": 0.796212235081918, + "flos": 21761746992000.0, + "grad_norm": 1.9075277797537118, + "language_loss": 0.78138423, + "learning_rate": 4.199454226296526e-07, + "loss": 0.80227041, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 13243, + "time_per_iteration": 2.356863498687744 + }, + { + "auxiliary_loss_clip": 0.0105382, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.01362324, + "balance_loss_mlp": 1.01684189, + "epoch": 0.7962723583345859, + "flos": 21177789724800.0, + "grad_norm": 1.7073804368590668, + "language_loss": 0.80208415, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.82299888, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 13244, + "time_per_iteration": 2.415593147277832 + }, + { + "auxiliary_loss_clip": 0.01054201, + "auxiliary_loss_mlp": 0.01041082, + "balance_loss_clip": 1.01627517, + "balance_loss_mlp": 1.01645625, + "epoch": 0.7963324815872539, + "flos": 17127690255360.0, + "grad_norm": 2.2756100199076017, + "language_loss": 0.69407392, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.71502668, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 13245, + "time_per_iteration": 3.916099786758423 + }, + { + "auxiliary_loss_clip": 0.01052525, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.01269782, + "balance_loss_mlp": 1.01676321, + "epoch": 0.7963926048399218, + "flos": 21396415858560.0, + "grad_norm": 1.4482659719424709, + "language_loss": 0.79917824, + "learning_rate": 4.192293885111549e-07, + "loss": 0.82006979, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35742188, + "step": 13246, + "time_per_iteration": 2.381047487258911 + }, + { + "auxiliary_loss_clip": 0.01053174, + "auxiliary_loss_mlp": 0.01039505, + "balance_loss_clip": 1.01484084, + "balance_loss_mlp": 1.01553106, + "epoch": 0.7964527280925898, + "flos": 25183235698560.0, + "grad_norm": 1.8442923674323632, + "language_loss": 0.67287505, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.69380182, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 13247, + "time_per_iteration": 2.385470390319824 + }, + { + "auxiliary_loss_clip": 0.01050456, + "auxiliary_loss_mlp": 0.01035524, + "balance_loss_clip": 1.01460266, + "balance_loss_mlp": 1.01580858, + "epoch": 0.7965128513452577, + "flos": 27014674222080.0, + "grad_norm": 1.993119670895604, + "language_loss": 0.7277658, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.74862564, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34570312, + "step": 13248, + "time_per_iteration": 2.424541473388672 + }, + { + "auxiliary_loss_clip": 0.01052451, + "auxiliary_loss_mlp": 0.01038201, + "balance_loss_clip": 1.01423955, + "balance_loss_mlp": 1.01648939, + "epoch": 0.7965729745979258, + "flos": 24418602812160.0, + "grad_norm": 2.3794005788624912, + "language_loss": 0.76910281, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.79000938, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 13249, + "time_per_iteration": 2.3677263259887695 + }, + { + "auxiliary_loss_clip": 0.01051595, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.01305091, + "balance_loss_mlp": 1.01646578, + "epoch": 0.7966330978505937, + "flos": 18839481949440.0, + "grad_norm": 1.829123814736729, + "language_loss": 0.62970346, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.65055931, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.3515625, + "step": 13250, + "time_per_iteration": 2.3596856594085693 + }, + { + "auxiliary_loss_clip": 0.01051862, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.01234436, + "balance_loss_mlp": 1.01613712, + "epoch": 0.7966932211032617, + "flos": 13151466956160.0, + "grad_norm": 2.2159276915535013, + "language_loss": 0.74390173, + "learning_rate": 4.180371972938206e-07, + "loss": 0.76478302, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 13251, + "time_per_iteration": 2.3301806449890137 + }, + { + "auxiliary_loss_clip": 0.01055583, + "auxiliary_loss_mlp": 0.010404, + "balance_loss_clip": 1.01367378, + "balance_loss_mlp": 1.01763511, + "epoch": 0.7967533443559297, + "flos": 23948671645440.0, + "grad_norm": 1.9036812125298026, + "language_loss": 0.74564517, + "learning_rate": 4.177989389787624e-07, + "loss": 0.76660502, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38085938, + "step": 13252, + "time_per_iteration": 2.397923231124878 + }, + { + "auxiliary_loss_clip": 0.01050022, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.01140893, + "balance_loss_mlp": 1.01560175, + "epoch": 0.7968134676085976, + "flos": 30367593285120.0, + "grad_norm": 1.9475907562746544, + "language_loss": 0.67526472, + "learning_rate": 4.175607406609278e-07, + "loss": 0.69609851, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 13253, + "time_per_iteration": 2.4117307662963867 + }, + { + "auxiliary_loss_clip": 0.01053451, + "auxiliary_loss_mlp": 0.01043411, + "balance_loss_clip": 1.01906896, + "balance_loss_mlp": 1.01594114, + "epoch": 0.7968735908612656, + "flos": 23073957642240.0, + "grad_norm": 1.5177408422808698, + "language_loss": 0.68037808, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.70134676, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.375, + "step": 13254, + "time_per_iteration": 2.385810613632202 + }, + { + "auxiliary_loss_clip": 0.01051503, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.0151937, + "balance_loss_mlp": 1.01648712, + "epoch": 0.7969337141139335, + "flos": 23581245830400.0, + "grad_norm": 1.937319370982777, + "language_loss": 0.70927429, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.73015559, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 13255, + "time_per_iteration": 2.3797271251678467 + }, + { + "auxiliary_loss_clip": 0.01050171, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.01417959, + "balance_loss_mlp": 1.01515269, + "epoch": 0.7969938373666016, + "flos": 19754834641920.0, + "grad_norm": 2.06828992935669, + "language_loss": 0.80050921, + "learning_rate": 4.168465057810733e-07, + "loss": 0.82137996, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 13256, + "time_per_iteration": 2.397908926010132 + }, + { + "auxiliary_loss_clip": 0.01052595, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.01147652, + "balance_loss_mlp": 1.01687825, + "epoch": 0.7970539606192695, + "flos": 24132943134720.0, + "grad_norm": 2.319243762144068, + "language_loss": 0.66957754, + "learning_rate": 4.166085475424315e-07, + "loss": 0.69043916, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 13257, + "time_per_iteration": 2.3795344829559326 + }, + { + "auxiliary_loss_clip": 0.01055608, + "auxiliary_loss_mlp": 0.01039527, + "balance_loss_clip": 1.01473117, + "balance_loss_mlp": 1.01801133, + "epoch": 0.7971140838719375, + "flos": 17967630677760.0, + "grad_norm": 2.242274490301799, + "language_loss": 0.73383546, + "learning_rate": 4.163706493461523e-07, + "loss": 0.75478685, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 13258, + "time_per_iteration": 2.370718479156494 + }, + { + "auxiliary_loss_clip": 0.01053589, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.01438439, + "balance_loss_mlp": 1.01610327, + "epoch": 0.7971742071246054, + "flos": 19168608136320.0, + "grad_norm": 1.7743104184605019, + "language_loss": 0.69913208, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.7200641, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 13259, + "time_per_iteration": 2.3417551517486572 + }, + { + "auxiliary_loss_clip": 0.0105048, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.01356316, + "balance_loss_mlp": 1.01665258, + "epoch": 0.7972343303772734, + "flos": 27124720427520.0, + "grad_norm": 1.7400792396879277, + "language_loss": 0.74243873, + "learning_rate": 4.158950331167641e-07, + "loss": 0.76329184, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33789062, + "step": 13260, + "time_per_iteration": 2.442868709564209 + }, + { + "auxiliary_loss_clip": 0.01049739, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.01174474, + "balance_loss_mlp": 1.0157187, + "epoch": 0.7972944536299413, + "flos": 20995578005760.0, + "grad_norm": 1.7357628757408043, + "language_loss": 0.78967923, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.81051195, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 13261, + "time_per_iteration": 2.3762500286102295 + }, + { + "auxiliary_loss_clip": 0.01046924, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.01294112, + "balance_loss_mlp": 1.01516879, + "epoch": 0.7973545768826094, + "flos": 21578941779840.0, + "grad_norm": 1.4381669705601923, + "language_loss": 0.76426309, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78505248, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.31640625, + "step": 13262, + "time_per_iteration": 2.3652896881103516 + }, + { + "auxiliary_loss_clip": 0.01056695, + "auxiliary_loss_mlp": 0.01040039, + "balance_loss_clip": 1.01415884, + "balance_loss_mlp": 1.0175333, + "epoch": 0.7974147001352773, + "flos": 20557487865600.0, + "grad_norm": 7.693261206777027, + "language_loss": 0.72272789, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.7436952, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.390625, + "step": 13263, + "time_per_iteration": 2.379946708679199 + }, + { + "auxiliary_loss_clip": 0.0105609, + "auxiliary_loss_mlp": 0.01041191, + "balance_loss_clip": 1.01291478, + "balance_loss_mlp": 1.01672125, + "epoch": 0.7974748233879453, + "flos": 20995717651200.0, + "grad_norm": 2.0376970207944103, + "language_loss": 0.72482866, + "learning_rate": 4.149445215631153e-07, + "loss": 0.74580151, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.39453125, + "step": 13264, + "time_per_iteration": 3.600358247756958 + }, + { + "auxiliary_loss_clip": 0.01051916, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.01658618, + "balance_loss_mlp": 1.01738918, + "epoch": 0.7975349466406133, + "flos": 22564086013440.0, + "grad_norm": 4.822821879876, + "language_loss": 0.78583938, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.80674732, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34570312, + "step": 13265, + "time_per_iteration": 2.360214948654175 + }, + { + "auxiliary_loss_clip": 0.01052124, + "auxiliary_loss_mlp": 0.0103601, + "balance_loss_clip": 1.01188171, + "balance_loss_mlp": 1.01561534, + "epoch": 0.7975950698932812, + "flos": 21688464314880.0, + "grad_norm": 1.9750045779861756, + "language_loss": 0.7646758, + "learning_rate": 4.144696263830285e-07, + "loss": 0.78555715, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 13266, + "time_per_iteration": 2.3817176818847656 + }, + { + "auxiliary_loss_clip": 0.01049679, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.01316357, + "balance_loss_mlp": 1.01501071, + "epoch": 0.7976551931459492, + "flos": 19603695899520.0, + "grad_norm": 2.0083429520661586, + "language_loss": 0.85068834, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.87152481, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34765625, + "step": 13267, + "time_per_iteration": 2.3383820056915283 + }, + { + "auxiliary_loss_clip": 0.01050191, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.01515198, + "epoch": 0.7977153163986171, + "flos": 21686579101440.0, + "grad_norm": 1.5750003407604996, + "language_loss": 0.77433693, + "learning_rate": 4.139949716968223e-07, + "loss": 0.79525405, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 13268, + "time_per_iteration": 2.38385009765625 + }, + { + "auxiliary_loss_clip": 0.01052311, + "auxiliary_loss_mlp": 0.01040321, + "balance_loss_clip": 1.01657414, + "balance_loss_mlp": 1.01680326, + "epoch": 0.7977754396512852, + "flos": 23475668279040.0, + "grad_norm": 1.5670755254388924, + "language_loss": 0.78494412, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.80587041, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 13269, + "time_per_iteration": 2.4109268188476562 + }, + { + "auxiliary_loss_clip": 0.01048661, + "auxiliary_loss_mlp": 0.01033055, + "balance_loss_clip": 1.01166916, + "balance_loss_mlp": 1.01456594, + "epoch": 0.7978355629039531, + "flos": 22381141155840.0, + "grad_norm": 1.7371468787981643, + "language_loss": 0.82729828, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84811544, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 13270, + "time_per_iteration": 3.7910163402557373 + }, + { + "auxiliary_loss_clip": 0.01052374, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.0197283, + "balance_loss_mlp": 1.01690185, + "epoch": 0.7978956861566211, + "flos": 20265299763840.0, + "grad_norm": 1.859571550011915, + "language_loss": 0.61421633, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.63516861, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 13271, + "time_per_iteration": 3.622788906097412 + }, + { + "auxiliary_loss_clip": 0.01054921, + "auxiliary_loss_mlp": 0.01041155, + "balance_loss_clip": 1.017838, + "balance_loss_mlp": 1.01727295, + "epoch": 0.797955809409289, + "flos": 28111121470080.0, + "grad_norm": 1.404414393111034, + "language_loss": 0.74601805, + "learning_rate": 4.130463840939975e-07, + "loss": 0.7669788, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37695312, + "step": 13272, + "time_per_iteration": 2.4462101459503174 + }, + { + "auxiliary_loss_clip": 0.01051877, + "auxiliary_loss_mlp": 0.01040411, + "balance_loss_clip": 1.0176419, + "balance_loss_mlp": 1.01660323, + "epoch": 0.798015932661957, + "flos": 15558693488640.0, + "grad_norm": 1.7901144621597667, + "language_loss": 0.72731823, + "learning_rate": 4.128093876144161e-07, + "loss": 0.74824107, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 13273, + "time_per_iteration": 2.3188564777374268 + }, + { + "auxiliary_loss_clip": 0.0105446, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.01331675, + "balance_loss_mlp": 1.01675427, + "epoch": 0.7980760559146249, + "flos": 23950068099840.0, + "grad_norm": 1.7629296304887276, + "language_loss": 0.76840639, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.78933197, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37695312, + "step": 13274, + "time_per_iteration": 2.3892621994018555 + }, + { + "auxiliary_loss_clip": 0.01047634, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.01412272, + "balance_loss_mlp": 1.01506269, + "epoch": 0.798136179167293, + "flos": 28036826363520.0, + "grad_norm": 1.4557828108372064, + "language_loss": 0.78684211, + "learning_rate": 4.12335575223518e-07, + "loss": 0.80764806, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.32617188, + "step": 13275, + "time_per_iteration": 2.403825521469116 + }, + { + "auxiliary_loss_clip": 0.01053662, + "auxiliary_loss_mlp": 0.01043817, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.01696301, + "epoch": 0.7981963024199609, + "flos": 35983268208000.0, + "grad_norm": 1.9114617806166778, + "language_loss": 0.65069616, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.67167091, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 13276, + "time_per_iteration": 2.4594767093658447 + }, + { + "auxiliary_loss_clip": 0.01049911, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.0144577, + "balance_loss_mlp": 1.01547539, + "epoch": 0.7982564256726289, + "flos": 25883732684160.0, + "grad_norm": 1.6715999353810436, + "language_loss": 0.61900806, + "learning_rate": 4.118620036501945e-07, + "loss": 0.63986969, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34375, + "step": 13277, + "time_per_iteration": 2.388949394226074 + }, + { + "auxiliary_loss_clip": 0.01054508, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.01812208, + "balance_loss_mlp": 1.01830757, + "epoch": 0.7983165489252969, + "flos": 25737970291200.0, + "grad_norm": 2.2707400897593617, + "language_loss": 0.80056548, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.8215186, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 13278, + "time_per_iteration": 2.3857619762420654 + }, + { + "auxiliary_loss_clip": 0.01051724, + "auxiliary_loss_mlp": 0.01040648, + "balance_loss_clip": 1.01694918, + "balance_loss_mlp": 1.01522076, + "epoch": 0.7983766721779648, + "flos": 21907125360000.0, + "grad_norm": 1.9860509955635113, + "language_loss": 0.65209228, + "learning_rate": 4.113886729662768e-07, + "loss": 0.67301595, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36523438, + "step": 13279, + "time_per_iteration": 2.3512537479400635 + }, + { + "auxiliary_loss_clip": 0.01048036, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.01124048, + "balance_loss_mlp": 1.01568198, + "epoch": 0.7984367954306328, + "flos": 29346244104960.0, + "grad_norm": 5.057200327146317, + "language_loss": 0.71716905, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73795187, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.32421875, + "step": 13280, + "time_per_iteration": 2.4374735355377197 + }, + { + "auxiliary_loss_clip": 0.01054556, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.01896262, + "balance_loss_mlp": 1.01780105, + "epoch": 0.7984969186833007, + "flos": 31356438122880.0, + "grad_norm": 1.6677192533140794, + "language_loss": 0.64004934, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.66103613, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 13281, + "time_per_iteration": 2.450786590576172 + }, + { + "auxiliary_loss_clip": 0.01055588, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.01405668, + "balance_loss_mlp": 1.01741457, + "epoch": 0.7985570419359688, + "flos": 24311873185920.0, + "grad_norm": 2.9835402957080004, + "language_loss": 0.80979908, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.83073395, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 13282, + "time_per_iteration": 2.4314935207366943 + }, + { + "auxiliary_loss_clip": 0.01052454, + "auxiliary_loss_mlp": 0.01041744, + "balance_loss_clip": 1.015661, + "balance_loss_mlp": 1.0157373, + "epoch": 0.7986171651886367, + "flos": 15741324144000.0, + "grad_norm": 1.8793573951121325, + "language_loss": 0.72700977, + "learning_rate": 4.10442734553802e-07, + "loss": 0.74795175, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 13283, + "time_per_iteration": 2.33414626121521 + }, + { + "auxiliary_loss_clip": 0.01049564, + "auxiliary_loss_mlp": 0.01034913, + "balance_loss_clip": 1.01239443, + "balance_loss_mlp": 1.01467037, + "epoch": 0.7986772884413047, + "flos": 11618605313280.0, + "grad_norm": 2.0159860053140597, + "language_loss": 0.7481488, + "learning_rate": 4.102064006186967e-07, + "loss": 0.7689935, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 13284, + "time_per_iteration": 3.8019633293151855 + }, + { + "auxiliary_loss_clip": 0.0105144, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.01499093, + "balance_loss_mlp": 1.01638794, + "epoch": 0.7987374116939726, + "flos": 22089965483520.0, + "grad_norm": 1.51894464273185, + "language_loss": 0.7127853, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.73365986, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34960938, + "step": 13285, + "time_per_iteration": 2.3694911003112793 + }, + { + "auxiliary_loss_clip": 0.01051972, + "auxiliary_loss_mlp": 0.01036836, + "balance_loss_clip": 1.01552165, + "balance_loss_mlp": 1.01673126, + "epoch": 0.7987975349466406, + "flos": 17889844435200.0, + "grad_norm": 1.740550284959867, + "language_loss": 0.74463177, + "learning_rate": 4.097339136128437e-07, + "loss": 0.76551986, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 13286, + "time_per_iteration": 2.3713886737823486 + }, + { + "auxiliary_loss_clip": 0.01052331, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.01540589, + "balance_loss_mlp": 1.01634514, + "epoch": 0.7988576581993085, + "flos": 19718210759040.0, + "grad_norm": 1.679771072439159, + "language_loss": 0.76276016, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.78367442, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 13287, + "time_per_iteration": 2.3512861728668213 + }, + { + "auxiliary_loss_clip": 0.01050252, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.01629722, + "balance_loss_mlp": 1.01576567, + "epoch": 0.7989177814519766, + "flos": 28035150618240.0, + "grad_norm": 1.694206096404595, + "language_loss": 0.63562685, + "learning_rate": 4.092616678191863e-07, + "loss": 0.65652305, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34570312, + "step": 13288, + "time_per_iteration": 2.431684970855713 + }, + { + "auxiliary_loss_clip": 0.01051295, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.01293075, + "balance_loss_mlp": 1.01669765, + "epoch": 0.7989779047046445, + "flos": 28869924159360.0, + "grad_norm": 1.9305931523380229, + "language_loss": 0.71624994, + "learning_rate": 4.090256353993169e-07, + "loss": 0.73710132, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34570312, + "step": 13289, + "time_per_iteration": 2.4135990142822266 + }, + { + "auxiliary_loss_clip": 0.01052324, + "auxiliary_loss_mlp": 0.01036874, + "balance_loss_clip": 1.01391423, + "balance_loss_mlp": 1.0170536, + "epoch": 0.7990380279573125, + "flos": 18185907697920.0, + "grad_norm": 2.161247359392258, + "language_loss": 0.64189857, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.66279054, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 13290, + "time_per_iteration": 2.369813919067383 + }, + { + "auxiliary_loss_clip": 0.01053756, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.01479936, + "balance_loss_mlp": 1.01738274, + "epoch": 0.7990981512099805, + "flos": 20879073198720.0, + "grad_norm": 1.811899500995961, + "language_loss": 0.72307897, + "learning_rate": 4.08553751558248e-07, + "loss": 0.74400711, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 13291, + "time_per_iteration": 2.398078203201294 + }, + { + "auxiliary_loss_clip": 0.01049861, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01332355, + "balance_loss_mlp": 1.01503563, + "epoch": 0.7991582744626484, + "flos": 26098832770560.0, + "grad_norm": 1.517377807024678, + "language_loss": 0.64520466, + "learning_rate": 4.083179001549422e-07, + "loss": 0.66605902, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 13292, + "time_per_iteration": 2.4021806716918945 + }, + { + "auxiliary_loss_clip": 0.01051096, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.01613295, + "balance_loss_mlp": 1.01540554, + "epoch": 0.7992183977153164, + "flos": 35294466528000.0, + "grad_norm": 1.6831265157801256, + "language_loss": 0.57058632, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.59148765, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35742188, + "step": 13293, + "time_per_iteration": 2.467203140258789 + }, + { + "auxiliary_loss_clip": 0.01052685, + "auxiliary_loss_mlp": 0.01038319, + "balance_loss_clip": 1.01566958, + "balance_loss_mlp": 1.01672387, + "epoch": 0.7992785209679844, + "flos": 51851781901440.0, + "grad_norm": 2.4893545735043725, + "language_loss": 0.73294914, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.75385916, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 13294, + "time_per_iteration": 2.6070194244384766 + }, + { + "auxiliary_loss_clip": 0.01052739, + "auxiliary_loss_mlp": 0.01037512, + "balance_loss_clip": 1.01394391, + "balance_loss_mlp": 1.01699102, + "epoch": 0.7993386442206524, + "flos": 22564016190720.0, + "grad_norm": 1.9323523451731863, + "language_loss": 0.74189967, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.76280218, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 13295, + "time_per_iteration": 2.349834442138672 + }, + { + "auxiliary_loss_clip": 0.01050577, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.01365972, + "balance_loss_mlp": 1.01651311, + "epoch": 0.7993987674733203, + "flos": 18799471664640.0, + "grad_norm": 1.853084231220753, + "language_loss": 0.77595979, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.79681897, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 13296, + "time_per_iteration": 2.348269462585449 + }, + { + "auxiliary_loss_clip": 0.01008434, + "auxiliary_loss_mlp": 0.01003446, + "balance_loss_clip": 1.00113297, + "balance_loss_mlp": 1.00147092, + "epoch": 0.7994588907259883, + "flos": 69418049189760.0, + "grad_norm": 0.6969778992458188, + "language_loss": 0.60854387, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62866271, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.06933594, + "step": 13297, + "time_per_iteration": 3.06969952583313 + }, + { + "auxiliary_loss_clip": 0.01050747, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.01033151, + "balance_loss_mlp": 1.01556528, + "epoch": 0.7995190139786562, + "flos": 13479475979520.0, + "grad_norm": 1.796971138702444, + "language_loss": 0.71694171, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.73778582, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3515625, + "step": 13298, + "time_per_iteration": 2.3446595668792725 + }, + { + "auxiliary_loss_clip": 0.01053751, + "auxiliary_loss_mlp": 0.010393, + "balance_loss_clip": 1.01407528, + "balance_loss_mlp": 1.01595545, + "epoch": 0.7995791372313242, + "flos": 21651770609280.0, + "grad_norm": 2.173547047635548, + "language_loss": 0.77314067, + "learning_rate": 4.066686308212037e-07, + "loss": 0.7940712, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 13299, + "time_per_iteration": 2.3445301055908203 + }, + { + "auxiliary_loss_clip": 0.01049429, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.01336741, + "balance_loss_mlp": 1.0148387, + "epoch": 0.7996392604839921, + "flos": 26066921921280.0, + "grad_norm": 1.8671734474411203, + "language_loss": 0.78713655, + "learning_rate": 4.064332625220828e-07, + "loss": 0.80798662, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34570312, + "step": 13300, + "time_per_iteration": 2.395376205444336 + }, + { + "auxiliary_loss_clip": 0.01053124, + "auxiliary_loss_mlp": 0.01034817, + "balance_loss_clip": 1.01065338, + "balance_loss_mlp": 1.0164963, + "epoch": 0.7996993837366602, + "flos": 24605771944320.0, + "grad_norm": 1.8614662200580108, + "language_loss": 0.64657557, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.66745496, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 13301, + "time_per_iteration": 2.377619981765747 + }, + { + "auxiliary_loss_clip": 0.01051142, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.01499057, + "balance_loss_mlp": 1.01628506, + "epoch": 0.7997595069893281, + "flos": 20991109351680.0, + "grad_norm": 1.6204099228793187, + "language_loss": 0.72755659, + "learning_rate": 4.059627072173928e-07, + "loss": 0.74844599, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34765625, + "step": 13302, + "time_per_iteration": 2.426759719848633 + }, + { + "auxiliary_loss_clip": 0.01053556, + "auxiliary_loss_mlp": 0.01039178, + "balance_loss_clip": 1.01414394, + "balance_loss_mlp": 1.01588821, + "epoch": 0.7998196302419961, + "flos": 24425340704640.0, + "grad_norm": 1.9949205832254464, + "language_loss": 0.8462621, + "learning_rate": 4.057275202296684e-07, + "loss": 0.86718941, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 13303, + "time_per_iteration": 3.7507333755493164 + }, + { + "auxiliary_loss_clip": 0.01049892, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.01327634, + "balance_loss_mlp": 1.01560068, + "epoch": 0.7998797534946641, + "flos": 30263307454080.0, + "grad_norm": 1.8718222049139581, + "language_loss": 0.60119116, + "learning_rate": 4.054923936969166e-07, + "loss": 0.62202621, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34375, + "step": 13304, + "time_per_iteration": 2.4264776706695557 + }, + { + "auxiliary_loss_clip": 0.0105383, + "auxiliary_loss_mlp": 0.0103409, + "balance_loss_clip": 1.01016462, + "balance_loss_mlp": 1.01644993, + "epoch": 0.799939876747332, + "flos": 23512850743680.0, + "grad_norm": 1.5642649892440927, + "language_loss": 0.7007395, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.72161871, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37304688, + "step": 13305, + "time_per_iteration": 2.4059605598449707 + }, + { + "auxiliary_loss_clip": 0.01050226, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.01124299, + "balance_loss_mlp": 1.01606202, + "epoch": 0.8, + "flos": 19317093615360.0, + "grad_norm": 1.6194933232609776, + "language_loss": 0.70251852, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.72334182, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34179688, + "step": 13306, + "time_per_iteration": 2.404358386993408 + }, + { + "auxiliary_loss_clip": 0.01052726, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.0119288, + "balance_loss_mlp": 1.01646233, + "epoch": 0.800060123252668, + "flos": 32411164429440.0, + "grad_norm": 1.7356377387484745, + "language_loss": 0.70373166, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.72461188, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 13307, + "time_per_iteration": 2.4474194049835205 + }, + { + "auxiliary_loss_clip": 0.01052686, + "auxiliary_loss_mlp": 0.01041141, + "balance_loss_clip": 1.01925457, + "balance_loss_mlp": 1.01661849, + "epoch": 0.800120246505336, + "flos": 20009595899520.0, + "grad_norm": 1.823540997590929, + "language_loss": 0.78222638, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.80316466, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36132812, + "step": 13308, + "time_per_iteration": 2.371959924697876 + }, + { + "auxiliary_loss_clip": 0.01054298, + "auxiliary_loss_mlp": 0.01040126, + "balance_loss_clip": 1.01447177, + "balance_loss_mlp": 1.01617599, + "epoch": 0.8001803697580039, + "flos": 31866938156160.0, + "grad_norm": 1.4712778817623702, + "language_loss": 0.79427356, + "learning_rate": 4.0431766816972e-07, + "loss": 0.81521773, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 13309, + "time_per_iteration": 3.8319997787475586 + }, + { + "auxiliary_loss_clip": 0.01008013, + "auxiliary_loss_mlp": 0.01007251, + "balance_loss_clip": 1.00468755, + "balance_loss_mlp": 1.00122499, + "epoch": 0.8002404930106719, + "flos": 63388828679040.0, + "grad_norm": 0.9329854659128911, + "language_loss": 0.64786059, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66801322, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.06835938, + "step": 13310, + "time_per_iteration": 4.4200804233551025 + }, + { + "auxiliary_loss_clip": 0.0105208, + "auxiliary_loss_mlp": 0.01040693, + "balance_loss_clip": 1.01773334, + "balance_loss_mlp": 1.01611304, + "epoch": 0.8003006162633398, + "flos": 27854230619520.0, + "grad_norm": 2.0076774504859523, + "language_loss": 0.84582543, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.86675316, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 13311, + "time_per_iteration": 2.3890202045440674 + }, + { + "auxiliary_loss_clip": 0.0105134, + "auxiliary_loss_mlp": 0.01039127, + "balance_loss_clip": 1.01468933, + "balance_loss_mlp": 1.01616049, + "epoch": 0.8003607395160078, + "flos": 18222357024000.0, + "grad_norm": 2.5040984923492777, + "language_loss": 0.67877114, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.6996758, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3515625, + "step": 13312, + "time_per_iteration": 2.3646042346954346 + }, + { + "auxiliary_loss_clip": 0.01056058, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.01246858, + "balance_loss_mlp": 1.0185473, + "epoch": 0.8004208627686757, + "flos": 20885915825280.0, + "grad_norm": 1.7459612539790001, + "language_loss": 0.7636776, + "learning_rate": 4.033789768462843e-07, + "loss": 0.78462338, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 13313, + "time_per_iteration": 2.509648084640503 + }, + { + "auxiliary_loss_clip": 0.01052269, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.01504469, + "balance_loss_mlp": 1.01616359, + "epoch": 0.8004809860213438, + "flos": 26434836495360.0, + "grad_norm": 1.320888692004062, + "language_loss": 0.76254028, + "learning_rate": 4.031444553532575e-07, + "loss": 0.78343934, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36132812, + "step": 13314, + "time_per_iteration": 2.4302194118499756 + }, + { + "auxiliary_loss_clip": 0.01008051, + "auxiliary_loss_mlp": 0.01002174, + "balance_loss_clip": 0.99994493, + "balance_loss_mlp": 1.00098968, + "epoch": 0.8005411092740117, + "flos": 63645335504640.0, + "grad_norm": 0.7849755740285583, + "language_loss": 0.53750026, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55760252, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.07080078, + "step": 13315, + "time_per_iteration": 2.9406778812408447 + }, + { + "auxiliary_loss_clip": 0.01052694, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.01309288, + "balance_loss_mlp": 1.01690245, + "epoch": 0.8006012325266797, + "flos": 36136571454720.0, + "grad_norm": 1.5026080924211764, + "language_loss": 0.72322857, + "learning_rate": 4.026755940348603e-07, + "loss": 0.74411052, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 13316, + "time_per_iteration": 2.5139594078063965 + }, + { + "auxiliary_loss_clip": 0.01053962, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.01922214, + "balance_loss_mlp": 1.01621783, + "epoch": 0.8006613557793477, + "flos": 33837540825600.0, + "grad_norm": 2.107985108984309, + "language_loss": 0.65915465, + "learning_rate": 4.024412542272706e-07, + "loss": 0.68013823, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 13317, + "time_per_iteration": 2.5097193717956543 + }, + { + "auxiliary_loss_clip": 0.01008046, + "auxiliary_loss_mlp": 0.01002888, + "balance_loss_clip": 1.00069451, + "balance_loss_mlp": 1.00115693, + "epoch": 0.8007214790320156, + "flos": 67344522232320.0, + "grad_norm": 0.7709447474508325, + "language_loss": 0.59140134, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61151075, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06884766, + "step": 13318, + "time_per_iteration": 3.1183090209960938 + }, + { + "auxiliary_loss_clip": 0.01051068, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.01318741, + "balance_loss_mlp": 1.01603055, + "epoch": 0.8007816022846836, + "flos": 23184527518080.0, + "grad_norm": 1.5696038789644844, + "language_loss": 0.66907722, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68994582, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 13319, + "time_per_iteration": 2.3894925117492676 + }, + { + "auxiliary_loss_clip": 0.01054559, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.01377392, + "balance_loss_mlp": 1.01694345, + "epoch": 0.8008417255373516, + "flos": 21980303303040.0, + "grad_norm": 1.89950748796875, + "language_loss": 0.74772179, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76867402, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.375, + "step": 13320, + "time_per_iteration": 2.401658535003662 + }, + { + "auxiliary_loss_clip": 0.01054125, + "auxiliary_loss_mlp": 0.01039708, + "balance_loss_clip": 1.01635551, + "balance_loss_mlp": 1.017084, + "epoch": 0.8009018487900196, + "flos": 16726398554880.0, + "grad_norm": 1.9267544552564826, + "language_loss": 0.81560904, + "learning_rate": 4.015045008816138e-07, + "loss": 0.83654737, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 13321, + "time_per_iteration": 2.359140396118164 + }, + { + "auxiliary_loss_clip": 0.01048825, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.01104164, + "balance_loss_mlp": 1.01433873, + "epoch": 0.8009619720426875, + "flos": 20812563325440.0, + "grad_norm": 1.860669395848053, + "language_loss": 0.66980684, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.69061661, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34375, + "step": 13322, + "time_per_iteration": 2.348242998123169 + }, + { + "auxiliary_loss_clip": 0.01051492, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.01432085, + "balance_loss_mlp": 1.01611876, + "epoch": 0.8010220952953555, + "flos": 17930134010880.0, + "grad_norm": 1.7501099757362812, + "language_loss": 0.78971565, + "learning_rate": 4.010364878639265e-07, + "loss": 0.81060314, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 13323, + "time_per_iteration": 2.360288381576538 + }, + { + "auxiliary_loss_clip": 0.01053798, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.01173639, + "balance_loss_mlp": 1.01602066, + "epoch": 0.8010822185480234, + "flos": 24571068186240.0, + "grad_norm": 2.2340522059976675, + "language_loss": 0.73352575, + "learning_rate": 4.00802572299932e-07, + "loss": 0.75443053, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 13324, + "time_per_iteration": 3.8230392932891846 + }, + { + "auxiliary_loss_clip": 0.01053816, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.01235318, + "balance_loss_mlp": 1.0159018, + "epoch": 0.8011423418006914, + "flos": 21829059826560.0, + "grad_norm": 1.7224440933386287, + "language_loss": 0.7705043, + "learning_rate": 4.005687173776635e-07, + "loss": 0.79142261, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 13325, + "time_per_iteration": 2.3857614994049072 + }, + { + "auxiliary_loss_clip": 0.01047502, + "auxiliary_loss_mlp": 0.01034817, + "balance_loss_clip": 1.01285911, + "balance_loss_mlp": 1.01406908, + "epoch": 0.8012024650533593, + "flos": 23914037710080.0, + "grad_norm": 1.6374828178326157, + "language_loss": 0.80544239, + "learning_rate": 4.003349231059898e-07, + "loss": 0.82626557, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33398438, + "step": 13326, + "time_per_iteration": 2.379748821258545 + }, + { + "auxiliary_loss_clip": 0.01050569, + "auxiliary_loss_mlp": 0.01041937, + "balance_loss_clip": 1.02002633, + "balance_loss_mlp": 1.01590276, + "epoch": 0.8012625883060274, + "flos": 23585923952640.0, + "grad_norm": 2.120157775548809, + "language_loss": 0.67181134, + "learning_rate": 4.001011894937765e-07, + "loss": 0.69273639, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 13327, + "time_per_iteration": 2.3907110691070557 + }, + { + "auxiliary_loss_clip": 0.01049639, + "auxiliary_loss_mlp": 0.01036789, + "balance_loss_clip": 1.01423478, + "balance_loss_mlp": 1.01561117, + "epoch": 0.8013227115586953, + "flos": 20812877527680.0, + "grad_norm": 1.5469651349580793, + "language_loss": 0.74252659, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.7633909, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.33984375, + "step": 13328, + "time_per_iteration": 2.3585877418518066 + }, + { + "auxiliary_loss_clip": 0.01052366, + "auxiliary_loss_mlp": 0.01042959, + "balance_loss_clip": 1.0178895, + "balance_loss_mlp": 1.01599586, + "epoch": 0.8013828348113633, + "flos": 15887400739200.0, + "grad_norm": 1.9683315520131375, + "language_loss": 0.75541466, + "learning_rate": 3.996339042831798e-07, + "loss": 0.77636796, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 13329, + "time_per_iteration": 2.3579304218292236 + }, + { + "auxiliary_loss_clip": 0.01007896, + "auxiliary_loss_mlp": 0.01003801, + "balance_loss_clip": 1.00158405, + "balance_loss_mlp": 1.00105822, + "epoch": 0.8014429580640313, + "flos": 71059281292800.0, + "grad_norm": 0.6947462736174085, + "language_loss": 0.53083634, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.55095327, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.06835938, + "step": 13330, + "time_per_iteration": 3.0835137367248535 + }, + { + "auxiliary_loss_clip": 0.01053917, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.01415849, + "balance_loss_mlp": 1.01613927, + "epoch": 0.8015030813166992, + "flos": 23075214451200.0, + "grad_norm": 1.7232341271587923, + "language_loss": 0.74295926, + "learning_rate": 3.991668618167519e-07, + "loss": 0.76391751, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.37890625, + "step": 13331, + "time_per_iteration": 2.3858482837677 + }, + { + "auxiliary_loss_clip": 0.01050671, + "auxiliary_loss_mlp": 0.01035365, + "balance_loss_clip": 1.01424146, + "balance_loss_mlp": 1.01630259, + "epoch": 0.8015632045693672, + "flos": 21871234615680.0, + "grad_norm": 1.8245883531802316, + "language_loss": 0.77988845, + "learning_rate": 3.989334316347401e-07, + "loss": 0.80074883, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34375, + "step": 13332, + "time_per_iteration": 2.412506580352783 + }, + { + "auxiliary_loss_clip": 0.01052378, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.0137589, + "balance_loss_mlp": 1.01654899, + "epoch": 0.8016233278220352, + "flos": 23655680582400.0, + "grad_norm": 1.8287657830796051, + "language_loss": 0.84158325, + "learning_rate": 3.987000621653338e-07, + "loss": 0.86247063, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35742188, + "step": 13333, + "time_per_iteration": 2.3736159801483154 + }, + { + "auxiliary_loss_clip": 0.01052545, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_clip": 1.0167923, + "balance_loss_mlp": 1.01549244, + "epoch": 0.8016834510747032, + "flos": 16252243113600.0, + "grad_norm": 1.6290612763086185, + "language_loss": 0.74924469, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.77018911, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 13334, + "time_per_iteration": 2.3352394104003906 + }, + { + "auxiliary_loss_clip": 0.01051079, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.00970936, + "balance_loss_mlp": 1.01619148, + "epoch": 0.8017435743273711, + "flos": 12275216853120.0, + "grad_norm": 2.2138216747296457, + "language_loss": 0.7603662, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.78120083, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 13335, + "time_per_iteration": 2.3888309001922607 + }, + { + "auxiliary_loss_clip": 0.01051595, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.01144052, + "balance_loss_mlp": 1.01578081, + "epoch": 0.8018036975800391, + "flos": 17195317292160.0, + "grad_norm": 1.8837422668642683, + "language_loss": 0.77163488, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.79249954, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 13336, + "time_per_iteration": 2.334930181503296 + }, + { + "auxiliary_loss_clip": 0.01057078, + "auxiliary_loss_mlp": 0.01040056, + "balance_loss_clip": 1.01411593, + "balance_loss_mlp": 1.01797438, + "epoch": 0.801863820832707, + "flos": 20630805454080.0, + "grad_norm": 2.0479402315508883, + "language_loss": 0.76641911, + "learning_rate": 3.977671915907068e-07, + "loss": 0.78739047, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.390625, + "step": 13337, + "time_per_iteration": 2.3789114952087402 + }, + { + "auxiliary_loss_clip": 0.0105402, + "auxiliary_loss_mlp": 0.01042037, + "balance_loss_clip": 1.01826656, + "balance_loss_mlp": 1.01695466, + "epoch": 0.801923944085375, + "flos": 30444262364160.0, + "grad_norm": 1.7224452421461776, + "language_loss": 0.81142694, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.83238745, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 13338, + "time_per_iteration": 2.41096830368042 + }, + { + "auxiliary_loss_clip": 0.01052113, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.01365519, + "balance_loss_mlp": 1.0153352, + "epoch": 0.801984067338043, + "flos": 20009560988160.0, + "grad_norm": 1.952040912500146, + "language_loss": 0.75693399, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.77783418, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 13339, + "time_per_iteration": 2.3934712409973145 + }, + { + "auxiliary_loss_clip": 0.01050728, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.01246023, + "balance_loss_mlp": 1.01689601, + "epoch": 0.802044190590711, + "flos": 22782921615360.0, + "grad_norm": 1.620327394438716, + "language_loss": 0.79667127, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81752205, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 13340, + "time_per_iteration": 2.39589262008667 + }, + { + "auxiliary_loss_clip": 0.01051442, + "auxiliary_loss_mlp": 0.01039098, + "balance_loss_clip": 1.01734281, + "balance_loss_mlp": 1.01558757, + "epoch": 0.8021043138433789, + "flos": 27598875868800.0, + "grad_norm": 1.831597165258107, + "language_loss": 0.68651378, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70741922, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.359375, + "step": 13341, + "time_per_iteration": 2.434870958328247 + }, + { + "auxiliary_loss_clip": 0.01007948, + "auxiliary_loss_mlp": 0.01007703, + "balance_loss_clip": 1.00539064, + "balance_loss_mlp": 1.00099778, + "epoch": 0.8021644370960469, + "flos": 62060942828160.0, + "grad_norm": 0.8205224505314526, + "language_loss": 0.61909592, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63925248, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.06933594, + "step": 13342, + "time_per_iteration": 4.291916370391846 + }, + { + "auxiliary_loss_clip": 0.01053948, + "auxiliary_loss_mlp": 0.01040342, + "balance_loss_clip": 1.01398492, + "balance_loss_mlp": 1.01713276, + "epoch": 0.8022245603487148, + "flos": 23360839217280.0, + "grad_norm": 1.8345905560279367, + "language_loss": 0.64450687, + "learning_rate": 3.963697086102522e-07, + "loss": 0.6654498, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.3671875, + "step": 13343, + "time_per_iteration": 2.386303663253784 + }, + { + "auxiliary_loss_clip": 0.01048874, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.01161766, + "balance_loss_mlp": 1.01547432, + "epoch": 0.8022846836013828, + "flos": 10852575972480.0, + "grad_norm": 1.8874222884025036, + "language_loss": 0.69124103, + "learning_rate": 3.96137007563051e-07, + "loss": 0.71206462, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33398438, + "step": 13344, + "time_per_iteration": 2.3325257301330566 + }, + { + "auxiliary_loss_clip": 0.01053036, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.01379538, + "balance_loss_mlp": 1.01685929, + "epoch": 0.8023448068540509, + "flos": 29239200276480.0, + "grad_norm": 1.7017643353638807, + "language_loss": 0.71682376, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.73772049, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 13345, + "time_per_iteration": 2.4552292823791504 + }, + { + "auxiliary_loss_clip": 0.01007927, + "auxiliary_loss_mlp": 0.01002321, + "balance_loss_clip": 1.00000882, + "balance_loss_mlp": 1.0008738, + "epoch": 0.8024049301067188, + "flos": 64150459188480.0, + "grad_norm": 0.9235124125767383, + "language_loss": 0.63053, + "learning_rate": 3.956717879334059e-07, + "loss": 0.6506325, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.0703125, + "step": 13346, + "time_per_iteration": 3.0966389179229736 + }, + { + "auxiliary_loss_clip": 0.01050293, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.01287758, + "balance_loss_mlp": 1.01593232, + "epoch": 0.8024650533593868, + "flos": 28584089925120.0, + "grad_norm": 1.4661475628143572, + "language_loss": 0.73177934, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.75262266, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34375, + "step": 13347, + "time_per_iteration": 2.431648015975952 + }, + { + "auxiliary_loss_clip": 0.01051577, + "auxiliary_loss_mlp": 0.01035846, + "balance_loss_clip": 1.01341116, + "balance_loss_mlp": 1.01549089, + "epoch": 0.8025251766120547, + "flos": 16981334369280.0, + "grad_norm": 1.7848041742851792, + "language_loss": 0.74173111, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.76260531, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 13348, + "time_per_iteration": 3.7682647705078125 + }, + { + "auxiliary_loss_clip": 0.01051815, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.01342189, + "balance_loss_mlp": 1.01616299, + "epoch": 0.8025852998647227, + "flos": 22162584844800.0, + "grad_norm": 2.633565589423174, + "language_loss": 0.7729876, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.79387027, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 13349, + "time_per_iteration": 2.353545904159546 + }, + { + "auxiliary_loss_clip": 0.01051609, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.01727271, + "balance_loss_mlp": 1.01680374, + "epoch": 0.8026454231173906, + "flos": 22015949667840.0, + "grad_norm": 2.114923139987294, + "language_loss": 0.84553611, + "learning_rate": 3.947420787800755e-07, + "loss": 0.86643988, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 13350, + "time_per_iteration": 3.6777758598327637 + }, + { + "auxiliary_loss_clip": 0.01052193, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.01422381, + "balance_loss_mlp": 1.0167346, + "epoch": 0.8027055463700586, + "flos": 22490244754560.0, + "grad_norm": 1.6401560057988513, + "language_loss": 0.7245332, + "learning_rate": 3.945098036485679e-07, + "loss": 0.74541897, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 13351, + "time_per_iteration": 2.361637830734253 + }, + { + "auxiliary_loss_clip": 0.01050812, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.01307666, + "balance_loss_mlp": 1.01674318, + "epoch": 0.8027656696227266, + "flos": 28911645100800.0, + "grad_norm": 4.160200651377108, + "language_loss": 0.62426066, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.64511138, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34179688, + "step": 13352, + "time_per_iteration": 2.4500269889831543 + }, + { + "auxiliary_loss_clip": 0.01051646, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.01125801, + "balance_loss_mlp": 1.01628006, + "epoch": 0.8028257928753946, + "flos": 18588351473280.0, + "grad_norm": 1.9956389786070885, + "language_loss": 0.78019881, + "learning_rate": 3.940454360354046e-07, + "loss": 0.80105692, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 13353, + "time_per_iteration": 2.3454771041870117 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01041053, + "balance_loss_clip": 1.01476741, + "balance_loss_mlp": 1.01707458, + "epoch": 0.8028859161280625, + "flos": 19129156433280.0, + "grad_norm": 2.2114956149866685, + "language_loss": 0.74152648, + "learning_rate": 3.938133435713582e-07, + "loss": 0.76249444, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 13354, + "time_per_iteration": 2.3706772327423096 + }, + { + "auxiliary_loss_clip": 0.01052977, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.01691699, + "balance_loss_mlp": 1.01592541, + "epoch": 0.8029460393807305, + "flos": 20228850437760.0, + "grad_norm": 2.7240161130778486, + "language_loss": 0.66969424, + "learning_rate": 3.935813120140714e-07, + "loss": 0.69060946, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.37109375, + "step": 13355, + "time_per_iteration": 2.3424570560455322 + }, + { + "auxiliary_loss_clip": 0.01054902, + "auxiliary_loss_mlp": 0.01039631, + "balance_loss_clip": 1.01421583, + "balance_loss_mlp": 1.01692808, + "epoch": 0.8030061626333984, + "flos": 49781466789120.0, + "grad_norm": 2.9230096449408243, + "language_loss": 0.70407653, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.7250219, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 13356, + "time_per_iteration": 2.609100103378296 + }, + { + "auxiliary_loss_clip": 0.01053576, + "auxiliary_loss_mlp": 0.01036834, + "balance_loss_clip": 1.01512587, + "balance_loss_mlp": 1.01732349, + "epoch": 0.8030662858860664, + "flos": 21614204119680.0, + "grad_norm": 1.5703299637043893, + "language_loss": 0.77969623, + "learning_rate": 3.931174316549666e-07, + "loss": 0.80060035, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36328125, + "step": 13357, + "time_per_iteration": 2.347810983657837 + }, + { + "auxiliary_loss_clip": 0.01054253, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.01498961, + "balance_loss_mlp": 1.01680017, + "epoch": 0.8031264091387345, + "flos": 25628866692480.0, + "grad_norm": 1.5413428871589125, + "language_loss": 0.78116179, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.80208886, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 13358, + "time_per_iteration": 2.416837215423584 + }, + { + "auxiliary_loss_clip": 0.0105065, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.01192582, + "balance_loss_mlp": 1.01536763, + "epoch": 0.8031865323914024, + "flos": 19645207372800.0, + "grad_norm": 1.7178316680600967, + "language_loss": 0.85859776, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.87944329, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 13359, + "time_per_iteration": 2.3387808799743652 + }, + { + "auxiliary_loss_clip": 0.01050743, + "auxiliary_loss_mlp": 0.01039165, + "balance_loss_clip": 1.01719451, + "balance_loss_mlp": 1.01611495, + "epoch": 0.8032466556440704, + "flos": 26168135552640.0, + "grad_norm": 2.2326316017315104, + "language_loss": 0.74481964, + "learning_rate": 3.924220681368928e-07, + "loss": 0.7657187, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 13360, + "time_per_iteration": 2.3980977535247803 + }, + { + "auxiliary_loss_clip": 0.0105231, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.01410758, + "balance_loss_mlp": 1.01612628, + "epoch": 0.8033067788967383, + "flos": 25518855398400.0, + "grad_norm": 1.7692255626222524, + "language_loss": 0.70806348, + "learning_rate": 3.921904022048512e-07, + "loss": 0.72895181, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 13361, + "time_per_iteration": 2.3794262409210205 + }, + { + "auxiliary_loss_clip": 0.01054874, + "auxiliary_loss_mlp": 0.01043849, + "balance_loss_clip": 1.01911294, + "balance_loss_mlp": 1.01690078, + "epoch": 0.8033669021494063, + "flos": 24023141308800.0, + "grad_norm": 1.5344241936815886, + "language_loss": 0.71472299, + "learning_rate": 3.919587972411098e-07, + "loss": 0.73571026, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 13362, + "time_per_iteration": 2.4027817249298096 + }, + { + "auxiliary_loss_clip": 0.0105661, + "auxiliary_loss_mlp": 0.01046482, + "balance_loss_clip": 1.01884937, + "balance_loss_mlp": 1.01694763, + "epoch": 0.8034270254020742, + "flos": 13587252946560.0, + "grad_norm": 2.4743359371358133, + "language_loss": 0.81032646, + "learning_rate": 3.91727253254452e-07, + "loss": 0.83135736, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.3984375, + "step": 13363, + "time_per_iteration": 3.9008572101593018 + }, + { + "auxiliary_loss_clip": 0.01052716, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.01239765, + "balance_loss_mlp": 1.01650953, + "epoch": 0.8034871486547422, + "flos": 27411567091200.0, + "grad_norm": 2.0974019649309845, + "language_loss": 0.76328588, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.78417957, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 13364, + "time_per_iteration": 2.411773920059204 + }, + { + "auxiliary_loss_clip": 0.01052496, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.01008391, + "balance_loss_mlp": 1.01742244, + "epoch": 0.8035472719074102, + "flos": 32597216398080.0, + "grad_norm": 2.352804596033289, + "language_loss": 0.6291002, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.64995694, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 13365, + "time_per_iteration": 2.4736979007720947 + }, + { + "auxiliary_loss_clip": 0.01053668, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.01790404, + "balance_loss_mlp": 1.01674676, + "epoch": 0.8036073951600782, + "flos": 21286928234880.0, + "grad_norm": 1.7492410949288033, + "language_loss": 0.67413259, + "learning_rate": 3.910329872447706e-07, + "loss": 0.69508427, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 13366, + "time_per_iteration": 2.3595855236053467 + }, + { + "auxiliary_loss_clip": 0.01052224, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.01115751, + "balance_loss_mlp": 1.01682925, + "epoch": 0.8036675184127461, + "flos": 18112869400320.0, + "grad_norm": 1.91564155329551, + "language_loss": 0.74958402, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77045351, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 13367, + "time_per_iteration": 2.340679407119751 + }, + { + "auxiliary_loss_clip": 0.01050659, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.00990009, + "balance_loss_mlp": 1.01583803, + "epoch": 0.8037276416654141, + "flos": 26029111052160.0, + "grad_norm": 1.5102241490083312, + "language_loss": 0.75102967, + "learning_rate": 3.905704482846428e-07, + "loss": 0.77185571, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 13368, + "time_per_iteration": 2.4152610301971436 + }, + { + "auxiliary_loss_clip": 0.01054877, + "auxiliary_loss_mlp": 0.01040366, + "balance_loss_clip": 1.01588082, + "balance_loss_mlp": 1.01681185, + "epoch": 0.803787764918082, + "flos": 18801321966720.0, + "grad_norm": 2.0659227123862047, + "language_loss": 0.71523106, + "learning_rate": 3.90339270344789e-07, + "loss": 0.73618346, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38085938, + "step": 13369, + "time_per_iteration": 2.346855401992798 + }, + { + "auxiliary_loss_clip": 0.01050276, + "auxiliary_loss_mlp": 0.01038915, + "balance_loss_clip": 1.01644444, + "balance_loss_mlp": 1.01567674, + "epoch": 0.80384788817075, + "flos": 20224870542720.0, + "grad_norm": 2.332070694829283, + "language_loss": 0.74709988, + "learning_rate": 3.901081534434312e-07, + "loss": 0.76799178, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 13370, + "time_per_iteration": 2.3632099628448486 + }, + { + "auxiliary_loss_clip": 0.01054533, + "auxiliary_loss_mlp": 0.01044536, + "balance_loss_clip": 1.01745129, + "balance_loss_mlp": 1.01686764, + "epoch": 0.8039080114234181, + "flos": 18514300746240.0, + "grad_norm": 2.4378821026155664, + "language_loss": 0.87594378, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89693451, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.37695312, + "step": 13371, + "time_per_iteration": 2.333332061767578 + }, + { + "auxiliary_loss_clip": 0.01055475, + "auxiliary_loss_mlp": 0.01037821, + "balance_loss_clip": 1.01145196, + "balance_loss_mlp": 1.01699102, + "epoch": 0.803968134676086, + "flos": 22381420446720.0, + "grad_norm": 1.927583345598428, + "language_loss": 0.75550526, + "learning_rate": 3.89646102791259e-07, + "loss": 0.77643824, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38476562, + "step": 13372, + "time_per_iteration": 2.3718769550323486 + }, + { + "auxiliary_loss_clip": 0.01051343, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.01502633, + "balance_loss_mlp": 1.01518607, + "epoch": 0.804028257928754, + "flos": 23841418348800.0, + "grad_norm": 2.052047545042118, + "language_loss": 0.79891813, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81983232, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36132812, + "step": 13373, + "time_per_iteration": 2.392277717590332 + }, + { + "auxiliary_loss_clip": 0.01049686, + "auxiliary_loss_mlp": 0.01041061, + "balance_loss_clip": 1.01880431, + "balance_loss_mlp": 1.01562333, + "epoch": 0.8040883811814219, + "flos": 23549579360640.0, + "grad_norm": 1.7688629261445346, + "language_loss": 0.756464, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.77737153, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33984375, + "step": 13374, + "time_per_iteration": 2.3956637382507324 + }, + { + "auxiliary_loss_clip": 0.01053063, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.01174021, + "balance_loss_mlp": 1.01568484, + "epoch": 0.8041485044340899, + "flos": 19025254627200.0, + "grad_norm": 1.9928352070748598, + "language_loss": 0.70245135, + "learning_rate": 3.889534848207452e-07, + "loss": 0.723355, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 13375, + "time_per_iteration": 2.3643784523010254 + }, + { + "auxiliary_loss_clip": 0.01007722, + "auxiliary_loss_mlp": 0.01003464, + "balance_loss_clip": 1.0010438, + "balance_loss_mlp": 1.00080132, + "epoch": 0.8042086276867578, + "flos": 70001971545600.0, + "grad_norm": 0.7263777541205627, + "language_loss": 0.55774832, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57786024, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.06933594, + "step": 13376, + "time_per_iteration": 3.098686456680298 + }, + { + "auxiliary_loss_clip": 0.0105414, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.01233625, + "balance_loss_mlp": 1.01734209, + "epoch": 0.8042687509394258, + "flos": 21871339349760.0, + "grad_norm": 1.5031579436964206, + "language_loss": 0.73897696, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75986779, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 13377, + "time_per_iteration": 2.3809173107147217 + }, + { + "auxiliary_loss_clip": 0.01051863, + "auxiliary_loss_mlp": 0.01039281, + "balance_loss_clip": 1.01559401, + "balance_loss_mlp": 1.01578736, + "epoch": 0.8043288741920938, + "flos": 26613661812480.0, + "grad_norm": 1.8159647882692662, + "language_loss": 0.7215395, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.74245101, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 13378, + "time_per_iteration": 2.4194066524505615 + }, + { + "auxiliary_loss_clip": 0.0105445, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.00888896, + "balance_loss_mlp": 1.01740837, + "epoch": 0.8043889974447618, + "flos": 33401929392000.0, + "grad_norm": 1.396960069121706, + "language_loss": 0.69803643, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71891689, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37109375, + "step": 13379, + "time_per_iteration": 2.4932446479797363 + }, + { + "auxiliary_loss_clip": 0.01054826, + "auxiliary_loss_mlp": 0.01039483, + "balance_loss_clip": 1.01492631, + "balance_loss_mlp": 1.01732111, + "epoch": 0.8044491206974297, + "flos": 20374927032960.0, + "grad_norm": 1.6944671937747975, + "language_loss": 0.77746379, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.7984069, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 13380, + "time_per_iteration": 2.481358528137207 + }, + { + "auxiliary_loss_clip": 0.01050191, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.0114336, + "balance_loss_mlp": 1.01452518, + "epoch": 0.8045092439500977, + "flos": 23402699804160.0, + "grad_norm": 1.9414360163556912, + "language_loss": 0.69967115, + "learning_rate": 3.875698985740887e-07, + "loss": 0.72050858, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35742188, + "step": 13381, + "time_per_iteration": 2.3922390937805176 + }, + { + "auxiliary_loss_clip": 0.01054063, + "auxiliary_loss_mlp": 0.01039188, + "balance_loss_clip": 1.01490545, + "balance_loss_mlp": 1.01741099, + "epoch": 0.8045693672027656, + "flos": 24096109783680.0, + "grad_norm": 2.0034411651396673, + "language_loss": 0.65305787, + "learning_rate": 3.873395148176135e-07, + "loss": 0.67399037, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3671875, + "step": 13382, + "time_per_iteration": 3.6145522594451904 + }, + { + "auxiliary_loss_clip": 0.01051722, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.01489925, + "balance_loss_mlp": 1.01606131, + "epoch": 0.8046294904554336, + "flos": 27705989520000.0, + "grad_norm": 1.8127103148146533, + "language_loss": 0.77267605, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.79358733, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.35546875, + "step": 13383, + "time_per_iteration": 2.4675276279449463 + }, + { + "auxiliary_loss_clip": 0.01053159, + "auxiliary_loss_mlp": 0.01041432, + "balance_loss_clip": 1.01625454, + "balance_loss_mlp": 1.01638174, + "epoch": 0.8046896137081017, + "flos": 24971836216320.0, + "grad_norm": 1.8775215459795844, + "language_loss": 0.70712167, + "learning_rate": 3.868789307701381e-07, + "loss": 0.72806758, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 13384, + "time_per_iteration": 2.3875222206115723 + }, + { + "auxiliary_loss_clip": 0.01052398, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.01397705, + "balance_loss_mlp": 1.01527214, + "epoch": 0.8047497369607696, + "flos": 17674534880640.0, + "grad_norm": 2.060524913877213, + "language_loss": 0.8104583, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.8313657, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 13385, + "time_per_iteration": 2.3387656211853027 + }, + { + "auxiliary_loss_clip": 0.01052889, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.00952482, + "balance_loss_mlp": 1.01622629, + "epoch": 0.8048098602134376, + "flos": 22381001510400.0, + "grad_norm": 1.8774025184774346, + "language_loss": 0.73294789, + "learning_rate": 3.864185914015108e-07, + "loss": 0.7538088, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 13386, + "time_per_iteration": 2.379587411880493 + }, + { + "auxiliary_loss_clip": 0.01007939, + "auxiliary_loss_mlp": 0.01002985, + "balance_loss_clip": 1.00093508, + "balance_loss_mlp": 1.0010066, + "epoch": 0.8048699834661055, + "flos": 71197467920640.0, + "grad_norm": 0.6643578532361207, + "language_loss": 0.51362604, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53373528, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06933594, + "step": 13387, + "time_per_iteration": 3.0725021362304688 + }, + { + "auxiliary_loss_clip": 0.01052946, + "auxiliary_loss_mlp": 0.01038989, + "balance_loss_clip": 1.01310849, + "balance_loss_mlp": 1.01607084, + "epoch": 0.8049301067187735, + "flos": 23659171718400.0, + "grad_norm": 1.8147618151520588, + "language_loss": 0.74790478, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76882422, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3671875, + "step": 13388, + "time_per_iteration": 3.846576452255249 + }, + { + "auxiliary_loss_clip": 0.01050355, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.01137781, + "balance_loss_mlp": 1.01571655, + "epoch": 0.8049902299714414, + "flos": 24425166147840.0, + "grad_norm": 1.4706113700411905, + "language_loss": 0.72493446, + "learning_rate": 3.857285412741411e-07, + "loss": 0.74578834, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34570312, + "step": 13389, + "time_per_iteration": 3.799210786819458 + }, + { + "auxiliary_loss_clip": 0.01054071, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.01445282, + "balance_loss_mlp": 1.01705313, + "epoch": 0.8050503532241094, + "flos": 17491694757120.0, + "grad_norm": 1.9054734320014055, + "language_loss": 0.83945084, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.86038536, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37109375, + "step": 13390, + "time_per_iteration": 2.350457191467285 + }, + { + "auxiliary_loss_clip": 0.01007573, + "auxiliary_loss_mlp": 0.01002285, + "balance_loss_clip": 1.00010335, + "balance_loss_mlp": 1.00078273, + "epoch": 0.8051104764767774, + "flos": 57655112849280.0, + "grad_norm": 0.7873959840666273, + "language_loss": 0.55531448, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57541305, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.06787109, + "step": 13391, + "time_per_iteration": 3.0432040691375732 + }, + { + "auxiliary_loss_clip": 0.01050749, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.01170683, + "balance_loss_mlp": 1.01603794, + "epoch": 0.8051705997294454, + "flos": 18002508992640.0, + "grad_norm": 1.5424382181754943, + "language_loss": 0.85650045, + "learning_rate": 3.850390420667762e-07, + "loss": 0.87734532, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 13392, + "time_per_iteration": 2.380176305770874 + }, + { + "auxiliary_loss_clip": 0.0105232, + "auxiliary_loss_mlp": 0.01038247, + "balance_loss_clip": 1.01549029, + "balance_loss_mlp": 1.0162921, + "epoch": 0.8052307229821133, + "flos": 26396501955840.0, + "grad_norm": 1.3886930886977327, + "language_loss": 0.70785856, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72876418, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36132812, + "step": 13393, + "time_per_iteration": 2.4559824466705322 + }, + { + "auxiliary_loss_clip": 0.01053645, + "auxiliary_loss_mlp": 0.01040649, + "balance_loss_clip": 1.01726055, + "balance_loss_mlp": 1.01697659, + "epoch": 0.8052908462347813, + "flos": 21756091351680.0, + "grad_norm": 2.088333915745371, + "language_loss": 0.76889896, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78984189, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36523438, + "step": 13394, + "time_per_iteration": 2.402182102203369 + }, + { + "auxiliary_loss_clip": 0.01052257, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.01289499, + "balance_loss_mlp": 1.01652479, + "epoch": 0.8053509694874492, + "flos": 25441243712640.0, + "grad_norm": 1.572886581653276, + "language_loss": 0.65252447, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67342103, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35546875, + "step": 13395, + "time_per_iteration": 2.4021615982055664 + }, + { + "auxiliary_loss_clip": 0.01008093, + "auxiliary_loss_mlp": 0.01004344, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.00133109, + "epoch": 0.8054110927401172, + "flos": 57664922941440.0, + "grad_norm": 0.7543522517932097, + "language_loss": 0.57589275, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59601712, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.06738281, + "step": 13396, + "time_per_iteration": 3.1906328201293945 + }, + { + "auxiliary_loss_clip": 0.01051695, + "auxiliary_loss_mlp": 0.01046145, + "balance_loss_clip": 1.02208853, + "balance_loss_mlp": 1.016078, + "epoch": 0.8054712159927853, + "flos": 19275337762560.0, + "grad_norm": 1.7290534082398585, + "language_loss": 0.78522277, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.80620116, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 13397, + "time_per_iteration": 2.383577346801758 + }, + { + "auxiliary_loss_clip": 0.01053221, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.0103476, + "balance_loss_mlp": 1.01682353, + "epoch": 0.8055313392454532, + "flos": 17966653159680.0, + "grad_norm": 1.5463885250522846, + "language_loss": 0.71287692, + "learning_rate": 3.836616973531266e-07, + "loss": 0.73374599, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 13398, + "time_per_iteration": 2.369751453399658 + }, + { + "auxiliary_loss_clip": 0.0105158, + "auxiliary_loss_mlp": 0.01036939, + "balance_loss_clip": 1.01585102, + "balance_loss_mlp": 1.01602089, + "epoch": 0.8055914624981212, + "flos": 13477555854720.0, + "grad_norm": 2.0701267560785066, + "language_loss": 0.70478308, + "learning_rate": 3.834323543710805e-07, + "loss": 0.72566831, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35546875, + "step": 13399, + "time_per_iteration": 2.3394076824188232 + }, + { + "auxiliary_loss_clip": 0.01052943, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.01346362, + "balance_loss_mlp": 1.01643705, + "epoch": 0.8056515857507891, + "flos": 13223946672000.0, + "grad_norm": 2.261179531703943, + "language_loss": 0.73212171, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.75301665, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 13400, + "time_per_iteration": 2.360828161239624 + }, + { + "auxiliary_loss_clip": 0.01050417, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.01432228, + "balance_loss_mlp": 1.01488662, + "epoch": 0.8057117090034571, + "flos": 23877064713600.0, + "grad_norm": 1.903386264412526, + "language_loss": 0.65134025, + "learning_rate": 3.829738523169037e-07, + "loss": 0.67221832, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 13401, + "time_per_iteration": 2.374128580093384 + }, + { + "auxiliary_loss_clip": 0.01053868, + "auxiliary_loss_mlp": 0.01038785, + "balance_loss_clip": 1.01456201, + "balance_loss_mlp": 1.01669204, + "epoch": 0.805771832256125, + "flos": 21213261532800.0, + "grad_norm": 2.3779464637656584, + "language_loss": 0.8587656, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.8796922, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 13402, + "time_per_iteration": 2.390835762023926 + }, + { + "auxiliary_loss_clip": 0.01051935, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.01466656, + "balance_loss_mlp": 1.01600766, + "epoch": 0.805831955508793, + "flos": 17565850218240.0, + "grad_norm": 2.8536328263173383, + "language_loss": 0.6956023, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.71651578, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 13403, + "time_per_iteration": 3.88546085357666 + }, + { + "auxiliary_loss_clip": 0.01051982, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.01328063, + "balance_loss_mlp": 1.01689351, + "epoch": 0.805892078761461, + "flos": 26906303761920.0, + "grad_norm": 1.5781766394764625, + "language_loss": 0.86019218, + "learning_rate": 3.822865591408084e-07, + "loss": 0.88106465, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 13404, + "time_per_iteration": 2.41072416305542 + }, + { + "auxiliary_loss_clip": 0.010486, + "auxiliary_loss_mlp": 0.01037564, + "balance_loss_clip": 1.01644027, + "balance_loss_mlp": 1.01473927, + "epoch": 0.805952202014129, + "flos": 31505028336000.0, + "grad_norm": 1.6165632436593849, + "language_loss": 0.70553911, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72640073, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33789062, + "step": 13405, + "time_per_iteration": 2.4690568447113037 + }, + { + "auxiliary_loss_clip": 0.01051705, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.01404929, + "balance_loss_mlp": 1.01652431, + "epoch": 0.8060123252667969, + "flos": 24388786644480.0, + "grad_norm": 2.778448600905664, + "language_loss": 0.77020901, + "learning_rate": 3.818286703948788e-07, + "loss": 0.79108906, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 13406, + "time_per_iteration": 2.4174351692199707 + }, + { + "auxiliary_loss_clip": 0.01053341, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.01407588, + "balance_loss_mlp": 1.01660216, + "epoch": 0.8060724485194649, + "flos": 23478740478720.0, + "grad_norm": 1.786210236832377, + "language_loss": 0.76273715, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78365135, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 13407, + "time_per_iteration": 2.412099599838257 + }, + { + "auxiliary_loss_clip": 0.01051489, + "auxiliary_loss_mlp": 0.01037769, + "balance_loss_clip": 1.0136764, + "balance_loss_mlp": 1.01570451, + "epoch": 0.8061325717721328, + "flos": 18623509079040.0, + "grad_norm": 1.7269267979454246, + "language_loss": 0.75258756, + "learning_rate": 3.81371027093822e-07, + "loss": 0.77348012, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35742188, + "step": 13408, + "time_per_iteration": 2.369699478149414 + }, + { + "auxiliary_loss_clip": 0.01053029, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.01440334, + "balance_loss_mlp": 1.01713753, + "epoch": 0.8061926950248008, + "flos": 23581734589440.0, + "grad_norm": 1.8914670654570915, + "language_loss": 0.71046233, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.73138577, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.359375, + "step": 13409, + "time_per_iteration": 2.4111976623535156 + }, + { + "auxiliary_loss_clip": 0.01051322, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.01027453, + "balance_loss_mlp": 1.01525402, + "epoch": 0.8062528182774689, + "flos": 11142599569920.0, + "grad_norm": 2.274251987702002, + "language_loss": 0.78541946, + "learning_rate": 3.809136293070545e-07, + "loss": 0.80628765, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.359375, + "step": 13410, + "time_per_iteration": 2.346719264984131 + }, + { + "auxiliary_loss_clip": 0.01052047, + "auxiliary_loss_mlp": 0.01041218, + "balance_loss_clip": 1.01735258, + "balance_loss_mlp": 1.01646018, + "epoch": 0.8063129415301368, + "flos": 22345704259200.0, + "grad_norm": 2.262038431030081, + "language_loss": 0.70072436, + "learning_rate": 3.806850225032117e-07, + "loss": 0.72165704, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 13411, + "time_per_iteration": 2.356912612915039 + }, + { + "auxiliary_loss_clip": 0.01050363, + "auxiliary_loss_mlp": 0.01036615, + "balance_loss_clip": 1.01373911, + "balance_loss_mlp": 1.01574874, + "epoch": 0.8063730647828048, + "flos": 23987250564480.0, + "grad_norm": 1.9311076418588045, + "language_loss": 0.69143856, + "learning_rate": 3.804564771039551e-07, + "loss": 0.71230829, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34570312, + "step": 13412, + "time_per_iteration": 2.3882107734680176 + }, + { + "auxiliary_loss_clip": 0.01055072, + "auxiliary_loss_mlp": 0.01040078, + "balance_loss_clip": 1.01347065, + "balance_loss_mlp": 1.01764512, + "epoch": 0.8064331880354727, + "flos": 21320514829440.0, + "grad_norm": 5.67426528533234, + "language_loss": 0.82153034, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.84248185, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.375, + "step": 13413, + "time_per_iteration": 2.359790086746216 + }, + { + "auxiliary_loss_clip": 0.01051822, + "auxiliary_loss_mlp": 0.01038728, + "balance_loss_clip": 1.0161376, + "balance_loss_mlp": 1.01684749, + "epoch": 0.8064933112881407, + "flos": 19681866167040.0, + "grad_norm": 1.8733685655180479, + "language_loss": 0.85800618, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87891167, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 13414, + "time_per_iteration": 2.3614144325256348 + }, + { + "auxiliary_loss_clip": 0.01050314, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.01361799, + "balance_loss_mlp": 1.01504827, + "epoch": 0.8065534345408086, + "flos": 19278759075840.0, + "grad_norm": 1.9165217691272494, + "language_loss": 0.68507355, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.70592636, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 13415, + "time_per_iteration": 2.344886064529419 + }, + { + "auxiliary_loss_clip": 0.01049914, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.01371741, + "balance_loss_mlp": 1.01615536, + "epoch": 0.8066135577934767, + "flos": 19676838931200.0, + "grad_norm": 1.6110398097091148, + "language_loss": 0.77341998, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.79426539, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33789062, + "step": 13416, + "time_per_iteration": 2.3797390460968018 + }, + { + "auxiliary_loss_clip": 0.01053928, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.01297879, + "balance_loss_mlp": 1.01648784, + "epoch": 0.8066736810461446, + "flos": 21142492473600.0, + "grad_norm": 1.7851227184126095, + "language_loss": 0.65994352, + "learning_rate": 3.793146714797086e-07, + "loss": 0.68084693, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 13417, + "time_per_iteration": 2.366173028945923 + }, + { + "auxiliary_loss_clip": 0.01053894, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.02115536, + "balance_loss_mlp": 1.01782215, + "epoch": 0.8067338042988126, + "flos": 22597253671680.0, + "grad_norm": 1.7009651041422313, + "language_loss": 0.81892478, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.83990049, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 13418, + "time_per_iteration": 2.414052724838257 + }, + { + "auxiliary_loss_clip": 0.0105323, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.01229048, + "balance_loss_mlp": 1.01639307, + "epoch": 0.8067939275514805, + "flos": 16507493130240.0, + "grad_norm": 1.481544153823053, + "language_loss": 0.85721123, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87810212, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 13419, + "time_per_iteration": 2.3421061038970947 + }, + { + "auxiliary_loss_clip": 0.0105422, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.01608205, + "balance_loss_mlp": 1.01713061, + "epoch": 0.8068540508041485, + "flos": 28540763061120.0, + "grad_norm": 1.6447227824985062, + "language_loss": 0.7710138, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.79194283, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37109375, + "step": 13420, + "time_per_iteration": 2.4338624477386475 + }, + { + "auxiliary_loss_clip": 0.01050086, + "auxiliary_loss_mlp": 0.01032247, + "balance_loss_clip": 1.01201761, + "balance_loss_mlp": 1.01520944, + "epoch": 0.8069141740568164, + "flos": 21651421495680.0, + "grad_norm": 1.767170700296049, + "language_loss": 0.79414082, + "learning_rate": 3.784023331462207e-07, + "loss": 0.81496418, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.34765625, + "step": 13421, + "time_per_iteration": 3.7605481147766113 + }, + { + "auxiliary_loss_clip": 0.01054041, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.01153994, + "balance_loss_mlp": 1.01706195, + "epoch": 0.8069742973094844, + "flos": 17528388462720.0, + "grad_norm": 1.9705695317516094, + "language_loss": 0.80515051, + "learning_rate": 3.78174402269098e-07, + "loss": 0.82604063, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 13422, + "time_per_iteration": 2.409130334854126 + }, + { + "auxiliary_loss_clip": 0.01051443, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.01285362, + "balance_loss_mlp": 1.01594055, + "epoch": 0.8070344205621525, + "flos": 23365936275840.0, + "grad_norm": 1.5932816533872878, + "language_loss": 0.69434214, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.71519583, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35546875, + "step": 13423, + "time_per_iteration": 2.3649544715881348 + }, + { + "auxiliary_loss_clip": 0.01053003, + "auxiliary_loss_mlp": 0.0103812, + "balance_loss_clip": 1.01330066, + "balance_loss_mlp": 1.01635456, + "epoch": 0.8070945438148204, + "flos": 22929068033280.0, + "grad_norm": 1.847940134715173, + "language_loss": 0.8111701, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.83208138, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 13424, + "time_per_iteration": 2.398940086364746 + }, + { + "auxiliary_loss_clip": 0.0105355, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.01292109, + "balance_loss_mlp": 1.01545405, + "epoch": 0.8071546670674884, + "flos": 25299531037440.0, + "grad_norm": 1.5238216838263303, + "language_loss": 0.79820281, + "learning_rate": 3.774909786710232e-07, + "loss": 0.8190968, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.38085938, + "step": 13425, + "time_per_iteration": 2.3807871341705322 + }, + { + "auxiliary_loss_clip": 0.01051259, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.01530159, + "balance_loss_mlp": 1.01590419, + "epoch": 0.8072147903201563, + "flos": 18112729754880.0, + "grad_norm": 3.283867708872226, + "language_loss": 0.78196514, + "learning_rate": 3.772632938448923e-07, + "loss": 0.8028546, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 13426, + "time_per_iteration": 2.3485589027404785 + }, + { + "auxiliary_loss_clip": 0.01051617, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.00922859, + "balance_loss_mlp": 1.01543391, + "epoch": 0.8072749135728243, + "flos": 26686944489600.0, + "grad_norm": 1.8424019134496252, + "language_loss": 0.73808229, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75892651, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 13427, + "time_per_iteration": 3.8178462982177734 + }, + { + "auxiliary_loss_clip": 0.0105185, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.02140975, + "balance_loss_mlp": 1.01654732, + "epoch": 0.8073350368254922, + "flos": 19239412106880.0, + "grad_norm": 1.827176644457688, + "language_loss": 0.70866156, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72963452, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 13428, + "time_per_iteration": 2.375622034072876 + }, + { + "auxiliary_loss_clip": 0.01051919, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.01326561, + "balance_loss_mlp": 1.01558208, + "epoch": 0.8073951600781603, + "flos": 13333678675200.0, + "grad_norm": 2.0540338257865773, + "language_loss": 0.76249719, + "learning_rate": 3.765806086070544e-07, + "loss": 0.78336895, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36328125, + "step": 13429, + "time_per_iteration": 3.7719759941101074 + }, + { + "auxiliary_loss_clip": 0.01050056, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.01676929, + "balance_loss_mlp": 1.01578832, + "epoch": 0.8074552833308282, + "flos": 22852189486080.0, + "grad_norm": 2.128931171493489, + "language_loss": 0.68129158, + "learning_rate": 3.763531699700568e-07, + "loss": 0.70218837, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34375, + "step": 13430, + "time_per_iteration": 2.4939937591552734 + }, + { + "auxiliary_loss_clip": 0.01051083, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.01159644, + "balance_loss_mlp": 1.01632977, + "epoch": 0.8075154065834962, + "flos": 20338372972800.0, + "grad_norm": 1.7287073923273004, + "language_loss": 0.80708373, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82794374, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34765625, + "step": 13431, + "time_per_iteration": 2.3655526638031006 + }, + { + "auxiliary_loss_clip": 0.01050484, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.01320374, + "balance_loss_mlp": 1.01611698, + "epoch": 0.8075755298361641, + "flos": 21906985714560.0, + "grad_norm": 1.8875366249539414, + "language_loss": 0.81560194, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.83646584, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34375, + "step": 13432, + "time_per_iteration": 2.355210542678833 + }, + { + "auxiliary_loss_clip": 0.01057745, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.01358652, + "balance_loss_mlp": 1.01847219, + "epoch": 0.8076356530888321, + "flos": 15668390580480.0, + "grad_norm": 2.5332593497044993, + "language_loss": 0.71750236, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.73847258, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.39257812, + "step": 13433, + "time_per_iteration": 2.5151493549346924 + }, + { + "auxiliary_loss_clip": 0.01051245, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.0159893, + "balance_loss_mlp": 1.01573801, + "epoch": 0.8076957763415, + "flos": 37775953255680.0, + "grad_norm": 2.0090479797311884, + "language_loss": 0.73452079, + "learning_rate": 3.754440311967828e-07, + "loss": 0.75541651, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 13434, + "time_per_iteration": 2.508953809738159 + }, + { + "auxiliary_loss_clip": 0.0105295, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.01251626, + "balance_loss_mlp": 1.01700342, + "epoch": 0.807755899594168, + "flos": 19609735564800.0, + "grad_norm": 1.7348824420220699, + "language_loss": 0.69324529, + "learning_rate": 3.752169004902361e-07, + "loss": 0.71412587, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 13435, + "time_per_iteration": 2.3647587299346924 + }, + { + "auxiliary_loss_clip": 0.01054624, + "auxiliary_loss_mlp": 0.01044926, + "balance_loss_clip": 1.0168643, + "balance_loss_mlp": 1.01702309, + "epoch": 0.8078160228468361, + "flos": 23293770762240.0, + "grad_norm": 1.45012187079473, + "language_loss": 0.75856638, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77956194, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.37695312, + "step": 13436, + "time_per_iteration": 2.3801019191741943 + }, + { + "auxiliary_loss_clip": 0.01049002, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.01317668, + "balance_loss_mlp": 1.01467729, + "epoch": 0.807876146099504, + "flos": 27161414133120.0, + "grad_norm": 1.6770231779432119, + "language_loss": 0.71901429, + "learning_rate": 3.747628239215674e-07, + "loss": 0.73985791, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34375, + "step": 13437, + "time_per_iteration": 2.4290661811828613 + }, + { + "auxiliary_loss_clip": 0.01053065, + "auxiliary_loss_mlp": 0.01039467, + "balance_loss_clip": 1.01698422, + "balance_loss_mlp": 1.01805353, + "epoch": 0.807936269352172, + "flos": 27158865603840.0, + "grad_norm": 2.0961456721926814, + "language_loss": 0.73628068, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75720596, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 13438, + "time_per_iteration": 2.4366037845611572 + }, + { + "auxiliary_loss_clip": 0.0105045, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.01540661, + "balance_loss_mlp": 1.01548314, + "epoch": 0.8079963926048399, + "flos": 20739385382400.0, + "grad_norm": 2.3165735096225664, + "language_loss": 0.78450239, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.80540061, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.34960938, + "step": 13439, + "time_per_iteration": 2.371670722961426 + }, + { + "auxiliary_loss_clip": 0.01051602, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.01527882, + "balance_loss_mlp": 1.01565337, + "epoch": 0.8080565158575079, + "flos": 25008495010560.0, + "grad_norm": 1.473915016231069, + "language_loss": 0.7925241, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.81341177, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 13440, + "time_per_iteration": 2.435093879699707 + }, + { + "auxiliary_loss_clip": 0.01053355, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.00826871, + "balance_loss_mlp": 1.01628852, + "epoch": 0.8081166391101758, + "flos": 18697071047040.0, + "grad_norm": 1.8491130069484432, + "language_loss": 0.60486245, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.62571329, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 13441, + "time_per_iteration": 2.3419432640075684 + }, + { + "auxiliary_loss_clip": 0.0105065, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.01157355, + "balance_loss_mlp": 1.01577163, + "epoch": 0.8081767623628439, + "flos": 19827628560000.0, + "grad_norm": 1.9026548207241158, + "language_loss": 0.77337962, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.79423851, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34960938, + "step": 13442, + "time_per_iteration": 2.390512228012085 + }, + { + "auxiliary_loss_clip": 0.01052495, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.01475286, + "balance_loss_mlp": 1.01675606, + "epoch": 0.8082368856155118, + "flos": 35771484700800.0, + "grad_norm": 1.5731403435637927, + "language_loss": 0.71661943, + "learning_rate": 3.734020735906169e-07, + "loss": 0.73752344, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35742188, + "step": 13443, + "time_per_iteration": 3.9548356533050537 + }, + { + "auxiliary_loss_clip": 0.01050838, + "auxiliary_loss_mlp": 0.01041757, + "balance_loss_clip": 1.01964402, + "balance_loss_mlp": 1.01659334, + "epoch": 0.8082970088681798, + "flos": 17197167594240.0, + "grad_norm": 1.9672518992512014, + "language_loss": 0.83458567, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.85551161, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34179688, + "step": 13444, + "time_per_iteration": 2.320800542831421 + }, + { + "auxiliary_loss_clip": 0.01008032, + "auxiliary_loss_mlp": 0.01002526, + "balance_loss_clip": 1.0002017, + "balance_loss_mlp": 1.00142503, + "epoch": 0.8083571321208477, + "flos": 63552502465920.0, + "grad_norm": 0.818557242727839, + "language_loss": 0.53649545, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55660105, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.06640625, + "step": 13445, + "time_per_iteration": 2.913663864135742 + }, + { + "auxiliary_loss_clip": 0.01050864, + "auxiliary_loss_mlp": 0.01039235, + "balance_loss_clip": 1.01570332, + "balance_loss_mlp": 1.01576197, + "epoch": 0.8084172553735157, + "flos": 17929749985920.0, + "grad_norm": 2.0194740649784237, + "language_loss": 0.73146737, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.75236839, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 13446, + "time_per_iteration": 2.33137845993042 + }, + { + "auxiliary_loss_clip": 0.01053506, + "auxiliary_loss_mlp": 0.0103762, + "balance_loss_clip": 1.0136348, + "balance_loss_mlp": 1.01690662, + "epoch": 0.8084773786261836, + "flos": 24096842922240.0, + "grad_norm": 1.6718605100323016, + "language_loss": 0.72282398, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.74373525, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 13447, + "time_per_iteration": 2.4201507568359375 + }, + { + "auxiliary_loss_clip": 0.01053789, + "auxiliary_loss_mlp": 0.01040188, + "balance_loss_clip": 1.01677501, + "balance_loss_mlp": 1.01630545, + "epoch": 0.8085375018788516, + "flos": 15587532138240.0, + "grad_norm": 2.269326596072333, + "language_loss": 0.75950468, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.78044444, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 13448, + "time_per_iteration": 2.31272554397583 + }, + { + "auxiliary_loss_clip": 0.01007884, + "auxiliary_loss_mlp": 0.01002318, + "balance_loss_clip": 0.99982673, + "balance_loss_mlp": 1.00116611, + "epoch": 0.8085976251315197, + "flos": 67557667282560.0, + "grad_norm": 0.7380583066120563, + "language_loss": 0.63960838, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65971041, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.06738281, + "step": 13449, + "time_per_iteration": 3.0928773880004883 + }, + { + "auxiliary_loss_clip": 0.01054691, + "auxiliary_loss_mlp": 0.01035674, + "balance_loss_clip": 1.01124847, + "balance_loss_mlp": 1.01750958, + "epoch": 0.8086577483841876, + "flos": 22560455232000.0, + "grad_norm": 1.9634590941012067, + "language_loss": 0.75485468, + "learning_rate": 3.718173381422105e-07, + "loss": 0.77575833, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 13450, + "time_per_iteration": 2.3544061183929443 + }, + { + "auxiliary_loss_clip": 0.0105084, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.01294231, + "balance_loss_mlp": 1.01562858, + "epoch": 0.8087178716368556, + "flos": 17967107007360.0, + "grad_norm": 1.8640227758140813, + "language_loss": 0.74621117, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.76707274, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 13451, + "time_per_iteration": 2.3836257457733154 + }, + { + "auxiliary_loss_clip": 0.01054465, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.01416898, + "balance_loss_mlp": 1.0154438, + "epoch": 0.8087779948895235, + "flos": 21718629596160.0, + "grad_norm": 1.66352841104271, + "language_loss": 0.80956572, + "learning_rate": 3.713651121244543e-07, + "loss": 0.83051431, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.390625, + "step": 13452, + "time_per_iteration": 2.355262279510498 + }, + { + "auxiliary_loss_clip": 0.01053232, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.01492798, + "balance_loss_mlp": 1.01688564, + "epoch": 0.8088381181421915, + "flos": 29091692315520.0, + "grad_norm": 1.666224452431801, + "language_loss": 0.79475772, + "learning_rate": 3.711390917482875e-07, + "loss": 0.81565827, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 13453, + "time_per_iteration": 2.516028881072998 + }, + { + "auxiliary_loss_clip": 0.01052682, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.01167405, + "balance_loss_mlp": 1.01623583, + "epoch": 0.8088982413948594, + "flos": 22197393336960.0, + "grad_norm": 2.373523065528196, + "language_loss": 0.78647554, + "learning_rate": 3.709131331386892e-07, + "loss": 0.80736303, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 13454, + "time_per_iteration": 2.343926191329956 + }, + { + "auxiliary_loss_clip": 0.01051979, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.01676178, + "balance_loss_mlp": 1.01649499, + "epoch": 0.8089583646475275, + "flos": 28035499731840.0, + "grad_norm": 1.83512587741484, + "language_loss": 0.78372794, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.80464232, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 13455, + "time_per_iteration": 2.443310499191284 + }, + { + "auxiliary_loss_clip": 0.01051054, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.01239491, + "balance_loss_mlp": 1.01576197, + "epoch": 0.8090184879001954, + "flos": 16616806197120.0, + "grad_norm": 1.9617370196375727, + "language_loss": 0.80017108, + "learning_rate": 3.70461401253471e-07, + "loss": 0.82103205, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 13456, + "time_per_iteration": 2.3204727172851562 + }, + { + "auxiliary_loss_clip": 0.01052036, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.01555133, + "balance_loss_mlp": 1.01724839, + "epoch": 0.8090786111528634, + "flos": 27339680868480.0, + "grad_norm": 2.1415195756234797, + "language_loss": 0.72925925, + "learning_rate": 3.702356279949801e-07, + "loss": 0.75016189, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 13457, + "time_per_iteration": 2.4659838676452637 + }, + { + "auxiliary_loss_clip": 0.01051489, + "auxiliary_loss_mlp": 0.01035855, + "balance_loss_clip": 1.01332486, + "balance_loss_mlp": 1.01610494, + "epoch": 0.8091387344055313, + "flos": 21104681604480.0, + "grad_norm": 2.0741949312442913, + "language_loss": 0.73757219, + "learning_rate": 3.700099165373176e-07, + "loss": 0.75844562, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 13458, + "time_per_iteration": 2.3565473556518555 + }, + { + "auxiliary_loss_clip": 0.01052263, + "auxiliary_loss_mlp": 0.0103867, + "balance_loss_clip": 1.01622319, + "balance_loss_mlp": 1.01652861, + "epoch": 0.8091988576581993, + "flos": 11654286589440.0, + "grad_norm": 2.1891455450077655, + "language_loss": 0.80917591, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.83008528, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 13459, + "time_per_iteration": 2.379330635070801 + }, + { + "auxiliary_loss_clip": 0.01051843, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.01307297, + "balance_loss_mlp": 1.01583242, + "epoch": 0.8092589809108672, + "flos": 22962305514240.0, + "grad_norm": 2.0510997858079216, + "language_loss": 0.8136006, + "learning_rate": 3.695586790587113e-07, + "loss": 0.83449358, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 13460, + "time_per_iteration": 2.3499274253845215 + }, + { + "auxiliary_loss_clip": 0.01052426, + "auxiliary_loss_mlp": 0.01036978, + "balance_loss_clip": 1.01279044, + "balance_loss_mlp": 1.01566994, + "epoch": 0.8093191041635353, + "flos": 13260151618560.0, + "grad_norm": 2.0535902385791447, + "language_loss": 0.85104781, + "learning_rate": 3.693331530548789e-07, + "loss": 0.8719418, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 13461, + "time_per_iteration": 3.5869734287261963 + }, + { + "auxiliary_loss_clip": 0.01054426, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.01435328, + "balance_loss_mlp": 1.01744437, + "epoch": 0.8093792274162032, + "flos": 25514945326080.0, + "grad_norm": 1.8283591985176675, + "language_loss": 0.77877307, + "learning_rate": 3.69107688886096e-07, + "loss": 0.79969811, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 13462, + "time_per_iteration": 2.3893134593963623 + }, + { + "auxiliary_loss_clip": 0.01052647, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.0133791, + "balance_loss_mlp": 1.01669121, + "epoch": 0.8094393506688712, + "flos": 23545459820160.0, + "grad_norm": 1.5854310543937062, + "language_loss": 0.83947957, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.86037594, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 13463, + "time_per_iteration": 2.4299380779266357 + }, + { + "auxiliary_loss_clip": 0.01051043, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.01769483, + "balance_loss_mlp": 1.01640463, + "epoch": 0.8094994739215392, + "flos": 17054966160000.0, + "grad_norm": 1.8202108728101964, + "language_loss": 0.63531476, + "learning_rate": 3.686569460878779e-07, + "loss": 0.65621293, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34570312, + "step": 13464, + "time_per_iteration": 2.3412845134735107 + }, + { + "auxiliary_loss_clip": 0.01050785, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.0138309, + "balance_loss_mlp": 1.01558614, + "epoch": 0.8095595971742071, + "flos": 23550068119680.0, + "grad_norm": 1.599761510553912, + "language_loss": 0.62940919, + "learning_rate": 3.684316674755341e-07, + "loss": 0.6502763, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 13465, + "time_per_iteration": 2.444350481033325 + }, + { + "auxiliary_loss_clip": 0.01053208, + "auxiliary_loss_mlp": 0.01039095, + "balance_loss_clip": 1.01625419, + "balance_loss_mlp": 1.01793396, + "epoch": 0.8096197204268751, + "flos": 20372238858240.0, + "grad_norm": 3.95783447622993, + "language_loss": 0.83310199, + "learning_rate": 3.682064507324256e-07, + "loss": 0.85402501, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 13466, + "time_per_iteration": 2.348133087158203 + }, + { + "auxiliary_loss_clip": 0.01053623, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.01861787, + "balance_loss_mlp": 1.01698732, + "epoch": 0.809679843679543, + "flos": 27817536913920.0, + "grad_norm": 1.823235149185082, + "language_loss": 0.77583516, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.79679692, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 13467, + "time_per_iteration": 3.793680191040039 + }, + { + "auxiliary_loss_clip": 0.01051318, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.01171017, + "balance_loss_mlp": 1.01505053, + "epoch": 0.8097399669322111, + "flos": 22013121847680.0, + "grad_norm": 1.7904436798361845, + "language_loss": 0.796363, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81722248, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 13468, + "time_per_iteration": 2.3501980304718018 + }, + { + "auxiliary_loss_clip": 0.01048339, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.01441503, + "balance_loss_mlp": 1.01386046, + "epoch": 0.809800090184879, + "flos": 18988002339840.0, + "grad_norm": 1.8069876621597911, + "language_loss": 0.69087934, + "learning_rate": 3.675311718038978e-07, + "loss": 0.71171606, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 13469, + "time_per_iteration": 3.7157602310180664 + }, + { + "auxiliary_loss_clip": 0.01007499, + "auxiliary_loss_mlp": 0.0100289, + "balance_loss_clip": 1.00048161, + "balance_loss_mlp": 1.00082314, + "epoch": 0.809860213437547, + "flos": 66095993635200.0, + "grad_norm": 0.6921169878327617, + "language_loss": 0.547786, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56788987, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.06640625, + "step": 13470, + "time_per_iteration": 3.093693971633911 + }, + { + "auxiliary_loss_clip": 0.01050852, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.01561236, + "balance_loss_mlp": 1.01558042, + "epoch": 0.8099203366902149, + "flos": 20881551905280.0, + "grad_norm": 1.8097297454159427, + "language_loss": 0.70574093, + "learning_rate": 3.670812953542279e-07, + "loss": 0.72662383, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 13471, + "time_per_iteration": 2.3706915378570557 + }, + { + "auxiliary_loss_clip": 0.01051414, + "auxiliary_loss_mlp": 0.01036043, + "balance_loss_clip": 1.01428771, + "balance_loss_mlp": 1.01573825, + "epoch": 0.8099804599428829, + "flos": 26029739456640.0, + "grad_norm": 1.6724812220380318, + "language_loss": 0.81317949, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.83405405, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35742188, + "step": 13472, + "time_per_iteration": 2.422808885574341 + }, + { + "auxiliary_loss_clip": 0.0100768, + "auxiliary_loss_mlp": 0.01004119, + "balance_loss_clip": 1.00188982, + "balance_loss_mlp": 1.00076342, + "epoch": 0.8100405831955508, + "flos": 69300147928320.0, + "grad_norm": 0.7509216571401784, + "language_loss": 0.57882953, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59894753, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.06933594, + "step": 13473, + "time_per_iteration": 2.9940404891967773 + }, + { + "auxiliary_loss_clip": 0.01052256, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.01181769, + "balance_loss_mlp": 1.01565754, + "epoch": 0.8101007064482189, + "flos": 15011604483840.0, + "grad_norm": 2.486543532130282, + "language_loss": 0.75355697, + "learning_rate": 3.664069451043399e-07, + "loss": 0.77443129, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36523438, + "step": 13474, + "time_per_iteration": 2.3868792057037354 + }, + { + "auxiliary_loss_clip": 0.01055264, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.01957238, + "balance_loss_mlp": 1.01791835, + "epoch": 0.8101608297008868, + "flos": 21066207419520.0, + "grad_norm": 1.6582642377009045, + "language_loss": 0.79815245, + "learning_rate": 3.661822855683723e-07, + "loss": 0.81913626, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37304688, + "step": 13475, + "time_per_iteration": 2.3599956035614014 + }, + { + "auxiliary_loss_clip": 0.01051253, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.01609814, + "balance_loss_mlp": 1.01595402, + "epoch": 0.8102209529535548, + "flos": 23730185157120.0, + "grad_norm": 1.5651317865772785, + "language_loss": 0.76163375, + "learning_rate": 3.659576879869364e-07, + "loss": 0.78253055, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 13476, + "time_per_iteration": 2.424445867538452 + }, + { + "auxiliary_loss_clip": 0.0105414, + "auxiliary_loss_mlp": 0.01039947, + "balance_loss_clip": 1.0143888, + "balance_loss_mlp": 1.01651227, + "epoch": 0.8102810762062228, + "flos": 10955290792320.0, + "grad_norm": 2.4009112248604865, + "language_loss": 0.7545836, + "learning_rate": 3.657331523685485e-07, + "loss": 0.7755245, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 13477, + "time_per_iteration": 2.3566675186157227 + }, + { + "auxiliary_loss_clip": 0.01053276, + "auxiliary_loss_mlp": 0.01039881, + "balance_loss_clip": 1.01521611, + "balance_loss_mlp": 1.01684058, + "epoch": 0.8103411994588907, + "flos": 14647914184320.0, + "grad_norm": 2.1724278491283626, + "language_loss": 0.71089494, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.73182654, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 13478, + "time_per_iteration": 2.4069743156433105 + }, + { + "auxiliary_loss_clip": 0.01007383, + "auxiliary_loss_mlp": 0.0100206, + "balance_loss_clip": 0.99986672, + "balance_loss_mlp": 1.0006249, + "epoch": 0.8104013227115587, + "flos": 59150373091200.0, + "grad_norm": 0.6824996788942684, + "language_loss": 0.52202493, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54211938, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06738281, + "step": 13479, + "time_per_iteration": 2.962573289871216 + }, + { + "auxiliary_loss_clip": 0.01051322, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.01572311, + "balance_loss_mlp": 1.01641846, + "epoch": 0.8104614459642266, + "flos": 19827663471360.0, + "grad_norm": 1.5278416475389829, + "language_loss": 0.72396559, + "learning_rate": 3.650599173768072e-07, + "loss": 0.74485707, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 13480, + "time_per_iteration": 2.4261791706085205 + }, + { + "auxiliary_loss_clip": 0.0105236, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.01065814, + "balance_loss_mlp": 1.01662302, + "epoch": 0.8105215692168947, + "flos": 25373093005440.0, + "grad_norm": 1.6432636483005112, + "language_loss": 0.80783296, + "learning_rate": 3.648356296957327e-07, + "loss": 0.82869577, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35742188, + "step": 13481, + "time_per_iteration": 2.461930751800537 + }, + { + "auxiliary_loss_clip": 0.01050998, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.01348042, + "balance_loss_mlp": 1.01554048, + "epoch": 0.8105816924695626, + "flos": 20480783875200.0, + "grad_norm": 1.7500613318569513, + "language_loss": 0.73616648, + "learning_rate": 3.646114040202548e-07, + "loss": 0.75704294, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 13482, + "time_per_iteration": 2.3791632652282715 + }, + { + "auxiliary_loss_clip": 0.01051383, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.01144898, + "balance_loss_mlp": 1.01388383, + "epoch": 0.8106418157222306, + "flos": 14537798156160.0, + "grad_norm": 1.8647961783016305, + "language_loss": 0.66465789, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.68553418, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 13483, + "time_per_iteration": 3.8133251667022705 + }, + { + "auxiliary_loss_clip": 0.0105166, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.01099539, + "balance_loss_mlp": 1.01600957, + "epoch": 0.8107019389748985, + "flos": 22563387786240.0, + "grad_norm": 1.6513257929728962, + "language_loss": 0.77261209, + "learning_rate": 3.641631387200992e-07, + "loss": 0.79346299, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 13484, + "time_per_iteration": 2.409259557723999 + }, + { + "auxiliary_loss_clip": 0.01052932, + "auxiliary_loss_mlp": 0.01046121, + "balance_loss_clip": 1.02117109, + "balance_loss_mlp": 1.01534081, + "epoch": 0.8107620622275665, + "flos": 19608548578560.0, + "grad_norm": 1.4566329147103592, + "language_loss": 0.72447902, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74546957, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 13485, + "time_per_iteration": 2.395188808441162 + }, + { + "auxiliary_loss_clip": 0.01049214, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.01061213, + "balance_loss_mlp": 1.01538193, + "epoch": 0.8108221854802344, + "flos": 16142580933120.0, + "grad_norm": 1.844981401577872, + "language_loss": 0.76913261, + "learning_rate": 3.637151215443308e-07, + "loss": 0.78993565, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 13486, + "time_per_iteration": 2.3798279762268066 + }, + { + "auxiliary_loss_clip": 0.01054075, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.01151609, + "balance_loss_mlp": 1.01713586, + "epoch": 0.8108823087329025, + "flos": 21105135452160.0, + "grad_norm": 2.075121872883423, + "language_loss": 0.72762674, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74852943, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36914062, + "step": 13487, + "time_per_iteration": 2.3722336292266846 + }, + { + "auxiliary_loss_clip": 0.01050512, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.01396143, + "balance_loss_mlp": 1.01639688, + "epoch": 0.8109424319855704, + "flos": 29198526675840.0, + "grad_norm": 1.9598983542899595, + "language_loss": 0.84612966, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86699426, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33984375, + "step": 13488, + "time_per_iteration": 2.455432176589966 + }, + { + "auxiliary_loss_clip": 0.01053456, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.00931644, + "balance_loss_mlp": 1.01705015, + "epoch": 0.8110025552382384, + "flos": 23110756081920.0, + "grad_norm": 1.9099899920037717, + "language_loss": 0.7501657, + "learning_rate": 3.630435611625502e-07, + "loss": 0.77103537, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 13489, + "time_per_iteration": 2.388122797012329 + }, + { + "auxiliary_loss_clip": 0.01049403, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.01223505, + "balance_loss_mlp": 1.01506925, + "epoch": 0.8110626784909064, + "flos": 22378941740160.0, + "grad_norm": 2.019206930527245, + "language_loss": 0.72326833, + "learning_rate": 3.628198318377453e-07, + "loss": 0.74410146, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 13490, + "time_per_iteration": 2.408686876296997 + }, + { + "auxiliary_loss_clip": 0.01054578, + "auxiliary_loss_mlp": 0.01045179, + "balance_loss_clip": 1.02035987, + "balance_loss_mlp": 1.01738286, + "epoch": 0.8111228017435743, + "flos": 23366913793920.0, + "grad_norm": 2.4761056101267958, + "language_loss": 0.73051226, + "learning_rate": 3.625961645949762e-07, + "loss": 0.75150979, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 13491, + "time_per_iteration": 2.385023593902588 + }, + { + "auxiliary_loss_clip": 0.01051652, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.01518679, + "balance_loss_mlp": 1.01536965, + "epoch": 0.8111829249962423, + "flos": 21285531780480.0, + "grad_norm": 3.1744406885480383, + "language_loss": 0.68619561, + "learning_rate": 3.623725594427245e-07, + "loss": 0.70711946, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36328125, + "step": 13492, + "time_per_iteration": 2.4424562454223633 + }, + { + "auxiliary_loss_clip": 0.01053808, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.00885153, + "balance_loss_mlp": 1.01744628, + "epoch": 0.8112430482489102, + "flos": 22344482361600.0, + "grad_norm": 1.5670282002172735, + "language_loss": 0.72893906, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74983007, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.36328125, + "step": 13493, + "time_per_iteration": 2.3798625469207764 + }, + { + "auxiliary_loss_clip": 0.01051509, + "auxiliary_loss_mlp": 0.0105075, + "balance_loss_clip": 1.02585864, + "balance_loss_mlp": 1.01563668, + "epoch": 0.8113031715015783, + "flos": 31137009027840.0, + "grad_norm": 2.3978676558188203, + "language_loss": 0.72281736, + "learning_rate": 3.619255354436885e-07, + "loss": 0.74383992, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.359375, + "step": 13494, + "time_per_iteration": 2.451765537261963 + }, + { + "auxiliary_loss_clip": 0.01053459, + "auxiliary_loss_mlp": 0.01040944, + "balance_loss_clip": 1.01511121, + "balance_loss_mlp": 1.01627874, + "epoch": 0.8113632947542462, + "flos": 25334339529600.0, + "grad_norm": 1.9822628315179835, + "language_loss": 0.77722144, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.79816544, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 13495, + "time_per_iteration": 2.3901774883270264 + }, + { + "auxiliary_loss_clip": 0.01051894, + "auxiliary_loss_mlp": 0.01038724, + "balance_loss_clip": 1.01520419, + "balance_loss_mlp": 1.01639342, + "epoch": 0.8114234180069142, + "flos": 28437908595840.0, + "grad_norm": 1.8296677739920801, + "language_loss": 0.80696929, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82787549, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35546875, + "step": 13496, + "time_per_iteration": 2.454524040222168 + }, + { + "auxiliary_loss_clip": 0.01050514, + "auxiliary_loss_mlp": 0.01042353, + "balance_loss_clip": 1.01977468, + "balance_loss_mlp": 1.01531005, + "epoch": 0.8114835412595821, + "flos": 20337849302400.0, + "grad_norm": 1.8939580372360536, + "language_loss": 0.7199719, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.74090052, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 13497, + "time_per_iteration": 2.367025375366211 + }, + { + "auxiliary_loss_clip": 0.0105388, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.01498663, + "balance_loss_mlp": 1.01755798, + "epoch": 0.8115436645122501, + "flos": 22489825818240.0, + "grad_norm": 1.5605294786677866, + "language_loss": 0.77590692, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79681587, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 13498, + "time_per_iteration": 2.4254791736602783 + }, + { + "auxiliary_loss_clip": 0.01052127, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.0144254, + "balance_loss_mlp": 1.01575541, + "epoch": 0.811603787764918, + "flos": 13844423088000.0, + "grad_norm": 2.2844471554442496, + "language_loss": 0.85605907, + "learning_rate": 3.608090626234055e-07, + "loss": 0.87694728, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 13499, + "time_per_iteration": 2.369807004928589 + }, + { + "auxiliary_loss_clip": 0.01052664, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.00962245, + "balance_loss_mlp": 1.01732314, + "epoch": 0.8116639110175861, + "flos": 21613610626560.0, + "grad_norm": 1.6585851560856377, + "language_loss": 0.77154529, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.79240733, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35351562, + "step": 13500, + "time_per_iteration": 3.6528820991516113 + }, + { + "auxiliary_loss_clip": 0.01007753, + "auxiliary_loss_mlp": 0.01004026, + "balance_loss_clip": 1.00141549, + "balance_loss_mlp": 1.00112391, + "epoch": 0.811724034270254, + "flos": 64456262720640.0, + "grad_norm": 0.8293201544946872, + "language_loss": 0.60053706, + "learning_rate": 3.603629085440303e-07, + "loss": 0.62065488, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.02612305, + "router_z_loss_mlp": 0.06640625, + "step": 13501, + "time_per_iteration": 3.0851199626922607 + }, + { + "auxiliary_loss_clip": 0.0105045, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.01614618, + "balance_loss_mlp": 1.01567364, + "epoch": 0.811784157522922, + "flos": 24752965703040.0, + "grad_norm": 1.5677583484510333, + "language_loss": 0.80235982, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.82324159, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34765625, + "step": 13502, + "time_per_iteration": 2.397235870361328 + }, + { + "auxiliary_loss_clip": 0.01052106, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.01795053, + "balance_loss_mlp": 1.01640379, + "epoch": 0.81184428077559, + "flos": 12166322722560.0, + "grad_norm": 2.1812457371275906, + "language_loss": 0.71959537, + "learning_rate": 3.599170031654635e-07, + "loss": 0.74051762, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35742188, + "step": 13503, + "time_per_iteration": 2.3779139518737793 + }, + { + "auxiliary_loss_clip": 0.01052121, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.01044869, + "balance_loss_mlp": 1.01552618, + "epoch": 0.8119044040282579, + "flos": 44420273832960.0, + "grad_norm": 1.6002346354477133, + "language_loss": 0.67999053, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.70086932, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 13504, + "time_per_iteration": 2.59869647026062 + }, + { + "auxiliary_loss_clip": 0.01052913, + "auxiliary_loss_mlp": 0.0103819, + "balance_loss_clip": 1.01328683, + "balance_loss_mlp": 1.01615763, + "epoch": 0.8119645272809259, + "flos": 52153570627200.0, + "grad_norm": 2.001128685107157, + "language_loss": 0.75889254, + "learning_rate": 3.594713465553403e-07, + "loss": 0.77980357, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.3671875, + "step": 13505, + "time_per_iteration": 2.6889119148254395 + }, + { + "auxiliary_loss_clip": 0.0105299, + "auxiliary_loss_mlp": 0.01040595, + "balance_loss_clip": 1.01590681, + "balance_loss_mlp": 1.01649666, + "epoch": 0.8120246505335939, + "flos": 30231501338880.0, + "grad_norm": 1.929583342232816, + "language_loss": 0.73554623, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.75648212, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 13506, + "time_per_iteration": 3.9361581802368164 + }, + { + "auxiliary_loss_clip": 0.01055247, + "auxiliary_loss_mlp": 0.01037508, + "balance_loss_clip": 1.01442862, + "balance_loss_mlp": 1.01661158, + "epoch": 0.8120847737862619, + "flos": 22126554455040.0, + "grad_norm": 2.1825343954717056, + "language_loss": 0.77496439, + "learning_rate": 3.590259387812593e-07, + "loss": 0.79589194, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.38671875, + "step": 13507, + "time_per_iteration": 2.4447824954986572 + }, + { + "auxiliary_loss_clip": 0.0105416, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.01141167, + "balance_loss_mlp": 1.01676154, + "epoch": 0.8121448970389298, + "flos": 23294050053120.0, + "grad_norm": 1.6301429739898425, + "language_loss": 0.71110338, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.73199588, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37304688, + "step": 13508, + "time_per_iteration": 2.358341932296753 + }, + { + "auxiliary_loss_clip": 0.01051589, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.01445389, + "balance_loss_mlp": 1.01599896, + "epoch": 0.8122050202915978, + "flos": 22163038692480.0, + "grad_norm": 1.6293695425107122, + "language_loss": 0.76950395, + "learning_rate": 3.585807799107785e-07, + "loss": 0.7903899, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 13509, + "time_per_iteration": 3.7622995376586914 + }, + { + "auxiliary_loss_clip": 0.01052682, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.01541471, + "balance_loss_mlp": 1.01665974, + "epoch": 0.8122651435442657, + "flos": 23257810195200.0, + "grad_norm": 1.7752687898820978, + "language_loss": 0.77776998, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79870021, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 13510, + "time_per_iteration": 2.4313602447509766 + }, + { + "auxiliary_loss_clip": 0.01054483, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.01793587, + "balance_loss_mlp": 1.01659822, + "epoch": 0.8123252667969337, + "flos": 16139194531200.0, + "grad_norm": 2.0818175883858787, + "language_loss": 0.71315646, + "learning_rate": 3.581358700114212e-07, + "loss": 0.73412752, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37890625, + "step": 13511, + "time_per_iteration": 2.334489107131958 + }, + { + "auxiliary_loss_clip": 0.01054688, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.01669574, + "balance_loss_mlp": 1.01826835, + "epoch": 0.8123853900496016, + "flos": 21244509066240.0, + "grad_norm": 1.5952553666239175, + "language_loss": 0.80115497, + "learning_rate": 3.57913508447004e-07, + "loss": 0.82210457, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 13512, + "time_per_iteration": 2.3634531497955322 + }, + { + "auxiliary_loss_clip": 0.01049888, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.01252508, + "balance_loss_mlp": 1.01506376, + "epoch": 0.8124455133022697, + "flos": 64375336321920.0, + "grad_norm": 3.8053443541740926, + "language_loss": 0.64675748, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.66759431, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34765625, + "step": 13513, + "time_per_iteration": 2.7906720638275146 + }, + { + "auxiliary_loss_clip": 0.01054325, + "auxiliary_loss_mlp": 0.01043798, + "balance_loss_clip": 1.0177865, + "balance_loss_mlp": 1.01669586, + "epoch": 0.8125056365549376, + "flos": 23841069235200.0, + "grad_norm": 1.7657388120435031, + "language_loss": 0.73030961, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.7512908, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37695312, + "step": 13514, + "time_per_iteration": 2.387388229370117 + }, + { + "auxiliary_loss_clip": 0.01049874, + "auxiliary_loss_mlp": 0.0103589, + "balance_loss_clip": 1.01403868, + "balance_loss_mlp": 1.01527429, + "epoch": 0.8125657598076056, + "flos": 23549195335680.0, + "grad_norm": 1.5825781166789274, + "language_loss": 0.63775265, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.65861022, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 13515, + "time_per_iteration": 2.3835203647613525 + }, + { + "auxiliary_loss_clip": 0.01048463, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.01076329, + "balance_loss_mlp": 1.01541793, + "epoch": 0.8126258830602736, + "flos": 20703180435840.0, + "grad_norm": 1.5616580773025324, + "language_loss": 0.76888514, + "learning_rate": 3.570246849544616e-07, + "loss": 0.7896862, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33007812, + "step": 13516, + "time_per_iteration": 2.3521318435668945 + }, + { + "auxiliary_loss_clip": 0.01052653, + "auxiliary_loss_mlp": 0.01037934, + "balance_loss_clip": 1.01574922, + "balance_loss_mlp": 1.01661611, + "epoch": 0.8126860063129415, + "flos": 23617171486080.0, + "grad_norm": 1.541194857436985, + "language_loss": 0.92223287, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.94313872, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 13517, + "time_per_iteration": 2.421895980834961 + }, + { + "auxiliary_loss_clip": 0.01054977, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.01544559, + "balance_loss_mlp": 1.01841378, + "epoch": 0.8127461295656095, + "flos": 25006051215360.0, + "grad_norm": 1.5596552306167573, + "language_loss": 0.79886985, + "learning_rate": 3.565806469852244e-07, + "loss": 0.81981671, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 13518, + "time_per_iteration": 2.398237466812134 + }, + { + "auxiliary_loss_clip": 0.01053162, + "auxiliary_loss_mlp": 0.01037977, + "balance_loss_clip": 1.01715076, + "balance_loss_mlp": 1.01744449, + "epoch": 0.8128062528182775, + "flos": 27341007500160.0, + "grad_norm": 1.7013068763810537, + "language_loss": 0.79995811, + "learning_rate": 3.56358721474336e-07, + "loss": 0.82086957, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.35742188, + "step": 13519, + "time_per_iteration": 2.4675660133361816 + }, + { + "auxiliary_loss_clip": 0.01051979, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02183306, + "balance_loss_mlp": 1.01563084, + "epoch": 0.8128663760709455, + "flos": 26505081884160.0, + "grad_norm": 2.59480442654366, + "language_loss": 0.71344197, + "learning_rate": 3.561368582904905e-07, + "loss": 0.73442411, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 13520, + "time_per_iteration": 2.4032235145568848 + }, + { + "auxiliary_loss_clip": 0.01051843, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.01623404, + "balance_loss_mlp": 1.01573122, + "epoch": 0.8129264993236134, + "flos": 17930273656320.0, + "grad_norm": 1.4658530317155931, + "language_loss": 0.73707551, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.75799763, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 13521, + "time_per_iteration": 2.381408452987671 + }, + { + "auxiliary_loss_clip": 0.01052482, + "auxiliary_loss_mlp": 0.01040545, + "balance_loss_clip": 1.01708508, + "balance_loss_mlp": 1.0157547, + "epoch": 0.8129866225762814, + "flos": 26176479367680.0, + "grad_norm": 1.8682915482212052, + "language_loss": 0.70856267, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.72949296, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 13522, + "time_per_iteration": 2.4156973361968994 + }, + { + "auxiliary_loss_clip": 0.01050648, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.01183534, + "balance_loss_mlp": 1.01644754, + "epoch": 0.8130467458289493, + "flos": 21031154547840.0, + "grad_norm": 1.6531894352189063, + "language_loss": 0.71691978, + "learning_rate": 3.554716427853233e-07, + "loss": 0.73776215, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34179688, + "step": 13523, + "time_per_iteration": 3.8518593311309814 + }, + { + "auxiliary_loss_clip": 0.01050618, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.01327634, + "balance_loss_mlp": 1.01498199, + "epoch": 0.8131068690816173, + "flos": 15486143950080.0, + "grad_norm": 2.369689808823012, + "language_loss": 0.72749823, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.74835795, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 13524, + "time_per_iteration": 2.3644704818725586 + }, + { + "auxiliary_loss_clip": 0.01051777, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.01354623, + "balance_loss_mlp": 1.01623631, + "epoch": 0.8131669923342852, + "flos": 29349944709120.0, + "grad_norm": 1.817712042755706, + "language_loss": 0.63446099, + "learning_rate": 3.550284775712653e-07, + "loss": 0.65534467, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 13525, + "time_per_iteration": 2.407771587371826 + }, + { + "auxiliary_loss_clip": 0.01051875, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.01826787, + "balance_loss_mlp": 1.01649117, + "epoch": 0.8132271155869533, + "flos": 35254875179520.0, + "grad_norm": 1.4887728766590236, + "language_loss": 0.66296411, + "learning_rate": 3.548069885262628e-07, + "loss": 0.68388462, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 13526, + "time_per_iteration": 2.501500368118286 + }, + { + "auxiliary_loss_clip": 0.01050901, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.01516998, + "balance_loss_mlp": 1.01539016, + "epoch": 0.8132872388396212, + "flos": 27780668651520.0, + "grad_norm": 1.5567901967433475, + "language_loss": 0.76312745, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.78401399, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 13527, + "time_per_iteration": 2.4133453369140625 + }, + { + "auxiliary_loss_clip": 0.01051226, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.01196265, + "balance_loss_mlp": 1.01596248, + "epoch": 0.8133473620922892, + "flos": 27818339875200.0, + "grad_norm": 1.63907167142735, + "language_loss": 0.71497017, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.73582089, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 13528, + "time_per_iteration": 2.4191091060638428 + }, + { + "auxiliary_loss_clip": 0.01052595, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.01045465, + "balance_loss_mlp": 1.01626956, + "epoch": 0.8134074853449572, + "flos": 18988526010240.0, + "grad_norm": 1.8837506848037462, + "language_loss": 0.70316994, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.72402215, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.36328125, + "step": 13529, + "time_per_iteration": 2.3479716777801514 + }, + { + "auxiliary_loss_clip": 0.01050403, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.01191139, + "balance_loss_mlp": 1.01492441, + "epoch": 0.8134676085976251, + "flos": 24241732531200.0, + "grad_norm": 4.008560124843101, + "language_loss": 0.78802991, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.8088854, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 13530, + "time_per_iteration": 2.404996871948242 + }, + { + "auxiliary_loss_clip": 0.01050288, + "auxiliary_loss_mlp": 0.01036797, + "balance_loss_clip": 1.01462436, + "balance_loss_mlp": 1.01559186, + "epoch": 0.8135277318502931, + "flos": 19061389751040.0, + "grad_norm": 1.8469040292830101, + "language_loss": 0.83156568, + "learning_rate": 3.537004792574052e-07, + "loss": 0.85243654, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 13531, + "time_per_iteration": 2.3804564476013184 + }, + { + "auxiliary_loss_clip": 0.01052424, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.01236713, + "balance_loss_mlp": 1.01580632, + "epoch": 0.813587855102961, + "flos": 17268914171520.0, + "grad_norm": 2.1064489572198237, + "language_loss": 0.72816312, + "learning_rate": 3.534793646536065e-07, + "loss": 0.74905205, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 13532, + "time_per_iteration": 2.3814139366149902 + }, + { + "auxiliary_loss_clip": 0.01050945, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.01186872, + "balance_loss_mlp": 1.0155195, + "epoch": 0.8136479783556291, + "flos": 20156545278720.0, + "grad_norm": 1.7697029046039667, + "language_loss": 0.77514768, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.79598695, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35546875, + "step": 13533, + "time_per_iteration": 2.3758068084716797 + }, + { + "auxiliary_loss_clip": 0.01054076, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.01526487, + "balance_loss_mlp": 1.01646471, + "epoch": 0.813708101608297, + "flos": 22051316741760.0, + "grad_norm": 1.9513024546803832, + "language_loss": 0.77537799, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.79630697, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.375, + "step": 13534, + "time_per_iteration": 2.366466760635376 + }, + { + "auxiliary_loss_clip": 0.01050572, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.01285481, + "balance_loss_mlp": 1.01569533, + "epoch": 0.813768224860965, + "flos": 16172676391680.0, + "grad_norm": 2.153063723209549, + "language_loss": 0.94472957, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.96557176, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34960938, + "step": 13535, + "time_per_iteration": 2.381228446960449 + }, + { + "auxiliary_loss_clip": 0.01050253, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.01264024, + "balance_loss_mlp": 1.01647806, + "epoch": 0.8138283481136329, + "flos": 24351185243520.0, + "grad_norm": 1.5690707448072008, + "language_loss": 0.71253937, + "learning_rate": 3.52595530684499e-07, + "loss": 0.73338032, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33789062, + "step": 13536, + "time_per_iteration": 2.3988044261932373 + }, + { + "auxiliary_loss_clip": 0.01051575, + "auxiliary_loss_mlp": 0.01035673, + "balance_loss_clip": 1.01298761, + "balance_loss_mlp": 1.01651192, + "epoch": 0.8138884713663009, + "flos": 25515294439680.0, + "grad_norm": 1.510979866330716, + "language_loss": 0.76390481, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.78477728, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 13537, + "time_per_iteration": 2.406149387359619 + }, + { + "auxiliary_loss_clip": 0.01049398, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.01317108, + "balance_loss_mlp": 1.0154655, + "epoch": 0.8139485946189688, + "flos": 22453306669440.0, + "grad_norm": 1.419799020829966, + "language_loss": 0.7696321, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.79047465, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 13538, + "time_per_iteration": 2.3804585933685303 + }, + { + "auxiliary_loss_clip": 0.0104924, + "auxiliary_loss_mlp": 0.01038043, + "balance_loss_clip": 1.01578641, + "balance_loss_mlp": 1.01406169, + "epoch": 0.8140087178716369, + "flos": 21249361745280.0, + "grad_norm": 1.9953764546150417, + "language_loss": 0.78219187, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80306464, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 13539, + "time_per_iteration": 2.418666124343872 + }, + { + "auxiliary_loss_clip": 0.0105005, + "auxiliary_loss_mlp": 0.01036653, + "balance_loss_clip": 1.01604152, + "balance_loss_mlp": 1.01600194, + "epoch": 0.8140688411243048, + "flos": 39414322627200.0, + "grad_norm": 2.3175401776949442, + "language_loss": 0.66691983, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68778682, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 13540, + "time_per_iteration": 3.7718703746795654 + }, + { + "auxiliary_loss_clip": 0.01052065, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.01619351, + "balance_loss_mlp": 1.01664329, + "epoch": 0.8141289643769728, + "flos": 25414569567360.0, + "grad_norm": 1.4650682655010114, + "language_loss": 0.68241799, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.70332032, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 13541, + "time_per_iteration": 2.492431402206421 + }, + { + "auxiliary_loss_clip": 0.01051192, + "auxiliary_loss_mlp": 0.01037792, + "balance_loss_clip": 1.01436687, + "balance_loss_mlp": 1.01588547, + "epoch": 0.8141890876296408, + "flos": 12567230398080.0, + "grad_norm": 1.9250763809513327, + "language_loss": 0.7021842, + "learning_rate": 3.512716539904355e-07, + "loss": 0.72307402, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 13542, + "time_per_iteration": 2.366791248321533 + }, + { + "auxiliary_loss_clip": 0.01053346, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.01346982, + "balance_loss_mlp": 1.01541853, + "epoch": 0.8142492108823087, + "flos": 14966532051840.0, + "grad_norm": 3.2273664054725786, + "language_loss": 0.82325196, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.84415716, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37890625, + "step": 13543, + "time_per_iteration": 2.3668298721313477 + }, + { + "auxiliary_loss_clip": 0.01054961, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.01719856, + "balance_loss_mlp": 1.0171299, + "epoch": 0.8143093341349767, + "flos": 12421188714240.0, + "grad_norm": 2.4103747844110464, + "language_loss": 0.79253727, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.81349909, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 13544, + "time_per_iteration": 2.333771228790283 + }, + { + "auxiliary_loss_clip": 0.01057144, + "auxiliary_loss_mlp": 0.01044611, + "balance_loss_clip": 1.01844501, + "balance_loss_mlp": 1.01747894, + "epoch": 0.8143694573876447, + "flos": 11909780985600.0, + "grad_norm": 3.305300743797227, + "language_loss": 0.75446594, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.77548349, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.39648438, + "step": 13545, + "time_per_iteration": 3.824995279312134 + }, + { + "auxiliary_loss_clip": 0.01050637, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.01593518, + "balance_loss_mlp": 1.01652038, + "epoch": 0.8144295806403127, + "flos": 21211899989760.0, + "grad_norm": 1.6136383837846529, + "language_loss": 0.77209616, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.79297471, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34179688, + "step": 13546, + "time_per_iteration": 2.4031004905700684 + }, + { + "auxiliary_loss_clip": 0.01053159, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.01296031, + "balance_loss_mlp": 1.01747024, + "epoch": 0.8144897038929806, + "flos": 19864252442880.0, + "grad_norm": 2.7624588036965667, + "language_loss": 0.71705019, + "learning_rate": 3.501701426337178e-07, + "loss": 0.73792922, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35546875, + "step": 13547, + "time_per_iteration": 2.3999722003936768 + }, + { + "auxiliary_loss_clip": 0.01052641, + "auxiliary_loss_mlp": 0.01042577, + "balance_loss_clip": 1.01768589, + "balance_loss_mlp": 1.01543725, + "epoch": 0.8145498271456486, + "flos": 24570579427200.0, + "grad_norm": 2.138067396654817, + "language_loss": 0.71958071, + "learning_rate": 3.49950028014111e-07, + "loss": 0.74053288, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37304688, + "step": 13548, + "time_per_iteration": 2.4348106384277344 + }, + { + "auxiliary_loss_clip": 0.01054282, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.00921774, + "balance_loss_mlp": 1.01784909, + "epoch": 0.8146099503983165, + "flos": 20192017086720.0, + "grad_norm": 2.1651073749746015, + "language_loss": 0.77705657, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.79792249, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 13549, + "time_per_iteration": 3.7097134590148926 + }, + { + "auxiliary_loss_clip": 0.01052258, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.0110122, + "balance_loss_mlp": 1.01736259, + "epoch": 0.8146700736509845, + "flos": 19535929217280.0, + "grad_norm": 2.188480958689104, + "language_loss": 0.72328448, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.74413651, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 13550, + "time_per_iteration": 2.3845138549804688 + }, + { + "auxiliary_loss_clip": 0.01048342, + "auxiliary_loss_mlp": 0.01033315, + "balance_loss_clip": 1.01223826, + "balance_loss_mlp": 1.01515746, + "epoch": 0.8147301969036524, + "flos": 18040354773120.0, + "grad_norm": 1.8130579905223816, + "language_loss": 0.72570181, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74651837, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33203125, + "step": 13551, + "time_per_iteration": 2.329204559326172 + }, + { + "auxiliary_loss_clip": 0.01055672, + "auxiliary_loss_mlp": 0.0104118, + "balance_loss_clip": 1.01576471, + "balance_loss_mlp": 1.01696575, + "epoch": 0.8147903201563205, + "flos": 18003730890240.0, + "grad_norm": 1.841978339294827, + "language_loss": 0.70359874, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.72456723, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 13552, + "time_per_iteration": 2.366551399230957 + }, + { + "auxiliary_loss_clip": 0.01050897, + "auxiliary_loss_mlp": 0.01036396, + "balance_loss_clip": 1.01540327, + "balance_loss_mlp": 1.0158757, + "epoch": 0.8148504434089884, + "flos": 20258492048640.0, + "grad_norm": 1.761135494890966, + "language_loss": 0.83697844, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.85785139, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.3515625, + "step": 13553, + "time_per_iteration": 2.3371598720550537 + }, + { + "auxiliary_loss_clip": 0.0105206, + "auxiliary_loss_mlp": 0.01037078, + "balance_loss_clip": 1.01532197, + "balance_loss_mlp": 1.01627445, + "epoch": 0.8149105666616564, + "flos": 12493912809600.0, + "grad_norm": 1.8611218206453928, + "language_loss": 0.69753301, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.71842432, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35742188, + "step": 13554, + "time_per_iteration": 2.341367721557617 + }, + { + "auxiliary_loss_clip": 0.01054716, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.01574969, + "balance_loss_mlp": 1.01786709, + "epoch": 0.8149706899143244, + "flos": 32522362709760.0, + "grad_norm": 1.7722381470650175, + "language_loss": 0.66824615, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68919301, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 13555, + "time_per_iteration": 2.447854995727539 + }, + { + "auxiliary_loss_clip": 0.01053553, + "auxiliary_loss_mlp": 0.01038459, + "balance_loss_clip": 1.01483202, + "balance_loss_mlp": 1.0167191, + "epoch": 0.8150308131669923, + "flos": 19385209411200.0, + "grad_norm": 2.10813950931355, + "language_loss": 0.74726385, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.76818395, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 13556, + "time_per_iteration": 2.3482964038848877 + }, + { + "auxiliary_loss_clip": 0.01051499, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.00911689, + "balance_loss_mlp": 1.01680207, + "epoch": 0.8150909364196604, + "flos": 17420402027520.0, + "grad_norm": 1.7266485040492248, + "language_loss": 0.81536818, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.83618748, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34570312, + "step": 13557, + "time_per_iteration": 2.3261830806732178 + }, + { + "auxiliary_loss_clip": 0.01055064, + "auxiliary_loss_mlp": 0.01037983, + "balance_loss_clip": 1.01560783, + "balance_loss_mlp": 1.01751971, + "epoch": 0.8151510596723283, + "flos": 27161553778560.0, + "grad_norm": 1.7627503519466825, + "language_loss": 0.66273719, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68366766, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.375, + "step": 13558, + "time_per_iteration": 2.441406488418579 + }, + { + "auxiliary_loss_clip": 0.01007438, + "auxiliary_loss_mlp": 0.01003979, + "balance_loss_clip": 1.00155902, + "balance_loss_mlp": 1.00081003, + "epoch": 0.8152111829249963, + "flos": 64216131189120.0, + "grad_norm": 0.7972983853127706, + "language_loss": 0.56947351, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.58958769, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.06640625, + "step": 13559, + "time_per_iteration": 2.9274091720581055 + }, + { + "auxiliary_loss_clip": 0.01007528, + "auxiliary_loss_mlp": 0.01002694, + "balance_loss_clip": 1.00024986, + "balance_loss_mlp": 1.00099587, + "epoch": 0.8152713061776642, + "flos": 67068814158720.0, + "grad_norm": 0.6810392485807253, + "language_loss": 0.55292958, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57303178, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.06542969, + "step": 13560, + "time_per_iteration": 2.9027628898620605 + }, + { + "auxiliary_loss_clip": 0.01051265, + "auxiliary_loss_mlp": 0.01039672, + "balance_loss_clip": 1.01796412, + "balance_loss_mlp": 1.01650715, + "epoch": 0.8153314294303322, + "flos": 14390290195200.0, + "grad_norm": 2.015411938969973, + "language_loss": 0.68192828, + "learning_rate": 3.470942348696948e-07, + "loss": 0.70283759, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 13561, + "time_per_iteration": 2.3333899974823 + }, + { + "auxiliary_loss_clip": 0.01054149, + "auxiliary_loss_mlp": 0.01040312, + "balance_loss_clip": 1.01691103, + "balance_loss_mlp": 1.01724339, + "epoch": 0.8153915526830001, + "flos": 25622512824960.0, + "grad_norm": 1.6694564893323505, + "language_loss": 0.827191, + "learning_rate": 3.468749969894085e-07, + "loss": 0.84813565, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 13562, + "time_per_iteration": 3.9167189598083496 + }, + { + "auxiliary_loss_clip": 0.01052465, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.01522946, + "balance_loss_mlp": 1.01637352, + "epoch": 0.8154516759356681, + "flos": 23367996046080.0, + "grad_norm": 1.6037662751293285, + "language_loss": 0.72856116, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74946308, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 13563, + "time_per_iteration": 2.4224705696105957 + }, + { + "auxiliary_loss_clip": 0.01051443, + "auxiliary_loss_mlp": 0.01038853, + "balance_loss_clip": 1.01281738, + "balance_loss_mlp": 1.01510966, + "epoch": 0.815511799188336, + "flos": 28147884998400.0, + "grad_norm": 1.5513606898108148, + "language_loss": 0.71345663, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.73435956, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36328125, + "step": 13564, + "time_per_iteration": 2.4412033557891846 + }, + { + "auxiliary_loss_clip": 0.01050871, + "auxiliary_loss_mlp": 0.01035301, + "balance_loss_clip": 1.0116024, + "balance_loss_mlp": 1.01570749, + "epoch": 0.8155719224410041, + "flos": 16982451532800.0, + "grad_norm": 1.8965648677071556, + "language_loss": 0.71329212, + "learning_rate": 3.462176595017854e-07, + "loss": 0.73415387, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3515625, + "step": 13565, + "time_per_iteration": 2.3872175216674805 + }, + { + "auxiliary_loss_clip": 0.01050633, + "auxiliary_loss_mlp": 0.01038148, + "balance_loss_clip": 1.01509273, + "balance_loss_mlp": 1.01558781, + "epoch": 0.815632045693672, + "flos": 24680555809920.0, + "grad_norm": 2.166154139237371, + "language_loss": 0.79519135, + "learning_rate": 3.459986724180188e-07, + "loss": 0.81607914, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 13566, + "time_per_iteration": 2.4251151084899902 + }, + { + "auxiliary_loss_clip": 0.01050376, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.01331675, + "balance_loss_mlp": 1.01568747, + "epoch": 0.81569216894634, + "flos": 19937290740480.0, + "grad_norm": 1.5902230688323156, + "language_loss": 0.83354002, + "learning_rate": 3.457797480541491e-07, + "loss": 0.85438675, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34570312, + "step": 13567, + "time_per_iteration": 2.4290928840637207 + }, + { + "auxiliary_loss_clip": 0.01050496, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.01369548, + "balance_loss_mlp": 1.01610744, + "epoch": 0.8157522921990079, + "flos": 21798301052160.0, + "grad_norm": 1.8466935633714183, + "language_loss": 0.81168956, + "learning_rate": 3.455608864184771e-07, + "loss": 0.83252811, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.34375, + "step": 13568, + "time_per_iteration": 2.3617868423461914 + }, + { + "auxiliary_loss_clip": 0.01050071, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.01707006, + "balance_loss_mlp": 1.0162003, + "epoch": 0.8158124154516759, + "flos": 18507527942400.0, + "grad_norm": 1.8908761617352146, + "language_loss": 0.78155303, + "learning_rate": 3.453420875193016e-07, + "loss": 0.80243552, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 13569, + "time_per_iteration": 2.3679440021514893 + }, + { + "auxiliary_loss_clip": 0.01051159, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.01360631, + "balance_loss_mlp": 1.01613665, + "epoch": 0.815872538704344, + "flos": 26829669594240.0, + "grad_norm": 3.4690948255190364, + "language_loss": 0.60956621, + "learning_rate": 3.451233513649199e-07, + "loss": 0.63042057, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.3515625, + "step": 13570, + "time_per_iteration": 2.4402389526367188 + }, + { + "auxiliary_loss_clip": 0.01055031, + "auxiliary_loss_mlp": 0.01043783, + "balance_loss_clip": 1.02004838, + "balance_loss_mlp": 1.01731706, + "epoch": 0.8159326619570119, + "flos": 21724634350080.0, + "grad_norm": 1.9568364540703478, + "language_loss": 0.83227783, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.853266, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37695312, + "step": 13571, + "time_per_iteration": 2.4104504585266113 + }, + { + "auxiliary_loss_clip": 0.0105187, + "auxiliary_loss_mlp": 0.01042944, + "balance_loss_clip": 1.0194242, + "balance_loss_mlp": 1.01699352, + "epoch": 0.8159927852096799, + "flos": 13839989345280.0, + "grad_norm": 2.666068385840137, + "language_loss": 0.79993945, + "learning_rate": 3.446860673237142e-07, + "loss": 0.82088763, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.34765625, + "step": 13572, + "time_per_iteration": 2.340604305267334 + }, + { + "auxiliary_loss_clip": 0.01052109, + "auxiliary_loss_mlp": 0.0103896, + "balance_loss_clip": 1.0162034, + "balance_loss_mlp": 1.01561487, + "epoch": 0.8160529084623478, + "flos": 24498344090880.0, + "grad_norm": 1.4352363991449348, + "language_loss": 0.66042233, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.68133307, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36523438, + "step": 13573, + "time_per_iteration": 2.400564670562744 + }, + { + "auxiliary_loss_clip": 0.01051566, + "auxiliary_loss_mlp": 0.01038, + "balance_loss_clip": 1.01496899, + "balance_loss_mlp": 1.01624191, + "epoch": 0.8161130317150158, + "flos": 24825201039360.0, + "grad_norm": 1.5685274942666432, + "language_loss": 0.76394951, + "learning_rate": 3.442490343611868e-07, + "loss": 0.78484523, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 13574, + "time_per_iteration": 2.4113190174102783 + }, + { + "auxiliary_loss_clip": 0.01054113, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.01662719, + "balance_loss_mlp": 1.01744914, + "epoch": 0.8161731549676837, + "flos": 30955216245120.0, + "grad_norm": 1.9952365591395962, + "language_loss": 0.60713041, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.62806797, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 13575, + "time_per_iteration": 2.4556126594543457 + }, + { + "auxiliary_loss_clip": 0.01051854, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.01298642, + "balance_loss_mlp": 1.01526785, + "epoch": 0.8162332782203517, + "flos": 18550994451840.0, + "grad_norm": 1.8271538169213215, + "language_loss": 0.75476044, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.77565217, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 13576, + "time_per_iteration": 2.357179880142212 + }, + { + "auxiliary_loss_clip": 0.01007508, + "auxiliary_loss_mlp": 0.01002397, + "balance_loss_clip": 1.00006068, + "balance_loss_mlp": 1.00079572, + "epoch": 0.8162934014730197, + "flos": 70383712884480.0, + "grad_norm": 0.81990881981954, + "language_loss": 0.58724636, + "learning_rate": 3.435939558349155e-07, + "loss": 0.6073454, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.06738281, + "step": 13577, + "time_per_iteration": 2.992936372756958 + }, + { + "auxiliary_loss_clip": 0.01049776, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.01476979, + "balance_loss_mlp": 1.01596355, + "epoch": 0.8163535247256877, + "flos": 21213785203200.0, + "grad_norm": 1.7503215275339672, + "language_loss": 0.71985441, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.74070519, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33789062, + "step": 13578, + "time_per_iteration": 2.3833415508270264 + }, + { + "auxiliary_loss_clip": 0.01051367, + "auxiliary_loss_mlp": 0.01039307, + "balance_loss_clip": 1.01587033, + "balance_loss_mlp": 1.01593661, + "epoch": 0.8164136479783556, + "flos": 21097978623360.0, + "grad_norm": 2.1133151107034207, + "language_loss": 0.74602854, + "learning_rate": 3.431575508590172e-07, + "loss": 0.76693535, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 13579, + "time_per_iteration": 2.371922254562378 + }, + { + "auxiliary_loss_clip": 0.01052346, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.01508737, + "balance_loss_mlp": 1.01567245, + "epoch": 0.8164737712310236, + "flos": 21719711848320.0, + "grad_norm": 1.7385024794369723, + "language_loss": 0.7986455, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.819543, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3671875, + "step": 13580, + "time_per_iteration": 2.362642765045166 + }, + { + "auxiliary_loss_clip": 0.01051261, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_clip": 1.01713169, + "balance_loss_mlp": 1.0161227, + "epoch": 0.8165338944836915, + "flos": 19535789571840.0, + "grad_norm": 1.7889952510031795, + "language_loss": 0.70002759, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.72095549, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3515625, + "step": 13581, + "time_per_iteration": 3.7475593090057373 + }, + { + "auxiliary_loss_clip": 0.01052591, + "auxiliary_loss_mlp": 0.01037685, + "balance_loss_clip": 1.01628757, + "balance_loss_mlp": 1.01685905, + "epoch": 0.8165940177363595, + "flos": 22927497022080.0, + "grad_norm": 1.5818595699036935, + "language_loss": 0.61098534, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.63188815, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35742188, + "step": 13582, + "time_per_iteration": 2.3591346740722656 + }, + { + "auxiliary_loss_clip": 0.01049185, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.01365769, + "balance_loss_mlp": 1.01579428, + "epoch": 0.8166541409890276, + "flos": 23369183032320.0, + "grad_norm": 1.4164408708332663, + "language_loss": 0.82710385, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84794319, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33203125, + "step": 13583, + "time_per_iteration": 2.5053625106811523 + }, + { + "auxiliary_loss_clip": 0.01052327, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.01701283, + "balance_loss_mlp": 1.01648581, + "epoch": 0.8167142642416955, + "flos": 18441018069120.0, + "grad_norm": 1.6575511828714975, + "language_loss": 0.75341833, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.77433574, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 13584, + "time_per_iteration": 2.3637771606445312 + }, + { + "auxiliary_loss_clip": 0.01053834, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.01202488, + "balance_loss_mlp": 1.0175395, + "epoch": 0.8167743874943635, + "flos": 21213924848640.0, + "grad_norm": 1.6763142196510445, + "language_loss": 0.76075673, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.78165483, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 13585, + "time_per_iteration": 3.786125421524048 + }, + { + "auxiliary_loss_clip": 0.01052172, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.01349711, + "balance_loss_mlp": 1.01632822, + "epoch": 0.8168345107470314, + "flos": 18696687022080.0, + "grad_norm": 1.4952209622197266, + "language_loss": 0.70830315, + "learning_rate": 3.416321129478068e-07, + "loss": 0.72920203, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 13586, + "time_per_iteration": 2.3586063385009766 + }, + { + "auxiliary_loss_clip": 0.01051277, + "auxiliary_loss_mlp": 0.01037774, + "balance_loss_clip": 1.01688862, + "balance_loss_mlp": 1.01647627, + "epoch": 0.8168946339996994, + "flos": 16252173290880.0, + "grad_norm": 1.533375632817559, + "language_loss": 0.62017381, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.64106429, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 13587, + "time_per_iteration": 2.4120731353759766 + }, + { + "auxiliary_loss_clip": 0.01053448, + "auxiliary_loss_mlp": 0.01042651, + "balance_loss_clip": 1.01895201, + "balance_loss_mlp": 1.01659369, + "epoch": 0.8169547572523673, + "flos": 26940414026880.0, + "grad_norm": 2.4038260781074134, + "language_loss": 0.71933842, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.7402994, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 13588, + "time_per_iteration": 3.7814080715179443 + }, + { + "auxiliary_loss_clip": 0.01053603, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.01402068, + "balance_loss_mlp": 1.01632857, + "epoch": 0.8170148805050353, + "flos": 18951343545600.0, + "grad_norm": 1.6545328170050868, + "language_loss": 0.73360538, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.7545352, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 13589, + "time_per_iteration": 2.3737802505493164 + }, + { + "auxiliary_loss_clip": 0.01049563, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.0123601, + "balance_loss_mlp": 1.01557326, + "epoch": 0.8170750037577033, + "flos": 21833842682880.0, + "grad_norm": 1.6547593022799678, + "language_loss": 0.74307901, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.76392186, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.33984375, + "step": 13590, + "time_per_iteration": 2.385528326034546 + }, + { + "auxiliary_loss_clip": 0.01054914, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.01348591, + "balance_loss_mlp": 1.01667237, + "epoch": 0.8171351270103713, + "flos": 33505866109440.0, + "grad_norm": 2.1130855391711756, + "language_loss": 0.66480505, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.68573689, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3828125, + "step": 13591, + "time_per_iteration": 2.5209076404571533 + }, + { + "auxiliary_loss_clip": 0.01053126, + "auxiliary_loss_mlp": 0.01040052, + "balance_loss_clip": 1.016747, + "balance_loss_mlp": 1.01548862, + "epoch": 0.8171952502630392, + "flos": 22707160231680.0, + "grad_norm": 2.362921484514171, + "language_loss": 0.69628417, + "learning_rate": 3.403270471641373e-07, + "loss": 0.71721601, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37695312, + "step": 13592, + "time_per_iteration": 2.398068428039551 + }, + { + "auxiliary_loss_clip": 0.01051832, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.01085567, + "balance_loss_mlp": 1.01628125, + "epoch": 0.8172553735157072, + "flos": 26722521031680.0, + "grad_norm": 1.7439637730372297, + "language_loss": 0.67617238, + "learning_rate": 3.401097564244759e-07, + "loss": 0.69702899, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 13593, + "time_per_iteration": 2.4385716915130615 + }, + { + "auxiliary_loss_clip": 0.01050312, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.01738882, + "balance_loss_mlp": 1.01545954, + "epoch": 0.8173154967683751, + "flos": 15960159745920.0, + "grad_norm": 1.9964396917297764, + "language_loss": 0.70399493, + "learning_rate": 3.398925286280188e-07, + "loss": 0.72489303, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 13594, + "time_per_iteration": 2.3737223148345947 + }, + { + "auxiliary_loss_clip": 0.01053208, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.01308513, + "balance_loss_mlp": 1.01609039, + "epoch": 0.8173756200210431, + "flos": 25985749276800.0, + "grad_norm": 1.7206889258545275, + "language_loss": 0.67469984, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.69559455, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 13595, + "time_per_iteration": 2.408904790878296 + }, + { + "auxiliary_loss_clip": 0.01054055, + "auxiliary_loss_mlp": 0.01039327, + "balance_loss_clip": 1.01459122, + "balance_loss_mlp": 1.01595783, + "epoch": 0.8174357432737112, + "flos": 25663291159680.0, + "grad_norm": 1.5045288856974015, + "language_loss": 0.7915405, + "learning_rate": 3.394582618976658e-07, + "loss": 0.81247431, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 13596, + "time_per_iteration": 2.412522554397583 + }, + { + "auxiliary_loss_clip": 0.01050146, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.01165783, + "balance_loss_mlp": 1.01540613, + "epoch": 0.8174958665263791, + "flos": 21834017239680.0, + "grad_norm": 2.290887761898957, + "language_loss": 0.60057598, + "learning_rate": 3.392412229802362e-07, + "loss": 0.62143636, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.34765625, + "step": 13597, + "time_per_iteration": 2.3650574684143066 + }, + { + "auxiliary_loss_clip": 0.01051284, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.01348066, + "balance_loss_mlp": 1.01634288, + "epoch": 0.8175559897790471, + "flos": 22454423832960.0, + "grad_norm": 1.6007137291326945, + "language_loss": 0.8332001, + "learning_rate": 3.390242470389462e-07, + "loss": 0.85407412, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 13598, + "time_per_iteration": 2.3521604537963867 + }, + { + "auxiliary_loss_clip": 0.01053349, + "auxiliary_loss_mlp": 0.01039555, + "balance_loss_clip": 1.01531959, + "balance_loss_mlp": 1.01627183, + "epoch": 0.817616113031715, + "flos": 23614867336320.0, + "grad_norm": 1.8565326052887317, + "language_loss": 0.83389139, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.85482037, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 13599, + "time_per_iteration": 2.4214560985565186 + }, + { + "auxiliary_loss_clip": 0.01052585, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.01435447, + "balance_loss_mlp": 1.01721275, + "epoch": 0.817676236284383, + "flos": 27671041382400.0, + "grad_norm": 1.8640402980228714, + "language_loss": 0.8468678, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.86776805, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 13600, + "time_per_iteration": 2.39587664604187 + }, + { + "auxiliary_loss_clip": 0.01051927, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_clip": 1.01550889, + "balance_loss_mlp": 1.01525164, + "epoch": 0.8177363595370509, + "flos": 24679857582720.0, + "grad_norm": 1.6635421744101178, + "language_loss": 0.75699025, + "learning_rate": 3.383736971541766e-07, + "loss": 0.77791655, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 13601, + "time_per_iteration": 3.7969820499420166 + }, + { + "auxiliary_loss_clip": 0.01053923, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.01400721, + "balance_loss_mlp": 1.01632774, + "epoch": 0.817796482789719, + "flos": 17345408693760.0, + "grad_norm": 1.962285562400752, + "language_loss": 0.69738245, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.71831423, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 13602, + "time_per_iteration": 2.3606979846954346 + }, + { + "auxiliary_loss_clip": 0.01049966, + "auxiliary_loss_mlp": 0.01037216, + "balance_loss_clip": 1.01461363, + "balance_loss_mlp": 1.01502156, + "epoch": 0.8178566060423869, + "flos": 17777703548160.0, + "grad_norm": 1.971335101617719, + "language_loss": 0.84880936, + "learning_rate": 3.379403122624718e-07, + "loss": 0.86968118, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 13603, + "time_per_iteration": 2.316683292388916 + }, + { + "auxiliary_loss_clip": 0.0105062, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.01194715, + "balance_loss_mlp": 1.01556301, + "epoch": 0.8179167292950549, + "flos": 24972080595840.0, + "grad_norm": 1.7155010049840889, + "language_loss": 0.70624733, + "learning_rate": 3.377237143507159e-07, + "loss": 0.72709537, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 13604, + "time_per_iteration": 2.4230597019195557 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.01241791, + "balance_loss_mlp": 1.01741362, + "epoch": 0.8179768525477228, + "flos": 22855680622080.0, + "grad_norm": 1.5862534360134182, + "language_loss": 0.75336683, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.77425086, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34960938, + "step": 13605, + "time_per_iteration": 2.36867618560791 + }, + { + "auxiliary_loss_clip": 0.0105202, + "auxiliary_loss_mlp": 0.01042532, + "balance_loss_clip": 1.01956034, + "balance_loss_mlp": 1.01737714, + "epoch": 0.8180369758003908, + "flos": 18514161100800.0, + "grad_norm": 1.8667053482499978, + "language_loss": 0.75461507, + "learning_rate": 3.372907076364666e-07, + "loss": 0.77556062, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34765625, + "step": 13606, + "time_per_iteration": 2.3294615745544434 + }, + { + "auxiliary_loss_clip": 0.01051636, + "auxiliary_loss_mlp": 0.01040624, + "balance_loss_clip": 1.01785553, + "balance_loss_mlp": 1.01607656, + "epoch": 0.8180970990530587, + "flos": 33180719817600.0, + "grad_norm": 1.967820453200513, + "language_loss": 0.66809225, + "learning_rate": 3.370742988503916e-07, + "loss": 0.68901485, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 13607, + "time_per_iteration": 2.480757236480713 + }, + { + "auxiliary_loss_clip": 0.01052571, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.01484811, + "balance_loss_mlp": 1.01644397, + "epoch": 0.8181572223057267, + "flos": 25008844124160.0, + "grad_norm": 1.8040400678898556, + "language_loss": 0.71274543, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.73364967, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 13608, + "time_per_iteration": 2.387209177017212 + }, + { + "auxiliary_loss_clip": 0.01050846, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.01969838, + "balance_loss_mlp": 1.01575804, + "epoch": 0.8182173455583948, + "flos": 28547466042240.0, + "grad_norm": 1.8247789316187035, + "language_loss": 0.81087816, + "learning_rate": 3.366416704613735e-07, + "loss": 0.83181441, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34960938, + "step": 13609, + "time_per_iteration": 2.464195489883423 + }, + { + "auxiliary_loss_clip": 0.01007352, + "auxiliary_loss_mlp": 0.0100225, + "balance_loss_clip": 0.9999972, + "balance_loss_mlp": 1.00083733, + "epoch": 0.8182774688110627, + "flos": 72024875164800.0, + "grad_norm": 0.749750175017467, + "language_loss": 0.56006736, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.58016336, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.06542969, + "step": 13610, + "time_per_iteration": 3.1334807872772217 + }, + { + "auxiliary_loss_clip": 0.01048207, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.01144087, + "balance_loss_mlp": 1.01508594, + "epoch": 0.8183375920637307, + "flos": 19754345882880.0, + "grad_norm": 2.142040947001422, + "language_loss": 0.79348654, + "learning_rate": 3.362092943712107e-07, + "loss": 0.81428933, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.33203125, + "step": 13611, + "time_per_iteration": 2.343662738800049 + }, + { + "auxiliary_loss_clip": 0.01054179, + "auxiliary_loss_mlp": 0.01044465, + "balance_loss_clip": 1.0164752, + "balance_loss_mlp": 1.01597667, + "epoch": 0.8183977153163986, + "flos": 22340921402880.0, + "grad_norm": 1.93234131009409, + "language_loss": 0.78712237, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.80810887, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.38085938, + "step": 13612, + "time_per_iteration": 2.4059505462646484 + }, + { + "auxiliary_loss_clip": 0.0104914, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.01057124, + "balance_loss_mlp": 1.01523638, + "epoch": 0.8184578385690666, + "flos": 17711507877120.0, + "grad_norm": 2.398121193626671, + "language_loss": 0.87617505, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.89699268, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33984375, + "step": 13613, + "time_per_iteration": 2.344506025314331 + }, + { + "auxiliary_loss_clip": 0.01053358, + "auxiliary_loss_mlp": 0.01042751, + "balance_loss_clip": 1.02043533, + "balance_loss_mlp": 1.01776814, + "epoch": 0.8185179618217345, + "flos": 25700019776640.0, + "grad_norm": 1.379317113646301, + "language_loss": 0.73609698, + "learning_rate": 3.355612034397746e-07, + "loss": 0.75705802, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 13614, + "time_per_iteration": 2.4300644397735596 + }, + { + "auxiliary_loss_clip": 0.01053099, + "auxiliary_loss_mlp": 0.01038036, + "balance_loss_clip": 1.01521921, + "balance_loss_mlp": 1.01671529, + "epoch": 0.8185780850744026, + "flos": 25959075131520.0, + "grad_norm": 1.6645383539122192, + "language_loss": 0.82181466, + "learning_rate": 3.353452993497479e-07, + "loss": 0.84272605, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 13615, + "time_per_iteration": 2.4384942054748535 + }, + { + "auxiliary_loss_clip": 0.01051903, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.01720798, + "balance_loss_mlp": 1.01595986, + "epoch": 0.8186382083270705, + "flos": 25227260789760.0, + "grad_norm": 1.8894138733150982, + "language_loss": 0.7658788, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.78679925, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 13616, + "time_per_iteration": 2.37628436088562 + }, + { + "auxiliary_loss_clip": 0.01050294, + "auxiliary_loss_mlp": 0.01040687, + "balance_loss_clip": 1.01608253, + "balance_loss_mlp": 1.01537776, + "epoch": 0.8186983315797385, + "flos": 22414029523200.0, + "grad_norm": 1.66345882841564, + "language_loss": 0.75921977, + "learning_rate": 3.349136805494979e-07, + "loss": 0.78012955, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.34960938, + "step": 13617, + "time_per_iteration": 2.379288911819458 + }, + { + "auxiliary_loss_clip": 0.01048335, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.013309, + "balance_loss_mlp": 1.01457334, + "epoch": 0.8187584548324064, + "flos": 22016927185920.0, + "grad_norm": 1.9284174328306187, + "language_loss": 0.69024992, + "learning_rate": 3.346979658556415e-07, + "loss": 0.71106613, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33789062, + "step": 13618, + "time_per_iteration": 2.3553390502929688 + }, + { + "auxiliary_loss_clip": 0.01054741, + "auxiliary_loss_mlp": 0.01041905, + "balance_loss_clip": 1.0179081, + "balance_loss_mlp": 1.01693916, + "epoch": 0.8188185780850744, + "flos": 29240387262720.0, + "grad_norm": 1.9606441973917461, + "language_loss": 0.70442343, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72538996, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37890625, + "step": 13619, + "time_per_iteration": 3.83598256111145 + }, + { + "auxiliary_loss_clip": 0.01053371, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.01395535, + "balance_loss_mlp": 1.01682043, + "epoch": 0.8188787013377423, + "flos": 20695674493440.0, + "grad_norm": 1.7356634050670037, + "language_loss": 0.744587, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.76549292, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 13620, + "time_per_iteration": 2.382858991622925 + }, + { + "auxiliary_loss_clip": 0.01049403, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.00945365, + "balance_loss_mlp": 1.01544619, + "epoch": 0.8189388245904103, + "flos": 23731825991040.0, + "grad_norm": 1.5478853590725143, + "language_loss": 0.76712209, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78792644, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33789062, + "step": 13621, + "time_per_iteration": 2.380847454071045 + }, + { + "auxiliary_loss_clip": 0.0105013, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.00908875, + "balance_loss_mlp": 1.01507783, + "epoch": 0.8189989478430784, + "flos": 28253881486080.0, + "grad_norm": 2.3474034889523687, + "language_loss": 0.67362976, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.69444942, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34960938, + "step": 13622, + "time_per_iteration": 2.468069314956665 + }, + { + "auxiliary_loss_clip": 0.01053237, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.01030326, + "balance_loss_mlp": 1.01713836, + "epoch": 0.8190590710957463, + "flos": 21396625326720.0, + "grad_norm": 1.8823720170211955, + "language_loss": 0.76386696, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.78475434, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36132812, + "step": 13623, + "time_per_iteration": 2.3773505687713623 + }, + { + "auxiliary_loss_clip": 0.0105218, + "auxiliary_loss_mlp": 0.01036659, + "balance_loss_clip": 1.01349628, + "balance_loss_mlp": 1.01598954, + "epoch": 0.8191191943484143, + "flos": 38795033197440.0, + "grad_norm": 2.0686613226596524, + "language_loss": 0.64567065, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.66655904, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36328125, + "step": 13624, + "time_per_iteration": 2.5161044597625732 + }, + { + "auxiliary_loss_clip": 0.01050145, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.01331997, + "balance_loss_mlp": 1.01517975, + "epoch": 0.8191793176010822, + "flos": 25445363253120.0, + "grad_norm": 2.3529825890341476, + "language_loss": 0.79795986, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.81881815, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3515625, + "step": 13625, + "time_per_iteration": 3.837476968765259 + }, + { + "auxiliary_loss_clip": 0.01055289, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_clip": 1.01818275, + "balance_loss_mlp": 1.01657236, + "epoch": 0.8192394408537502, + "flos": 25081847510400.0, + "grad_norm": 2.320522951694003, + "language_loss": 0.77455622, + "learning_rate": 3.329745223345244e-07, + "loss": 0.79553092, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.38671875, + "step": 13626, + "time_per_iteration": 2.4145047664642334 + }, + { + "auxiliary_loss_clip": 0.01051399, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.01685917, + "balance_loss_mlp": 1.01689982, + "epoch": 0.8192995641064181, + "flos": 27672472748160.0, + "grad_norm": 1.4979403470115598, + "language_loss": 0.74739105, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.76828879, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 13627, + "time_per_iteration": 2.454164743423462 + }, + { + "auxiliary_loss_clip": 0.0105286, + "auxiliary_loss_mlp": 0.01035017, + "balance_loss_clip": 1.0117116, + "balance_loss_mlp": 1.01701128, + "epoch": 0.8193596873590862, + "flos": 21287416993920.0, + "grad_norm": 1.5057113259201131, + "language_loss": 0.70063066, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.72150946, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 13628, + "time_per_iteration": 3.6954305171966553 + }, + { + "auxiliary_loss_clip": 0.01055868, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.01058984, + "balance_loss_mlp": 1.01797557, + "epoch": 0.8194198106117541, + "flos": 17491694757120.0, + "grad_norm": 1.6090719499399269, + "language_loss": 0.8593697, + "learning_rate": 3.323292738168171e-07, + "loss": 0.8802852, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 13629, + "time_per_iteration": 2.3951497077941895 + }, + { + "auxiliary_loss_clip": 0.01051668, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.01587474, + "balance_loss_mlp": 1.01550519, + "epoch": 0.8194799338644221, + "flos": 15267029057280.0, + "grad_norm": 2.306338351042696, + "language_loss": 0.74949354, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.77038783, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36132812, + "step": 13630, + "time_per_iteration": 2.3223626613616943 + }, + { + "auxiliary_loss_clip": 0.01054298, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.01525331, + "balance_loss_mlp": 1.01742971, + "epoch": 0.81954005711709, + "flos": 14717985016320.0, + "grad_norm": 1.7963438164423489, + "language_loss": 0.72829461, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74923897, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 13631, + "time_per_iteration": 2.3344781398773193 + }, + { + "auxiliary_loss_clip": 0.01052668, + "auxiliary_loss_mlp": 0.01038108, + "balance_loss_clip": 1.01452839, + "balance_loss_mlp": 1.01622975, + "epoch": 0.819600180369758, + "flos": 23072980124160.0, + "grad_norm": 1.5060742121739281, + "language_loss": 0.77223766, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.79314542, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 13632, + "time_per_iteration": 2.391266345977783 + }, + { + "auxiliary_loss_clip": 0.01050495, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.01060867, + "balance_loss_mlp": 1.01526058, + "epoch": 0.8196603036224259, + "flos": 27598561666560.0, + "grad_norm": 1.5940225972081272, + "language_loss": 0.67072642, + "learning_rate": 3.314698278332588e-07, + "loss": 0.69155604, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 13633, + "time_per_iteration": 2.399674654006958 + }, + { + "auxiliary_loss_clip": 0.0104908, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.01631808, + "balance_loss_mlp": 1.01519191, + "epoch": 0.8197204268750939, + "flos": 28580843168640.0, + "grad_norm": 1.6283420262251564, + "language_loss": 0.76385939, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.78473216, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 13634, + "time_per_iteration": 2.4749715328216553 + }, + { + "auxiliary_loss_clip": 0.01049413, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.01228166, + "balance_loss_mlp": 1.01543999, + "epoch": 0.819780550127762, + "flos": 23257181790720.0, + "grad_norm": 1.8999218493955068, + "language_loss": 0.82405239, + "learning_rate": 3.310404844338841e-07, + "loss": 0.84489286, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.33984375, + "step": 13635, + "time_per_iteration": 2.359858751296997 + }, + { + "auxiliary_loss_clip": 0.01051191, + "auxiliary_loss_mlp": 0.01039724, + "balance_loss_clip": 1.01532161, + "balance_loss_mlp": 1.01514769, + "epoch": 0.8198406733804299, + "flos": 26684116669440.0, + "grad_norm": 1.6374307372761028, + "language_loss": 0.76832461, + "learning_rate": 3.308259076607949e-07, + "loss": 0.7892338, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 13636, + "time_per_iteration": 2.398360252380371 + }, + { + "auxiliary_loss_clip": 0.01049484, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.0142312, + "balance_loss_mlp": 1.01524186, + "epoch": 0.8199007966330979, + "flos": 20083053133440.0, + "grad_norm": 1.9014296813170228, + "language_loss": 0.82381427, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.84467149, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34179688, + "step": 13637, + "time_per_iteration": 2.354078769683838 + }, + { + "auxiliary_loss_clip": 0.01050614, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.01143324, + "balance_loss_mlp": 1.01533985, + "epoch": 0.8199609198857658, + "flos": 31901502268800.0, + "grad_norm": 2.216425319537988, + "language_loss": 0.72927469, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.75011593, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 13638, + "time_per_iteration": 2.4448435306549072 + }, + { + "auxiliary_loss_clip": 0.01053208, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.01043582, + "balance_loss_mlp": 1.01566124, + "epoch": 0.8200210431384338, + "flos": 26468911848960.0, + "grad_norm": 2.006578319183566, + "language_loss": 0.80338019, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.82427245, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 13639, + "time_per_iteration": 2.386749505996704 + }, + { + "auxiliary_loss_clip": 0.01050869, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.01109695, + "balance_loss_mlp": 1.01579821, + "epoch": 0.8200811663911017, + "flos": 22090349508480.0, + "grad_norm": 1.5767737129119914, + "language_loss": 0.80111724, + "learning_rate": 3.299682336022589e-07, + "loss": 0.82196575, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 13640, + "time_per_iteration": 2.3856165409088135 + }, + { + "auxiliary_loss_clip": 0.0105509, + "auxiliary_loss_mlp": 0.01041786, + "balance_loss_clip": 1.01560783, + "balance_loss_mlp": 1.01623034, + "epoch": 0.8201412896437698, + "flos": 37592240348160.0, + "grad_norm": 1.7378667689231904, + "language_loss": 0.6367203, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65768909, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 13641, + "time_per_iteration": 3.9470717906951904 + }, + { + "auxiliary_loss_clip": 0.01050683, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.0124824, + "balance_loss_mlp": 1.01564419, + "epoch": 0.8202014128964377, + "flos": 19645312106880.0, + "grad_norm": 1.7802339919079493, + "language_loss": 0.74411994, + "learning_rate": 3.295397765071055e-07, + "loss": 0.76497757, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 13642, + "time_per_iteration": 2.3357226848602295 + }, + { + "auxiliary_loss_clip": 0.01052997, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.01347041, + "balance_loss_mlp": 1.0174439, + "epoch": 0.8202615361491057, + "flos": 31465995569280.0, + "grad_norm": 1.7405154711992856, + "language_loss": 0.71722579, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.73812538, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 13643, + "time_per_iteration": 2.4540491104125977 + }, + { + "auxiliary_loss_clip": 0.01051301, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.0127306, + "balance_loss_mlp": 1.01639354, + "epoch": 0.8203216594017736, + "flos": 24714456606720.0, + "grad_norm": 2.109123865090378, + "language_loss": 0.66278529, + "learning_rate": 3.291115727880256e-07, + "loss": 0.68364936, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 13644, + "time_per_iteration": 2.3942558765411377 + }, + { + "auxiliary_loss_clip": 0.01053266, + "auxiliary_loss_mlp": 0.01035413, + "balance_loss_clip": 1.01229787, + "balance_loss_mlp": 1.01650798, + "epoch": 0.8203817826544416, + "flos": 26030612240640.0, + "grad_norm": 1.4050123235852237, + "language_loss": 0.71522141, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.73610824, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 13645, + "time_per_iteration": 2.399430513381958 + }, + { + "auxiliary_loss_clip": 0.01050033, + "auxiliary_loss_mlp": 0.01037055, + "balance_loss_clip": 1.01527524, + "balance_loss_mlp": 1.01503658, + "epoch": 0.8204419059071095, + "flos": 25953454402560.0, + "grad_norm": 1.7367781586935402, + "language_loss": 0.71933597, + "learning_rate": 3.286836225099707e-07, + "loss": 0.74020684, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34960938, + "step": 13646, + "time_per_iteration": 2.4114530086517334 + }, + { + "auxiliary_loss_clip": 0.01051869, + "auxiliary_loss_mlp": 0.0103983, + "balance_loss_clip": 1.01617849, + "balance_loss_mlp": 1.01589632, + "epoch": 0.8205020291597775, + "flos": 23579116237440.0, + "grad_norm": 2.432418979718719, + "language_loss": 0.8047933, + "learning_rate": 3.284697424316132e-07, + "loss": 0.8257103, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 13647, + "time_per_iteration": 2.3573670387268066 + }, + { + "auxiliary_loss_clip": 0.01050381, + "auxiliary_loss_mlp": 0.01038128, + "balance_loss_clip": 1.015836, + "balance_loss_mlp": 1.01613474, + "epoch": 0.8205621524124456, + "flos": 26797898390400.0, + "grad_norm": 1.4463165037124226, + "language_loss": 0.69187403, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.71275914, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34179688, + "step": 13648, + "time_per_iteration": 2.447502851486206 + }, + { + "auxiliary_loss_clip": 0.01050857, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.01293778, + "balance_loss_mlp": 1.01526821, + "epoch": 0.8206222756651135, + "flos": 27526605621120.0, + "grad_norm": 1.7552021762172534, + "language_loss": 0.80988908, + "learning_rate": 3.28042172436791e-07, + "loss": 0.83075011, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 13649, + "time_per_iteration": 2.4110682010650635 + }, + { + "auxiliary_loss_clip": 0.01052699, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.01552248, + "balance_loss_mlp": 1.01712382, + "epoch": 0.8206823989177815, + "flos": 21177545345280.0, + "grad_norm": 1.7849611286931002, + "language_loss": 0.69653559, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71746016, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 13650, + "time_per_iteration": 2.3758177757263184 + }, + { + "auxiliary_loss_clip": 0.01054068, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.00969422, + "balance_loss_mlp": 1.01670206, + "epoch": 0.8207425221704494, + "flos": 11508838398720.0, + "grad_norm": 2.4164552698358093, + "language_loss": 0.6226548, + "learning_rate": 3.276148560452001e-07, + "loss": 0.64352965, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.375, + "step": 13651, + "time_per_iteration": 2.3309831619262695 + }, + { + "auxiliary_loss_clip": 0.01053787, + "auxiliary_loss_mlp": 0.01041017, + "balance_loss_clip": 1.01783085, + "balance_loss_mlp": 1.01708078, + "epoch": 0.8208026454231174, + "flos": 19790969765760.0, + "grad_norm": 1.9620790485356434, + "language_loss": 0.73194975, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.7528978, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 13652, + "time_per_iteration": 2.419726610183716 + }, + { + "auxiliary_loss_clip": 0.01048927, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.01000059, + "balance_loss_mlp": 1.01581001, + "epoch": 0.8208627686757853, + "flos": 15666679923840.0, + "grad_norm": 1.879729385048439, + "language_loss": 0.73662359, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75743532, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33203125, + "step": 13653, + "time_per_iteration": 2.325052261352539 + }, + { + "auxiliary_loss_clip": 0.01054125, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.01432359, + "balance_loss_mlp": 1.01675344, + "epoch": 0.8209228919284534, + "flos": 37481286447360.0, + "grad_norm": 2.4210461574398323, + "language_loss": 0.64397931, + "learning_rate": 3.269743571056451e-07, + "loss": 0.66492069, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37304688, + "step": 13654, + "time_per_iteration": 2.518131971359253 + }, + { + "auxiliary_loss_clip": 0.01053519, + "auxiliary_loss_mlp": 0.01036812, + "balance_loss_clip": 1.01451957, + "balance_loss_mlp": 1.01555622, + "epoch": 0.8209830151811213, + "flos": 23111838334080.0, + "grad_norm": 1.573878586221211, + "language_loss": 0.7097019, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.73060519, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.37890625, + "step": 13655, + "time_per_iteration": 2.374218463897705 + }, + { + "auxiliary_loss_clip": 0.01051606, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.01543355, + "balance_loss_mlp": 1.01632094, + "epoch": 0.8210431384337893, + "flos": 21287102791680.0, + "grad_norm": 2.1167436207498036, + "language_loss": 0.83530742, + "learning_rate": 3.265476750056162e-07, + "loss": 0.85620886, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 13656, + "time_per_iteration": 2.3978655338287354 + }, + { + "auxiliary_loss_clip": 0.01048442, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.01348901, + "balance_loss_mlp": 1.01607013, + "epoch": 0.8211032616864572, + "flos": 11501821215360.0, + "grad_norm": 3.101973590447265, + "language_loss": 0.7497052, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.7705276, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.32421875, + "step": 13657, + "time_per_iteration": 2.346237897872925 + }, + { + "auxiliary_loss_clip": 0.01050945, + "auxiliary_loss_mlp": 0.0103727, + "balance_loss_clip": 1.01377368, + "balance_loss_mlp": 1.01568151, + "epoch": 0.8211633849391252, + "flos": 29820294812160.0, + "grad_norm": 1.7592275728503664, + "language_loss": 0.569278, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.59016013, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 13658, + "time_per_iteration": 2.432565450668335 + }, + { + "auxiliary_loss_clip": 0.01052458, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.01329422, + "balance_loss_mlp": 1.01639915, + "epoch": 0.8212235081917931, + "flos": 13114598693760.0, + "grad_norm": 2.2456863124925484, + "language_loss": 0.80314744, + "learning_rate": 3.259081278068805e-07, + "loss": 0.82403314, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 13659, + "time_per_iteration": 3.616368532180786 + }, + { + "auxiliary_loss_clip": 0.01046958, + "auxiliary_loss_mlp": 0.0102825, + "balance_loss_clip": 1.00841379, + "balance_loss_mlp": 1.01454568, + "epoch": 0.8212836314444611, + "flos": 40513772252160.0, + "grad_norm": 1.6977335711673756, + "language_loss": 0.60716242, + "learning_rate": 3.256950723599887e-07, + "loss": 0.62791455, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.32421875, + "step": 13660, + "time_per_iteration": 2.532332181930542 + }, + { + "auxiliary_loss_clip": 0.01052209, + "auxiliary_loss_mlp": 0.01038461, + "balance_loss_clip": 1.01504779, + "balance_loss_mlp": 1.01577711, + "epoch": 0.8213437546971292, + "flos": 18769550762880.0, + "grad_norm": 1.8607194847785282, + "language_loss": 0.73910785, + "learning_rate": 3.254820804029075e-07, + "loss": 0.76001453, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 13661, + "time_per_iteration": 2.3380308151245117 + }, + { + "auxiliary_loss_clip": 0.01051333, + "auxiliary_loss_mlp": 0.01037316, + "balance_loss_clip": 1.01401043, + "balance_loss_mlp": 1.01492476, + "epoch": 0.8214038779497971, + "flos": 19681272673920.0, + "grad_norm": 2.189305543722619, + "language_loss": 0.76009095, + "learning_rate": 3.252691519437143e-07, + "loss": 0.78097749, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 13662, + "time_per_iteration": 2.375488519668579 + }, + { + "auxiliary_loss_clip": 0.01007668, + "auxiliary_loss_mlp": 0.01002759, + "balance_loss_clip": 1.00045872, + "balance_loss_mlp": 1.00100398, + "epoch": 0.8214640012024651, + "flos": 71599457848320.0, + "grad_norm": 0.7420737585933828, + "language_loss": 0.54107761, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56118184, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.06640625, + "step": 13663, + "time_per_iteration": 3.1538243293762207 + }, + { + "auxiliary_loss_clip": 0.01051303, + "auxiliary_loss_mlp": 0.01040294, + "balance_loss_clip": 1.01682162, + "balance_loss_mlp": 1.01489711, + "epoch": 0.821524124455133, + "flos": 14756319555840.0, + "grad_norm": 2.0252667493238854, + "language_loss": 0.67170167, + "learning_rate": 3.248434855512838e-07, + "loss": 0.69261765, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 13664, + "time_per_iteration": 3.8017237186431885 + }, + { + "auxiliary_loss_clip": 0.01049642, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.01324737, + "balance_loss_mlp": 1.01655626, + "epoch": 0.821584247707801, + "flos": 25081114371840.0, + "grad_norm": 1.430025292777419, + "language_loss": 0.75514597, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77598214, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33203125, + "step": 13665, + "time_per_iteration": 2.3990750312805176 + }, + { + "auxiliary_loss_clip": 0.01050885, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.0124433, + "balance_loss_mlp": 1.01570559, + "epoch": 0.8216443709604689, + "flos": 36829213384320.0, + "grad_norm": 2.1342566014921975, + "language_loss": 0.66423064, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.68509936, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 13666, + "time_per_iteration": 2.4688801765441895 + }, + { + "auxiliary_loss_clip": 0.01051038, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.01521039, + "balance_loss_mlp": 1.01682162, + "epoch": 0.821704494213137, + "flos": 25080660524160.0, + "grad_norm": 1.7857118734320716, + "language_loss": 0.77399957, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79488158, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34179688, + "step": 13667, + "time_per_iteration": 2.401698112487793 + }, + { + "auxiliary_loss_clip": 0.01052756, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.01315045, + "balance_loss_mlp": 1.01629221, + "epoch": 0.8217646174658049, + "flos": 14355237323520.0, + "grad_norm": 1.8006380230745018, + "language_loss": 0.78316867, + "learning_rate": 3.239929150961773e-07, + "loss": 0.80406594, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 13668, + "time_per_iteration": 3.638336420059204 + }, + { + "auxiliary_loss_clip": 0.01051074, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.01525223, + "balance_loss_mlp": 1.01564789, + "epoch": 0.8218247407184729, + "flos": 22089476724480.0, + "grad_norm": 2.0100748046801606, + "language_loss": 0.75601101, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.77688688, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35546875, + "step": 13669, + "time_per_iteration": 2.387239456176758 + }, + { + "auxiliary_loss_clip": 0.01050515, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.01219463, + "balance_loss_mlp": 1.01562881, + "epoch": 0.8218848639711408, + "flos": 16763092260480.0, + "grad_norm": 2.4468980491540977, + "language_loss": 0.79291874, + "learning_rate": 3.235680111625161e-07, + "loss": 0.81376886, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 13670, + "time_per_iteration": 2.3346989154815674 + }, + { + "auxiliary_loss_clip": 0.01055168, + "auxiliary_loss_mlp": 0.01037544, + "balance_loss_clip": 1.01240265, + "balance_loss_mlp": 1.01705599, + "epoch": 0.8219449872238088, + "flos": 25993604332800.0, + "grad_norm": 1.8258222579839392, + "language_loss": 0.76102912, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.78195626, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 13671, + "time_per_iteration": 2.4013445377349854 + }, + { + "auxiliary_loss_clip": 0.01054108, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.01213324, + "balance_loss_mlp": 1.0159483, + "epoch": 0.8220051104764767, + "flos": 20777021694720.0, + "grad_norm": 1.7824943752643938, + "language_loss": 0.78080714, + "learning_rate": 3.23143361510728e-07, + "loss": 0.80172253, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 13672, + "time_per_iteration": 2.3632864952087402 + }, + { + "auxiliary_loss_clip": 0.01051913, + "auxiliary_loss_mlp": 0.01040347, + "balance_loss_clip": 1.01548052, + "balance_loss_mlp": 1.0159359, + "epoch": 0.8220652337291448, + "flos": 14573968191360.0, + "grad_norm": 2.073093528543402, + "language_loss": 0.7617296, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.78265226, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.359375, + "step": 13673, + "time_per_iteration": 2.349086284637451 + }, + { + "auxiliary_loss_clip": 0.01053444, + "auxiliary_loss_mlp": 0.01038652, + "balance_loss_clip": 1.01417875, + "balance_loss_mlp": 1.01659727, + "epoch": 0.8221253569818128, + "flos": 23804724643200.0, + "grad_norm": 1.615873367473832, + "language_loss": 0.80120802, + "learning_rate": 3.227189662052254e-07, + "loss": 0.82212907, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 13674, + "time_per_iteration": 2.379401683807373 + }, + { + "auxiliary_loss_clip": 0.01050769, + "auxiliary_loss_mlp": 0.01037696, + "balance_loss_clip": 1.01343679, + "balance_loss_mlp": 1.0157249, + "epoch": 0.8221854802344807, + "flos": 21287172614400.0, + "grad_norm": 1.7303943918478601, + "language_loss": 0.71977067, + "learning_rate": 3.225068639524484e-07, + "loss": 0.7406553, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.34960938, + "step": 13675, + "time_per_iteration": 2.363065004348755 + }, + { + "auxiliary_loss_clip": 0.01050633, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.013901, + "balance_loss_mlp": 1.01612639, + "epoch": 0.8222456034871487, + "flos": 20955812100480.0, + "grad_norm": 2.6098853065842924, + "language_loss": 0.75069153, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.77154952, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34570312, + "step": 13676, + "time_per_iteration": 2.3545026779174805 + }, + { + "auxiliary_loss_clip": 0.01052071, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.01237726, + "balance_loss_mlp": 1.01603508, + "epoch": 0.8223057267398166, + "flos": 21396450769920.0, + "grad_norm": 1.809011174321349, + "language_loss": 0.8122431, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.833112, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 13677, + "time_per_iteration": 2.4093449115753174 + }, + { + "auxiliary_loss_clip": 0.01052746, + "auxiliary_loss_mlp": 0.01038322, + "balance_loss_clip": 1.01579142, + "balance_loss_mlp": 1.0159483, + "epoch": 0.8223658499924846, + "flos": 15267308348160.0, + "grad_norm": 1.7549310263364226, + "language_loss": 0.7133922, + "learning_rate": 3.218709388905245e-07, + "loss": 0.73430282, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3671875, + "step": 13678, + "time_per_iteration": 2.328145980834961 + }, + { + "auxiliary_loss_clip": 0.01050454, + "auxiliary_loss_mlp": 0.01038285, + "balance_loss_clip": 1.01602888, + "balance_loss_mlp": 1.01560354, + "epoch": 0.8224259732451525, + "flos": 31248172396800.0, + "grad_norm": 1.4069972995959763, + "language_loss": 0.72585535, + "learning_rate": 3.216590911288133e-07, + "loss": 0.74674273, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 13679, + "time_per_iteration": 2.472179412841797 + }, + { + "auxiliary_loss_clip": 0.01050499, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.00834274, + "balance_loss_mlp": 1.01498365, + "epoch": 0.8224860964978206, + "flos": 21573705075840.0, + "grad_norm": 2.02662258294978, + "language_loss": 0.71132404, + "learning_rate": 3.214473070099564e-07, + "loss": 0.73213869, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 13680, + "time_per_iteration": 2.34512996673584 + }, + { + "auxiliary_loss_clip": 0.01051273, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.01201606, + "balance_loss_mlp": 1.01563263, + "epoch": 0.8225462197504885, + "flos": 25482056958720.0, + "grad_norm": 1.9510840540928875, + "language_loss": 0.60647523, + "learning_rate": 3.21235586541986e-07, + "loss": 0.62733579, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 13681, + "time_per_iteration": 3.8736627101898193 + }, + { + "auxiliary_loss_clip": 0.01053923, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.01165557, + "balance_loss_mlp": 1.01605809, + "epoch": 0.8226063430031565, + "flos": 39383878055040.0, + "grad_norm": 1.5680507605801115, + "language_loss": 0.70608044, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.72697532, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37890625, + "step": 13682, + "time_per_iteration": 2.5191221237182617 + }, + { + "auxiliary_loss_clip": 0.01053147, + "auxiliary_loss_mlp": 0.01043877, + "balance_loss_clip": 1.01784194, + "balance_loss_mlp": 1.01644659, + "epoch": 0.8226664662558244, + "flos": 22814308794240.0, + "grad_norm": 1.8693623397348311, + "language_loss": 0.80388629, + "learning_rate": 3.20812336590816e-07, + "loss": 0.82485664, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 13683, + "time_per_iteration": 2.4139251708984375 + }, + { + "auxiliary_loss_clip": 0.01048606, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.01546884, + "balance_loss_mlp": 1.0151968, + "epoch": 0.8227265895084924, + "flos": 25664443234560.0, + "grad_norm": 1.9077755716809752, + "language_loss": 0.87738049, + "learning_rate": 3.206008071236661e-07, + "loss": 0.89823306, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33398438, + "step": 13684, + "time_per_iteration": 2.378774642944336 + }, + { + "auxiliary_loss_clip": 0.01049373, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.01389074, + "balance_loss_mlp": 1.01505518, + "epoch": 0.8227867127611603, + "flos": 26178015467520.0, + "grad_norm": 1.4507026702346044, + "language_loss": 0.80641437, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82726991, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 13685, + "time_per_iteration": 2.4716949462890625 + }, + { + "auxiliary_loss_clip": 0.01052829, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.0160706, + "epoch": 0.8228468360138284, + "flos": 22016962097280.0, + "grad_norm": 1.5525981825681079, + "language_loss": 0.69723439, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.718171, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3671875, + "step": 13686, + "time_per_iteration": 2.3744308948516846 + }, + { + "auxiliary_loss_clip": 0.01052174, + "auxiliary_loss_mlp": 0.01039551, + "balance_loss_clip": 1.0151248, + "balance_loss_mlp": 1.01495135, + "epoch": 0.8229069592664963, + "flos": 14902465973760.0, + "grad_norm": 1.930919656493472, + "language_loss": 0.78772295, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80864024, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 13687, + "time_per_iteration": 2.3539745807647705 + }, + { + "auxiliary_loss_clip": 0.0105235, + "auxiliary_loss_mlp": 0.01038446, + "balance_loss_clip": 1.01430631, + "balance_loss_mlp": 1.01625586, + "epoch": 0.8229670825191643, + "flos": 15668565137280.0, + "grad_norm": 1.7758782421818677, + "language_loss": 0.73762667, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.75853467, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 13688, + "time_per_iteration": 2.313310146331787 + }, + { + "auxiliary_loss_clip": 0.01051268, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.01481509, + "balance_loss_mlp": 1.01615644, + "epoch": 0.8230272057718323, + "flos": 23182432836480.0, + "grad_norm": 1.6903391610442347, + "language_loss": 0.74015427, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.76103687, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 13689, + "time_per_iteration": 2.4368338584899902 + }, + { + "auxiliary_loss_clip": 0.01051561, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.01475191, + "balance_loss_mlp": 1.01543069, + "epoch": 0.8230873290245002, + "flos": 21031364016000.0, + "grad_norm": 1.8818249213994498, + "language_loss": 0.69594038, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71683574, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 13690, + "time_per_iteration": 2.362656354904175 + }, + { + "auxiliary_loss_clip": 0.01052087, + "auxiliary_loss_mlp": 0.01038551, + "balance_loss_clip": 1.0155673, + "balance_loss_mlp": 1.01586604, + "epoch": 0.8231474522771682, + "flos": 21249117365760.0, + "grad_norm": 1.6793630829494646, + "language_loss": 0.86230582, + "learning_rate": 3.191218844260988e-07, + "loss": 0.88321221, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 13691, + "time_per_iteration": 2.4108457565307617 + }, + { + "auxiliary_loss_clip": 0.01051606, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.01021302, + "balance_loss_mlp": 1.01633859, + "epoch": 0.8232075755298361, + "flos": 23840894678400.0, + "grad_norm": 2.559678180374444, + "language_loss": 0.78346282, + "learning_rate": 3.189108646472252e-07, + "loss": 0.80430126, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 13692, + "time_per_iteration": 2.374525308609009 + }, + { + "auxiliary_loss_clip": 0.01050421, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.00974655, + "balance_loss_mlp": 1.01560473, + "epoch": 0.8232676987825042, + "flos": 21652852861440.0, + "grad_norm": 1.4796838820272846, + "language_loss": 0.72551727, + "learning_rate": 3.186999086154205e-07, + "loss": 0.74634123, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 13693, + "time_per_iteration": 2.4065725803375244 + }, + { + "auxiliary_loss_clip": 0.0104909, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.01364207, + "balance_loss_mlp": 1.0145061, + "epoch": 0.8233278220351721, + "flos": 26321508622080.0, + "grad_norm": 1.3351774240298224, + "language_loss": 0.844064, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.86490089, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34570312, + "step": 13694, + "time_per_iteration": 2.398897409439087 + }, + { + "auxiliary_loss_clip": 0.01052029, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.01283145, + "balance_loss_mlp": 1.01610231, + "epoch": 0.8233879452878401, + "flos": 21724739084160.0, + "grad_norm": 2.0761203967320543, + "language_loss": 0.78034443, + "learning_rate": 3.182781878250118e-07, + "loss": 0.8012237, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 13695, + "time_per_iteration": 2.4076693058013916 + }, + { + "auxiliary_loss_clip": 0.01052164, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.01241493, + "balance_loss_mlp": 1.01712561, + "epoch": 0.823448068540508, + "flos": 20556719815680.0, + "grad_norm": 1.9007416577707212, + "language_loss": 0.82522988, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.84609699, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 13696, + "time_per_iteration": 2.348827362060547 + }, + { + "auxiliary_loss_clip": 0.01007741, + "auxiliary_loss_mlp": 0.01003216, + "balance_loss_clip": 1.00107014, + "balance_loss_mlp": 1.0009315, + "epoch": 0.823508191793176, + "flos": 67269947834880.0, + "grad_norm": 0.7456124811532253, + "language_loss": 0.63992459, + "learning_rate": 3.178567221188393e-07, + "loss": 0.66003418, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06835938, + "step": 13697, + "time_per_iteration": 3.1126952171325684 + }, + { + "auxiliary_loss_clip": 0.01048768, + "auxiliary_loss_mlp": 0.01033674, + "balance_loss_clip": 1.01265693, + "balance_loss_mlp": 1.01534414, + "epoch": 0.8235683150458439, + "flos": 17927515658880.0, + "grad_norm": 1.6683153295939304, + "language_loss": 0.73709458, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75791895, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.3359375, + "step": 13698, + "time_per_iteration": 3.6471986770629883 + }, + { + "auxiliary_loss_clip": 0.01051791, + "auxiliary_loss_mlp": 0.01034473, + "balance_loss_clip": 1.00973666, + "balance_loss_mlp": 1.01578915, + "epoch": 0.823628438298512, + "flos": 18915103687680.0, + "grad_norm": 2.083305990538955, + "language_loss": 0.73110628, + "learning_rate": 3.174355115608305e-07, + "loss": 0.75196898, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 13699, + "time_per_iteration": 2.3676772117614746 + }, + { + "auxiliary_loss_clip": 0.01050724, + "auxiliary_loss_mlp": 0.01033229, + "balance_loss_clip": 1.01172388, + "balance_loss_mlp": 1.01573539, + "epoch": 0.8236885615511799, + "flos": 18695500035840.0, + "grad_norm": 2.2577930345676887, + "language_loss": 0.82995224, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.85079175, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34960938, + "step": 13700, + "time_per_iteration": 2.327467679977417 + }, + { + "auxiliary_loss_clip": 0.01051805, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.0131526, + "balance_loss_mlp": 1.01564407, + "epoch": 0.8237486848038479, + "flos": 23693910387840.0, + "grad_norm": 1.8268670295827365, + "language_loss": 0.73855007, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75942791, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 13701, + "time_per_iteration": 2.4037322998046875 + }, + { + "auxiliary_loss_clip": 0.01053038, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.0098182, + "balance_loss_mlp": 1.01631689, + "epoch": 0.8238088080565159, + "flos": 23440161559680.0, + "grad_norm": 2.00875775776149, + "language_loss": 0.70594144, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.72682142, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 13702, + "time_per_iteration": 2.368945837020874 + }, + { + "auxiliary_loss_clip": 0.01052834, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.01155365, + "balance_loss_mlp": 1.0167563, + "epoch": 0.8238689313091838, + "flos": 22745459859840.0, + "grad_norm": 1.7779821416780852, + "language_loss": 0.7594527, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.78034103, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 13703, + "time_per_iteration": 2.393434762954712 + }, + { + "auxiliary_loss_clip": 0.01053456, + "auxiliary_loss_mlp": 0.01040066, + "balance_loss_clip": 1.0155921, + "balance_loss_mlp": 1.01590419, + "epoch": 0.8239290545618518, + "flos": 25628901603840.0, + "grad_norm": 2.266470965689381, + "language_loss": 0.71195197, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.73288715, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 13704, + "time_per_iteration": 2.3892483711242676 + }, + { + "auxiliary_loss_clip": 0.010495, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.01515639, + "balance_loss_mlp": 1.01472795, + "epoch": 0.8239891778145197, + "flos": 26025410448000.0, + "grad_norm": 1.9029960182633554, + "language_loss": 0.65494573, + "learning_rate": 3.161734114144916e-07, + "loss": 0.67582887, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34765625, + "step": 13705, + "time_per_iteration": 3.8251585960388184 + }, + { + "auxiliary_loss_clip": 0.01052628, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.01119828, + "balance_loss_mlp": 1.01596391, + "epoch": 0.8240493010671878, + "flos": 21832236760320.0, + "grad_norm": 2.067869870464641, + "language_loss": 0.70667964, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.72755706, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 13706, + "time_per_iteration": 2.3849499225616455 + }, + { + "auxiliary_loss_clip": 0.01051745, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.013641, + "balance_loss_mlp": 1.01678371, + "epoch": 0.8241094243198557, + "flos": 18550924629120.0, + "grad_norm": 1.6907490182608942, + "language_loss": 0.70994508, + "learning_rate": 3.157532220876475e-07, + "loss": 0.73082322, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 13707, + "time_per_iteration": 3.726036548614502 + }, + { + "auxiliary_loss_clip": 0.0105212, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.01244688, + "balance_loss_mlp": 1.01534212, + "epoch": 0.8241695475725237, + "flos": 25445991657600.0, + "grad_norm": 1.8775499190859868, + "language_loss": 0.80200076, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.82288575, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 13708, + "time_per_iteration": 2.3908145427703857 + }, + { + "auxiliary_loss_clip": 0.0105183, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.01318884, + "balance_loss_mlp": 1.01571441, + "epoch": 0.8242296708251916, + "flos": 18988665655680.0, + "grad_norm": 2.094011680408907, + "language_loss": 0.69728827, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.71816814, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36132812, + "step": 13709, + "time_per_iteration": 2.349421739578247 + }, + { + "auxiliary_loss_clip": 0.01052518, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.01273739, + "balance_loss_mlp": 1.01680636, + "epoch": 0.8242897940778596, + "flos": 22599802200960.0, + "grad_norm": 2.043366930458977, + "language_loss": 0.83966368, + "learning_rate": 3.151234171183319e-07, + "loss": 0.86054784, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35742188, + "step": 13710, + "time_per_iteration": 2.384967803955078 + }, + { + "auxiliary_loss_clip": 0.01050417, + "auxiliary_loss_mlp": 0.01038897, + "balance_loss_clip": 1.01618779, + "balance_loss_mlp": 1.01473761, + "epoch": 0.8243499173305275, + "flos": 21467150006400.0, + "grad_norm": 1.854981875320383, + "language_loss": 0.79263496, + "learning_rate": 3.149136098993257e-07, + "loss": 0.81352806, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35742188, + "step": 13711, + "time_per_iteration": 2.3450136184692383 + }, + { + "auxiliary_loss_clip": 0.0105149, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.0132246, + "balance_loss_mlp": 1.0157392, + "epoch": 0.8244100405831956, + "flos": 20009351520000.0, + "grad_norm": 1.8966336712692795, + "language_loss": 0.67359197, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.69446874, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35742188, + "step": 13712, + "time_per_iteration": 2.3466148376464844 + }, + { + "auxiliary_loss_clip": 0.01052535, + "auxiliary_loss_mlp": 0.01040053, + "balance_loss_clip": 1.01517439, + "balance_loss_mlp": 1.0165453, + "epoch": 0.8244701638358635, + "flos": 26429529968640.0, + "grad_norm": 1.7683742884443596, + "language_loss": 0.7514528, + "learning_rate": 3.14494187165202e-07, + "loss": 0.77237868, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 13713, + "time_per_iteration": 2.3883330821990967 + }, + { + "auxiliary_loss_clip": 0.01051023, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.00772572, + "balance_loss_mlp": 1.01480937, + "epoch": 0.8245302870885315, + "flos": 17639028161280.0, + "grad_norm": 1.9502625444840858, + "language_loss": 0.82376063, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.84459436, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 13714, + "time_per_iteration": 2.3304924964904785 + }, + { + "auxiliary_loss_clip": 0.01052617, + "auxiliary_loss_mlp": 0.01040467, + "balance_loss_clip": 1.01670885, + "balance_loss_mlp": 1.01726341, + "epoch": 0.8245904103411995, + "flos": 26208425128320.0, + "grad_norm": 1.6623445712396068, + "language_loss": 0.67541242, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.69634324, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 13715, + "time_per_iteration": 2.431386947631836 + }, + { + "auxiliary_loss_clip": 0.0105384, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.01191115, + "balance_loss_mlp": 1.01662421, + "epoch": 0.8246505335938674, + "flos": 24203991484800.0, + "grad_norm": 2.3056388832187213, + "language_loss": 0.76429844, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.78518689, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37109375, + "step": 13716, + "time_per_iteration": 2.3897364139556885 + }, + { + "auxiliary_loss_clip": 0.01007645, + "auxiliary_loss_mlp": 0.0100343, + "balance_loss_clip": 1.00128424, + "balance_loss_mlp": 1.00077212, + "epoch": 0.8247106568465354, + "flos": 67088434343040.0, + "grad_norm": 0.7201942139237212, + "language_loss": 0.59022033, + "learning_rate": 3.136561087351175e-07, + "loss": 0.61033112, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06884766, + "step": 13717, + "time_per_iteration": 3.1317803859710693 + }, + { + "auxiliary_loss_clip": 0.01050625, + "auxiliary_loss_mlp": 0.01036374, + "balance_loss_clip": 1.01489198, + "balance_loss_mlp": 1.01497698, + "epoch": 0.8247707800992033, + "flos": 12567404954880.0, + "grad_norm": 2.2083422575513496, + "language_loss": 0.81297016, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.83384007, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35546875, + "step": 13718, + "time_per_iteration": 2.352323532104492 + }, + { + "auxiliary_loss_clip": 0.01049063, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.01844597, + "balance_loss_mlp": 1.01490891, + "epoch": 0.8248309033518714, + "flos": 15922732901760.0, + "grad_norm": 1.9954337661688828, + "language_loss": 0.69619828, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71708977, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 13719, + "time_per_iteration": 2.3277177810668945 + }, + { + "auxiliary_loss_clip": 0.01052147, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.01382399, + "balance_loss_mlp": 1.01532471, + "epoch": 0.8248910266045393, + "flos": 17563825359360.0, + "grad_norm": 2.242367341032914, + "language_loss": 0.70832318, + "learning_rate": 3.13028221321197e-07, + "loss": 0.72924328, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 13720, + "time_per_iteration": 2.3641624450683594 + }, + { + "auxiliary_loss_clip": 0.01053947, + "auxiliary_loss_mlp": 0.01038022, + "balance_loss_clip": 1.01304746, + "balance_loss_mlp": 1.01688361, + "epoch": 0.8249511498572073, + "flos": 28618444569600.0, + "grad_norm": 1.6906866637176536, + "language_loss": 0.76969123, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.79061091, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 13721, + "time_per_iteration": 3.868593454360962 + }, + { + "auxiliary_loss_clip": 0.01050118, + "auxiliary_loss_mlp": 0.0103269, + "balance_loss_clip": 1.01255548, + "balance_loss_mlp": 1.0152241, + "epoch": 0.8250112731098752, + "flos": 25555409458560.0, + "grad_norm": 1.8044308012485728, + "language_loss": 0.78912723, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80995524, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.34765625, + "step": 13722, + "time_per_iteration": 2.396998167037964 + }, + { + "auxiliary_loss_clip": 0.01050425, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.01130223, + "balance_loss_mlp": 1.01596189, + "epoch": 0.8250713963625432, + "flos": 27744917552640.0, + "grad_norm": 1.7520334927874042, + "language_loss": 0.64098966, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.66182053, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 13723, + "time_per_iteration": 2.427154064178467 + }, + { + "auxiliary_loss_clip": 0.01052913, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.01373231, + "balance_loss_mlp": 1.0165993, + "epoch": 0.8251315196152111, + "flos": 21609700554240.0, + "grad_norm": 1.4859289116893657, + "language_loss": 0.75809765, + "learning_rate": 3.121919337215666e-07, + "loss": 0.77900505, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 13724, + "time_per_iteration": 2.371713876724243 + }, + { + "auxiliary_loss_clip": 0.01052832, + "auxiliary_loss_mlp": 0.01041425, + "balance_loss_clip": 1.01659405, + "balance_loss_mlp": 1.01658201, + "epoch": 0.8251916428678792, + "flos": 28578259728000.0, + "grad_norm": 2.3848005751551757, + "language_loss": 0.64839172, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.66933435, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 13725, + "time_per_iteration": 2.416520118713379 + }, + { + "auxiliary_loss_clip": 0.01050748, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.01833391, + "balance_loss_mlp": 1.01587284, + "epoch": 0.8252517661205471, + "flos": 23074097287680.0, + "grad_norm": 1.5642246146217766, + "language_loss": 0.82790619, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.8488254, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34765625, + "step": 13726, + "time_per_iteration": 2.3837780952453613 + }, + { + "auxiliary_loss_clip": 0.01047529, + "auxiliary_loss_mlp": 0.0103571, + "balance_loss_clip": 1.01565921, + "balance_loss_mlp": 1.0140605, + "epoch": 0.8253118893732151, + "flos": 31758218582400.0, + "grad_norm": 1.69345792485503, + "language_loss": 0.71831417, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.73914659, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33398438, + "step": 13727, + "time_per_iteration": 2.457618474960327 + }, + { + "auxiliary_loss_clip": 0.0105435, + "auxiliary_loss_mlp": 0.01039851, + "balance_loss_clip": 1.0138638, + "balance_loss_mlp": 1.01714301, + "epoch": 0.8253720126258831, + "flos": 18295430232960.0, + "grad_norm": 1.7039133327494973, + "language_loss": 0.63871086, + "learning_rate": 3.113566701515036e-07, + "loss": 0.65965283, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37304688, + "step": 13728, + "time_per_iteration": 2.373444080352783 + }, + { + "auxiliary_loss_clip": 0.01054819, + "auxiliary_loss_mlp": 0.0103844, + "balance_loss_clip": 1.01403749, + "balance_loss_mlp": 1.01728225, + "epoch": 0.825432135878551, + "flos": 26796117911040.0, + "grad_norm": 1.8318426646296377, + "language_loss": 0.72683394, + "learning_rate": 3.111480143230092e-07, + "loss": 0.74776649, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 13729, + "time_per_iteration": 2.390575647354126 + }, + { + "auxiliary_loss_clip": 0.01007064, + "auxiliary_loss_mlp": 0.01002672, + "balance_loss_clip": 1.0003233, + "balance_loss_mlp": 1.00046849, + "epoch": 0.825492259131219, + "flos": 54216552487680.0, + "grad_norm": 0.8471337922207444, + "language_loss": 0.62750763, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64760494, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.06591797, + "step": 13730, + "time_per_iteration": 2.8276736736297607 + }, + { + "auxiliary_loss_clip": 0.01051951, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.01548135, + "balance_loss_mlp": 1.01659226, + "epoch": 0.825552382383887, + "flos": 43754655162240.0, + "grad_norm": 2.002134561137746, + "language_loss": 0.65245491, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.6733492, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 13731, + "time_per_iteration": 2.546267509460449 + }, + { + "auxiliary_loss_clip": 0.01053281, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.01453674, + "balance_loss_mlp": 1.0155654, + "epoch": 0.825612505636555, + "flos": 12602038890240.0, + "grad_norm": 2.0595488610429675, + "language_loss": 0.70732892, + "learning_rate": 3.105224311177812e-07, + "loss": 0.72823465, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.37695312, + "step": 13732, + "time_per_iteration": 2.3264107704162598 + }, + { + "auxiliary_loss_clip": 0.01053072, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.0168829, + "balance_loss_mlp": 1.01567078, + "epoch": 0.8256726288892229, + "flos": 17594863424640.0, + "grad_norm": 3.241746381902063, + "language_loss": 0.7422685, + "learning_rate": 3.103140315024817e-07, + "loss": 0.7632215, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 13733, + "time_per_iteration": 2.3161416053771973 + }, + { + "auxiliary_loss_clip": 0.01048847, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.01110864, + "balance_loss_mlp": 1.01472259, + "epoch": 0.8257327521418909, + "flos": 23804654820480.0, + "grad_norm": 1.4962230508326781, + "language_loss": 0.82774031, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84855068, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 13734, + "time_per_iteration": 2.4225738048553467 + }, + { + "auxiliary_loss_clip": 0.01050733, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.00889063, + "balance_loss_mlp": 1.01661229, + "epoch": 0.8257928753945588, + "flos": 19280120618880.0, + "grad_norm": 1.8422566488163872, + "language_loss": 0.84302306, + "learning_rate": 3.098974244989676e-07, + "loss": 0.8638351, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33984375, + "step": 13735, + "time_per_iteration": 2.333651304244995 + }, + { + "auxiliary_loss_clip": 0.01051862, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.01177704, + "balance_loss_mlp": 1.01654196, + "epoch": 0.8258529986472268, + "flos": 18477851420160.0, + "grad_norm": 1.9368549063382565, + "language_loss": 0.7217797, + "learning_rate": 3.096892171265497e-07, + "loss": 0.74262047, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.35351562, + "step": 13736, + "time_per_iteration": 2.381186008453369 + }, + { + "auxiliary_loss_clip": 0.01007357, + "auxiliary_loss_mlp": 0.01003346, + "balance_loss_clip": 1.00123656, + "balance_loss_mlp": 1.00075221, + "epoch": 0.8259131218998947, + "flos": 62135133200640.0, + "grad_norm": 0.8596043439707867, + "language_loss": 0.68088019, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.70098722, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06591797, + "step": 13737, + "time_per_iteration": 3.0227253437042236 + }, + { + "auxiliary_loss_clip": 0.01052281, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.01192832, + "balance_loss_mlp": 1.01600826, + "epoch": 0.8259732451525628, + "flos": 22158081279360.0, + "grad_norm": 1.8930576818336164, + "language_loss": 0.70642078, + "learning_rate": 3.0927299467987e-07, + "loss": 0.72728372, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 13738, + "time_per_iteration": 3.616687536239624 + }, + { + "auxiliary_loss_clip": 0.01055874, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.01733685, + "balance_loss_mlp": 1.01789188, + "epoch": 0.8260333684052307, + "flos": 38360154902400.0, + "grad_norm": 2.247222552464759, + "language_loss": 0.64482176, + "learning_rate": 3.090649796213911e-07, + "loss": 0.66582274, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 13739, + "time_per_iteration": 2.507190704345703 + }, + { + "auxiliary_loss_clip": 0.01007507, + "auxiliary_loss_mlp": 0.01001849, + "balance_loss_clip": 0.99967909, + "balance_loss_mlp": 1.00071645, + "epoch": 0.8260934916578987, + "flos": 62182474871040.0, + "grad_norm": 0.8194267873820062, + "language_loss": 0.59452707, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61462063, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.06787109, + "step": 13740, + "time_per_iteration": 3.0841991901397705 + }, + { + "auxiliary_loss_clip": 0.01054657, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.0150969, + "balance_loss_mlp": 1.01659513, + "epoch": 0.8261536149105667, + "flos": 22564365304320.0, + "grad_norm": 1.9198958775916841, + "language_loss": 0.76910233, + "learning_rate": 3.086491418735959e-07, + "loss": 0.79005855, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38085938, + "step": 13741, + "time_per_iteration": 2.42486572265625 + }, + { + "auxiliary_loss_clip": 0.01052388, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_clip": 1.01780307, + "balance_loss_mlp": 1.01635325, + "epoch": 0.8262137381632346, + "flos": 32524108277760.0, + "grad_norm": 2.2226064025118664, + "language_loss": 0.63181102, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.65275359, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 13742, + "time_per_iteration": 2.4346420764923096 + }, + { + "auxiliary_loss_clip": 0.01054467, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.01561832, + "balance_loss_mlp": 1.0166986, + "epoch": 0.8262738614159026, + "flos": 14135598760320.0, + "grad_norm": 3.079206878354677, + "language_loss": 0.69156039, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.7125361, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.37695312, + "step": 13743, + "time_per_iteration": 2.3485946655273438 + }, + { + "auxiliary_loss_clip": 0.01052704, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.02092826, + "balance_loss_mlp": 1.01695454, + "epoch": 0.8263339846685706, + "flos": 19824416714880.0, + "grad_norm": 1.887763168095159, + "language_loss": 0.67104936, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.69200909, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 13744, + "time_per_iteration": 3.775764226913452 + }, + { + "auxiliary_loss_clip": 0.01052515, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.01308846, + "balance_loss_mlp": 1.01660144, + "epoch": 0.8263941079212386, + "flos": 22744901278080.0, + "grad_norm": 1.8519567134814667, + "language_loss": 0.76621699, + "learning_rate": 3.078182360753612e-07, + "loss": 0.78710234, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 13745, + "time_per_iteration": 2.3882153034210205 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.01518714, + "balance_loss_mlp": 1.01446176, + "epoch": 0.8264542311739065, + "flos": 20119607193600.0, + "grad_norm": 1.980059763992647, + "language_loss": 0.80219698, + "learning_rate": 3.076106700253709e-07, + "loss": 0.82307625, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 13746, + "time_per_iteration": 2.335047721862793 + }, + { + "auxiliary_loss_clip": 0.01056986, + "auxiliary_loss_mlp": 0.01044792, + "balance_loss_clip": 1.01892424, + "balance_loss_mlp": 1.01900578, + "epoch": 0.8265143544265745, + "flos": 16836200380800.0, + "grad_norm": 2.1482314666378293, + "language_loss": 0.70225751, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.72327536, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 13747, + "time_per_iteration": 3.7638440132141113 + }, + { + "auxiliary_loss_clip": 0.01053447, + "auxiliary_loss_mlp": 0.01037844, + "balance_loss_clip": 1.01396644, + "balance_loss_mlp": 1.01627421, + "epoch": 0.8265744776792424, + "flos": 22017485767680.0, + "grad_norm": 1.9391881632777639, + "language_loss": 0.76672113, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.78763402, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 13748, + "time_per_iteration": 2.364699125289917 + }, + { + "auxiliary_loss_clip": 0.01049228, + "auxiliary_loss_mlp": 0.01036504, + "balance_loss_clip": 1.01622629, + "balance_loss_mlp": 1.01575422, + "epoch": 0.8266346009319104, + "flos": 19243845849600.0, + "grad_norm": 2.664302121373129, + "language_loss": 0.64361084, + "learning_rate": 3.069883569603102e-07, + "loss": 0.66446817, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33398438, + "step": 13749, + "time_per_iteration": 2.3361499309539795 + }, + { + "auxiliary_loss_clip": 0.01049959, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.01420939, + "balance_loss_mlp": 1.01536822, + "epoch": 0.8266947241845783, + "flos": 24165726768000.0, + "grad_norm": 1.5699244249134474, + "language_loss": 0.74999297, + "learning_rate": 3.067810476598132e-07, + "loss": 0.77085233, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34570312, + "step": 13750, + "time_per_iteration": 2.3706769943237305 + }, + { + "auxiliary_loss_clip": 0.01051969, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.01786137, + "balance_loss_mlp": 1.01605988, + "epoch": 0.8267548474372464, + "flos": 21104751427200.0, + "grad_norm": 2.3474379514526564, + "language_loss": 0.66507316, + "learning_rate": 3.065738025663496e-07, + "loss": 0.68601149, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 13751, + "time_per_iteration": 2.3911654949188232 + }, + { + "auxiliary_loss_clip": 0.01048849, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.01484108, + "balance_loss_mlp": 1.01453328, + "epoch": 0.8268149706899143, + "flos": 39966718158720.0, + "grad_norm": 1.51298149150531, + "language_loss": 0.6147306, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.63558602, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 13752, + "time_per_iteration": 2.5097508430480957 + }, + { + "auxiliary_loss_clip": 0.01007159, + "auxiliary_loss_mlp": 0.01002187, + "balance_loss_clip": 1.00005269, + "balance_loss_mlp": 1.00031424, + "epoch": 0.8268750939425823, + "flos": 65779611960960.0, + "grad_norm": 0.7765846021345417, + "language_loss": 0.57579887, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59589231, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.06835938, + "step": 13753, + "time_per_iteration": 3.0536856651306152 + }, + { + "auxiliary_loss_clip": 0.01007313, + "auxiliary_loss_mlp": 0.01007027, + "balance_loss_clip": 1.00489342, + "balance_loss_mlp": 1.00056815, + "epoch": 0.8269352171952503, + "flos": 52978846412160.0, + "grad_norm": 0.7069682616430156, + "language_loss": 0.5501436, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.57028699, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.06738281, + "step": 13754, + "time_per_iteration": 3.107558250427246 + }, + { + "auxiliary_loss_clip": 0.01049186, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.01640403, + "balance_loss_mlp": 1.0145998, + "epoch": 0.8269953404479182, + "flos": 23075004983040.0, + "grad_norm": 1.8921019108325405, + "language_loss": 0.70836234, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.7292257, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34570312, + "step": 13755, + "time_per_iteration": 2.3663318157196045 + }, + { + "auxiliary_loss_clip": 0.01051061, + "auxiliary_loss_mlp": 0.01039404, + "balance_loss_clip": 1.01788652, + "balance_loss_mlp": 1.01684356, + "epoch": 0.8270554637005862, + "flos": 14209125816960.0, + "grad_norm": 2.043156823382538, + "language_loss": 0.71967268, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.74057728, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 13756, + "time_per_iteration": 2.376335859298706 + }, + { + "auxiliary_loss_clip": 0.01053294, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.01517296, + "balance_loss_mlp": 1.01719117, + "epoch": 0.8271155869532542, + "flos": 21760978942080.0, + "grad_norm": 1.7339633548618025, + "language_loss": 0.73984134, + "learning_rate": 3.053316807931623e-07, + "loss": 0.76076186, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36132812, + "step": 13757, + "time_per_iteration": 2.3743927478790283 + }, + { + "auxiliary_loss_clip": 0.01053738, + "auxiliary_loss_mlp": 0.01041493, + "balance_loss_clip": 1.01526666, + "balance_loss_mlp": 1.01643682, + "epoch": 0.8271757102059222, + "flos": 15119625830400.0, + "grad_norm": 2.2490857398343986, + "language_loss": 0.70114172, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.72209406, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37304688, + "step": 13758, + "time_per_iteration": 2.3407645225524902 + }, + { + "auxiliary_loss_clip": 0.01049417, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.0121063, + "balance_loss_mlp": 1.01472569, + "epoch": 0.8272358334585901, + "flos": 24132594021120.0, + "grad_norm": 1.499913236473441, + "language_loss": 0.70495522, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.72579527, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 13759, + "time_per_iteration": 2.3859896659851074 + }, + { + "auxiliary_loss_clip": 0.01051509, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.0128634, + "balance_loss_mlp": 1.01598716, + "epoch": 0.8272959567112581, + "flos": 18989678085120.0, + "grad_norm": 1.6237615617864072, + "language_loss": 0.71326077, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73415029, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35546875, + "step": 13760, + "time_per_iteration": 3.735513210296631 + }, + { + "auxiliary_loss_clip": 0.01050736, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.01408446, + "balance_loss_mlp": 1.01649499, + "epoch": 0.827356079963926, + "flos": 20630561074560.0, + "grad_norm": 1.5853894668209771, + "language_loss": 0.78532076, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.80618179, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 13761, + "time_per_iteration": 2.35186505317688 + }, + { + "auxiliary_loss_clip": 0.01049634, + "auxiliary_loss_mlp": 0.01040045, + "balance_loss_clip": 1.01842046, + "balance_loss_mlp": 1.01634729, + "epoch": 0.827416203216594, + "flos": 22415600534400.0, + "grad_norm": 1.636737354445705, + "language_loss": 0.71192634, + "learning_rate": 3.042983464482387e-07, + "loss": 0.73282313, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33203125, + "step": 13762, + "time_per_iteration": 2.391179323196411 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.01218283, + "balance_loss_mlp": 1.01551795, + "epoch": 0.827476326469262, + "flos": 19025184804480.0, + "grad_norm": 1.895140834418085, + "language_loss": 0.71432304, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.73516947, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 13763, + "time_per_iteration": 2.3426053524017334 + }, + { + "auxiliary_loss_clip": 0.01006985, + "auxiliary_loss_mlp": 0.0100321, + "balance_loss_clip": 1.00089765, + "balance_loss_mlp": 1.00036263, + "epoch": 0.82753644972193, + "flos": 68497180502400.0, + "grad_norm": 0.8528146917432381, + "language_loss": 0.65251702, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67261899, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.06640625, + "step": 13764, + "time_per_iteration": 3.079747200012207 + }, + { + "auxiliary_loss_clip": 0.01053139, + "auxiliary_loss_mlp": 0.01039773, + "balance_loss_clip": 1.01497769, + "balance_loss_mlp": 1.01700807, + "epoch": 0.8275965729745979, + "flos": 18404429097600.0, + "grad_norm": 2.083606920641129, + "language_loss": 0.79720962, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.81813872, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 13765, + "time_per_iteration": 2.3299560546875 + }, + { + "auxiliary_loss_clip": 0.01052633, + "auxiliary_loss_mlp": 0.01042159, + "balance_loss_clip": 1.01830554, + "balance_loss_mlp": 1.01561797, + "epoch": 0.8276566962272659, + "flos": 28510807248000.0, + "grad_norm": 1.6456613241945417, + "language_loss": 0.63294125, + "learning_rate": 3.034728363464214e-07, + "loss": 0.65388918, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 13766, + "time_per_iteration": 2.4275379180908203 + }, + { + "auxiliary_loss_clip": 0.01051572, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.01347017, + "balance_loss_mlp": 1.01591229, + "epoch": 0.8277168194799339, + "flos": 20229199551360.0, + "grad_norm": 1.6075299287805156, + "language_loss": 0.83589661, + "learning_rate": 3.03266619632609e-07, + "loss": 0.85678536, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35742188, + "step": 13767, + "time_per_iteration": 2.350020170211792 + }, + { + "auxiliary_loss_clip": 0.01052686, + "auxiliary_loss_mlp": 0.01042005, + "balance_loss_clip": 1.01771057, + "balance_loss_mlp": 1.01605737, + "epoch": 0.8277769427326018, + "flos": 28475335440000.0, + "grad_norm": 2.1493476042476316, + "language_loss": 0.69444752, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71539438, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3671875, + "step": 13768, + "time_per_iteration": 2.441054582595825 + }, + { + "auxiliary_loss_clip": 0.01049897, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.01337028, + "balance_loss_mlp": 1.01531804, + "epoch": 0.8278370659852698, + "flos": 27196432093440.0, + "grad_norm": 1.7636803453809866, + "language_loss": 0.75820464, + "learning_rate": 3.028543792337006e-07, + "loss": 0.77906346, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34570312, + "step": 13769, + "time_per_iteration": 2.3991925716400146 + }, + { + "auxiliary_loss_clip": 0.01052602, + "auxiliary_loss_mlp": 0.01041078, + "balance_loss_clip": 1.01700974, + "balance_loss_mlp": 1.01604247, + "epoch": 0.8278971892379378, + "flos": 37814601997440.0, + "grad_norm": 1.8224599423572245, + "language_loss": 0.75199759, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.77293444, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 13770, + "time_per_iteration": 2.509031295776367 + }, + { + "auxiliary_loss_clip": 0.01052885, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.01595819, + "balance_loss_mlp": 1.01700926, + "epoch": 0.8279573124906058, + "flos": 22559198423040.0, + "grad_norm": 1.610761780167085, + "language_loss": 0.76836324, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.7893002, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 13771, + "time_per_iteration": 2.358564615249634 + }, + { + "auxiliary_loss_clip": 0.01051689, + "auxiliary_loss_mlp": 0.0103943, + "balance_loss_clip": 1.01765084, + "balance_loss_mlp": 1.01590943, + "epoch": 0.8280174357432737, + "flos": 36063149132160.0, + "grad_norm": 1.4946167505200783, + "language_loss": 0.73615336, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.75706458, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.359375, + "step": 13772, + "time_per_iteration": 2.4893031120300293 + }, + { + "auxiliary_loss_clip": 0.01050463, + "auxiliary_loss_mlp": 0.01037726, + "balance_loss_clip": 1.01251292, + "balance_loss_mlp": 1.01585746, + "epoch": 0.8280775589959417, + "flos": 22960106098560.0, + "grad_norm": 2.3518812610166706, + "language_loss": 0.75801677, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.77889872, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.34570312, + "step": 13773, + "time_per_iteration": 2.3506124019622803 + }, + { + "auxiliary_loss_clip": 0.01052566, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.00860691, + "balance_loss_mlp": 1.01683044, + "epoch": 0.8281376822486096, + "flos": 26062208887680.0, + "grad_norm": 1.85577387804443, + "language_loss": 0.76618958, + "learning_rate": 3.01824904601915e-07, + "loss": 0.78703737, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35742188, + "step": 13774, + "time_per_iteration": 2.411689281463623 + }, + { + "auxiliary_loss_clip": 0.01054427, + "auxiliary_loss_mlp": 0.01037829, + "balance_loss_clip": 1.01319993, + "balance_loss_mlp": 1.01660085, + "epoch": 0.8281978055012776, + "flos": 20666731109760.0, + "grad_norm": 2.508567694578778, + "language_loss": 0.76004612, + "learning_rate": 3.01619202829249e-07, + "loss": 0.78096873, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 13775, + "time_per_iteration": 2.3425917625427246 + }, + { + "auxiliary_loss_clip": 0.01054133, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_clip": 1.01475048, + "balance_loss_mlp": 1.01612878, + "epoch": 0.8282579287539455, + "flos": 29313984142080.0, + "grad_norm": 2.103369488104231, + "language_loss": 0.74504805, + "learning_rate": 3.01413565459353e-07, + "loss": 0.76600754, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 13776, + "time_per_iteration": 2.447099208831787 + }, + { + "auxiliary_loss_clip": 0.01052396, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.01158619, + "balance_loss_mlp": 1.01567721, + "epoch": 0.8283180520066136, + "flos": 15705258842880.0, + "grad_norm": 2.040335963238092, + "language_loss": 0.78148699, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.80237287, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 13777, + "time_per_iteration": 3.5787482261657715 + }, + { + "auxiliary_loss_clip": 0.01050825, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.0078392, + "balance_loss_mlp": 1.01695979, + "epoch": 0.8283781752592815, + "flos": 24790287813120.0, + "grad_norm": 1.5960846157181359, + "language_loss": 0.83586824, + "learning_rate": 3.010024839590604e-07, + "loss": 0.85666823, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33789062, + "step": 13778, + "time_per_iteration": 2.4183297157287598 + }, + { + "auxiliary_loss_clip": 0.01048916, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.01203406, + "balance_loss_mlp": 1.01517332, + "epoch": 0.8284382985119495, + "flos": 18981997585920.0, + "grad_norm": 1.7673606845096157, + "language_loss": 0.75397754, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.77481824, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.33789062, + "step": 13779, + "time_per_iteration": 2.339005708694458 + }, + { + "auxiliary_loss_clip": 0.01007046, + "auxiliary_loss_mlp": 0.01004806, + "balance_loss_clip": 1.00256479, + "balance_loss_mlp": 1.00059843, + "epoch": 0.8284984217646175, + "flos": 61030898853120.0, + "grad_norm": 0.9106768976356092, + "language_loss": 0.56792045, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58803892, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06445312, + "step": 13780, + "time_per_iteration": 3.031635046005249 + }, + { + "auxiliary_loss_clip": 0.01051691, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.00748956, + "balance_loss_mlp": 1.01591408, + "epoch": 0.8285585450172854, + "flos": 19713742104960.0, + "grad_norm": 1.7676867696948724, + "language_loss": 0.80314624, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.82397455, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 13781, + "time_per_iteration": 2.3589601516723633 + }, + { + "auxiliary_loss_clip": 0.01053482, + "auxiliary_loss_mlp": 0.01038521, + "balance_loss_clip": 1.01246142, + "balance_loss_mlp": 1.01680911, + "epoch": 0.8286186682699535, + "flos": 21687835910400.0, + "grad_norm": 1.9901426666413942, + "language_loss": 0.77369487, + "learning_rate": 3.001810941346543e-07, + "loss": 0.79461491, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3671875, + "step": 13782, + "time_per_iteration": 2.3807458877563477 + }, + { + "auxiliary_loss_clip": 0.01051252, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.01702642, + "balance_loss_mlp": 1.01548147, + "epoch": 0.8286787915226214, + "flos": 25774384705920.0, + "grad_norm": 1.8358969054842813, + "language_loss": 0.77083892, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.79175818, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 13783, + "time_per_iteration": 3.881993055343628 + }, + { + "auxiliary_loss_clip": 0.01051276, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.01194227, + "balance_loss_mlp": 1.01525295, + "epoch": 0.8287389147752894, + "flos": 21287277348480.0, + "grad_norm": 1.6725097106097901, + "language_loss": 0.74693674, + "learning_rate": 2.997707859351304e-07, + "loss": 0.76780856, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 13784, + "time_per_iteration": 2.3460171222686768 + }, + { + "auxiliary_loss_clip": 0.01053507, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.01640248, + "balance_loss_mlp": 1.01538658, + "epoch": 0.8287990380279573, + "flos": 33543537333120.0, + "grad_norm": 1.4760948762816442, + "language_loss": 0.70681024, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72775894, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38085938, + "step": 13785, + "time_per_iteration": 2.469475269317627 + }, + { + "auxiliary_loss_clip": 0.01053004, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.00933588, + "balance_loss_mlp": 1.01755333, + "epoch": 0.8288591612806253, + "flos": 22965238068480.0, + "grad_norm": 1.3986330166977154, + "language_loss": 0.69214553, + "learning_rate": 2.993607356270516e-07, + "loss": 0.71300286, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 13786, + "time_per_iteration": 3.7534193992614746 + }, + { + "auxiliary_loss_clip": 0.01054253, + "auxiliary_loss_mlp": 0.01040675, + "balance_loss_clip": 1.01497364, + "balance_loss_mlp": 1.01677942, + "epoch": 0.8289192845332932, + "flos": 18587967448320.0, + "grad_norm": 1.7469830628853085, + "language_loss": 0.77954209, + "learning_rate": 2.991558072017426e-07, + "loss": 0.80049133, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 13787, + "time_per_iteration": 2.36616587638855 + }, + { + "auxiliary_loss_clip": 0.01051251, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.01318002, + "balance_loss_mlp": 1.0162673, + "epoch": 0.8289794077859612, + "flos": 15449520067200.0, + "grad_norm": 1.9229834919189646, + "language_loss": 0.81121737, + "learning_rate": 2.989509432726163e-07, + "loss": 0.83208847, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 13788, + "time_per_iteration": 2.3398399353027344 + }, + { + "auxiliary_loss_clip": 0.0105177, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.01393235, + "balance_loss_mlp": 1.01649249, + "epoch": 0.8290395310386292, + "flos": 28876557317760.0, + "grad_norm": 1.5498830095731873, + "language_loss": 0.72400296, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.744892, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 13789, + "time_per_iteration": 2.442476272583008 + }, + { + "auxiliary_loss_clip": 0.01053047, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.01651359, + "balance_loss_mlp": 1.01553667, + "epoch": 0.8290996542912972, + "flos": 36574766328960.0, + "grad_norm": 1.7885158809985646, + "language_loss": 0.69249332, + "learning_rate": 2.985414089339813e-07, + "loss": 0.71343714, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 13790, + "time_per_iteration": 2.493319272994995 + }, + { + "auxiliary_loss_clip": 0.0105293, + "auxiliary_loss_mlp": 0.01039518, + "balance_loss_clip": 1.01291037, + "balance_loss_mlp": 1.01577473, + "epoch": 0.8291597775439651, + "flos": 23621884519680.0, + "grad_norm": 1.6634848212219613, + "language_loss": 0.78915727, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.81008178, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37109375, + "step": 13791, + "time_per_iteration": 2.3961334228515625 + }, + { + "auxiliary_loss_clip": 0.01051638, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.01421475, + "balance_loss_mlp": 1.01665175, + "epoch": 0.8292199007966331, + "flos": 21396415858560.0, + "grad_norm": 1.369797902189044, + "language_loss": 0.70294362, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72383839, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34960938, + "step": 13792, + "time_per_iteration": 2.3537349700927734 + }, + { + "auxiliary_loss_clip": 0.01051992, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.01626265, + "balance_loss_mlp": 1.0152595, + "epoch": 0.829280024049301, + "flos": 28766336555520.0, + "grad_norm": 1.4751845445129725, + "language_loss": 0.65817314, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67910171, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 13793, + "time_per_iteration": 2.4260761737823486 + }, + { + "auxiliary_loss_clip": 0.01054919, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.01248026, + "balance_loss_mlp": 1.0163691, + "epoch": 0.829340147301969, + "flos": 19937046360960.0, + "grad_norm": 2.0165776275288594, + "language_loss": 0.67431343, + "learning_rate": 2.977231145525461e-07, + "loss": 0.6952374, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.38476562, + "step": 13794, + "time_per_iteration": 2.3247358798980713 + }, + { + "auxiliary_loss_clip": 0.01052529, + "auxiliary_loss_mlp": 0.01042538, + "balance_loss_clip": 1.0185051, + "balance_loss_mlp": 1.01605558, + "epoch": 0.829400270554637, + "flos": 25227400435200.0, + "grad_norm": 1.959689437603246, + "language_loss": 0.67641664, + "learning_rate": 2.975187023140757e-07, + "loss": 0.69736737, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 13795, + "time_per_iteration": 2.400827407836914 + }, + { + "auxiliary_loss_clip": 0.01051073, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.01478744, + "balance_loss_mlp": 1.01651311, + "epoch": 0.829460393807305, + "flos": 24462383523840.0, + "grad_norm": 1.6723176521333354, + "language_loss": 0.67424846, + "learning_rate": 2.973143546338661e-07, + "loss": 0.69511926, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34570312, + "step": 13796, + "time_per_iteration": 2.387488842010498 + }, + { + "auxiliary_loss_clip": 0.01049898, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.01278341, + "balance_loss_mlp": 1.01523995, + "epoch": 0.829520517059973, + "flos": 15121580866560.0, + "grad_norm": 1.761670827209745, + "language_loss": 0.72874415, + "learning_rate": 2.971100715196666e-07, + "loss": 0.74959326, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 13797, + "time_per_iteration": 2.4074742794036865 + }, + { + "auxiliary_loss_clip": 0.01052442, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.01114583, + "balance_loss_mlp": 1.0159719, + "epoch": 0.8295806403126409, + "flos": 21578906868480.0, + "grad_norm": 2.155284808437615, + "language_loss": 0.73711842, + "learning_rate": 2.969058529792243e-07, + "loss": 0.75798762, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36523438, + "step": 13798, + "time_per_iteration": 2.375195026397705 + }, + { + "auxiliary_loss_clip": 0.01049378, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.01377344, + "balance_loss_mlp": 1.01542974, + "epoch": 0.8296407635653089, + "flos": 21725472222720.0, + "grad_norm": 1.6734794937195392, + "language_loss": 0.77761555, + "learning_rate": 2.967016990202822e-07, + "loss": 0.79846567, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 13799, + "time_per_iteration": 2.3752589225769043 + }, + { + "auxiliary_loss_clip": 0.01051448, + "auxiliary_loss_mlp": 0.01040198, + "balance_loss_clip": 1.01659465, + "balance_loss_mlp": 1.01627743, + "epoch": 0.8297008868179768, + "flos": 11180375527680.0, + "grad_norm": 1.9447547702520052, + "language_loss": 0.68761981, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.70853627, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3515625, + "step": 13800, + "time_per_iteration": 3.7445592880249023 + }, + { + "auxiliary_loss_clip": 0.01055478, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.01402044, + "balance_loss_mlp": 1.01746249, + "epoch": 0.8297610100706448, + "flos": 20663100328320.0, + "grad_norm": 2.4376241082715464, + "language_loss": 0.75459301, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.77556372, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.37890625, + "step": 13801, + "time_per_iteration": 2.400974750518799 + }, + { + "auxiliary_loss_clip": 0.01052327, + "auxiliary_loss_mlp": 0.01033393, + "balance_loss_clip": 1.01113629, + "balance_loss_mlp": 1.01645184, + "epoch": 0.8298211333233128, + "flos": 20375276146560.0, + "grad_norm": 1.690723584642869, + "language_loss": 0.74254447, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.76340163, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 13802, + "time_per_iteration": 2.342258930206299 + }, + { + "auxiliary_loss_clip": 0.01052282, + "auxiliary_loss_mlp": 0.01037321, + "balance_loss_clip": 1.01537442, + "balance_loss_mlp": 1.01589513, + "epoch": 0.8298812565759808, + "flos": 21507579227520.0, + "grad_norm": 1.5904066130398413, + "language_loss": 0.75603104, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.77692705, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 13803, + "time_per_iteration": 2.401033878326416 + }, + { + "auxiliary_loss_clip": 0.01053479, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.01641965, + "balance_loss_mlp": 1.01751614, + "epoch": 0.8299413798286487, + "flos": 22817625373440.0, + "grad_norm": 1.7379508259199545, + "language_loss": 0.79947293, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.82039821, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 13804, + "time_per_iteration": 2.3658573627471924 + }, + { + "auxiliary_loss_clip": 0.01050076, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.01337147, + "balance_loss_mlp": 1.01557219, + "epoch": 0.8300015030813167, + "flos": 29677918821120.0, + "grad_norm": 2.2819174987018718, + "language_loss": 0.74462098, + "learning_rate": 2.954781319115016e-07, + "loss": 0.76547003, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 13805, + "time_per_iteration": 2.468914031982422 + }, + { + "auxiliary_loss_clip": 0.0105313, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.01155579, + "balance_loss_mlp": 1.01642728, + "epoch": 0.8300616263339846, + "flos": 19718455138560.0, + "grad_norm": 2.161691789369759, + "language_loss": 0.78895795, + "learning_rate": 2.952744302396906e-07, + "loss": 0.80984503, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 13806, + "time_per_iteration": 2.3333864212036133 + }, + { + "auxiliary_loss_clip": 0.01054055, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.01494598, + "balance_loss_mlp": 1.01684356, + "epoch": 0.8301217495866526, + "flos": 19900911237120.0, + "grad_norm": 1.7041629118466792, + "language_loss": 0.65038013, + "learning_rate": 2.950707932112444e-07, + "loss": 0.67131543, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37304688, + "step": 13807, + "time_per_iteration": 2.3705432415008545 + }, + { + "auxiliary_loss_clip": 0.01052349, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.01195908, + "balance_loss_mlp": 1.01644373, + "epoch": 0.8301818728393207, + "flos": 19714859268480.0, + "grad_norm": 1.9915210660419373, + "language_loss": 0.73760939, + "learning_rate": 2.948672208338847e-07, + "loss": 0.7584933, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.359375, + "step": 13808, + "time_per_iteration": 2.333934783935547 + }, + { + "auxiliary_loss_clip": 0.01056791, + "auxiliary_loss_mlp": 0.01051107, + "balance_loss_clip": 1.02429652, + "balance_loss_mlp": 1.01840341, + "epoch": 0.8302419960919886, + "flos": 28292390582400.0, + "grad_norm": 2.0746180464506936, + "language_loss": 0.68752772, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.70860672, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3828125, + "step": 13809, + "time_per_iteration": 2.421452045440674 + }, + { + "auxiliary_loss_clip": 0.01052968, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.0113852, + "balance_loss_mlp": 1.01603651, + "epoch": 0.8303021193446566, + "flos": 18222461758080.0, + "grad_norm": 7.044162913114388, + "language_loss": 0.74933815, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.77020699, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36914062, + "step": 13810, + "time_per_iteration": 2.3284552097320557 + }, + { + "auxiliary_loss_clip": 0.01050156, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.01600695, + "balance_loss_mlp": 1.01583242, + "epoch": 0.8303622425973245, + "flos": 23110337145600.0, + "grad_norm": 1.6683660835991407, + "language_loss": 0.82136279, + "learning_rate": 2.94256891685505e-07, + "loss": 0.84223056, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.34375, + "step": 13811, + "time_per_iteration": 2.427122116088867 + }, + { + "auxiliary_loss_clip": 0.01052627, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.0209825, + "balance_loss_mlp": 1.01676893, + "epoch": 0.8304223658499925, + "flos": 19571854872960.0, + "grad_norm": 2.7283289138936193, + "language_loss": 0.74288625, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.76384109, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 13812, + "time_per_iteration": 2.3442749977111816 + }, + { + "auxiliary_loss_clip": 0.01051352, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01197445, + "balance_loss_mlp": 1.01614738, + "epoch": 0.8304824891026604, + "flos": 24424956679680.0, + "grad_norm": 1.5794534687098756, + "language_loss": 0.79124367, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.81209236, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 13813, + "time_per_iteration": 2.3870816230773926 + }, + { + "auxiliary_loss_clip": 0.01051797, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.01181436, + "balance_loss_mlp": 1.01487148, + "epoch": 0.8305426123553284, + "flos": 22380722219520.0, + "grad_norm": 1.9436167787435077, + "language_loss": 0.72395039, + "learning_rate": 2.93647144674658e-07, + "loss": 0.74483776, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36914062, + "step": 13814, + "time_per_iteration": 2.3860552310943604 + }, + { + "auxiliary_loss_clip": 0.01056453, + "auxiliary_loss_mlp": 0.01050354, + "balance_loss_clip": 1.02193475, + "balance_loss_mlp": 1.01701093, + "epoch": 0.8306027356079964, + "flos": 14902675441920.0, + "grad_norm": 2.364331436263779, + "language_loss": 0.69425642, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.71532452, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.39453125, + "step": 13815, + "time_per_iteration": 2.33354115486145 + }, + { + "auxiliary_loss_clip": 0.01053235, + "auxiliary_loss_mlp": 0.0104153, + "balance_loss_clip": 1.01657987, + "balance_loss_mlp": 1.01715803, + "epoch": 0.8306628588606644, + "flos": 19643601450240.0, + "grad_norm": 1.9617324547094606, + "language_loss": 0.77129471, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.79224241, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.359375, + "step": 13816, + "time_per_iteration": 2.375720500946045 + }, + { + "auxiliary_loss_clip": 0.01050975, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.01540756, + "balance_loss_mlp": 1.01607001, + "epoch": 0.8307229821133323, + "flos": 24388577176320.0, + "grad_norm": 1.6779692252605694, + "language_loss": 0.82161421, + "learning_rate": 2.930379800094371e-07, + "loss": 0.84249794, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 13817, + "time_per_iteration": 3.618485450744629 + }, + { + "auxiliary_loss_clip": 0.01054679, + "auxiliary_loss_mlp": 0.01040402, + "balance_loss_clip": 1.01696527, + "balance_loss_mlp": 1.01779377, + "epoch": 0.8307831053660003, + "flos": 20995857296640.0, + "grad_norm": 2.5983065912203784, + "language_loss": 0.78762937, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80858022, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36914062, + "step": 13818, + "time_per_iteration": 2.3574862480163574 + }, + { + "auxiliary_loss_clip": 0.01053033, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.01771569, + "balance_loss_mlp": 1.01673877, + "epoch": 0.8308432286186682, + "flos": 21396241301760.0, + "grad_norm": 2.0308241695081133, + "language_loss": 0.8286863, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84964734, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 13819, + "time_per_iteration": 2.358039379119873 + }, + { + "auxiliary_loss_clip": 0.01007601, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 1.00065732, + "balance_loss_mlp": 1.00097549, + "epoch": 0.8309033518713362, + "flos": 62530978728960.0, + "grad_norm": 0.7675731671404494, + "language_loss": 0.56303877, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58314323, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06640625, + "step": 13820, + "time_per_iteration": 3.0220210552215576 + }, + { + "auxiliary_loss_clip": 0.01050728, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.01075411, + "balance_loss_mlp": 1.01657045, + "epoch": 0.8309634751240043, + "flos": 16978262169600.0, + "grad_norm": 2.0579865976482496, + "language_loss": 0.68949676, + "learning_rate": 2.922266666860831e-07, + "loss": 0.7103337, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33984375, + "step": 13821, + "time_per_iteration": 2.3411002159118652 + }, + { + "auxiliary_loss_clip": 0.0105212, + "auxiliary_loss_mlp": 0.01043232, + "balance_loss_clip": 1.01978326, + "balance_loss_mlp": 1.01539207, + "epoch": 0.8310235983766722, + "flos": 22673364168960.0, + "grad_norm": 1.835189032306545, + "language_loss": 0.70818734, + "learning_rate": 2.920240002333625e-07, + "loss": 0.72914088, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 13822, + "time_per_iteration": 2.3627798557281494 + }, + { + "auxiliary_loss_clip": 0.0105088, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.013798, + "balance_loss_mlp": 1.01651478, + "epoch": 0.8310837216293402, + "flos": 30810117168000.0, + "grad_norm": 1.9505218808494906, + "language_loss": 0.62915409, + "learning_rate": 2.918213985472631e-07, + "loss": 0.65002638, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34375, + "step": 13823, + "time_per_iteration": 3.830002546310425 + }, + { + "auxiliary_loss_clip": 0.0100734, + "auxiliary_loss_mlp": 0.01003842, + "balance_loss_clip": 1.00142229, + "balance_loss_mlp": 1.00080514, + "epoch": 0.8311438448820081, + "flos": 71272531077120.0, + "grad_norm": 0.8885623155215208, + "language_loss": 0.6205194, + "learning_rate": 2.916188616354669e-07, + "loss": 0.64063126, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.06542969, + "step": 13824, + "time_per_iteration": 3.1623013019561768 + }, + { + "auxiliary_loss_clip": 0.01051144, + "auxiliary_loss_mlp": 0.01037439, + "balance_loss_clip": 1.01558781, + "balance_loss_mlp": 1.01580739, + "epoch": 0.8312039681346761, + "flos": 20886020559360.0, + "grad_norm": 1.7853334377204213, + "language_loss": 0.74869823, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76958412, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 13825, + "time_per_iteration": 3.6964385509490967 + }, + { + "auxiliary_loss_clip": 0.01052382, + "auxiliary_loss_mlp": 0.01039103, + "balance_loss_clip": 1.01477265, + "balance_loss_mlp": 1.01544952, + "epoch": 0.831264091387344, + "flos": 17016631620480.0, + "grad_norm": 3.4781028329438524, + "language_loss": 0.81076765, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.83168244, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36914062, + "step": 13826, + "time_per_iteration": 2.370760440826416 + }, + { + "auxiliary_loss_clip": 0.01050847, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01247573, + "balance_loss_mlp": 1.01509225, + "epoch": 0.831324214640012, + "flos": 24418602812160.0, + "grad_norm": 1.7764532414796457, + "language_loss": 0.68957734, + "learning_rate": 2.910116396226914e-07, + "loss": 0.71043593, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 13827, + "time_per_iteration": 2.381718397140503 + }, + { + "auxiliary_loss_clip": 0.01050679, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.01093602, + "balance_loss_mlp": 1.01552093, + "epoch": 0.83138433789268, + "flos": 13544938512000.0, + "grad_norm": 1.7334520836180642, + "language_loss": 0.75202692, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.77286386, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 13828, + "time_per_iteration": 2.3453621864318848 + }, + { + "auxiliary_loss_clip": 0.01050173, + "auxiliary_loss_mlp": 0.01038468, + "balance_loss_clip": 1.01625967, + "balance_loss_mlp": 1.01453066, + "epoch": 0.831444461145348, + "flos": 44490693778560.0, + "grad_norm": 1.6423490393980427, + "language_loss": 0.68561882, + "learning_rate": 2.906071489597657e-07, + "loss": 0.70650518, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 13829, + "time_per_iteration": 2.545858860015869 + }, + { + "auxiliary_loss_clip": 0.01052757, + "auxiliary_loss_mlp": 0.01038189, + "balance_loss_clip": 1.01475263, + "balance_loss_mlp": 1.01605916, + "epoch": 0.8315045843980159, + "flos": 22704088032000.0, + "grad_norm": 1.565011421612002, + "language_loss": 0.83773082, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.85864031, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 13830, + "time_per_iteration": 2.3741729259490967 + }, + { + "auxiliary_loss_clip": 0.01053022, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.01688671, + "balance_loss_mlp": 1.01638186, + "epoch": 0.8315647076506839, + "flos": 16872544972800.0, + "grad_norm": 2.3280243599180563, + "language_loss": 0.75826573, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.77919966, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 13831, + "time_per_iteration": 2.3168656826019287 + }, + { + "auxiliary_loss_clip": 0.01052326, + "auxiliary_loss_mlp": 0.0104303, + "balance_loss_clip": 1.01956987, + "balance_loss_mlp": 1.01620054, + "epoch": 0.8316248309033518, + "flos": 13807869027840.0, + "grad_norm": 1.6354801103887562, + "language_loss": 0.72577822, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.74673176, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 13832, + "time_per_iteration": 2.3292856216430664 + }, + { + "auxiliary_loss_clip": 0.01050285, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.01532292, + "balance_loss_mlp": 1.01483798, + "epoch": 0.8316849541560198, + "flos": 23511419377920.0, + "grad_norm": 1.6040728004629479, + "language_loss": 0.85757154, + "learning_rate": 2.897989455393979e-07, + "loss": 0.87846142, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 13833, + "time_per_iteration": 2.3700947761535645 + }, + { + "auxiliary_loss_clip": 0.01053338, + "auxiliary_loss_mlp": 0.01040427, + "balance_loss_clip": 1.0164299, + "balance_loss_mlp": 1.01593113, + "epoch": 0.8317450774086879, + "flos": 23770160530560.0, + "grad_norm": 1.6929412126889882, + "language_loss": 0.7674824, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78842002, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 13834, + "time_per_iteration": 2.419008493423462 + }, + { + "auxiliary_loss_clip": 0.01048838, + "auxiliary_loss_mlp": 0.01037864, + "balance_loss_clip": 1.01514244, + "balance_loss_mlp": 1.01421428, + "epoch": 0.8318052006613558, + "flos": 16214641712640.0, + "grad_norm": 1.8474667138693, + "language_loss": 0.80643141, + "learning_rate": 2.893952329045459e-07, + "loss": 0.82729846, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34570312, + "step": 13835, + "time_per_iteration": 2.323345184326172 + }, + { + "auxiliary_loss_clip": 0.01054952, + "auxiliary_loss_mlp": 0.01044186, + "balance_loss_clip": 1.01816273, + "balance_loss_mlp": 1.01688337, + "epoch": 0.8318653239140238, + "flos": 19973530598400.0, + "grad_norm": 1.9494426247551537, + "language_loss": 0.81866753, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.83965892, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38085938, + "step": 13836, + "time_per_iteration": 2.341207981109619 + }, + { + "auxiliary_loss_clip": 0.01050405, + "auxiliary_loss_mlp": 0.01035872, + "balance_loss_clip": 1.01354456, + "balance_loss_mlp": 1.01550746, + "epoch": 0.8319254471666917, + "flos": 17703967023360.0, + "grad_norm": 2.038987795361162, + "language_loss": 0.78562623, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.80648905, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34960938, + "step": 13837, + "time_per_iteration": 2.351989984512329 + }, + { + "auxiliary_loss_clip": 0.01053746, + "auxiliary_loss_mlp": 0.01039346, + "balance_loss_clip": 1.01332319, + "balance_loss_mlp": 1.01655102, + "epoch": 0.8319855704193597, + "flos": 19535545192320.0, + "grad_norm": 1.8343042227337962, + "language_loss": 0.84872752, + "learning_rate": 2.887901504686685e-07, + "loss": 0.86965847, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37109375, + "step": 13838, + "time_per_iteration": 2.4359796047210693 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.0103926, + "balance_loss_clip": 1.01545429, + "balance_loss_mlp": 1.01649654, + "epoch": 0.8320456936720276, + "flos": 21177021674880.0, + "grad_norm": 3.764797506738201, + "language_loss": 0.76096821, + "learning_rate": 2.885885860916795e-07, + "loss": 0.78187239, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34570312, + "step": 13839, + "time_per_iteration": 2.434671640396118 + }, + { + "auxiliary_loss_clip": 0.0105339, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.01560366, + "balance_loss_mlp": 1.01674056, + "epoch": 0.8321058169246957, + "flos": 33249603663360.0, + "grad_norm": 1.649856715504996, + "language_loss": 0.68742806, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.70835882, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 13840, + "time_per_iteration": 3.8739395141601562 + }, + { + "auxiliary_loss_clip": 0.01051178, + "auxiliary_loss_mlp": 0.01039927, + "balance_loss_clip": 1.01639557, + "balance_loss_mlp": 1.01516032, + "epoch": 0.8321659401773636, + "flos": 14207310426240.0, + "grad_norm": 2.067789430489718, + "language_loss": 0.80408007, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.82499111, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 13841, + "time_per_iteration": 2.3843681812286377 + }, + { + "auxiliary_loss_clip": 0.01050799, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.01246619, + "balance_loss_mlp": 1.01606655, + "epoch": 0.8322260634300316, + "flos": 15157366876800.0, + "grad_norm": 1.92226869039995, + "language_loss": 0.69694698, + "learning_rate": 2.879842823726262e-07, + "loss": 0.71781993, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.34765625, + "step": 13842, + "time_per_iteration": 2.352890968322754 + }, + { + "auxiliary_loss_clip": 0.01052225, + "auxiliary_loss_mlp": 0.01039895, + "balance_loss_clip": 1.01587391, + "balance_loss_mlp": 1.01698661, + "epoch": 0.8322861866826995, + "flos": 25299670682880.0, + "grad_norm": 1.7047367426841706, + "language_loss": 0.73605102, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75697219, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 13843, + "time_per_iteration": 2.397564649581909 + }, + { + "auxiliary_loss_clip": 0.01053669, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.01461959, + "balance_loss_mlp": 1.01831079, + "epoch": 0.8323463099353675, + "flos": 17018412099840.0, + "grad_norm": 1.9723552537437776, + "language_loss": 0.79233873, + "learning_rate": 2.875817378128975e-07, + "loss": 0.81324244, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 13844, + "time_per_iteration": 2.37977933883667 + }, + { + "auxiliary_loss_clip": 0.01007312, + "auxiliary_loss_mlp": 0.01002205, + "balance_loss_clip": 0.99988037, + "balance_loss_mlp": 1.00063097, + "epoch": 0.8324064331880354, + "flos": 55605222748800.0, + "grad_norm": 0.7795706560298629, + "language_loss": 0.55293953, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57303464, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.06689453, + "step": 13845, + "time_per_iteration": 2.9196736812591553 + }, + { + "auxiliary_loss_clip": 0.01053505, + "auxiliary_loss_mlp": 0.01044647, + "balance_loss_clip": 1.01957726, + "balance_loss_mlp": 1.0168097, + "epoch": 0.8324665564407034, + "flos": 26137481512320.0, + "grad_norm": 1.6168982165684447, + "language_loss": 0.76270741, + "learning_rate": 2.871794529934555e-07, + "loss": 0.7836889, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 13846, + "time_per_iteration": 2.431917190551758 + }, + { + "auxiliary_loss_clip": 0.01052726, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.01529646, + "balance_loss_mlp": 1.01525569, + "epoch": 0.8325266796933715, + "flos": 22048244542080.0, + "grad_norm": 1.6893832241671234, + "language_loss": 0.79995537, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.82088214, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 13847, + "time_per_iteration": 2.378840446472168 + }, + { + "auxiliary_loss_clip": 0.01052283, + "auxiliary_loss_mlp": 0.01036246, + "balance_loss_clip": 1.01390624, + "balance_loss_mlp": 1.01725411, + "epoch": 0.8325868029460394, + "flos": 22815635425920.0, + "grad_norm": 1.6532108236265493, + "language_loss": 0.75626749, + "learning_rate": 2.867774279753175e-07, + "loss": 0.77715278, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 13848, + "time_per_iteration": 2.3682680130004883 + }, + { + "auxiliary_loss_clip": 0.01051893, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0096699, + "balance_loss_mlp": 1.01648879, + "epoch": 0.8326469261987074, + "flos": 14756563935360.0, + "grad_norm": 1.9362789311221424, + "language_loss": 0.65080839, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.67165875, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 13849, + "time_per_iteration": 2.3233935832977295 + }, + { + "auxiliary_loss_clip": 0.01052941, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.0120213, + "balance_loss_mlp": 1.01582527, + "epoch": 0.8327070494513753, + "flos": 22925123049600.0, + "grad_norm": 2.078194133531837, + "language_loss": 0.80933607, + "learning_rate": 2.863756628194638e-07, + "loss": 0.83022225, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 13850, + "time_per_iteration": 2.376472234725952 + }, + { + "auxiliary_loss_clip": 0.01048691, + "auxiliary_loss_mlp": 0.01039336, + "balance_loss_clip": 1.01755655, + "balance_loss_mlp": 1.01545763, + "epoch": 0.8327671727040433, + "flos": 20664357137280.0, + "grad_norm": 1.5449075535510972, + "language_loss": 0.79077542, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.81165564, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.33203125, + "step": 13851, + "time_per_iteration": 2.362128973007202 + }, + { + "auxiliary_loss_clip": 0.01007315, + "auxiliary_loss_mlp": 0.01002828, + "balance_loss_clip": 1.00083768, + "balance_loss_mlp": 1.00072193, + "epoch": 0.8328272959567112, + "flos": 56057661457920.0, + "grad_norm": 0.7632058527128924, + "language_loss": 0.55857456, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57867599, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.06591797, + "step": 13852, + "time_per_iteration": 2.9974281787872314 + }, + { + "auxiliary_loss_clip": 0.01051257, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.01389313, + "balance_loss_mlp": 1.0168339, + "epoch": 0.8328874192093793, + "flos": 32301816451200.0, + "grad_norm": 1.616951403917932, + "language_loss": 0.68711853, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.7079947, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34375, + "step": 13853, + "time_per_iteration": 2.475623369216919 + }, + { + "auxiliary_loss_clip": 0.01051995, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.01881659, + "balance_loss_mlp": 1.01660192, + "epoch": 0.8329475424620472, + "flos": 23511593934720.0, + "grad_norm": 1.607626158603418, + "language_loss": 0.79193658, + "learning_rate": 2.855729123383286e-07, + "loss": 0.81287181, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 13854, + "time_per_iteration": 2.450927495956421 + }, + { + "auxiliary_loss_clip": 0.01007551, + "auxiliary_loss_mlp": 0.01002352, + "balance_loss_clip": 1.0003854, + "balance_loss_mlp": 1.00098419, + "epoch": 0.8330076657147152, + "flos": 67837392028800.0, + "grad_norm": 0.7695029478139919, + "language_loss": 0.58825493, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60835397, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.06542969, + "step": 13855, + "time_per_iteration": 2.8962531089782715 + }, + { + "auxiliary_loss_clip": 0.01050309, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.01236701, + "balance_loss_mlp": 1.01550174, + "epoch": 0.8330677889673831, + "flos": 22891711011840.0, + "grad_norm": 1.718937411762079, + "language_loss": 0.7298038, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.75065696, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 13856, + "time_per_iteration": 2.3992316722869873 + }, + { + "auxiliary_loss_clip": 0.01052218, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.01415598, + "balance_loss_mlp": 1.01574838, + "epoch": 0.8331279122200511, + "flos": 27343800408960.0, + "grad_norm": 2.2507125941325574, + "language_loss": 0.75970894, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.7806074, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 13857, + "time_per_iteration": 3.6044180393218994 + }, + { + "auxiliary_loss_clip": 0.01048771, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.00970781, + "balance_loss_mlp": 1.01580274, + "epoch": 0.833188035472719, + "flos": 19937151095040.0, + "grad_norm": 1.518262910820934, + "language_loss": 0.7417469, + "learning_rate": 2.847712020370958e-07, + "loss": 0.7625289, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.328125, + "step": 13858, + "time_per_iteration": 2.363327741622925 + }, + { + "auxiliary_loss_clip": 0.01053647, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.01323485, + "balance_loss_mlp": 1.01567364, + "epoch": 0.833248158725387, + "flos": 15231696894720.0, + "grad_norm": 1.7705479286423251, + "language_loss": 0.74186277, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.76279652, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 13859, + "time_per_iteration": 2.3303451538085938 + }, + { + "auxiliary_loss_clip": 0.01048607, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.01156259, + "balance_loss_mlp": 1.01520658, + "epoch": 0.8333082819780551, + "flos": 24534374480640.0, + "grad_norm": 2.036425015766899, + "language_loss": 0.80098069, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.82178652, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33398438, + "step": 13860, + "time_per_iteration": 2.4286410808563232 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.01527762, + "balance_loss_mlp": 1.0159533, + "epoch": 0.833368405230723, + "flos": 31466065392000.0, + "grad_norm": 1.331054852985571, + "language_loss": 0.83022416, + "learning_rate": 2.841706022218644e-07, + "loss": 0.85112011, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.359375, + "step": 13861, + "time_per_iteration": 2.46478533744812 + }, + { + "auxiliary_loss_clip": 0.01052528, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.01485777, + "balance_loss_mlp": 1.01648378, + "epoch": 0.833428528483391, + "flos": 14901837569280.0, + "grad_norm": 1.7670574805354997, + "language_loss": 0.80256557, + "learning_rate": 2.839705324021806e-07, + "loss": 0.82347023, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 13862, + "time_per_iteration": 3.7615230083465576 + }, + { + "auxiliary_loss_clip": 0.01051769, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.01413846, + "balance_loss_mlp": 1.01547027, + "epoch": 0.8334886517360589, + "flos": 22198754880000.0, + "grad_norm": 1.9675933090824378, + "language_loss": 0.76526237, + "learning_rate": 2.83770527654505e-07, + "loss": 0.78617662, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36328125, + "step": 13863, + "time_per_iteration": 2.371427536010742 + }, + { + "auxiliary_loss_clip": 0.0105048, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.01437664, + "balance_loss_mlp": 1.0165329, + "epoch": 0.8335487749887269, + "flos": 30371258977920.0, + "grad_norm": 2.137240418573223, + "language_loss": 0.76087868, + "learning_rate": 2.835705879864232e-07, + "loss": 0.78175527, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.33984375, + "step": 13864, + "time_per_iteration": 2.4306232929229736 + }, + { + "auxiliary_loss_clip": 0.01053348, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.01793516, + "balance_loss_mlp": 1.01683247, + "epoch": 0.8336088982413948, + "flos": 24679997228160.0, + "grad_norm": 2.1045293964084024, + "language_loss": 0.70262861, + "learning_rate": 2.833707134055168e-07, + "loss": 0.72358477, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 13865, + "time_per_iteration": 3.7431747913360596 + }, + { + "auxiliary_loss_clip": 0.01053279, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01255369, + "balance_loss_mlp": 1.01772404, + "epoch": 0.8336690214940629, + "flos": 38175778679040.0, + "grad_norm": 1.6168183426959257, + "language_loss": 0.76737183, + "learning_rate": 2.831709039193653e-07, + "loss": 0.78826416, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 13866, + "time_per_iteration": 2.500892162322998 + }, + { + "auxiliary_loss_clip": 0.01007416, + "auxiliary_loss_mlp": 0.01002361, + "balance_loss_clip": 1.00021529, + "balance_loss_mlp": 1.00087428, + "epoch": 0.8337291447467308, + "flos": 55562629023360.0, + "grad_norm": 0.8749185653629723, + "language_loss": 0.63279253, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65289026, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06542969, + "step": 13867, + "time_per_iteration": 2.9033100605010986 + }, + { + "auxiliary_loss_clip": 0.01050187, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.02086353, + "balance_loss_mlp": 1.01578975, + "epoch": 0.8337892679993988, + "flos": 24132419464320.0, + "grad_norm": 1.6520518313967918, + "language_loss": 0.73296803, + "learning_rate": 2.827714802616301e-07, + "loss": 0.75389779, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 13868, + "time_per_iteration": 2.3808765411376953 + }, + { + "auxiliary_loss_clip": 0.01054169, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.0141505, + "balance_loss_mlp": 1.0176332, + "epoch": 0.8338493912520667, + "flos": 28182658579200.0, + "grad_norm": 1.4795469785062092, + "language_loss": 0.81028068, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.83121598, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 13869, + "time_per_iteration": 2.4742603302001953 + }, + { + "auxiliary_loss_clip": 0.0105124, + "auxiliary_loss_mlp": 0.01038653, + "balance_loss_clip": 1.01483464, + "balance_loss_mlp": 1.01496696, + "epoch": 0.8339095145047347, + "flos": 22157417963520.0, + "grad_norm": 1.5557233387619653, + "language_loss": 0.83645785, + "learning_rate": 2.823723170738028e-07, + "loss": 0.85735673, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 13870, + "time_per_iteration": 2.3889806270599365 + }, + { + "auxiliary_loss_clip": 0.01052185, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.01084054, + "balance_loss_mlp": 1.01573837, + "epoch": 0.8339696377574026, + "flos": 17306271192960.0, + "grad_norm": 2.414769402610643, + "language_loss": 0.72599822, + "learning_rate": 2.821728331750264e-07, + "loss": 0.74685746, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 13871, + "time_per_iteration": 2.3129358291625977 + }, + { + "auxiliary_loss_clip": 0.01051389, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.01559842, + "balance_loss_mlp": 1.01644945, + "epoch": 0.8340297610100706, + "flos": 20667289691520.0, + "grad_norm": 1.6748976927576198, + "language_loss": 0.69694996, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.71783388, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34960938, + "step": 13872, + "time_per_iteration": 2.3589656352996826 + }, + { + "auxiliary_loss_clip": 0.01051273, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.01190734, + "balance_loss_mlp": 1.01558411, + "epoch": 0.8340898842627387, + "flos": 20514579937920.0, + "grad_norm": 2.0414003822902194, + "language_loss": 0.74736977, + "learning_rate": 2.817740608055712e-07, + "loss": 0.76823103, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 13873, + "time_per_iteration": 2.4185218811035156 + }, + { + "auxiliary_loss_clip": 0.01051929, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.01515222, + "balance_loss_mlp": 1.01560402, + "epoch": 0.8341500075154066, + "flos": 21425010128640.0, + "grad_norm": 2.9835600588287754, + "language_loss": 0.75932515, + "learning_rate": 2.81574772350013e-07, + "loss": 0.78025693, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36328125, + "step": 13874, + "time_per_iteration": 2.3684730529785156 + }, + { + "auxiliary_loss_clip": 0.01049946, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.00992894, + "balance_loss_mlp": 1.01582074, + "epoch": 0.8342101307680746, + "flos": 22089895660800.0, + "grad_norm": 2.0597759068351715, + "language_loss": 0.67370868, + "learning_rate": 2.813755490573118e-07, + "loss": 0.6945262, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 13875, + "time_per_iteration": 2.385450839996338 + }, + { + "auxiliary_loss_clip": 0.01051796, + "auxiliary_loss_mlp": 0.01039265, + "balance_loss_clip": 1.01667428, + "balance_loss_mlp": 1.0168221, + "epoch": 0.8342702540207425, + "flos": 21870396743040.0, + "grad_norm": 1.6217109127490692, + "language_loss": 0.80670393, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.82761455, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 13876, + "time_per_iteration": 2.364272356033325 + }, + { + "auxiliary_loss_clip": 0.01051406, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.01331711, + "balance_loss_mlp": 1.01632881, + "epoch": 0.8343303772734105, + "flos": 22527392307840.0, + "grad_norm": 1.9022099504369743, + "language_loss": 0.88698155, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.90785277, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 13877, + "time_per_iteration": 2.461705207824707 + }, + { + "auxiliary_loss_clip": 0.01050939, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.0138973, + "balance_loss_mlp": 1.01583731, + "epoch": 0.8343905005260784, + "flos": 14938880388480.0, + "grad_norm": 2.0013063118620558, + "language_loss": 0.70331621, + "learning_rate": 2.807782702318828e-07, + "loss": 0.72417212, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.3515625, + "step": 13878, + "time_per_iteration": 2.3225207328796387 + }, + { + "auxiliary_loss_clip": 0.01049561, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.01093817, + "balance_loss_mlp": 1.01554716, + "epoch": 0.8344506237787465, + "flos": 15011569572480.0, + "grad_norm": 2.400869835659916, + "language_loss": 0.80274403, + "learning_rate": 2.805793076661309e-07, + "loss": 0.82356864, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33984375, + "step": 13879, + "time_per_iteration": 3.6919219493865967 + }, + { + "auxiliary_loss_clip": 0.01050877, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.01448274, + "balance_loss_mlp": 1.01529372, + "epoch": 0.8345107470314144, + "flos": 17559601084800.0, + "grad_norm": 2.4256216190546738, + "language_loss": 0.84652543, + "learning_rate": 2.803804103009828e-07, + "loss": 0.86739898, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 13880, + "time_per_iteration": 2.3245058059692383 + }, + { + "auxiliary_loss_clip": 0.01053597, + "auxiliary_loss_mlp": 0.01039953, + "balance_loss_clip": 1.0172677, + "balance_loss_mlp": 1.01646733, + "epoch": 0.8345708702840824, + "flos": 25186238075520.0, + "grad_norm": 1.5331472946042062, + "language_loss": 0.78905255, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80998802, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37109375, + "step": 13881, + "time_per_iteration": 2.4143354892730713 + }, + { + "auxiliary_loss_clip": 0.01048094, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.01524746, + "balance_loss_mlp": 1.01501179, + "epoch": 0.8346309935367503, + "flos": 15082722656640.0, + "grad_norm": 2.709410250725937, + "language_loss": 0.79915601, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.81998789, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.33007812, + "step": 13882, + "time_per_iteration": 2.3307859897613525 + }, + { + "auxiliary_loss_clip": 0.01051673, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.01712251, + "balance_loss_mlp": 1.01580751, + "epoch": 0.8346911167894183, + "flos": 22929486969600.0, + "grad_norm": 1.7886800399754492, + "language_loss": 0.81691611, + "learning_rate": 2.79784109484579e-07, + "loss": 0.83784068, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 13883, + "time_per_iteration": 2.3828089237213135 + }, + { + "auxiliary_loss_clip": 0.01051721, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.01228333, + "balance_loss_mlp": 1.01560628, + "epoch": 0.8347512400420862, + "flos": 20192017086720.0, + "grad_norm": 2.9994701895846125, + "language_loss": 0.7552464, + "learning_rate": 2.795854729972482e-07, + "loss": 0.77613461, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 13884, + "time_per_iteration": 2.3497042655944824 + }, + { + "auxiliary_loss_clip": 0.01056479, + "auxiliary_loss_mlp": 0.01048041, + "balance_loss_clip": 1.02101588, + "balance_loss_mlp": 1.01739883, + "epoch": 0.8348113632947542, + "flos": 25953733693440.0, + "grad_norm": 1.832345232783009, + "language_loss": 0.71166587, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.73271108, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.390625, + "step": 13885, + "time_per_iteration": 2.5710909366607666 + }, + { + "auxiliary_loss_clip": 0.01053321, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.01090932, + "balance_loss_mlp": 1.01655674, + "epoch": 0.8348714865474223, + "flos": 34203116338560.0, + "grad_norm": 1.6712098153287573, + "language_loss": 0.71574938, + "learning_rate": 2.791883957449912e-07, + "loss": 0.73661351, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3671875, + "step": 13886, + "time_per_iteration": 2.500521421432495 + }, + { + "auxiliary_loss_clip": 0.01050317, + "auxiliary_loss_mlp": 0.01035897, + "balance_loss_clip": 1.01198351, + "balance_loss_mlp": 1.01512194, + "epoch": 0.8349316098000902, + "flos": 24388961201280.0, + "grad_norm": 1.5188370331110352, + "language_loss": 0.7985267, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81938887, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3515625, + "step": 13887, + "time_per_iteration": 2.502025842666626 + }, + { + "auxiliary_loss_clip": 0.01054284, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.01218545, + "balance_loss_mlp": 1.01665366, + "epoch": 0.8349917330527582, + "flos": 23031817764480.0, + "grad_norm": 2.34324001260461, + "language_loss": 0.66526616, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.68617666, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 13888, + "time_per_iteration": 2.361043930053711 + }, + { + "auxiliary_loss_clip": 0.01053494, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.01340139, + "balance_loss_mlp": 1.01623702, + "epoch": 0.8350518563054261, + "flos": 13625028904320.0, + "grad_norm": 2.0966431601965523, + "language_loss": 0.69187278, + "learning_rate": 2.785932692855244e-07, + "loss": 0.71278167, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 13889, + "time_per_iteration": 2.4941892623901367 + }, + { + "auxiliary_loss_clip": 0.0105091, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.01192319, + "balance_loss_mlp": 1.01571953, + "epoch": 0.8351119795580941, + "flos": 21578732311680.0, + "grad_norm": 1.8000607853089268, + "language_loss": 0.69868827, + "learning_rate": 2.783950243408399e-07, + "loss": 0.71954799, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 13890, + "time_per_iteration": 2.413825511932373 + }, + { + "auxiliary_loss_clip": 0.01052934, + "auxiliary_loss_mlp": 0.01038765, + "balance_loss_clip": 1.01468468, + "balance_loss_mlp": 1.01626945, + "epoch": 0.835172102810762, + "flos": 20037526853760.0, + "grad_norm": 2.301841993172882, + "language_loss": 0.60690153, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.62781852, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 13891, + "time_per_iteration": 2.4797375202178955 + }, + { + "auxiliary_loss_clip": 0.01052639, + "auxiliary_loss_mlp": 0.01037957, + "balance_loss_clip": 1.01578426, + "balance_loss_mlp": 1.01658893, + "epoch": 0.8352322260634301, + "flos": 25110616337280.0, + "grad_norm": 1.591332594816603, + "language_loss": 0.72557354, + "learning_rate": 2.779987303092846e-07, + "loss": 0.74647951, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.36132812, + "step": 13892, + "time_per_iteration": 2.3765666484832764 + }, + { + "auxiliary_loss_clip": 0.01049991, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.01558685, + "balance_loss_mlp": 1.01572239, + "epoch": 0.835292349316098, + "flos": 24862592972160.0, + "grad_norm": 1.7579176608943043, + "language_loss": 0.67234182, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.69323164, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34179688, + "step": 13893, + "time_per_iteration": 2.429600954055786 + }, + { + "auxiliary_loss_clip": 0.01050239, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.01187074, + "balance_loss_mlp": 1.01516163, + "epoch": 0.835352472568766, + "flos": 19864531733760.0, + "grad_norm": 1.973268814451002, + "language_loss": 0.79320294, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.81403828, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 13894, + "time_per_iteration": 2.359849691390991 + }, + { + "auxiliary_loss_clip": 0.01051103, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.01051879, + "balance_loss_mlp": 1.01722026, + "epoch": 0.8354125958214339, + "flos": 22053655802880.0, + "grad_norm": 1.6854776551188797, + "language_loss": 0.73884588, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.75968313, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33789062, + "step": 13895, + "time_per_iteration": 2.3930459022521973 + }, + { + "auxiliary_loss_clip": 0.01051904, + "auxiliary_loss_mlp": 0.01041491, + "balance_loss_clip": 1.01382232, + "balance_loss_mlp": 1.01565838, + "epoch": 0.8354727190741019, + "flos": 21396730060800.0, + "grad_norm": 2.4505439661467463, + "language_loss": 0.72847283, + "learning_rate": 2.772069258877667e-07, + "loss": 0.7494067, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.36328125, + "step": 13896, + "time_per_iteration": 3.6465444564819336 + }, + { + "auxiliary_loss_clip": 0.01050907, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.00905585, + "balance_loss_mlp": 1.0159781, + "epoch": 0.8355328423267698, + "flos": 50839125649920.0, + "grad_norm": 2.2807018633487703, + "language_loss": 0.60111749, + "learning_rate": 2.770091380848423e-07, + "loss": 0.6219359, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 13897, + "time_per_iteration": 2.63702130317688 + }, + { + "auxiliary_loss_clip": 0.01007418, + "auxiliary_loss_mlp": 0.01002538, + "balance_loss_clip": 1.00041604, + "balance_loss_mlp": 1.00089371, + "epoch": 0.8355929655794379, + "flos": 65547577998720.0, + "grad_norm": 0.6947303130168239, + "language_loss": 0.57659686, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59669644, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.06542969, + "step": 13898, + "time_per_iteration": 3.039914846420288 + }, + { + "auxiliary_loss_clip": 0.01053838, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.01659274, + "balance_loss_mlp": 1.01673758, + "epoch": 0.8356530888321058, + "flos": 19169550743040.0, + "grad_norm": 2.803886389906241, + "language_loss": 0.80977094, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.83071333, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 13899, + "time_per_iteration": 2.3612303733825684 + }, + { + "auxiliary_loss_clip": 0.01052108, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.01450694, + "balance_loss_mlp": 1.01624608, + "epoch": 0.8357132120847738, + "flos": 44125013531520.0, + "grad_norm": 2.027063614629441, + "language_loss": 0.69904768, + "learning_rate": 2.764161667219749e-07, + "loss": 0.71994108, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 13900, + "time_per_iteration": 2.5547778606414795 + }, + { + "auxiliary_loss_clip": 0.01052547, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.0128262, + "balance_loss_mlp": 1.01708972, + "epoch": 0.8357733353374418, + "flos": 24388542264960.0, + "grad_norm": 1.399338992702125, + "language_loss": 0.72139406, + "learning_rate": 2.762186403079716e-07, + "loss": 0.74225342, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.35351562, + "step": 13901, + "time_per_iteration": 2.399590015411377 + }, + { + "auxiliary_loss_clip": 0.01054779, + "auxiliary_loss_mlp": 0.01044885, + "balance_loss_clip": 1.01985109, + "balance_loss_mlp": 1.01700246, + "epoch": 0.8358334585901097, + "flos": 20915452702080.0, + "grad_norm": 2.1904254446554736, + "language_loss": 0.80732137, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82831806, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 13902, + "time_per_iteration": 3.7852110862731934 + }, + { + "auxiliary_loss_clip": 0.01050087, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.01142097, + "balance_loss_mlp": 1.01582122, + "epoch": 0.8358935818427777, + "flos": 19243182533760.0, + "grad_norm": 1.4716145059181194, + "language_loss": 0.63317645, + "learning_rate": 2.758237835853379e-07, + "loss": 0.65401697, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34179688, + "step": 13903, + "time_per_iteration": 2.34626841545105 + }, + { + "auxiliary_loss_clip": 0.01050999, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.0122354, + "balance_loss_mlp": 1.01555622, + "epoch": 0.8359537050954456, + "flos": 24132908223360.0, + "grad_norm": 1.8278020579347583, + "language_loss": 0.75478101, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.77562517, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.35351562, + "step": 13904, + "time_per_iteration": 2.3883681297302246 + }, + { + "auxiliary_loss_clip": 0.01049572, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.0133332, + "balance_loss_mlp": 1.01484656, + "epoch": 0.8360138283481137, + "flos": 16179484106880.0, + "grad_norm": 1.6858745803138315, + "language_loss": 0.7345351, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.75540161, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34765625, + "step": 13905, + "time_per_iteration": 3.725339412689209 + }, + { + "auxiliary_loss_clip": 0.0105189, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.01382375, + "balance_loss_mlp": 1.01712859, + "epoch": 0.8360739516007816, + "flos": 22197847184640.0, + "grad_norm": 1.5090764797846925, + "language_loss": 0.67245924, + "learning_rate": 2.752319888771e-07, + "loss": 0.69333947, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 13906, + "time_per_iteration": 2.3862814903259277 + }, + { + "auxiliary_loss_clip": 0.01052412, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.0133698, + "balance_loss_mlp": 1.01675153, + "epoch": 0.8361340748534496, + "flos": 20922085860480.0, + "grad_norm": 1.459778482270787, + "language_loss": 0.74776208, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76863658, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35742188, + "step": 13907, + "time_per_iteration": 2.370638847351074 + }, + { + "auxiliary_loss_clip": 0.01051446, + "auxiliary_loss_mlp": 0.01045327, + "balance_loss_clip": 1.0208298, + "balance_loss_mlp": 1.0151664, + "epoch": 0.8361941981061175, + "flos": 26172499472640.0, + "grad_norm": 2.0680055206534247, + "language_loss": 0.76231468, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.7832824, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 13908, + "time_per_iteration": 2.5035171508789062 + }, + { + "auxiliary_loss_clip": 0.01053181, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.01565421, + "balance_loss_mlp": 1.01657963, + "epoch": 0.8362543213587855, + "flos": 24418393344000.0, + "grad_norm": 3.8963499257245107, + "language_loss": 0.72963989, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.75056738, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36523438, + "step": 13909, + "time_per_iteration": 2.404506206512451 + }, + { + "auxiliary_loss_clip": 0.0105262, + "auxiliary_loss_mlp": 0.01042904, + "balance_loss_clip": 1.01857328, + "balance_loss_mlp": 1.01569819, + "epoch": 0.8363144446114534, + "flos": 17201426780160.0, + "grad_norm": 2.0349596519876885, + "language_loss": 0.74864256, + "learning_rate": 2.744438449482338e-07, + "loss": 0.76959789, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 13910, + "time_per_iteration": 2.360766887664795 + }, + { + "auxiliary_loss_clip": 0.01052878, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.01135027, + "balance_loss_mlp": 1.01672399, + "epoch": 0.8363745678641215, + "flos": 19278444873600.0, + "grad_norm": 1.7741762943482722, + "language_loss": 0.73987514, + "learning_rate": 2.742469725305001e-07, + "loss": 0.76073378, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36132812, + "step": 13911, + "time_per_iteration": 2.353569507598877 + }, + { + "auxiliary_loss_clip": 0.01054392, + "auxiliary_loss_mlp": 0.01042359, + "balance_loss_clip": 1.01829064, + "balance_loss_mlp": 1.01763332, + "epoch": 0.8364346911167894, + "flos": 11874064798080.0, + "grad_norm": 1.9483510845309961, + "language_loss": 0.79688179, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81784928, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 13912, + "time_per_iteration": 2.360982656478882 + }, + { + "auxiliary_loss_clip": 0.01051668, + "auxiliary_loss_mlp": 0.01034405, + "balance_loss_clip": 1.01256561, + "balance_loss_mlp": 1.01559186, + "epoch": 0.8364948143694574, + "flos": 20224311960960.0, + "grad_norm": 1.6414312137101574, + "language_loss": 0.79737759, + "learning_rate": 2.738534240246797e-07, + "loss": 0.81823838, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36132812, + "step": 13913, + "time_per_iteration": 2.3668673038482666 + }, + { + "auxiliary_loss_clip": 0.01051433, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.01720071, + "balance_loss_mlp": 1.01536632, + "epoch": 0.8365549376221254, + "flos": 21611934881280.0, + "grad_norm": 1.8384597020111357, + "language_loss": 0.75360614, + "learning_rate": 2.736567479515153e-07, + "loss": 0.77453637, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 13914, + "time_per_iteration": 2.4084463119506836 + }, + { + "auxiliary_loss_clip": 0.0105122, + "auxiliary_loss_mlp": 0.01038352, + "balance_loss_clip": 1.01521349, + "balance_loss_mlp": 1.01606119, + "epoch": 0.8366150608747933, + "flos": 23293107446400.0, + "grad_norm": 1.54286108915256, + "language_loss": 0.72427046, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.74516612, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 13915, + "time_per_iteration": 2.3849143981933594 + }, + { + "auxiliary_loss_clip": 0.01052102, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.01519728, + "balance_loss_mlp": 1.01575112, + "epoch": 0.8366751841274613, + "flos": 15266784677760.0, + "grad_norm": 1.8348290888952778, + "language_loss": 0.73952836, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.76043016, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 13916, + "time_per_iteration": 2.36014461517334 + }, + { + "auxiliary_loss_clip": 0.01053341, + "auxiliary_loss_mlp": 0.01036491, + "balance_loss_clip": 1.01262581, + "balance_loss_mlp": 1.01666284, + "epoch": 0.8367353073801292, + "flos": 13224086317440.0, + "grad_norm": 1.8285629309249507, + "language_loss": 0.75942498, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.78032333, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 13917, + "time_per_iteration": 2.3376317024230957 + }, + { + "auxiliary_loss_clip": 0.01048256, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.01192617, + "balance_loss_mlp": 1.01576781, + "epoch": 0.8367954306327973, + "flos": 24203991484800.0, + "grad_norm": 1.4400010493707238, + "language_loss": 0.80111629, + "learning_rate": 2.728706983644933e-07, + "loss": 0.82192945, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.32421875, + "step": 13918, + "time_per_iteration": 2.3873038291931152 + }, + { + "auxiliary_loss_clip": 0.01051422, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.01453435, + "balance_loss_mlp": 1.01594412, + "epoch": 0.8368555538854652, + "flos": 24533606430720.0, + "grad_norm": 1.8376973891348236, + "language_loss": 0.69317234, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.71405768, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 13919, + "time_per_iteration": 3.846771001815796 + }, + { + "auxiliary_loss_clip": 0.01049931, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.01256537, + "balance_loss_mlp": 1.01512003, + "epoch": 0.8369156771381332, + "flos": 20258526960000.0, + "grad_norm": 1.7972019550509266, + "language_loss": 0.75026494, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.77110958, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 13920, + "time_per_iteration": 2.3838131427764893 + }, + { + "auxiliary_loss_clip": 0.01051708, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.01610303, + "balance_loss_mlp": 1.01517367, + "epoch": 0.8369758003908011, + "flos": 21834471087360.0, + "grad_norm": 1.6705689181276333, + "language_loss": 0.70048988, + "learning_rate": 2.722818488237566e-07, + "loss": 0.7214067, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 13921, + "time_per_iteration": 2.3906428813934326 + }, + { + "auxiliary_loss_clip": 0.01052856, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.01904511, + "balance_loss_mlp": 1.01623964, + "epoch": 0.8370359236434691, + "flos": 21718420128000.0, + "grad_norm": 2.088615072506136, + "language_loss": 0.86441517, + "learning_rate": 2.720856966640801e-07, + "loss": 0.88536042, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36523438, + "step": 13922, + "time_per_iteration": 2.3818106651306152 + }, + { + "auxiliary_loss_clip": 0.01050404, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.01381218, + "balance_loss_mlp": 1.01610684, + "epoch": 0.837096046896137, + "flos": 23147763989760.0, + "grad_norm": 1.6348824005361218, + "language_loss": 0.72524655, + "learning_rate": 2.71889610027088e-07, + "loss": 0.74609828, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34179688, + "step": 13923, + "time_per_iteration": 2.3598484992980957 + }, + { + "auxiliary_loss_clip": 0.01050152, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.01338243, + "balance_loss_mlp": 1.01580787, + "epoch": 0.8371561701488051, + "flos": 24491885489280.0, + "grad_norm": 1.842089625101855, + "language_loss": 0.77452481, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.79538023, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 13924, + "time_per_iteration": 2.404940366744995 + }, + { + "auxiliary_loss_clip": 0.01051247, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.01380813, + "balance_loss_mlp": 1.01670694, + "epoch": 0.837216293401473, + "flos": 29205404213760.0, + "grad_norm": 1.5831476758720693, + "language_loss": 0.66194391, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.68280661, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34570312, + "step": 13925, + "time_per_iteration": 2.419646978378296 + }, + { + "auxiliary_loss_clip": 0.01052655, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.01506472, + "balance_loss_mlp": 1.01616478, + "epoch": 0.837276416654141, + "flos": 25264094140800.0, + "grad_norm": 1.579363095954496, + "language_loss": 0.75717258, + "learning_rate": 2.713017433265543e-07, + "loss": 0.77809715, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36523438, + "step": 13926, + "time_per_iteration": 2.4070005416870117 + }, + { + "auxiliary_loss_clip": 0.01052404, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.01266646, + "balance_loss_mlp": 1.01707458, + "epoch": 0.837336539906809, + "flos": 13881151704960.0, + "grad_norm": 1.7330251614017704, + "language_loss": 0.72802353, + "learning_rate": 2.711059188546274e-07, + "loss": 0.74890035, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 13927, + "time_per_iteration": 2.358887195587158 + }, + { + "auxiliary_loss_clip": 0.01007112, + "auxiliary_loss_mlp": 0.01003964, + "balance_loss_clip": 1.00185394, + "balance_loss_mlp": 1.00055981, + "epoch": 0.8373966631594769, + "flos": 68867050112640.0, + "grad_norm": 0.7005500572806644, + "language_loss": 0.58968961, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60980034, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06542969, + "step": 13928, + "time_per_iteration": 3.165985584259033 + }, + { + "auxiliary_loss_clip": 0.01053132, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.01459813, + "balance_loss_mlp": 1.01753008, + "epoch": 0.8374567864121449, + "flos": 20447930419200.0, + "grad_norm": 1.6964071671264327, + "language_loss": 0.7000308, + "learning_rate": 2.707144665977068e-07, + "loss": 0.72094744, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35546875, + "step": 13929, + "time_per_iteration": 2.3797028064727783 + }, + { + "auxiliary_loss_clip": 0.01053941, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.01186669, + "balance_loss_mlp": 1.01651883, + "epoch": 0.8375169096648128, + "flos": 41902512336000.0, + "grad_norm": 1.4712845365888085, + "language_loss": 0.68311983, + "learning_rate": 2.705188388275574e-07, + "loss": 0.70402157, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.375, + "step": 13930, + "time_per_iteration": 2.591395378112793 + }, + { + "auxiliary_loss_clip": 0.01052159, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.01116252, + "balance_loss_mlp": 1.01706719, + "epoch": 0.8375770329174809, + "flos": 20008374001920.0, + "grad_norm": 1.7228012173533487, + "language_loss": 0.72717243, + "learning_rate": 2.703232766395067e-07, + "loss": 0.748025, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 13931, + "time_per_iteration": 2.3748037815093994 + }, + { + "auxiliary_loss_clip": 0.01049732, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.01177812, + "balance_loss_mlp": 1.01494157, + "epoch": 0.8376371561701488, + "flos": 22782502679040.0, + "grad_norm": 2.073179889007315, + "language_loss": 0.72849226, + "learning_rate": 2.701277800409705e-07, + "loss": 0.74932826, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 13932, + "time_per_iteration": 2.393982172012329 + }, + { + "auxiliary_loss_clip": 0.01050707, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.01375103, + "balance_loss_mlp": 1.01548719, + "epoch": 0.8376972794228168, + "flos": 23913339482880.0, + "grad_norm": 2.6770667235520453, + "language_loss": 0.68100238, + "learning_rate": 2.699323490393628e-07, + "loss": 0.70185536, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.3515625, + "step": 13933, + "time_per_iteration": 2.4285342693328857 + }, + { + "auxiliary_loss_clip": 0.01051725, + "auxiliary_loss_mlp": 0.01039165, + "balance_loss_clip": 1.01839852, + "balance_loss_mlp": 1.01677752, + "epoch": 0.8377574026754847, + "flos": 13733888123520.0, + "grad_norm": 2.001974996568261, + "language_loss": 0.77769417, + "learning_rate": 2.697369836420933e-07, + "loss": 0.79860312, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34960938, + "step": 13934, + "time_per_iteration": 2.427338123321533 + }, + { + "auxiliary_loss_clip": 0.01053315, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.01700425, + "balance_loss_mlp": 1.01861596, + "epoch": 0.8378175259281527, + "flos": 21650304332160.0, + "grad_norm": 1.4839032088989714, + "language_loss": 0.78045487, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.8013835, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 13935, + "time_per_iteration": 2.3798108100891113 + }, + { + "auxiliary_loss_clip": 0.01050684, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.01160049, + "balance_loss_mlp": 1.01509666, + "epoch": 0.8378776491808206, + "flos": 15447949056000.0, + "grad_norm": 2.617736194582649, + "language_loss": 0.58006477, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.60091555, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 13936, + "time_per_iteration": 3.635401725769043 + }, + { + "auxiliary_loss_clip": 0.01050669, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.01590133, + "balance_loss_mlp": 1.01627624, + "epoch": 0.8379377724334887, + "flos": 14719521116160.0, + "grad_norm": 1.9219422532484516, + "language_loss": 0.90483451, + "learning_rate": 2.691512811503882e-07, + "loss": 0.92571694, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 13937, + "time_per_iteration": 2.3338940143585205 + }, + { + "auxiliary_loss_clip": 0.01052501, + "auxiliary_loss_mlp": 0.01034287, + "balance_loss_clip": 1.01147008, + "balance_loss_mlp": 1.01689625, + "epoch": 0.8379978956861566, + "flos": 24534095189760.0, + "grad_norm": 1.8980343801774786, + "language_loss": 0.82361597, + "learning_rate": 2.689561782445313e-07, + "loss": 0.84448385, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 13938, + "time_per_iteration": 2.3976593017578125 + }, + { + "auxiliary_loss_clip": 0.01053715, + "auxiliary_loss_mlp": 0.01038083, + "balance_loss_clip": 1.01395547, + "balance_loss_mlp": 1.01655865, + "epoch": 0.8380580189388246, + "flos": 18951622836480.0, + "grad_norm": 1.653898703961682, + "language_loss": 0.71325374, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.73417169, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37109375, + "step": 13939, + "time_per_iteration": 2.352163076400757 + }, + { + "auxiliary_loss_clip": 0.01054696, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.01569617, + "balance_loss_mlp": 1.01797056, + "epoch": 0.8381181421914926, + "flos": 26539122326400.0, + "grad_norm": 1.6429745656731911, + "language_loss": 0.7724036, + "learning_rate": 2.6856616936428e-07, + "loss": 0.79334563, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 13940, + "time_per_iteration": 2.466586112976074 + }, + { + "auxiliary_loss_clip": 0.01051471, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.01619411, + "balance_loss_mlp": 1.01607919, + "epoch": 0.8381782654441605, + "flos": 23290454183040.0, + "grad_norm": 1.5766175909662246, + "language_loss": 0.77832818, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.79923183, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 13941, + "time_per_iteration": 2.4094760417938232 + }, + { + "auxiliary_loss_clip": 0.01053959, + "auxiliary_loss_mlp": 0.01039437, + "balance_loss_clip": 1.01335359, + "balance_loss_mlp": 1.01619208, + "epoch": 0.8382383886968285, + "flos": 26757643726080.0, + "grad_norm": 2.012550227143333, + "language_loss": 0.73967361, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.7606076, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 13942, + "time_per_iteration": 3.877239942550659 + }, + { + "auxiliary_loss_clip": 0.01055951, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.01788116, + "balance_loss_mlp": 1.01710129, + "epoch": 0.8382985119494964, + "flos": 26103336336000.0, + "grad_norm": 1.482033853710732, + "language_loss": 0.80640155, + "learning_rate": 2.679816484834554e-07, + "loss": 0.82740331, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 13943, + "time_per_iteration": 2.4358839988708496 + }, + { + "auxiliary_loss_clip": 0.01050074, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.0132432, + "balance_loss_mlp": 1.01574028, + "epoch": 0.8383586352021645, + "flos": 16434210453120.0, + "grad_norm": 1.9023293530257637, + "language_loss": 0.86309409, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.88394672, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 13944, + "time_per_iteration": 2.346911668777466 + }, + { + "auxiliary_loss_clip": 0.01007305, + "auxiliary_loss_mlp": 0.01002841, + "balance_loss_clip": 1.0006237, + "balance_loss_mlp": 1.00067306, + "epoch": 0.8384187584548324, + "flos": 64192249952640.0, + "grad_norm": 0.6506471623353044, + "language_loss": 0.50248998, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52259147, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.06640625, + "step": 13945, + "time_per_iteration": 4.516470909118652 + }, + { + "auxiliary_loss_clip": 0.01050295, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.01425123, + "balance_loss_mlp": 1.01556253, + "epoch": 0.8384788817075004, + "flos": 22381804471680.0, + "grad_norm": 3.491207694801384, + "language_loss": 0.65808415, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67894524, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 13946, + "time_per_iteration": 2.367666006088257 + }, + { + "auxiliary_loss_clip": 0.01052298, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.01380789, + "balance_loss_mlp": 1.01575506, + "epoch": 0.8385390049601683, + "flos": 29495567456640.0, + "grad_norm": 1.5422470779827908, + "language_loss": 0.6819396, + "learning_rate": 2.672032068397829e-07, + "loss": 0.7028439, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 13947, + "time_per_iteration": 2.4149975776672363 + }, + { + "auxiliary_loss_clip": 0.01053567, + "auxiliary_loss_mlp": 0.01038705, + "balance_loss_clip": 1.01476812, + "balance_loss_mlp": 1.01676881, + "epoch": 0.8385991282128363, + "flos": 32706424730880.0, + "grad_norm": 1.6517484214611318, + "language_loss": 0.71017009, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.73109281, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36914062, + "step": 13948, + "time_per_iteration": 2.4547441005706787 + }, + { + "auxiliary_loss_clip": 0.01049324, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.01216257, + "balance_loss_mlp": 1.01540065, + "epoch": 0.8386592514655042, + "flos": 25440021815040.0, + "grad_norm": 2.476518356447475, + "language_loss": 0.85993695, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.88075513, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33789062, + "step": 13949, + "time_per_iteration": 2.526679277420044 + }, + { + "auxiliary_loss_clip": 0.01050791, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.01235962, + "balance_loss_mlp": 1.01647615, + "epoch": 0.8387193747181723, + "flos": 22014867415680.0, + "grad_norm": 1.8522828109691583, + "language_loss": 0.71584839, + "learning_rate": 2.66620065513385e-07, + "loss": 0.73670053, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 13950, + "time_per_iteration": 2.3749654293060303 + }, + { + "auxiliary_loss_clip": 0.01052396, + "auxiliary_loss_mlp": 0.01037016, + "balance_loss_clip": 1.01365066, + "balance_loss_mlp": 1.01644874, + "epoch": 0.8387794979708402, + "flos": 18149248903680.0, + "grad_norm": 2.397443794993565, + "language_loss": 0.65751255, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.67840666, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 13951, + "time_per_iteration": 2.3428561687469482 + }, + { + "auxiliary_loss_clip": 0.01051282, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.01205921, + "balance_loss_mlp": 1.01666784, + "epoch": 0.8388396212235082, + "flos": 25410031090560.0, + "grad_norm": 1.4372239803501226, + "language_loss": 0.71239537, + "learning_rate": 2.662316332665393e-07, + "loss": 0.73325276, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 13952, + "time_per_iteration": 2.4094398021698 + }, + { + "auxiliary_loss_clip": 0.01050466, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.01051128, + "balance_loss_mlp": 1.01586211, + "epoch": 0.8388997444761762, + "flos": 22271967734400.0, + "grad_norm": 1.8814199420152866, + "language_loss": 0.73660219, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.757424, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34570312, + "step": 13953, + "time_per_iteration": 2.3582210540771484 + }, + { + "auxiliary_loss_clip": 0.01049176, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.01343036, + "balance_loss_mlp": 1.01539075, + "epoch": 0.8389598677288441, + "flos": 19572203986560.0, + "grad_norm": 1.8235012722915565, + "language_loss": 0.69339204, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.71422958, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33789062, + "step": 13954, + "time_per_iteration": 2.344651937484741 + }, + { + "auxiliary_loss_clip": 0.01052522, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.01649737, + "balance_loss_mlp": 1.01758313, + "epoch": 0.8390199909815121, + "flos": 17383743233280.0, + "grad_norm": 1.8825576347124364, + "language_loss": 0.73999518, + "learning_rate": 2.656494779996932e-07, + "loss": 0.76089656, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34960938, + "step": 13955, + "time_per_iteration": 2.3476345539093018 + }, + { + "auxiliary_loss_clip": 0.01051564, + "auxiliary_loss_mlp": 0.01033199, + "balance_loss_clip": 1.01028657, + "balance_loss_mlp": 1.01589549, + "epoch": 0.83908011423418, + "flos": 24638625400320.0, + "grad_norm": 2.2772161076402666, + "language_loss": 0.67923588, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.70008355, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 13956, + "time_per_iteration": 2.3679006099700928 + }, + { + "auxiliary_loss_clip": 0.01052079, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.01227736, + "balance_loss_mlp": 1.0154798, + "epoch": 0.8391402374868481, + "flos": 24717179692800.0, + "grad_norm": 1.8896353761738685, + "language_loss": 0.81010771, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.83099425, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 13957, + "time_per_iteration": 2.427332639694214 + }, + { + "auxiliary_loss_clip": 0.01007397, + "auxiliary_loss_mlp": 0.0100207, + "balance_loss_clip": 0.99980515, + "balance_loss_mlp": 1.00079751, + "epoch": 0.839200360739516, + "flos": 56868344294400.0, + "grad_norm": 0.7558085797372138, + "language_loss": 0.53481519, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55490983, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.06591797, + "step": 13958, + "time_per_iteration": 4.4519431591033936 + }, + { + "auxiliary_loss_clip": 0.01051912, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.01182485, + "balance_loss_mlp": 1.01596808, + "epoch": 0.839260483992184, + "flos": 18331809736320.0, + "grad_norm": 1.8842618826754418, + "language_loss": 0.74766445, + "learning_rate": 2.648741917459574e-07, + "loss": 0.7685442, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 13959, + "time_per_iteration": 2.3306961059570312 + }, + { + "auxiliary_loss_clip": 0.01048855, + "auxiliary_loss_mlp": 0.01036607, + "balance_loss_clip": 1.01468432, + "balance_loss_mlp": 1.0150404, + "epoch": 0.8393206072448519, + "flos": 27086735001600.0, + "grad_norm": 2.240977232916885, + "language_loss": 0.56797683, + "learning_rate": 2.646805346545169e-07, + "loss": 0.58883148, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 13960, + "time_per_iteration": 2.401132345199585 + }, + { + "auxiliary_loss_clip": 0.01007225, + "auxiliary_loss_mlp": 0.01004853, + "balance_loss_clip": 1.00250459, + "balance_loss_mlp": 1.00059056, + "epoch": 0.8393807304975199, + "flos": 61518287566080.0, + "grad_norm": 0.7671256473689121, + "language_loss": 0.60729432, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62741512, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.0234375, + "router_z_loss_mlp": 0.06640625, + "step": 13961, + "time_per_iteration": 3.080437660217285 + }, + { + "auxiliary_loss_clip": 0.01049715, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.01287007, + "balance_loss_mlp": 1.01479673, + "epoch": 0.8394408537501878, + "flos": 14894191981440.0, + "grad_norm": 2.1806492419214982, + "language_loss": 0.68956649, + "learning_rate": 2.642934178894405e-07, + "loss": 0.71040004, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34960938, + "step": 13962, + "time_per_iteration": 2.315078020095825 + }, + { + "auxiliary_loss_clip": 0.01052607, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.01534271, + "balance_loss_mlp": 1.01585555, + "epoch": 0.8395009770028559, + "flos": 17411464719360.0, + "grad_norm": 2.0349300100565864, + "language_loss": 0.74746966, + "learning_rate": 2.640999582304841e-07, + "loss": 0.76838255, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 13963, + "time_per_iteration": 2.321296215057373 + }, + { + "auxiliary_loss_clip": 0.01051002, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.01272798, + "balance_loss_mlp": 1.01575625, + "epoch": 0.8395611002555238, + "flos": 27923603224320.0, + "grad_norm": 1.6057179317828205, + "language_loss": 0.76988971, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.79076278, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3515625, + "step": 13964, + "time_per_iteration": 2.4147701263427734 + }, + { + "auxiliary_loss_clip": 0.01054303, + "auxiliary_loss_mlp": 0.01038166, + "balance_loss_clip": 1.01298892, + "balance_loss_mlp": 1.01685345, + "epoch": 0.8396212235081918, + "flos": 11100354958080.0, + "grad_norm": 2.152414332348708, + "language_loss": 0.79776901, + "learning_rate": 2.637132363964161e-07, + "loss": 0.8186937, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.375, + "step": 13965, + "time_per_iteration": 2.3288767337799072 + }, + { + "auxiliary_loss_clip": 0.01049649, + "auxiliary_loss_mlp": 0.01034034, + "balance_loss_clip": 1.01271963, + "balance_loss_mlp": 1.01537395, + "epoch": 0.8396813467608598, + "flos": 35734197502080.0, + "grad_norm": 3.7693335838664157, + "language_loss": 0.67569804, + "learning_rate": 2.635199742359684e-07, + "loss": 0.69653487, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34179688, + "step": 13966, + "time_per_iteration": 2.511916399002075 + }, + { + "auxiliary_loss_clip": 0.01051077, + "auxiliary_loss_mlp": 0.01038773, + "balance_loss_clip": 1.01602745, + "balance_loss_mlp": 1.01564622, + "epoch": 0.8397414700135277, + "flos": 26175536760960.0, + "grad_norm": 1.607886525975985, + "language_loss": 0.76046568, + "learning_rate": 2.633267779230177e-07, + "loss": 0.7813642, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 13967, + "time_per_iteration": 2.3921566009521484 + }, + { + "auxiliary_loss_clip": 0.01051129, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.0126133, + "balance_loss_mlp": 1.0162226, + "epoch": 0.8398015932661957, + "flos": 18332123938560.0, + "grad_norm": 2.594141152868473, + "language_loss": 0.84317046, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.86403579, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34960938, + "step": 13968, + "time_per_iteration": 2.3446543216705322 + }, + { + "auxiliary_loss_clip": 0.01053555, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.00933754, + "balance_loss_mlp": 1.01778984, + "epoch": 0.8398617165188637, + "flos": 17378681086080.0, + "grad_norm": 2.1356129542614943, + "language_loss": 0.78725195, + "learning_rate": 2.629405828689075e-07, + "loss": 0.80809629, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35742188, + "step": 13969, + "time_per_iteration": 2.331275224685669 + }, + { + "auxiliary_loss_clip": 0.01052724, + "auxiliary_loss_mlp": 0.0104112, + "balance_loss_clip": 1.01589561, + "balance_loss_mlp": 1.01559711, + "epoch": 0.8399218397715317, + "flos": 22928579274240.0, + "grad_norm": 1.9807432893774355, + "language_loss": 0.78099203, + "learning_rate": 2.627475841423923e-07, + "loss": 0.80193049, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 13970, + "time_per_iteration": 2.3704679012298584 + }, + { + "auxiliary_loss_clip": 0.01052162, + "auxiliary_loss_mlp": 0.01040974, + "balance_loss_clip": 1.01795506, + "balance_loss_mlp": 1.01609945, + "epoch": 0.8399819630241996, + "flos": 23148427305600.0, + "grad_norm": 2.4319994938023246, + "language_loss": 0.73200881, + "learning_rate": 2.625546512926633e-07, + "loss": 0.75294018, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 13971, + "time_per_iteration": 2.3713085651397705 + }, + { + "auxiliary_loss_clip": 0.01050794, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.01101613, + "balance_loss_mlp": 1.01490045, + "epoch": 0.8400420862768676, + "flos": 16396539229440.0, + "grad_norm": 1.8808912792576304, + "language_loss": 0.78765976, + "learning_rate": 2.623617843270358e-07, + "loss": 0.80852473, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.359375, + "step": 13972, + "time_per_iteration": 2.326504707336426 + }, + { + "auxiliary_loss_clip": 0.01049432, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.01358938, + "balance_loss_mlp": 1.01558626, + "epoch": 0.8401022095295355, + "flos": 21286439475840.0, + "grad_norm": 1.345971531179824, + "language_loss": 0.69233406, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.71317631, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33789062, + "step": 13973, + "time_per_iteration": 2.3693132400512695 + }, + { + "auxiliary_loss_clip": 0.01051313, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.01008201, + "balance_loss_mlp": 1.01591873, + "epoch": 0.8401623327822035, + "flos": 17310355822080.0, + "grad_norm": 1.823149804393061, + "language_loss": 0.79006004, + "learning_rate": 2.619762480773382e-07, + "loss": 0.81089288, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 13974, + "time_per_iteration": 2.326477527618408 + }, + { + "auxiliary_loss_clip": 0.01051888, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.01789951, + "balance_loss_mlp": 1.01640081, + "epoch": 0.8402224560348714, + "flos": 22235588231040.0, + "grad_norm": 1.805281144647976, + "language_loss": 0.74015844, + "learning_rate": 2.617835788078868e-07, + "loss": 0.76108694, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 13975, + "time_per_iteration": 2.384267568588257 + }, + { + "auxiliary_loss_clip": 0.01050976, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.01265657, + "balance_loss_mlp": 1.01558626, + "epoch": 0.8402825792875395, + "flos": 20228920260480.0, + "grad_norm": 1.6725082261729158, + "language_loss": 0.73107451, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.75193405, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 13976, + "time_per_iteration": 3.619525194168091 + }, + { + "auxiliary_loss_clip": 0.01051005, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.01300633, + "balance_loss_mlp": 1.01600814, + "epoch": 0.8403427025402074, + "flos": 23288987905920.0, + "grad_norm": 1.7105858514969159, + "language_loss": 0.73449337, + "learning_rate": 2.61398438016311e-07, + "loss": 0.75535583, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 13977, + "time_per_iteration": 2.399258852005005 + }, + { + "auxiliary_loss_clip": 0.01050422, + "auxiliary_loss_mlp": 0.01039928, + "balance_loss_clip": 1.01634812, + "balance_loss_mlp": 1.01501095, + "epoch": 0.8404028257928754, + "flos": 32674094945280.0, + "grad_norm": 1.4571553691463306, + "language_loss": 0.69647002, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.71737349, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 13978, + "time_per_iteration": 2.525038003921509 + }, + { + "auxiliary_loss_clip": 0.01049508, + "auxiliary_loss_mlp": 0.01033373, + "balance_loss_clip": 1.01280904, + "balance_loss_mlp": 1.0165478, + "epoch": 0.8404629490455434, + "flos": 16179588840960.0, + "grad_norm": 2.4596481241122596, + "language_loss": 0.78435111, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80517989, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.328125, + "step": 13979, + "time_per_iteration": 2.325166702270508 + }, + { + "auxiliary_loss_clip": 0.01049565, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.01080906, + "balance_loss_mlp": 1.01543832, + "epoch": 0.8405230722982113, + "flos": 15193571823360.0, + "grad_norm": 1.9636038436655818, + "language_loss": 0.79833853, + "learning_rate": 2.60821221306778e-07, + "loss": 0.81916332, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34179688, + "step": 13980, + "time_per_iteration": 2.330409526824951 + }, + { + "auxiliary_loss_clip": 0.01051569, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.01496506, + "balance_loss_mlp": 1.0166409, + "epoch": 0.8405831955508793, + "flos": 27811357603200.0, + "grad_norm": 1.534735359624246, + "language_loss": 0.87362099, + "learning_rate": 2.606289476268757e-07, + "loss": 0.89449489, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34960938, + "step": 13981, + "time_per_iteration": 2.406942844390869 + }, + { + "auxiliary_loss_clip": 0.01051235, + "auxiliary_loss_mlp": 0.01035291, + "balance_loss_clip": 1.01346421, + "balance_loss_mlp": 1.01629519, + "epoch": 0.8406433188035473, + "flos": 23768310228480.0, + "grad_norm": 2.3849868163410153, + "language_loss": 0.69083053, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.71169579, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34960938, + "step": 13982, + "time_per_iteration": 2.3748786449432373 + }, + { + "auxiliary_loss_clip": 0.01050893, + "auxiliary_loss_mlp": 0.01039551, + "balance_loss_clip": 1.01572061, + "balance_loss_mlp": 1.01568687, + "epoch": 0.8407034420562153, + "flos": 29204391784320.0, + "grad_norm": 1.572498779580134, + "language_loss": 0.69188666, + "learning_rate": 2.602445981457324e-07, + "loss": 0.71279109, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3515625, + "step": 13983, + "time_per_iteration": 3.7749762535095215 + }, + { + "auxiliary_loss_clip": 0.0105103, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.01175332, + "balance_loss_mlp": 1.01506782, + "epoch": 0.8407635653088832, + "flos": 26358865643520.0, + "grad_norm": 1.8294312722190453, + "language_loss": 0.80307949, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.8239466, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 13984, + "time_per_iteration": 2.397649049758911 + }, + { + "auxiliary_loss_clip": 0.01049913, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.01188278, + "balance_loss_mlp": 1.01452982, + "epoch": 0.8408236885615512, + "flos": 21467778410880.0, + "grad_norm": 1.9313389362117013, + "language_loss": 0.61486447, + "learning_rate": 2.598605125513842e-07, + "loss": 0.63570261, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 13985, + "time_per_iteration": 3.6453235149383545 + }, + { + "auxiliary_loss_clip": 0.01053742, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.01331139, + "balance_loss_mlp": 1.01682591, + "epoch": 0.8408838118142191, + "flos": 22962689539200.0, + "grad_norm": 1.6924853364224823, + "language_loss": 0.83136284, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.85226882, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36914062, + "step": 13986, + "time_per_iteration": 2.378918409347534 + }, + { + "auxiliary_loss_clip": 0.01051533, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.01015306, + "balance_loss_mlp": 1.01675296, + "epoch": 0.8409439350668871, + "flos": 26798736263040.0, + "grad_norm": 1.4631967865728899, + "language_loss": 0.66784656, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.68868482, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 13987, + "time_per_iteration": 2.4241740703582764 + }, + { + "auxiliary_loss_clip": 0.0105131, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.01483214, + "balance_loss_mlp": 1.01559663, + "epoch": 0.841004058319555, + "flos": 26577456865920.0, + "grad_norm": 3.1035315059978816, + "language_loss": 0.68497467, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.70585746, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 13988, + "time_per_iteration": 2.402193546295166 + }, + { + "auxiliary_loss_clip": 0.01054166, + "auxiliary_loss_mlp": 0.01048398, + "balance_loss_clip": 1.02202868, + "balance_loss_mlp": 1.01680803, + "epoch": 0.8410641815722231, + "flos": 14500999716480.0, + "grad_norm": 2.8102135920438465, + "language_loss": 0.81864411, + "learning_rate": 2.590931332560622e-07, + "loss": 0.8396697, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37304688, + "step": 13989, + "time_per_iteration": 2.3401761054992676 + }, + { + "auxiliary_loss_clip": 0.01053146, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.00990963, + "balance_loss_mlp": 1.01626968, + "epoch": 0.841124304824891, + "flos": 29165463751680.0, + "grad_norm": 1.7442394364719818, + "language_loss": 0.76197797, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.7828449, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 13990, + "time_per_iteration": 2.4131994247436523 + }, + { + "auxiliary_loss_clip": 0.01047002, + "auxiliary_loss_mlp": 0.01038746, + "balance_loss_clip": 1.01787281, + "balance_loss_mlp": 1.01413083, + "epoch": 0.841184428077559, + "flos": 22411131880320.0, + "grad_norm": 1.6136812814108434, + "language_loss": 0.81506282, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.83592027, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.328125, + "step": 13991, + "time_per_iteration": 2.4583330154418945 + }, + { + "auxiliary_loss_clip": 0.01051099, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.01528716, + "balance_loss_mlp": 1.01581264, + "epoch": 0.841244551330227, + "flos": 22961781843840.0, + "grad_norm": 2.507398066768113, + "language_loss": 0.71970248, + "learning_rate": 2.585182919204105e-07, + "loss": 0.74059403, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 13992, + "time_per_iteration": 2.3855385780334473 + }, + { + "auxiliary_loss_clip": 0.01052422, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.01147151, + "balance_loss_mlp": 1.01629472, + "epoch": 0.8413046745828949, + "flos": 21031678218240.0, + "grad_norm": 1.6474027487389236, + "language_loss": 0.77998984, + "learning_rate": 2.583268102064959e-07, + "loss": 0.80084437, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36132812, + "step": 13993, + "time_per_iteration": 2.3763575553894043 + }, + { + "auxiliary_loss_clip": 0.01054165, + "auxiliary_loss_mlp": 0.01043715, + "balance_loss_clip": 1.01795411, + "balance_loss_mlp": 1.01608825, + "epoch": 0.841364797835563, + "flos": 27050879168640.0, + "grad_norm": 2.104335495844484, + "language_loss": 0.74802995, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76900876, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38085938, + "step": 13994, + "time_per_iteration": 2.4393832683563232 + }, + { + "auxiliary_loss_clip": 0.0104999, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.01509154, + "balance_loss_mlp": 1.01552773, + "epoch": 0.8414249210882309, + "flos": 17894697114240.0, + "grad_norm": 1.6449099273634091, + "language_loss": 0.60768735, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.62856579, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34375, + "step": 13995, + "time_per_iteration": 2.349722146987915 + }, + { + "auxiliary_loss_clip": 0.01050151, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.01416421, + "balance_loss_mlp": 1.01578021, + "epoch": 0.8414850443408989, + "flos": 25440196371840.0, + "grad_norm": 1.7817827831168362, + "language_loss": 0.72804511, + "learning_rate": 2.577527613603163e-07, + "loss": 0.74891436, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34375, + "step": 13996, + "time_per_iteration": 2.4049980640411377 + }, + { + "auxiliary_loss_clip": 0.0105152, + "auxiliary_loss_mlp": 0.01038968, + "balance_loss_clip": 1.01573443, + "balance_loss_mlp": 1.01564181, + "epoch": 0.8415451675935668, + "flos": 23218986896640.0, + "grad_norm": 1.9951342006771153, + "language_loss": 0.65454477, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.67544967, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 13997, + "time_per_iteration": 2.394443988800049 + }, + { + "auxiliary_loss_clip": 0.01053917, + "auxiliary_loss_mlp": 0.01046475, + "balance_loss_clip": 1.02170336, + "balance_loss_mlp": 1.0164175, + "epoch": 0.8416052908462348, + "flos": 18545653013760.0, + "grad_norm": 1.9918549352968093, + "language_loss": 0.82935274, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.85035658, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 13998, + "time_per_iteration": 3.8182199001312256 + }, + { + "auxiliary_loss_clip": 0.01052328, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.01550984, + "balance_loss_mlp": 1.0158031, + "epoch": 0.8416654140989027, + "flos": 26103964740480.0, + "grad_norm": 1.5062917570038723, + "language_loss": 0.81282365, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.83374679, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 13999, + "time_per_iteration": 2.4215519428253174 + }, + { + "auxiliary_loss_clip": 0.01053461, + "auxiliary_loss_mlp": 0.01039488, + "balance_loss_clip": 1.01397705, + "balance_loss_mlp": 1.01592994, + "epoch": 0.8417255373515707, + "flos": 26432043586560.0, + "grad_norm": 2.3347608760530476, + "language_loss": 0.68133831, + "learning_rate": 2.569882878592096e-07, + "loss": 0.70226777, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 14000, + "time_per_iteration": 2.4092214107513428 + }, + { + "auxiliary_loss_clip": 0.01054648, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.01144874, + "balance_loss_mlp": 1.01704526, + "epoch": 0.8417856606042387, + "flos": 24716586199680.0, + "grad_norm": 1.4101090972599177, + "language_loss": 0.80305713, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.82395512, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.375, + "step": 14001, + "time_per_iteration": 2.411808490753174 + }, + { + "auxiliary_loss_clip": 0.01051964, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.01344585, + "balance_loss_mlp": 1.01588631, + "epoch": 0.8418457838569067, + "flos": 20849780701440.0, + "grad_norm": 1.9134872990295089, + "language_loss": 0.79620141, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.81707925, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36132812, + "step": 14002, + "time_per_iteration": 2.3698573112487793 + }, + { + "auxiliary_loss_clip": 0.01051147, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.01534832, + "balance_loss_mlp": 1.01570463, + "epoch": 0.8419059071095746, + "flos": 28659292727040.0, + "grad_norm": 1.4352666282898348, + "language_loss": 0.78919876, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.8100965, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 14003, + "time_per_iteration": 2.434781789779663 + }, + { + "auxiliary_loss_clip": 0.01051244, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.01391733, + "balance_loss_mlp": 1.01618433, + "epoch": 0.8419660303622426, + "flos": 21652503747840.0, + "grad_norm": 1.5299627182967663, + "language_loss": 0.66478348, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.68566102, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 14004, + "time_per_iteration": 2.3747096061706543 + }, + { + "auxiliary_loss_clip": 0.01053005, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.01919484, + "balance_loss_mlp": 1.01623344, + "epoch": 0.8420261536149106, + "flos": 25299949973760.0, + "grad_norm": 1.9865701419292883, + "language_loss": 0.77247357, + "learning_rate": 2.560341831785724e-07, + "loss": 0.79344046, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.3671875, + "step": 14005, + "time_per_iteration": 2.3871724605560303 + }, + { + "auxiliary_loss_clip": 0.0105211, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.01204753, + "balance_loss_mlp": 1.01539683, + "epoch": 0.8420862768675785, + "flos": 18762603402240.0, + "grad_norm": 4.059525841825488, + "language_loss": 0.79214799, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.81304252, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 14006, + "time_per_iteration": 2.437835693359375 + }, + { + "auxiliary_loss_clip": 0.01051606, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.01424885, + "balance_loss_mlp": 1.01606596, + "epoch": 0.8421464001202466, + "flos": 18327201436800.0, + "grad_norm": 1.714009812045215, + "language_loss": 0.77780455, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79869306, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 14007, + "time_per_iteration": 2.3309378623962402 + }, + { + "auxiliary_loss_clip": 0.01052246, + "auxiliary_loss_mlp": 0.01038053, + "balance_loss_clip": 1.01397288, + "balance_loss_mlp": 1.01589251, + "epoch": 0.8422065233729145, + "flos": 31535926755840.0, + "grad_norm": 2.0856938504438665, + "language_loss": 0.6632266, + "learning_rate": 2.554625138886102e-07, + "loss": 0.68412954, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 14008, + "time_per_iteration": 2.4392004013061523 + }, + { + "auxiliary_loss_clip": 0.01007489, + "auxiliary_loss_mlp": 0.01001778, + "balance_loss_clip": 0.99969143, + "balance_loss_mlp": 1.00076818, + "epoch": 0.8422666466255825, + "flos": 64295034595200.0, + "grad_norm": 0.7159798722931919, + "language_loss": 0.57031476, + "learning_rate": 2.552720897550631e-07, + "loss": 0.59040749, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06738281, + "step": 14009, + "time_per_iteration": 3.0698609352111816 + }, + { + "auxiliary_loss_clip": 0.01049571, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.01128721, + "balance_loss_mlp": 1.01533425, + "epoch": 0.8423267698782504, + "flos": 24315573790080.0, + "grad_norm": 1.2929447897587416, + "language_loss": 0.78449094, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80530488, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34179688, + "step": 14010, + "time_per_iteration": 2.4197700023651123 + }, + { + "auxiliary_loss_clip": 0.0105426, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.01413727, + "balance_loss_mlp": 1.01730275, + "epoch": 0.8423868931309184, + "flos": 18295116030720.0, + "grad_norm": 2.044519748142335, + "language_loss": 0.73314679, + "learning_rate": 2.548914399759592e-07, + "loss": 0.75408638, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36914062, + "step": 14011, + "time_per_iteration": 2.347945213317871 + }, + { + "auxiliary_loss_clip": 0.01050918, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.01618576, + "balance_loss_mlp": 1.01485538, + "epoch": 0.8424470163835863, + "flos": 23549090601600.0, + "grad_norm": 1.7641990752119325, + "language_loss": 0.85652083, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.87742162, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 14012, + "time_per_iteration": 2.383077383041382 + }, + { + "auxiliary_loss_clip": 0.01046507, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.01437962, + "balance_loss_mlp": 1.01442051, + "epoch": 0.8425071396362543, + "flos": 23768345139840.0, + "grad_norm": 1.6119150805098506, + "language_loss": 0.68904543, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70984089, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.3203125, + "step": 14013, + "time_per_iteration": 2.3757898807525635 + }, + { + "auxiliary_loss_clip": 0.01055385, + "auxiliary_loss_mlp": 0.01038854, + "balance_loss_clip": 1.01459551, + "balance_loss_mlp": 1.0167619, + "epoch": 0.8425672628889223, + "flos": 16178017829760.0, + "grad_norm": 2.390081421197911, + "language_loss": 0.79793096, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.81887341, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.38671875, + "step": 14014, + "time_per_iteration": 2.3513364791870117 + }, + { + "auxiliary_loss_clip": 0.0104946, + "auxiliary_loss_mlp": 0.0104151, + "balance_loss_clip": 1.01958728, + "balance_loss_mlp": 1.01438344, + "epoch": 0.8426273861415903, + "flos": 23148008369280.0, + "grad_norm": 1.696297297601098, + "language_loss": 0.68580532, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.70671505, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 14015, + "time_per_iteration": 3.6770594120025635 + }, + { + "auxiliary_loss_clip": 0.01050611, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.01369333, + "balance_loss_mlp": 1.01578307, + "epoch": 0.8426875093942582, + "flos": 17456781530880.0, + "grad_norm": 2.167017928447435, + "language_loss": 0.77674979, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.79762423, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 14016, + "time_per_iteration": 2.365633249282837 + }, + { + "auxiliary_loss_clip": 0.01052472, + "auxiliary_loss_mlp": 0.01041577, + "balance_loss_clip": 1.01903415, + "balance_loss_mlp": 1.01703167, + "epoch": 0.8427476326469262, + "flos": 19639691377920.0, + "grad_norm": 1.7993442821109884, + "language_loss": 0.80175537, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.82269585, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 14017, + "time_per_iteration": 2.353987455368042 + }, + { + "auxiliary_loss_clip": 0.01051085, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.01571047, + "balance_loss_mlp": 1.01608622, + "epoch": 0.8428077558995941, + "flos": 11940539760000.0, + "grad_norm": 2.645239349595698, + "language_loss": 0.64054632, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.66144967, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 14018, + "time_per_iteration": 2.4242959022521973 + }, + { + "auxiliary_loss_clip": 0.01051363, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.01466513, + "balance_loss_mlp": 1.01621222, + "epoch": 0.8428678791522621, + "flos": 10450970069760.0, + "grad_norm": 1.9975851987268989, + "language_loss": 0.80325818, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.82414281, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 14019, + "time_per_iteration": 2.333517074584961 + }, + { + "auxiliary_loss_clip": 0.01052855, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.0157125, + "balance_loss_mlp": 1.01672208, + "epoch": 0.8429280024049302, + "flos": 28765987441920.0, + "grad_norm": 2.402750953628054, + "language_loss": 0.79159433, + "learning_rate": 2.531817924498265e-07, + "loss": 0.81251591, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 14020, + "time_per_iteration": 2.4432289600372314 + }, + { + "auxiliary_loss_clip": 0.01051611, + "auxiliary_loss_mlp": 0.01039601, + "balance_loss_clip": 1.01469803, + "balance_loss_mlp": 1.01545119, + "epoch": 0.8429881256575981, + "flos": 19536068862720.0, + "grad_norm": 1.6238611106838678, + "language_loss": 0.7269814, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.74789351, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36132812, + "step": 14021, + "time_per_iteration": 2.36151385307312 + }, + { + "auxiliary_loss_clip": 0.01052167, + "auxiliary_loss_mlp": 0.01041132, + "balance_loss_clip": 1.01724267, + "balance_loss_mlp": 1.01569629, + "epoch": 0.8430482489102661, + "flos": 24789764142720.0, + "grad_norm": 1.6281075840262558, + "language_loss": 0.70830679, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.72923976, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36523438, + "step": 14022, + "time_per_iteration": 3.812649726867676 + }, + { + "auxiliary_loss_clip": 0.01054641, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.01902199, + "balance_loss_mlp": 1.01758647, + "epoch": 0.843108372162934, + "flos": 21543155769600.0, + "grad_norm": 1.8067638791216571, + "language_loss": 0.73345995, + "learning_rate": 2.526131019933553e-07, + "loss": 0.75443631, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 14023, + "time_per_iteration": 2.366070508956909 + }, + { + "auxiliary_loss_clip": 0.01050839, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.01780224, + "balance_loss_mlp": 1.01552415, + "epoch": 0.843168495415602, + "flos": 24607622246400.0, + "grad_norm": 1.507049688553594, + "language_loss": 0.67486525, + "learning_rate": 2.524236710204559e-07, + "loss": 0.69578791, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35351562, + "step": 14024, + "time_per_iteration": 3.7416255474090576 + }, + { + "auxiliary_loss_clip": 0.01050502, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.01614499, + "balance_loss_mlp": 1.01589036, + "epoch": 0.8432286186682699, + "flos": 15121825246080.0, + "grad_norm": 1.8169778923759035, + "language_loss": 0.82014298, + "learning_rate": 2.522343063158261e-07, + "loss": 0.84104282, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34570312, + "step": 14025, + "time_per_iteration": 2.3460795879364014 + }, + { + "auxiliary_loss_clip": 0.01048841, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.01071274, + "balance_loss_mlp": 1.01498461, + "epoch": 0.843288741920938, + "flos": 20300876305920.0, + "grad_norm": 1.4186257156020363, + "language_loss": 0.78257287, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.80335921, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.33789062, + "step": 14026, + "time_per_iteration": 2.3605825901031494 + }, + { + "auxiliary_loss_clip": 0.01051832, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.01752114, + "balance_loss_mlp": 1.01694489, + "epoch": 0.8433488651736059, + "flos": 23330953226880.0, + "grad_norm": 1.8770422417133452, + "language_loss": 0.83049273, + "learning_rate": 2.518557757400945e-07, + "loss": 0.85141701, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34765625, + "step": 14027, + "time_per_iteration": 2.389634609222412 + }, + { + "auxiliary_loss_clip": 0.01051038, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.01526833, + "balance_loss_mlp": 1.01533043, + "epoch": 0.8434089884262739, + "flos": 39456532327680.0, + "grad_norm": 1.4109709585345187, + "language_loss": 0.57530349, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.59618771, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 14028, + "time_per_iteration": 2.516916036605835 + }, + { + "auxiliary_loss_clip": 0.01049714, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.01160848, + "balance_loss_mlp": 1.0154593, + "epoch": 0.8434691116789418, + "flos": 23767716735360.0, + "grad_norm": 1.8749214619115566, + "language_loss": 0.64570928, + "learning_rate": 2.51477510323578e-07, + "loss": 0.66653413, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 14029, + "time_per_iteration": 2.379504919052124 + }, + { + "auxiliary_loss_clip": 0.01047524, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.01380169, + "balance_loss_mlp": 1.01462054, + "epoch": 0.8435292349316098, + "flos": 22670396703360.0, + "grad_norm": 1.496541787549182, + "language_loss": 0.76205403, + "learning_rate": 2.51288477067956e-07, + "loss": 0.7828629, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.328125, + "step": 14030, + "time_per_iteration": 2.367100477218628 + }, + { + "auxiliary_loss_clip": 0.01050699, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.01278377, + "balance_loss_mlp": 1.01679754, + "epoch": 0.8435893581842777, + "flos": 18842623971840.0, + "grad_norm": 1.8096201469772781, + "language_loss": 0.84156358, + "learning_rate": 2.510995101236502e-07, + "loss": 0.86241567, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.33984375, + "step": 14031, + "time_per_iteration": 2.341137170791626 + }, + { + "auxiliary_loss_clip": 0.01050575, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.01067233, + "balance_loss_mlp": 1.01567948, + "epoch": 0.8436494814369457, + "flos": 20703180435840.0, + "grad_norm": 4.702719676775216, + "language_loss": 0.81579912, + "learning_rate": 2.509106094978266e-07, + "loss": 0.83662784, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34765625, + "step": 14032, + "time_per_iteration": 2.34971022605896 + }, + { + "auxiliary_loss_clip": 0.01051719, + "auxiliary_loss_mlp": 0.01040398, + "balance_loss_clip": 1.01507759, + "balance_loss_mlp": 1.01573789, + "epoch": 0.8437096046896138, + "flos": 22673084878080.0, + "grad_norm": 1.3871189266839887, + "language_loss": 0.76244426, + "learning_rate": 2.507217751976478e-07, + "loss": 0.78336543, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.359375, + "step": 14033, + "time_per_iteration": 2.3855786323547363 + }, + { + "auxiliary_loss_clip": 0.01051534, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.01630354, + "balance_loss_mlp": 1.01611876, + "epoch": 0.8437697279422817, + "flos": 16179204816000.0, + "grad_norm": 1.8285747422904859, + "language_loss": 0.84834003, + "learning_rate": 2.505330072302743e-07, + "loss": 0.86923021, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.35546875, + "step": 14034, + "time_per_iteration": 2.3158233165740967 + }, + { + "auxiliary_loss_clip": 0.01051711, + "auxiliary_loss_mlp": 0.01033497, + "balance_loss_clip": 1.0107398, + "balance_loss_mlp": 1.01579595, + "epoch": 0.8438298511949497, + "flos": 28764625898880.0, + "grad_norm": 1.4260534028255158, + "language_loss": 0.78802985, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80888188, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 14035, + "time_per_iteration": 2.424017906188965 + }, + { + "auxiliary_loss_clip": 0.01051052, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.01506948, + "balance_loss_mlp": 1.0166378, + "epoch": 0.8438899744476176, + "flos": 33723025966080.0, + "grad_norm": 1.352840024852956, + "language_loss": 0.7283566, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74923134, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 14036, + "time_per_iteration": 2.470640182495117 + }, + { + "auxiliary_loss_clip": 0.01047853, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.01249504, + "balance_loss_mlp": 1.01517355, + "epoch": 0.8439500977002856, + "flos": 25109848287360.0, + "grad_norm": 1.6437226045276623, + "language_loss": 0.71038139, + "learning_rate": 2.49967101396557e-07, + "loss": 0.73117316, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.32617188, + "step": 14037, + "time_per_iteration": 2.389752149581909 + }, + { + "auxiliary_loss_clip": 0.01050234, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.01531982, + "balance_loss_mlp": 1.01549339, + "epoch": 0.8440102209529535, + "flos": 32849080012800.0, + "grad_norm": 1.6435204291204595, + "language_loss": 0.69656575, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71744132, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 14038, + "time_per_iteration": 3.898721933364868 + }, + { + "auxiliary_loss_clip": 0.01052736, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.01101184, + "balance_loss_mlp": 1.0167129, + "epoch": 0.8440703442056215, + "flos": 23729137816320.0, + "grad_norm": 1.6141012177492886, + "language_loss": 0.7723158, + "learning_rate": 2.49590162635938e-07, + "loss": 0.79319859, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.359375, + "step": 14039, + "time_per_iteration": 2.407320976257324 + }, + { + "auxiliary_loss_clip": 0.01054199, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.01477432, + "balance_loss_mlp": 1.016505, + "epoch": 0.8441304674582895, + "flos": 20192680402560.0, + "grad_norm": 2.0645968212891224, + "language_loss": 0.80195546, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.82288414, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37695312, + "step": 14040, + "time_per_iteration": 2.359320640563965 + }, + { + "auxiliary_loss_clip": 0.01051751, + "auxiliary_loss_mlp": 0.01043443, + "balance_loss_clip": 1.01774192, + "balance_loss_mlp": 1.01642156, + "epoch": 0.8441905907109575, + "flos": 20219145079680.0, + "grad_norm": 2.121384663365544, + "language_loss": 0.70167196, + "learning_rate": 2.492134893781821e-07, + "loss": 0.72262388, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.35351562, + "step": 14041, + "time_per_iteration": 2.3709168434143066 + }, + { + "auxiliary_loss_clip": 0.01052821, + "auxiliary_loss_mlp": 0.01041196, + "balance_loss_clip": 1.01801014, + "balance_loss_mlp": 1.01663399, + "epoch": 0.8442507139636254, + "flos": 13515611103360.0, + "grad_norm": 2.0010267224435525, + "language_loss": 0.71359539, + "learning_rate": 2.490252523307341e-07, + "loss": 0.73453557, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 14042, + "time_per_iteration": 2.382293939590454 + }, + { + "auxiliary_loss_clip": 0.01049336, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.00981736, + "balance_loss_mlp": 1.01549792, + "epoch": 0.8443108372162934, + "flos": 18219319735680.0, + "grad_norm": 1.687025571893789, + "language_loss": 0.76165164, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.78245288, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33789062, + "step": 14043, + "time_per_iteration": 2.409672498703003 + }, + { + "auxiliary_loss_clip": 0.01050292, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01010585, + "balance_loss_mlp": 1.01514864, + "epoch": 0.8443709604689613, + "flos": 16104246393600.0, + "grad_norm": 2.6261614097163446, + "language_loss": 0.7365073, + "learning_rate": 2.486489774343865e-07, + "loss": 0.75732446, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 14044, + "time_per_iteration": 2.348379135131836 + }, + { + "auxiliary_loss_clip": 0.01048713, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.01375794, + "balance_loss_mlp": 1.01476657, + "epoch": 0.8444310837216293, + "flos": 18511228546560.0, + "grad_norm": 1.7724126077677926, + "language_loss": 0.75467604, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77551186, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 14045, + "time_per_iteration": 2.419152021408081 + }, + { + "auxiliary_loss_clip": 0.01049734, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.01551032, + "balance_loss_mlp": 1.01532507, + "epoch": 0.8444912069742974, + "flos": 14938950211200.0, + "grad_norm": 3.117943971129822, + "language_loss": 0.79594171, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.81681299, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 14046, + "time_per_iteration": 2.330622673034668 + }, + { + "auxiliary_loss_clip": 0.01052127, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.01398087, + "balance_loss_mlp": 1.01645887, + "epoch": 0.8445513302269653, + "flos": 20119292991360.0, + "grad_norm": 2.749696319041143, + "language_loss": 0.79782689, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.81871665, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 14047, + "time_per_iteration": 2.3647823333740234 + }, + { + "auxiliary_loss_clip": 0.01050564, + "auxiliary_loss_mlp": 0.01040188, + "balance_loss_clip": 1.01836109, + "balance_loss_mlp": 1.0164063, + "epoch": 0.8446114534796333, + "flos": 31169722838400.0, + "grad_norm": 1.741070088683045, + "language_loss": 0.72971392, + "learning_rate": 2.478972246355935e-07, + "loss": 0.75062144, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 14048, + "time_per_iteration": 2.431424617767334 + }, + { + "auxiliary_loss_clip": 0.01052916, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.0120554, + "balance_loss_mlp": 1.01669765, + "epoch": 0.8446715767323012, + "flos": 23947275191040.0, + "grad_norm": 1.3538520502561788, + "language_loss": 0.74179101, + "learning_rate": 2.477094525178667e-07, + "loss": 0.76267534, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 14049, + "time_per_iteration": 2.392207622528076 + }, + { + "auxiliary_loss_clip": 0.01007967, + "auxiliary_loss_mlp": 0.01005111, + "balance_loss_clip": 1.00272703, + "balance_loss_mlp": 1.00111318, + "epoch": 0.8447316999849692, + "flos": 67981653233280.0, + "grad_norm": 0.816100666537498, + "language_loss": 0.60741127, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62754214, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.06835938, + "step": 14050, + "time_per_iteration": 2.97776198387146 + }, + { + "auxiliary_loss_clip": 0.01051677, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.01185036, + "balance_loss_mlp": 1.01603889, + "epoch": 0.8447918232376371, + "flos": 22417834861440.0, + "grad_norm": 2.0819184109739184, + "language_loss": 0.72993493, + "learning_rate": 2.473341076306303e-07, + "loss": 0.75081307, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 14051, + "time_per_iteration": 2.4369964599609375 + }, + { + "auxiliary_loss_clip": 0.01050087, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.01245236, + "balance_loss_mlp": 1.01570797, + "epoch": 0.8448519464903052, + "flos": 23693072515200.0, + "grad_norm": 1.8961678058458085, + "language_loss": 0.76257777, + "learning_rate": 2.471465348753547e-07, + "loss": 0.78342593, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 14052, + "time_per_iteration": 2.3618345260620117 + }, + { + "auxiliary_loss_clip": 0.01048262, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.01273322, + "balance_loss_mlp": 1.01602387, + "epoch": 0.8449120697429731, + "flos": 13735040198400.0, + "grad_norm": 2.128196504283438, + "language_loss": 0.75999457, + "learning_rate": 2.469590285884575e-07, + "loss": 0.78080964, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.32226562, + "step": 14053, + "time_per_iteration": 2.3419103622436523 + }, + { + "auxiliary_loss_clip": 0.01050084, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.01090002, + "balance_loss_mlp": 1.01561642, + "epoch": 0.8449721929956411, + "flos": 20885741268480.0, + "grad_norm": 1.7447893641752743, + "language_loss": 0.75266963, + "learning_rate": 2.467715887770494e-07, + "loss": 0.77350748, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34570312, + "step": 14054, + "time_per_iteration": 3.615701198577881 + }, + { + "auxiliary_loss_clip": 0.01055428, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.01370144, + "balance_loss_mlp": 1.01825917, + "epoch": 0.845032316248309, + "flos": 33215598132480.0, + "grad_norm": 1.5127575897453718, + "language_loss": 0.79013205, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.81106198, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 14055, + "time_per_iteration": 2.4808404445648193 + }, + { + "auxiliary_loss_clip": 0.01048988, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.01051378, + "balance_loss_mlp": 1.0147748, + "epoch": 0.845092439500977, + "flos": 23584143473280.0, + "grad_norm": 1.615999552709702, + "language_loss": 0.73982286, + "learning_rate": 2.463969086091302e-07, + "loss": 0.76062703, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 14056, + "time_per_iteration": 2.38860821723938 + }, + { + "auxiliary_loss_clip": 0.01054955, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.01511621, + "balance_loss_mlp": 1.01750314, + "epoch": 0.8451525627536449, + "flos": 13333085182080.0, + "grad_norm": 2.4281441400675914, + "language_loss": 0.69095731, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.71190524, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 14057, + "time_per_iteration": 2.3328404426574707 + }, + { + "auxiliary_loss_clip": 0.01052563, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.01125097, + "balance_loss_mlp": 1.01605535, + "epoch": 0.8452126860063129, + "flos": 27816768864000.0, + "grad_norm": 1.6792598115877024, + "language_loss": 0.78910136, + "learning_rate": 2.460224944284284e-07, + "loss": 0.80997539, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36523438, + "step": 14058, + "time_per_iteration": 2.400944709777832 + }, + { + "auxiliary_loss_clip": 0.01053776, + "auxiliary_loss_mlp": 0.01039177, + "balance_loss_clip": 1.01628852, + "balance_loss_mlp": 1.01712632, + "epoch": 0.845272809258981, + "flos": 27123498529920.0, + "grad_norm": 1.7908678159045432, + "language_loss": 0.70265949, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72358906, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 14059, + "time_per_iteration": 2.400639533996582 + }, + { + "auxiliary_loss_clip": 0.01053856, + "auxiliary_loss_mlp": 0.01041432, + "balance_loss_clip": 1.01720881, + "balance_loss_mlp": 1.01679623, + "epoch": 0.8453329325116489, + "flos": 18331600268160.0, + "grad_norm": 3.1995526759682185, + "language_loss": 0.58886796, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.60982078, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 14060, + "time_per_iteration": 2.322502613067627 + }, + { + "auxiliary_loss_clip": 0.01053129, + "auxiliary_loss_mlp": 0.01036517, + "balance_loss_clip": 1.01144731, + "balance_loss_mlp": 1.01579356, + "epoch": 0.8453930557643169, + "flos": 22674132218880.0, + "grad_norm": 1.6670972418422754, + "language_loss": 0.76666665, + "learning_rate": 2.454613720076277e-07, + "loss": 0.78756315, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37304688, + "step": 14061, + "time_per_iteration": 2.3755078315734863 + }, + { + "auxiliary_loss_clip": 0.01053817, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.01148963, + "balance_loss_mlp": 1.01635253, + "epoch": 0.8454531790169848, + "flos": 22486299770880.0, + "grad_norm": 2.4084213862985204, + "language_loss": 0.72347617, + "learning_rate": 2.452744642558013e-07, + "loss": 0.74436414, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 14062, + "time_per_iteration": 3.788644313812256 + }, + { + "auxiliary_loss_clip": 0.01007939, + "auxiliary_loss_mlp": 0.0100111, + "balance_loss_clip": 0.99891669, + "balance_loss_mlp": 1.00098348, + "epoch": 0.8455133022696528, + "flos": 58274925949440.0, + "grad_norm": 0.6353977904721891, + "language_loss": 0.52700102, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54709154, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06933594, + "step": 14063, + "time_per_iteration": 3.0691773891448975 + }, + { + "auxiliary_loss_clip": 0.01048426, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.01399779, + "balance_loss_mlp": 1.01603615, + "epoch": 0.8455734255223207, + "flos": 21360210912000.0, + "grad_norm": 8.977404707849722, + "language_loss": 0.83472002, + "learning_rate": 2.449008483773378e-07, + "loss": 0.85553658, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.32421875, + "step": 14064, + "time_per_iteration": 3.687551975250244 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.0103877, + "balance_loss_clip": 1.01352179, + "balance_loss_mlp": 1.01597404, + "epoch": 0.8456335487749888, + "flos": 20448209710080.0, + "grad_norm": 1.9641555357005338, + "language_loss": 0.7329247, + "learning_rate": 2.447141402648685e-07, + "loss": 0.75384283, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 14065, + "time_per_iteration": 2.3486921787261963 + }, + { + "auxiliary_loss_clip": 0.0105013, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.0132339, + "balance_loss_mlp": 1.01629472, + "epoch": 0.8456936720276567, + "flos": 28839619232640.0, + "grad_norm": 1.463001037382905, + "language_loss": 0.77889204, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79973787, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33984375, + "step": 14066, + "time_per_iteration": 2.4267537593841553 + }, + { + "auxiliary_loss_clip": 0.01050946, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.01338613, + "balance_loss_mlp": 1.01628721, + "epoch": 0.8457537952803247, + "flos": 22671828069120.0, + "grad_norm": 1.719343216206393, + "language_loss": 0.70846236, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72933382, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34765625, + "step": 14067, + "time_per_iteration": 2.3682994842529297 + }, + { + "auxiliary_loss_clip": 0.01049972, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.01022351, + "balance_loss_mlp": 1.01481557, + "epoch": 0.8458139185329926, + "flos": 33801510435840.0, + "grad_norm": 1.7601486411776635, + "language_loss": 0.72642308, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.74724948, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 14068, + "time_per_iteration": 2.4987051486968994 + }, + { + "auxiliary_loss_clip": 0.01007671, + "auxiliary_loss_mlp": 0.01002822, + "balance_loss_clip": 1.00073624, + "balance_loss_mlp": 1.00095153, + "epoch": 0.8458740417856606, + "flos": 70292274433920.0, + "grad_norm": 0.6955618834500206, + "language_loss": 0.60557467, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62567961, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06738281, + "step": 14069, + "time_per_iteration": 3.1007158756256104 + }, + { + "auxiliary_loss_clip": 0.01050997, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.01526475, + "balance_loss_mlp": 1.01600564, + "epoch": 0.8459341650383285, + "flos": 24169706663040.0, + "grad_norm": 1.7393564428343797, + "language_loss": 0.75202239, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.77289701, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34960938, + "step": 14070, + "time_per_iteration": 2.395197629928589 + }, + { + "auxiliary_loss_clip": 0.01052083, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.02045262, + "balance_loss_mlp": 1.01710296, + "epoch": 0.8459942882909965, + "flos": 38179618928640.0, + "grad_norm": 1.619984189947319, + "language_loss": 0.67794824, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69889843, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 14071, + "time_per_iteration": 2.5020875930786133 + }, + { + "auxiliary_loss_clip": 0.01007469, + "auxiliary_loss_mlp": 0.01004224, + "balance_loss_clip": 1.00197101, + "balance_loss_mlp": 1.00076425, + "epoch": 0.8460544115436646, + "flos": 64115092114560.0, + "grad_norm": 0.7356129684498104, + "language_loss": 0.61095899, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.63107592, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.06738281, + "step": 14072, + "time_per_iteration": 2.8788344860076904 + }, + { + "auxiliary_loss_clip": 0.01052664, + "auxiliary_loss_mlp": 0.01042236, + "balance_loss_clip": 1.01628399, + "balance_loss_mlp": 1.01536918, + "epoch": 0.8461145347963325, + "flos": 24169671751680.0, + "grad_norm": 1.7463466683440252, + "language_loss": 0.73720169, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.7581507, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37304688, + "step": 14073, + "time_per_iteration": 2.378504991531372 + }, + { + "auxiliary_loss_clip": 0.01056524, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_clip": 1.01803005, + "balance_loss_mlp": 1.01730585, + "epoch": 0.8461746580490005, + "flos": 34892441688960.0, + "grad_norm": 2.8761451757139516, + "language_loss": 0.78925747, + "learning_rate": 2.430367633291155e-07, + "loss": 0.81026888, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.39257812, + "step": 14074, + "time_per_iteration": 2.4853456020355225 + }, + { + "auxiliary_loss_clip": 0.01051543, + "auxiliary_loss_mlp": 0.01039623, + "balance_loss_clip": 1.01735437, + "balance_loss_mlp": 1.01632857, + "epoch": 0.8462347813016684, + "flos": 25555828394880.0, + "grad_norm": 2.256001541958331, + "language_loss": 0.76470953, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.78562117, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 14075, + "time_per_iteration": 2.4008069038391113 + }, + { + "auxiliary_loss_clip": 0.01050385, + "auxiliary_loss_mlp": 0.01035436, + "balance_loss_clip": 1.01294076, + "balance_loss_mlp": 1.015522, + "epoch": 0.8462949045543364, + "flos": 21324250344960.0, + "grad_norm": 2.2340895076231457, + "language_loss": 0.74285448, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.76371264, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 14076, + "time_per_iteration": 2.3767597675323486 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.01048216, + "balance_loss_clip": 1.02438629, + "balance_loss_mlp": 1.01734567, + "epoch": 0.8463550278070043, + "flos": 22636356261120.0, + "grad_norm": 1.9605213231215752, + "language_loss": 0.78674448, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.80776763, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 14077, + "time_per_iteration": 3.848389148712158 + }, + { + "auxiliary_loss_clip": 0.01053461, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.01489425, + "balance_loss_mlp": 1.01686668, + "epoch": 0.8464151510596724, + "flos": 13004761956480.0, + "grad_norm": 2.0313029651281456, + "language_loss": 0.76352459, + "learning_rate": 2.422929943924643e-07, + "loss": 0.78443855, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 14078, + "time_per_iteration": 2.327629327774048 + }, + { + "auxiliary_loss_clip": 0.01050859, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.00924659, + "balance_loss_mlp": 1.01568925, + "epoch": 0.8464752743123403, + "flos": 15704036945280.0, + "grad_norm": 7.987137618837253, + "language_loss": 0.85962403, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.88045228, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3515625, + "step": 14079, + "time_per_iteration": 2.405475616455078 + }, + { + "auxiliary_loss_clip": 0.01056518, + "auxiliary_loss_mlp": 0.01044146, + "balance_loss_clip": 1.01772904, + "balance_loss_mlp": 1.01726556, + "epoch": 0.8465353975650083, + "flos": 21652852861440.0, + "grad_norm": 2.566866230367173, + "language_loss": 0.59778905, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61879563, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.39257812, + "step": 14080, + "time_per_iteration": 2.4580578804016113 + }, + { + "auxiliary_loss_clip": 0.01052471, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.01417255, + "balance_loss_mlp": 1.01544249, + "epoch": 0.8465955208176762, + "flos": 18514649859840.0, + "grad_norm": 2.0179232201050197, + "language_loss": 0.68227386, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.70317942, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 14081, + "time_per_iteration": 2.433373212814331 + }, + { + "auxiliary_loss_clip": 0.01054265, + "auxiliary_loss_mlp": 0.01040508, + "balance_loss_clip": 1.01827526, + "balance_loss_mlp": 1.01643491, + "epoch": 0.8466556440703442, + "flos": 24199592653440.0, + "grad_norm": 1.6998601142191438, + "language_loss": 0.74215543, + "learning_rate": 2.41550291894576e-07, + "loss": 0.76310313, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.37890625, + "step": 14082, + "time_per_iteration": 2.449704647064209 + }, + { + "auxiliary_loss_clip": 0.01051949, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.01378405, + "balance_loss_mlp": 1.01581597, + "epoch": 0.8467157673230121, + "flos": 20374857210240.0, + "grad_norm": 1.793017148134033, + "language_loss": 0.76784945, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78872991, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36132812, + "step": 14083, + "time_per_iteration": 2.433077096939087 + }, + { + "auxiliary_loss_clip": 0.0105485, + "auxiliary_loss_mlp": 0.01041272, + "balance_loss_clip": 1.01588023, + "balance_loss_mlp": 1.01666236, + "epoch": 0.8467758905756801, + "flos": 28472437797120.0, + "grad_norm": 2.023256167341085, + "language_loss": 0.67092478, + "learning_rate": 2.411793407010092e-07, + "loss": 0.69188595, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 14084, + "time_per_iteration": 2.4911766052246094 + }, + { + "auxiliary_loss_clip": 0.0105209, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.01103377, + "balance_loss_mlp": 1.01699305, + "epoch": 0.8468360138283482, + "flos": 11691748344960.0, + "grad_norm": 2.3257229671657718, + "language_loss": 0.71885228, + "learning_rate": 2.409939651426938e-07, + "loss": 0.73970294, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 14085, + "time_per_iteration": 2.400099754333496 + }, + { + "auxiliary_loss_clip": 0.01051333, + "auxiliary_loss_mlp": 0.01038307, + "balance_loss_clip": 1.01447678, + "balance_loss_mlp": 1.01523614, + "epoch": 0.8468961370810161, + "flos": 24606714551040.0, + "grad_norm": 1.7064022716174612, + "language_loss": 0.72022849, + "learning_rate": 2.408086562860634e-07, + "loss": 0.74112487, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 14086, + "time_per_iteration": 2.4531853199005127 + }, + { + "auxiliary_loss_clip": 0.01050348, + "auxiliary_loss_mlp": 0.01040245, + "balance_loss_clip": 1.01831043, + "balance_loss_mlp": 1.01570225, + "epoch": 0.8469562603336841, + "flos": 19608792958080.0, + "grad_norm": 2.5164785876138733, + "language_loss": 0.76179343, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.78269935, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 14087, + "time_per_iteration": 2.3480405807495117 + }, + { + "auxiliary_loss_clip": 0.01052208, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.0105598, + "balance_loss_mlp": 1.01740801, + "epoch": 0.847016383586352, + "flos": 22637822538240.0, + "grad_norm": 1.3672241044346496, + "language_loss": 0.74484235, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76568699, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 14088, + "time_per_iteration": 2.3802154064178467 + }, + { + "auxiliary_loss_clip": 0.01053325, + "auxiliary_loss_mlp": 0.01040602, + "balance_loss_clip": 1.01685548, + "balance_loss_mlp": 1.01634634, + "epoch": 0.84707650683902, + "flos": 20959093768320.0, + "grad_norm": 2.3299484383038296, + "language_loss": 0.73592508, + "learning_rate": 2.402531299965387e-07, + "loss": 0.75686437, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36914062, + "step": 14089, + "time_per_iteration": 2.4030070304870605 + }, + { + "auxiliary_loss_clip": 0.01049911, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.01348436, + "balance_loss_mlp": 1.01650548, + "epoch": 0.8471366300916879, + "flos": 24091990243200.0, + "grad_norm": 1.4386294465822587, + "language_loss": 0.79864252, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81947505, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33398438, + "step": 14090, + "time_per_iteration": 2.42712140083313 + }, + { + "auxiliary_loss_clip": 0.01052641, + "auxiliary_loss_mlp": 0.01046825, + "balance_loss_clip": 1.02170718, + "balance_loss_mlp": 1.01569486, + "epoch": 0.847196753344356, + "flos": 18331914470400.0, + "grad_norm": 2.2745768792216436, + "language_loss": 0.78366488, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.80465961, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 14091, + "time_per_iteration": 2.3472237586975098 + }, + { + "auxiliary_loss_clip": 0.01007474, + "auxiliary_loss_mlp": 0.01004838, + "balance_loss_clip": 1.00253725, + "balance_loss_mlp": 1.00077438, + "epoch": 0.8472568765970239, + "flos": 49564584357120.0, + "grad_norm": 0.8193754219782357, + "language_loss": 0.59433031, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61445343, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.06738281, + "step": 14092, + "time_per_iteration": 3.070391893386841 + }, + { + "auxiliary_loss_clip": 0.01051227, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.01484764, + "balance_loss_mlp": 1.01568723, + "epoch": 0.8473169998496919, + "flos": 19278130671360.0, + "grad_norm": 1.6634163539024676, + "language_loss": 0.70982414, + "learning_rate": 2.395133625267756e-07, + "loss": 0.73072118, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 14093, + "time_per_iteration": 2.383718729019165 + }, + { + "auxiliary_loss_clip": 0.01049075, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.00806463, + "balance_loss_mlp": 1.01512742, + "epoch": 0.8473771231023598, + "flos": 17674604703360.0, + "grad_norm": 2.2366740438893022, + "language_loss": 0.8475976, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.86838037, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 14094, + "time_per_iteration": 3.6670734882354736 + }, + { + "auxiliary_loss_clip": 0.01049546, + "auxiliary_loss_mlp": 0.01039694, + "balance_loss_clip": 1.01798582, + "balance_loss_mlp": 1.01569068, + "epoch": 0.8474372463550278, + "flos": 26358551441280.0, + "grad_norm": 1.666315957630569, + "language_loss": 0.72060108, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.74149346, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33789062, + "step": 14095, + "time_per_iteration": 2.440690279006958 + }, + { + "auxiliary_loss_clip": 0.0104949, + "auxiliary_loss_mlp": 0.01040062, + "balance_loss_clip": 1.01735306, + "balance_loss_mlp": 1.01485598, + "epoch": 0.8474973696076957, + "flos": 23400989147520.0, + "grad_norm": 1.6613392594477725, + "language_loss": 0.81536281, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.83625835, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34570312, + "step": 14096, + "time_per_iteration": 2.374547004699707 + }, + { + "auxiliary_loss_clip": 0.01053673, + "auxiliary_loss_mlp": 0.01045106, + "balance_loss_clip": 1.02044177, + "balance_loss_mlp": 1.01619625, + "epoch": 0.8475574928603637, + "flos": 25074690681600.0, + "grad_norm": 1.8869473062754396, + "language_loss": 0.78710103, + "learning_rate": 2.387746631822374e-07, + "loss": 0.80808878, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 14097, + "time_per_iteration": 2.4155218601226807 + }, + { + "auxiliary_loss_clip": 0.01051318, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.0149368, + "balance_loss_mlp": 1.01655912, + "epoch": 0.8476176161130318, + "flos": 19965885010560.0, + "grad_norm": 1.6451445070223836, + "language_loss": 0.81299186, + "learning_rate": 2.385901552932048e-07, + "loss": 0.83387297, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 14098, + "time_per_iteration": 2.356947183609009 + }, + { + "auxiliary_loss_clip": 0.01051399, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.01462078, + "balance_loss_mlp": 1.01570296, + "epoch": 0.8476777393656997, + "flos": 21284833553280.0, + "grad_norm": 1.8061116992736108, + "language_loss": 0.72840947, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.74929011, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 14099, + "time_per_iteration": 2.3820905685424805 + }, + { + "auxiliary_loss_clip": 0.01052109, + "auxiliary_loss_mlp": 0.01040841, + "balance_loss_clip": 1.01487708, + "balance_loss_mlp": 1.01581228, + "epoch": 0.8477378626183677, + "flos": 29970176745600.0, + "grad_norm": 1.847100518505733, + "language_loss": 0.64791375, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.66884321, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.36328125, + "step": 14100, + "time_per_iteration": 2.4516537189483643 + }, + { + "auxiliary_loss_clip": 0.01053033, + "auxiliary_loss_mlp": 0.01042061, + "balance_loss_clip": 1.018803, + "balance_loss_mlp": 1.01615751, + "epoch": 0.8477979858710356, + "flos": 24236740206720.0, + "grad_norm": 1.9374589707803758, + "language_loss": 0.75035322, + "learning_rate": 2.380370324111085e-07, + "loss": 0.77130413, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 14101, + "time_per_iteration": 2.3584036827087402 + }, + { + "auxiliary_loss_clip": 0.01052469, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.01185274, + "balance_loss_mlp": 1.01690578, + "epoch": 0.8478581091237036, + "flos": 25592487189120.0, + "grad_norm": 1.7245593968836188, + "language_loss": 0.72540778, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.74626261, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35546875, + "step": 14102, + "time_per_iteration": 3.8572287559509277 + }, + { + "auxiliary_loss_clip": 0.01053259, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.01107156, + "balance_loss_mlp": 1.01675963, + "epoch": 0.8479182323763715, + "flos": 12056311428480.0, + "grad_norm": 2.334240603147291, + "language_loss": 0.83815849, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.85904735, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 14103, + "time_per_iteration": 3.780444383621216 + }, + { + "auxiliary_loss_clip": 0.01049872, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.01289892, + "balance_loss_mlp": 1.01536334, + "epoch": 0.8479783556290396, + "flos": 21432341514240.0, + "grad_norm": 1.871872713532463, + "language_loss": 0.79645598, + "learning_rate": 2.374845108533079e-07, + "loss": 0.8173033, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 14104, + "time_per_iteration": 2.3724448680877686 + }, + { + "auxiliary_loss_clip": 0.01054972, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.01667976, + "balance_loss_mlp": 1.01738477, + "epoch": 0.8480384788817075, + "flos": 19641716236800.0, + "grad_norm": 1.7802614585535894, + "language_loss": 0.79658109, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.81755054, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 14105, + "time_per_iteration": 2.3625824451446533 + }, + { + "auxiliary_loss_clip": 0.01055676, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.01419544, + "balance_loss_mlp": 1.01697087, + "epoch": 0.8480986021343755, + "flos": 22488184984320.0, + "grad_norm": 3.3192270708078317, + "language_loss": 0.51168412, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.53264213, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.38671875, + "step": 14106, + "time_per_iteration": 2.3698086738586426 + }, + { + "auxiliary_loss_clip": 0.01052066, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.01804852, + "balance_loss_mlp": 1.01579762, + "epoch": 0.8481587253870434, + "flos": 22089476724480.0, + "grad_norm": 1.7920148056216658, + "language_loss": 0.7652117, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.78614819, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 14107, + "time_per_iteration": 2.366255521774292 + }, + { + "auxiliary_loss_clip": 0.010527, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.01307917, + "balance_loss_mlp": 1.01646996, + "epoch": 0.8482188486397114, + "flos": 33581313290880.0, + "grad_norm": 1.608095364799378, + "language_loss": 0.74699724, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.7678932, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 14108, + "time_per_iteration": 2.462435245513916 + }, + { + "auxiliary_loss_clip": 0.0105004, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.01386774, + "balance_loss_mlp": 1.0155077, + "epoch": 0.8482789718923793, + "flos": 20918455079040.0, + "grad_norm": 2.0371417899081963, + "language_loss": 0.74013305, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.76100898, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34570312, + "step": 14109, + "time_per_iteration": 2.3611695766448975 + }, + { + "auxiliary_loss_clip": 0.0105027, + "auxiliary_loss_mlp": 0.01036517, + "balance_loss_clip": 1.01379538, + "balance_loss_mlp": 1.01545501, + "epoch": 0.8483390951450474, + "flos": 12895379066880.0, + "grad_norm": 1.9604291675362098, + "language_loss": 0.75628936, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.77715719, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34765625, + "step": 14110, + "time_per_iteration": 2.3429877758026123 + }, + { + "auxiliary_loss_clip": 0.01052281, + "auxiliary_loss_mlp": 0.0104059, + "balance_loss_clip": 1.01866698, + "balance_loss_mlp": 1.01697755, + "epoch": 0.8483992183977154, + "flos": 25080485967360.0, + "grad_norm": 1.6446367706017544, + "language_loss": 0.77143693, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.79236567, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 14111, + "time_per_iteration": 2.4031736850738525 + }, + { + "auxiliary_loss_clip": 0.01051322, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.01160467, + "balance_loss_mlp": 1.01711273, + "epoch": 0.8484593416503833, + "flos": 25556247331200.0, + "grad_norm": 1.800503090833019, + "language_loss": 0.68415046, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.70499039, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 14112, + "time_per_iteration": 2.399285316467285 + }, + { + "auxiliary_loss_clip": 0.01052344, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.01295376, + "balance_loss_mlp": 1.01567137, + "epoch": 0.8485194649030513, + "flos": 27197235054720.0, + "grad_norm": 1.3492068549958418, + "language_loss": 0.74451977, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.76540446, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 14113, + "time_per_iteration": 2.422658681869507 + }, + { + "auxiliary_loss_clip": 0.01051824, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.01392198, + "balance_loss_mlp": 1.01642382, + "epoch": 0.8485795881557192, + "flos": 24204794446080.0, + "grad_norm": 2.479367587742233, + "language_loss": 0.66761798, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68851328, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 14114, + "time_per_iteration": 2.3939766883850098 + }, + { + "auxiliary_loss_clip": 0.01054277, + "auxiliary_loss_mlp": 0.01044877, + "balance_loss_clip": 1.02124965, + "balance_loss_mlp": 1.01760805, + "epoch": 0.8486397114083872, + "flos": 21140607260160.0, + "grad_norm": 1.6043460797270042, + "language_loss": 0.80354118, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.82453275, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 14115, + "time_per_iteration": 2.371312379837036 + }, + { + "auxiliary_loss_clip": 0.0105167, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.01323438, + "balance_loss_mlp": 1.01592851, + "epoch": 0.8486998346610551, + "flos": 19973740066560.0, + "grad_norm": 1.8042622614463173, + "language_loss": 0.79720098, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81807554, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 14116, + "time_per_iteration": 2.3490540981292725 + }, + { + "auxiliary_loss_clip": 0.01052641, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.01032281, + "balance_loss_mlp": 1.01612902, + "epoch": 0.8487599579137232, + "flos": 19791283968000.0, + "grad_norm": 1.747321330865072, + "language_loss": 0.69130528, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.71217442, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36523438, + "step": 14117, + "time_per_iteration": 3.7858188152313232 + }, + { + "auxiliary_loss_clip": 0.01052896, + "auxiliary_loss_mlp": 0.01036895, + "balance_loss_clip": 1.01418531, + "balance_loss_mlp": 1.01603711, + "epoch": 0.8488200811663911, + "flos": 26394826210560.0, + "grad_norm": 2.017031225200575, + "language_loss": 0.66186351, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.68276143, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 14118, + "time_per_iteration": 2.38550066947937 + }, + { + "auxiliary_loss_clip": 0.01049682, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.01325917, + "balance_loss_mlp": 1.01524353, + "epoch": 0.8488802044190591, + "flos": 16358449069440.0, + "grad_norm": 2.3065924994543128, + "language_loss": 0.74858397, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.76942724, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 14119, + "time_per_iteration": 2.376495838165283 + }, + { + "auxiliary_loss_clip": 0.01052891, + "auxiliary_loss_mlp": 0.01036625, + "balance_loss_clip": 1.00996971, + "balance_loss_mlp": 1.01603651, + "epoch": 0.848940327671727, + "flos": 19207850371200.0, + "grad_norm": 1.6296428787467565, + "language_loss": 0.79114741, + "learning_rate": 2.345478926864446e-07, + "loss": 0.81204259, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36914062, + "step": 14120, + "time_per_iteration": 2.3576154708862305 + }, + { + "auxiliary_loss_clip": 0.01053713, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.01627779, + "balance_loss_mlp": 1.0173595, + "epoch": 0.849000450924395, + "flos": 21870117452160.0, + "grad_norm": 2.531667868651732, + "language_loss": 0.77032983, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.79126781, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 14121, + "time_per_iteration": 2.4227097034454346 + }, + { + "auxiliary_loss_clip": 0.01007535, + "auxiliary_loss_mlp": 0.01001477, + "balance_loss_clip": 0.99929547, + "balance_loss_mlp": 1.00086248, + "epoch": 0.8490605741770629, + "flos": 71162938719360.0, + "grad_norm": 0.821426194701518, + "language_loss": 0.60232222, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62241232, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.06689453, + "step": 14122, + "time_per_iteration": 3.0033016204833984 + }, + { + "auxiliary_loss_clip": 0.01051925, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.01473927, + "balance_loss_mlp": 1.01640642, + "epoch": 0.849120697429731, + "flos": 24972185329920.0, + "grad_norm": 2.4193978744918083, + "language_loss": 0.81728429, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.83816195, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35546875, + "step": 14123, + "time_per_iteration": 2.405208110809326 + }, + { + "auxiliary_loss_clip": 0.0105011, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.01042533, + "balance_loss_mlp": 1.01540208, + "epoch": 0.8491808206823989, + "flos": 23031363916800.0, + "grad_norm": 2.117695700851805, + "language_loss": 0.84035283, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.86118752, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34765625, + "step": 14124, + "time_per_iteration": 2.363466262817383 + }, + { + "auxiliary_loss_clip": 0.01054703, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.01405811, + "balance_loss_mlp": 1.01811481, + "epoch": 0.8492409439350669, + "flos": 23877413827200.0, + "grad_norm": 1.9171578557979745, + "language_loss": 0.73050684, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.75144345, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 14125, + "time_per_iteration": 2.388522148132324 + }, + { + "auxiliary_loss_clip": 0.01054504, + "auxiliary_loss_mlp": 0.0104154, + "balance_loss_clip": 1.01540911, + "balance_loss_mlp": 1.0166564, + "epoch": 0.8493010671877349, + "flos": 22418777468160.0, + "grad_norm": 1.672999069299573, + "language_loss": 0.74589038, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.76685083, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 14126, + "time_per_iteration": 2.3957440853118896 + }, + { + "auxiliary_loss_clip": 0.01050841, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.0135417, + "balance_loss_mlp": 1.01577485, + "epoch": 0.8493611904404028, + "flos": 17528493196800.0, + "grad_norm": 1.5213625715771126, + "language_loss": 0.69004464, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.71090716, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34960938, + "step": 14127, + "time_per_iteration": 2.343822717666626 + }, + { + "auxiliary_loss_clip": 0.010532, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.01184702, + "balance_loss_mlp": 1.01646745, + "epoch": 0.8494213136930708, + "flos": 19461948312960.0, + "grad_norm": 1.7771054285299006, + "language_loss": 0.70207798, + "learning_rate": 2.330860086502211e-07, + "loss": 0.72297573, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3671875, + "step": 14128, + "time_per_iteration": 2.346179485321045 + }, + { + "auxiliary_loss_clip": 0.01051631, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.01458645, + "balance_loss_mlp": 1.01684284, + "epoch": 0.8494814369457387, + "flos": 18769306383360.0, + "grad_norm": 1.8346344772894638, + "language_loss": 0.78852397, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80942041, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 14129, + "time_per_iteration": 2.349085569381714 + }, + { + "auxiliary_loss_clip": 0.01053706, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.02156711, + "balance_loss_mlp": 1.0173161, + "epoch": 0.8495415601984068, + "flos": 23330359733760.0, + "grad_norm": 1.6786662694153613, + "language_loss": 0.69216979, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.71316046, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 14130, + "time_per_iteration": 2.362420082092285 + }, + { + "auxiliary_loss_clip": 0.01050823, + "auxiliary_loss_mlp": 0.01038497, + "balance_loss_clip": 1.01519108, + "balance_loss_mlp": 1.01535738, + "epoch": 0.8496016834510747, + "flos": 26611706776320.0, + "grad_norm": 2.447856244249683, + "language_loss": 0.71736306, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73825628, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 14131, + "time_per_iteration": 2.414033889770508 + }, + { + "auxiliary_loss_clip": 0.0105253, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.01053405, + "balance_loss_mlp": 1.01595259, + "epoch": 0.8496618067037427, + "flos": 25479298961280.0, + "grad_norm": 1.8941064976203343, + "language_loss": 0.69420606, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.71505201, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.36523438, + "step": 14132, + "time_per_iteration": 2.399500846862793 + }, + { + "auxiliary_loss_clip": 0.01049471, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.01440787, + "balance_loss_mlp": 1.01480269, + "epoch": 0.8497219299564106, + "flos": 25373407207680.0, + "grad_norm": 1.6363964853809982, + "language_loss": 0.7070775, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72793531, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 14133, + "time_per_iteration": 3.622816801071167 + }, + { + "auxiliary_loss_clip": 0.01007711, + "auxiliary_loss_mlp": 0.01006901, + "balance_loss_clip": 1.00469553, + "balance_loss_mlp": 1.00099468, + "epoch": 0.8497820532090786, + "flos": 67776642529920.0, + "grad_norm": 0.729539990578915, + "language_loss": 0.57700938, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59715551, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.06738281, + "step": 14134, + "time_per_iteration": 3.098874807357788 + }, + { + "auxiliary_loss_clip": 0.01054562, + "auxiliary_loss_mlp": 0.01035694, + "balance_loss_clip": 1.01224542, + "balance_loss_mlp": 1.01648867, + "epoch": 0.8498421764617465, + "flos": 23439428421120.0, + "grad_norm": 1.9200511468499248, + "language_loss": 0.80576718, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.82666981, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.38085938, + "step": 14135, + "time_per_iteration": 2.3692755699157715 + }, + { + "auxiliary_loss_clip": 0.01054308, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.01669383, + "balance_loss_mlp": 1.01689923, + "epoch": 0.8499022997144146, + "flos": 17711647522560.0, + "grad_norm": 1.6950578932121916, + "language_loss": 0.64396143, + "learning_rate": 2.316284127127044e-07, + "loss": 0.66493511, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37304688, + "step": 14136, + "time_per_iteration": 2.3670544624328613 + }, + { + "auxiliary_loss_clip": 0.01054071, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.0132978, + "balance_loss_mlp": 1.01660371, + "epoch": 0.8499624229670825, + "flos": 18587513600640.0, + "grad_norm": 1.8205925686847422, + "language_loss": 0.85296953, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.87388992, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 14137, + "time_per_iteration": 2.335573434829712 + }, + { + "auxiliary_loss_clip": 0.01050997, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.01429081, + "balance_loss_mlp": 1.01666451, + "epoch": 0.8500225462197505, + "flos": 24344901198720.0, + "grad_norm": 1.9824955795008088, + "language_loss": 0.80070686, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.82157391, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34375, + "step": 14138, + "time_per_iteration": 2.413609027862549 + }, + { + "auxiliary_loss_clip": 0.01053003, + "auxiliary_loss_mlp": 0.01042411, + "balance_loss_clip": 1.01852167, + "balance_loss_mlp": 1.01648188, + "epoch": 0.8500826694724185, + "flos": 16544570860800.0, + "grad_norm": 1.635701728427246, + "language_loss": 0.65525252, + "learning_rate": 2.310829204839073e-07, + "loss": 0.67620671, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36523438, + "step": 14139, + "time_per_iteration": 2.3462941646575928 + }, + { + "auxiliary_loss_clip": 0.01052311, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.01333499, + "balance_loss_mlp": 1.01606393, + "epoch": 0.8501427927250864, + "flos": 16288482971520.0, + "grad_norm": 1.5247114762573557, + "language_loss": 0.71252441, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.73340392, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36328125, + "step": 14140, + "time_per_iteration": 2.326925039291382 + }, + { + "auxiliary_loss_clip": 0.01054429, + "auxiliary_loss_mlp": 0.01042133, + "balance_loss_clip": 1.0183388, + "balance_loss_mlp": 1.01626694, + "epoch": 0.8502029159777544, + "flos": 26686700110080.0, + "grad_norm": 2.0572548245846525, + "language_loss": 0.65554368, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.67650926, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 14141, + "time_per_iteration": 2.408808708190918 + }, + { + "auxiliary_loss_clip": 0.01052918, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.01709986, + "balance_loss_mlp": 1.01639485, + "epoch": 0.8502630392304223, + "flos": 35589307893120.0, + "grad_norm": 1.4648052860458578, + "language_loss": 0.72045428, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.74139154, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 14142, + "time_per_iteration": 3.8912203311920166 + }, + { + "auxiliary_loss_clip": 0.01051341, + "auxiliary_loss_mlp": 0.01036147, + "balance_loss_clip": 1.01337802, + "balance_loss_mlp": 1.01582778, + "epoch": 0.8503231624830904, + "flos": 21648488941440.0, + "grad_norm": 1.6105185862922722, + "language_loss": 0.6654985, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.68637329, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 14143, + "time_per_iteration": 3.8661394119262695 + }, + { + "auxiliary_loss_clip": 0.01053511, + "auxiliary_loss_mlp": 0.0104088, + "balance_loss_clip": 1.01594162, + "balance_loss_mlp": 1.01564789, + "epoch": 0.8503832857357583, + "flos": 22416403495680.0, + "grad_norm": 2.912128314137782, + "language_loss": 0.69358557, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.71452951, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 14144, + "time_per_iteration": 2.4366819858551025 + }, + { + "auxiliary_loss_clip": 0.01051363, + "auxiliary_loss_mlp": 0.01037564, + "balance_loss_clip": 1.01508117, + "balance_loss_mlp": 1.01607442, + "epoch": 0.8504434089884263, + "flos": 18696966312960.0, + "grad_norm": 2.2569784082259905, + "language_loss": 0.65932876, + "learning_rate": 2.299937473050777e-07, + "loss": 0.68021804, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 14145, + "time_per_iteration": 2.344773292541504 + }, + { + "auxiliary_loss_clip": 0.01051212, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.01864517, + "balance_loss_mlp": 1.01641536, + "epoch": 0.8505035322410942, + "flos": 20007047370240.0, + "grad_norm": 1.7956610289071568, + "language_loss": 0.86403221, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.88495082, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 14146, + "time_per_iteration": 2.3681747913360596 + }, + { + "auxiliary_loss_clip": 0.0105116, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.01255345, + "balance_loss_mlp": 1.01570892, + "epoch": 0.8505636554937622, + "flos": 20811166871040.0, + "grad_norm": 1.5793144901290441, + "language_loss": 0.85149777, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.8723563, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 14147, + "time_per_iteration": 2.442284107208252 + }, + { + "auxiliary_loss_clip": 0.0105441, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.01438546, + "balance_loss_mlp": 1.01696873, + "epoch": 0.8506237787464301, + "flos": 14173549274880.0, + "grad_norm": 2.576137685658371, + "language_loss": 0.87368214, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.89461571, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 14148, + "time_per_iteration": 2.333218574523926 + }, + { + "auxiliary_loss_clip": 0.01051841, + "auxiliary_loss_mlp": 0.01036872, + "balance_loss_clip": 1.01396012, + "balance_loss_mlp": 1.01682472, + "epoch": 0.8506839019990982, + "flos": 23257251613440.0, + "grad_norm": 1.6531174876610206, + "language_loss": 0.73258007, + "learning_rate": 2.292689741370204e-07, + "loss": 0.7534672, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34960938, + "step": 14149, + "time_per_iteration": 2.4016737937927246 + }, + { + "auxiliary_loss_clip": 0.01052481, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.01137936, + "balance_loss_mlp": 1.01597309, + "epoch": 0.8507440252517661, + "flos": 23658089466240.0, + "grad_norm": 1.6304762550003746, + "language_loss": 0.77982825, + "learning_rate": 2.290879486935804e-07, + "loss": 0.80068815, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.36523438, + "step": 14150, + "time_per_iteration": 2.4086837768554688 + }, + { + "auxiliary_loss_clip": 0.01051868, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.01225519, + "balance_loss_mlp": 1.0171349, + "epoch": 0.8508041485044341, + "flos": 18660342430080.0, + "grad_norm": 1.6866242654458588, + "language_loss": 0.73500013, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.75585735, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34765625, + "step": 14151, + "time_per_iteration": 2.3537256717681885 + }, + { + "auxiliary_loss_clip": 0.01007602, + "auxiliary_loss_mlp": 0.01005494, + "balance_loss_clip": 1.00311029, + "balance_loss_mlp": 1.0009892, + "epoch": 0.8508642717571021, + "flos": 52508217018240.0, + "grad_norm": 0.8821073840116479, + "language_loss": 0.596753, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61688399, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.06640625, + "step": 14152, + "time_per_iteration": 2.7637526988983154 + }, + { + "auxiliary_loss_clip": 0.0100739, + "auxiliary_loss_mlp": 0.0100524, + "balance_loss_clip": 1.00301063, + "balance_loss_mlp": 1.00079393, + "epoch": 0.85092439500977, + "flos": 69293898794880.0, + "grad_norm": 0.7006780134306205, + "language_loss": 0.61344445, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63357079, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.06591797, + "step": 14153, + "time_per_iteration": 3.0684869289398193 + }, + { + "auxiliary_loss_clip": 0.01053219, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.01574445, + "balance_loss_mlp": 1.0168221, + "epoch": 0.850984518262438, + "flos": 24388577176320.0, + "grad_norm": 1.6417033631873912, + "language_loss": 0.81422424, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.83515191, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 14154, + "time_per_iteration": 2.3942973613739014 + }, + { + "auxiliary_loss_clip": 0.01050208, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.01360703, + "balance_loss_mlp": 1.01615906, + "epoch": 0.851044641515106, + "flos": 23293700939520.0, + "grad_norm": 1.8020357223969257, + "language_loss": 0.80829883, + "learning_rate": 2.281838289110165e-07, + "loss": 0.82914281, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.33984375, + "step": 14155, + "time_per_iteration": 2.368971347808838 + }, + { + "auxiliary_loss_clip": 0.01052398, + "auxiliary_loss_mlp": 0.0103694, + "balance_loss_clip": 1.01239514, + "balance_loss_mlp": 1.01551819, + "epoch": 0.851104764767774, + "flos": 22049117326080.0, + "grad_norm": 1.6434159914313629, + "language_loss": 0.71089202, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.73178542, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36914062, + "step": 14156, + "time_per_iteration": 2.358189344406128 + }, + { + "auxiliary_loss_clip": 0.01049875, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.010023, + "balance_loss_mlp": 1.01609707, + "epoch": 0.8511648880204419, + "flos": 20703669194880.0, + "grad_norm": 1.9724311676902375, + "language_loss": 0.74881947, + "learning_rate": 2.278226512621386e-07, + "loss": 0.76963794, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33789062, + "step": 14157, + "time_per_iteration": 3.800255060195923 + }, + { + "auxiliary_loss_clip": 0.01049037, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.00849473, + "balance_loss_mlp": 1.01575637, + "epoch": 0.8512250112731099, + "flos": 24023525333760.0, + "grad_norm": 2.4215197620551954, + "language_loss": 0.80403388, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.82480818, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33203125, + "step": 14158, + "time_per_iteration": 2.360689163208008 + }, + { + "auxiliary_loss_clip": 0.01051781, + "auxiliary_loss_mlp": 0.0104078, + "balance_loss_clip": 1.01684332, + "balance_loss_mlp": 1.01567376, + "epoch": 0.8512851345257778, + "flos": 22014448479360.0, + "grad_norm": 2.19580141813171, + "language_loss": 0.79989564, + "learning_rate": 2.27461742417828e-07, + "loss": 0.82082123, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36132812, + "step": 14159, + "time_per_iteration": 2.364351272583008 + }, + { + "auxiliary_loss_clip": 0.01052732, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.01467443, + "balance_loss_mlp": 1.01644111, + "epoch": 0.8513452577784458, + "flos": 14829322942080.0, + "grad_norm": 1.7636351609607326, + "language_loss": 0.7301544, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.75106835, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 14160, + "time_per_iteration": 2.338268756866455 + }, + { + "auxiliary_loss_clip": 0.01055424, + "auxiliary_loss_mlp": 0.01038252, + "balance_loss_clip": 1.01340938, + "balance_loss_mlp": 1.01682448, + "epoch": 0.8514053810311137, + "flos": 33034294108800.0, + "grad_norm": 2.075687742517877, + "language_loss": 0.7114774, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.73241413, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 14161, + "time_per_iteration": 2.4808170795440674 + }, + { + "auxiliary_loss_clip": 0.01052045, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.01591825, + "balance_loss_mlp": 1.01469469, + "epoch": 0.8514655042837818, + "flos": 27563194592640.0, + "grad_norm": 2.533822471929798, + "language_loss": 0.79851621, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.81942087, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.37304688, + "step": 14162, + "time_per_iteration": 2.429096221923828 + }, + { + "auxiliary_loss_clip": 0.01051912, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.01642072, + "balance_loss_mlp": 1.01623261, + "epoch": 0.8515256275364497, + "flos": 35554534312320.0, + "grad_norm": 2.1460387174767206, + "language_loss": 0.78203666, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.80294585, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 14163, + "time_per_iteration": 2.51216459274292 + }, + { + "auxiliary_loss_clip": 0.01007139, + "auxiliary_loss_mlp": 0.01002491, + "balance_loss_clip": 1.00033295, + "balance_loss_mlp": 1.00072277, + "epoch": 0.8515857507891177, + "flos": 70204154428800.0, + "grad_norm": 0.7102808817500432, + "language_loss": 0.55211878, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57221508, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06445312, + "step": 14164, + "time_per_iteration": 3.0411384105682373 + }, + { + "auxiliary_loss_clip": 0.01052082, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.01339793, + "balance_loss_mlp": 1.01610196, + "epoch": 0.8516458740417857, + "flos": 22674167130240.0, + "grad_norm": 1.7745609479401523, + "language_loss": 0.73648071, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.75737303, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.359375, + "step": 14165, + "time_per_iteration": 2.383466958999634 + }, + { + "auxiliary_loss_clip": 0.01049976, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.01015532, + "balance_loss_mlp": 1.01491213, + "epoch": 0.8517059972944536, + "flos": 22746332643840.0, + "grad_norm": 1.788726058580675, + "language_loss": 0.68561596, + "learning_rate": 2.26200679088697e-07, + "loss": 0.70645368, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3515625, + "step": 14166, + "time_per_iteration": 2.3798439502716064 + }, + { + "auxiliary_loss_clip": 0.01051198, + "auxiliary_loss_mlp": 0.01033815, + "balance_loss_clip": 1.012012, + "balance_loss_mlp": 1.01580656, + "epoch": 0.8517661205471216, + "flos": 21688080289920.0, + "grad_norm": 1.6796441434707552, + "language_loss": 0.74233019, + "learning_rate": 2.260207961805125e-07, + "loss": 0.76318032, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35351562, + "step": 14167, + "time_per_iteration": 2.3874104022979736 + }, + { + "auxiliary_loss_clip": 0.01050995, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.01604879, + "balance_loss_mlp": 1.01572227, + "epoch": 0.8518262437997896, + "flos": 25373651587200.0, + "grad_norm": 1.7308488185765762, + "language_loss": 0.81850827, + "learning_rate": 2.258409805417969e-07, + "loss": 0.83939695, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.3515625, + "step": 14168, + "time_per_iteration": 2.3986122608184814 + }, + { + "auxiliary_loss_clip": 0.01050781, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.01129031, + "balance_loss_mlp": 1.01567149, + "epoch": 0.8518863670524576, + "flos": 27234173139840.0, + "grad_norm": 1.976545804441052, + "language_loss": 0.77593684, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.79677773, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 14169, + "time_per_iteration": 2.4107086658477783 + }, + { + "auxiliary_loss_clip": 0.0105446, + "auxiliary_loss_mlp": 0.01038121, + "balance_loss_clip": 1.0149709, + "balance_loss_mlp": 1.01685119, + "epoch": 0.8519464903051255, + "flos": 20958465363840.0, + "grad_norm": 1.703114247569637, + "language_loss": 0.65332794, + "learning_rate": 2.254815511000452e-07, + "loss": 0.67425376, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.375, + "step": 14170, + "time_per_iteration": 2.3651392459869385 + }, + { + "auxiliary_loss_clip": 0.01048708, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.0099647, + "balance_loss_mlp": 1.01390374, + "epoch": 0.8520066135577935, + "flos": 18440773689600.0, + "grad_norm": 2.309207034441405, + "language_loss": 0.88022101, + "learning_rate": 2.253019373106384e-07, + "loss": 0.90102857, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 14171, + "time_per_iteration": 2.327394485473633 + }, + { + "auxiliary_loss_clip": 0.01051547, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.01342618, + "balance_loss_mlp": 1.01633835, + "epoch": 0.8520667368104614, + "flos": 29129014425600.0, + "grad_norm": 1.7492771333054395, + "language_loss": 0.55889893, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.5797736, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 14172, + "time_per_iteration": 2.426170587539673 + }, + { + "auxiliary_loss_clip": 0.01048841, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.01454616, + "balance_loss_mlp": 1.01508069, + "epoch": 0.8521268600631294, + "flos": 16033442423040.0, + "grad_norm": 2.059231718298724, + "language_loss": 0.70154548, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.72238195, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33789062, + "step": 14173, + "time_per_iteration": 3.573348045349121 + }, + { + "auxiliary_loss_clip": 0.01052634, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.01419461, + "balance_loss_mlp": 1.01636744, + "epoch": 0.8521869833157973, + "flos": 22453795428480.0, + "grad_norm": 3.761712718379248, + "language_loss": 0.78329974, + "learning_rate": 2.247634997500205e-07, + "loss": 0.80421889, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 14174, + "time_per_iteration": 2.359973430633545 + }, + { + "auxiliary_loss_clip": 0.01053416, + "auxiliary_loss_mlp": 0.010383, + "balance_loss_clip": 1.01512611, + "balance_loss_mlp": 1.01648259, + "epoch": 0.8522471065684654, + "flos": 24970893609600.0, + "grad_norm": 1.6858293800662034, + "language_loss": 0.83188844, + "learning_rate": 2.245841551883676e-07, + "loss": 0.8528055, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 14175, + "time_per_iteration": 2.396933078765869 + }, + { + "auxiliary_loss_clip": 0.01054853, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.0147177, + "balance_loss_mlp": 1.01778173, + "epoch": 0.8523072298211333, + "flos": 17709692486400.0, + "grad_norm": 2.2236129712727957, + "language_loss": 0.67388165, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.69481874, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37109375, + "step": 14176, + "time_per_iteration": 2.332918882369995 + }, + { + "auxiliary_loss_clip": 0.01050417, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.01195645, + "balance_loss_mlp": 1.01517475, + "epoch": 0.8523673530738013, + "flos": 25445049050880.0, + "grad_norm": 1.7588681422119954, + "language_loss": 0.79681885, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.81766677, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 14177, + "time_per_iteration": 2.405050039291382 + }, + { + "auxiliary_loss_clip": 0.01052178, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.00931072, + "balance_loss_mlp": 1.01614928, + "epoch": 0.8524274763264693, + "flos": 31428289434240.0, + "grad_norm": 1.6597435101348865, + "language_loss": 0.7447561, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.76559317, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 14178, + "time_per_iteration": 2.45031476020813 + }, + { + "auxiliary_loss_clip": 0.01052666, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.01986694, + "balance_loss_mlp": 1.01684499, + "epoch": 0.8524875995791372, + "flos": 17711682433920.0, + "grad_norm": 1.8452012845144103, + "language_loss": 0.76106292, + "learning_rate": 2.238674502491935e-07, + "loss": 0.78201693, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 14179, + "time_per_iteration": 2.345099687576294 + }, + { + "auxiliary_loss_clip": 0.01050773, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.01144278, + "balance_loss_mlp": 1.01593208, + "epoch": 0.8525477228318052, + "flos": 21686299810560.0, + "grad_norm": 2.2073564606593123, + "language_loss": 0.83499324, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.85584164, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 14180, + "time_per_iteration": 2.3743889331817627 + }, + { + "auxiliary_loss_clip": 0.01052194, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.0137527, + "balance_loss_mlp": 1.01651311, + "epoch": 0.8526078460844732, + "flos": 24825899266560.0, + "grad_norm": 6.209726723977838, + "language_loss": 0.62721407, + "learning_rate": 2.235095018591815e-07, + "loss": 0.64811063, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35742188, + "step": 14181, + "time_per_iteration": 3.8730671405792236 + }, + { + "auxiliary_loss_clip": 0.01051159, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.01569557, + "balance_loss_mlp": 1.01621437, + "epoch": 0.8526679693371412, + "flos": 13515576192000.0, + "grad_norm": 2.183304251135541, + "language_loss": 0.7315644, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.75245225, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 14182, + "time_per_iteration": 2.3320682048797607 + }, + { + "auxiliary_loss_clip": 0.01050865, + "auxiliary_loss_mlp": 0.0104099, + "balance_loss_clip": 1.01779175, + "balance_loss_mlp": 1.01598775, + "epoch": 0.8527280925898091, + "flos": 23512955477760.0, + "grad_norm": 1.4775660769841545, + "language_loss": 0.71953827, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.74045682, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 14183, + "time_per_iteration": 3.698413848876953 + }, + { + "auxiliary_loss_clip": 0.01051843, + "auxiliary_loss_mlp": 0.01036838, + "balance_loss_clip": 1.01545179, + "balance_loss_mlp": 1.01742125, + "epoch": 0.8527882158424771, + "flos": 20301993469440.0, + "grad_norm": 1.8396985258261307, + "language_loss": 0.73308444, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.75397122, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 14184, + "time_per_iteration": 2.359746217727661 + }, + { + "auxiliary_loss_clip": 0.01052098, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.01087117, + "balance_loss_mlp": 1.01633596, + "epoch": 0.852848339095145, + "flos": 17201531514240.0, + "grad_norm": 1.7483955149190165, + "language_loss": 0.77317274, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.79402745, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 14185, + "time_per_iteration": 2.380070447921753 + }, + { + "auxiliary_loss_clip": 0.01051634, + "auxiliary_loss_mlp": 0.01038554, + "balance_loss_clip": 1.01411653, + "balance_loss_mlp": 1.01509595, + "epoch": 0.852908462347813, + "flos": 18368014682880.0, + "grad_norm": 1.8519955994472161, + "language_loss": 0.80296504, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.82386696, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36523438, + "step": 14186, + "time_per_iteration": 2.3651018142700195 + }, + { + "auxiliary_loss_clip": 0.01051744, + "auxiliary_loss_mlp": 0.01040343, + "balance_loss_clip": 1.01464105, + "balance_loss_mlp": 1.01510739, + "epoch": 0.8529685856004809, + "flos": 18623893104000.0, + "grad_norm": 1.7753239710802815, + "language_loss": 0.63306183, + "learning_rate": 2.224372736588449e-07, + "loss": 0.65398264, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3671875, + "step": 14187, + "time_per_iteration": 2.3398501873016357 + }, + { + "auxiliary_loss_clip": 0.01054024, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.00964499, + "balance_loss_mlp": 1.01602697, + "epoch": 0.853028708853149, + "flos": 29606346800640.0, + "grad_norm": 1.563610760347071, + "language_loss": 0.777807, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.79869962, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 14188, + "time_per_iteration": 2.458784341812134 + }, + { + "auxiliary_loss_clip": 0.01054452, + "auxiliary_loss_mlp": 0.01042544, + "balance_loss_clip": 1.0160917, + "balance_loss_mlp": 1.01722765, + "epoch": 0.8530888321058169, + "flos": 26352127751040.0, + "grad_norm": 1.5784033567624374, + "language_loss": 0.79075825, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.81172812, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37304688, + "step": 14189, + "time_per_iteration": 2.4401755332946777 + }, + { + "auxiliary_loss_clip": 0.01051684, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.01327157, + "balance_loss_mlp": 1.01562703, + "epoch": 0.8531489553584849, + "flos": 20520933805440.0, + "grad_norm": 2.867577547316191, + "language_loss": 0.81399238, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.83488131, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36132812, + "step": 14190, + "time_per_iteration": 2.3590621948242188 + }, + { + "auxiliary_loss_clip": 0.01051313, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.01876283, + "balance_loss_mlp": 1.01648748, + "epoch": 0.8532090786111529, + "flos": 20703250258560.0, + "grad_norm": 1.8029502890416584, + "language_loss": 0.77141535, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.79233694, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 14191, + "time_per_iteration": 2.3620171546936035 + }, + { + "auxiliary_loss_clip": 0.01050508, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.01158845, + "balance_loss_mlp": 1.01554465, + "epoch": 0.8532692018638208, + "flos": 19827872939520.0, + "grad_norm": 1.8905590225072815, + "language_loss": 0.69905359, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71991408, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.34960938, + "step": 14192, + "time_per_iteration": 2.3388164043426514 + }, + { + "auxiliary_loss_clip": 0.01054948, + "auxiliary_loss_mlp": 0.01044151, + "balance_loss_clip": 1.01713824, + "balance_loss_mlp": 1.01632106, + "epoch": 0.8533293251164888, + "flos": 20995717651200.0, + "grad_norm": 2.669462044371613, + "language_loss": 0.64359683, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.66458786, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.38671875, + "step": 14193, + "time_per_iteration": 2.3587467670440674 + }, + { + "auxiliary_loss_clip": 0.0105103, + "auxiliary_loss_mlp": 0.01036318, + "balance_loss_clip": 1.01456213, + "balance_loss_mlp": 1.01539803, + "epoch": 0.8533894483691568, + "flos": 22418498177280.0, + "grad_norm": 1.8496558471054128, + "language_loss": 0.77399087, + "learning_rate": 2.211894078044365e-07, + "loss": 0.79486436, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35546875, + "step": 14194, + "time_per_iteration": 2.3582582473754883 + }, + { + "auxiliary_loss_clip": 0.01052397, + "auxiliary_loss_mlp": 0.01034563, + "balance_loss_clip": 1.01187754, + "balance_loss_mlp": 1.01606321, + "epoch": 0.8534495716218248, + "flos": 21615460928640.0, + "grad_norm": 2.066792351149847, + "language_loss": 0.70763743, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.72850704, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 14195, + "time_per_iteration": 2.3598520755767822 + }, + { + "auxiliary_loss_clip": 0.01050898, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.01157784, + "balance_loss_mlp": 1.0150435, + "epoch": 0.8535096948744927, + "flos": 22345180588800.0, + "grad_norm": 1.9056908114137008, + "language_loss": 0.86577547, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.88663077, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 14196, + "time_per_iteration": 3.8925931453704834 + }, + { + "auxiliary_loss_clip": 0.0100731, + "auxiliary_loss_mlp": 0.01003825, + "balance_loss_clip": 1.00127351, + "balance_loss_mlp": 1.00069582, + "epoch": 0.8535698181271607, + "flos": 52759906076160.0, + "grad_norm": 0.7696621663842129, + "language_loss": 0.55205905, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57217044, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.0255127, + "router_z_loss_mlp": 0.06640625, + "step": 14197, + "time_per_iteration": 2.9587671756744385 + }, + { + "auxiliary_loss_clip": 0.01048838, + "auxiliary_loss_mlp": 0.01035225, + "balance_loss_clip": 1.01376748, + "balance_loss_mlp": 1.01482749, + "epoch": 0.8536299413798286, + "flos": 19061878510080.0, + "grad_norm": 1.5881670446152643, + "language_loss": 0.82180727, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.84264791, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 14198, + "time_per_iteration": 2.4055604934692383 + }, + { + "auxiliary_loss_clip": 0.01050786, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.01317692, + "balance_loss_mlp": 1.01573122, + "epoch": 0.8536900646324966, + "flos": 49342922801280.0, + "grad_norm": 1.660344287652332, + "language_loss": 0.69876492, + "learning_rate": 2.203000984963035e-07, + "loss": 0.71962333, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 14199, + "time_per_iteration": 2.597259283065796 + }, + { + "auxiliary_loss_clip": 0.01049143, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.01245475, + "balance_loss_mlp": 1.01466286, + "epoch": 0.8537501878851645, + "flos": 21761258232960.0, + "grad_norm": 1.571579475047364, + "language_loss": 0.87316275, + "learning_rate": 2.201224390669072e-07, + "loss": 0.89398307, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.34375, + "step": 14200, + "time_per_iteration": 2.366915225982666 + }, + { + "auxiliary_loss_clip": 0.01052058, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.0121392, + "balance_loss_mlp": 1.01576877, + "epoch": 0.8538103111378326, + "flos": 22268197307520.0, + "grad_norm": 1.786208890817532, + "language_loss": 0.79266798, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.81353676, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 14201, + "time_per_iteration": 2.375302791595459 + }, + { + "auxiliary_loss_clip": 0.01050845, + "auxiliary_loss_mlp": 0.0103639, + "balance_loss_clip": 1.01487279, + "balance_loss_mlp": 1.01639652, + "epoch": 0.8538704343905005, + "flos": 20302866253440.0, + "grad_norm": 1.6048446643939962, + "language_loss": 0.6972723, + "learning_rate": 2.19767322694256e-07, + "loss": 0.71814466, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34570312, + "step": 14202, + "time_per_iteration": 2.3649210929870605 + }, + { + "auxiliary_loss_clip": 0.01051001, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.00957155, + "balance_loss_mlp": 1.01560009, + "epoch": 0.8539305576431685, + "flos": 24753978132480.0, + "grad_norm": 1.5520969485652851, + "language_loss": 0.81808656, + "learning_rate": 2.195898657644666e-07, + "loss": 0.83894241, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35351562, + "step": 14203, + "time_per_iteration": 2.4086856842041016 + }, + { + "auxiliary_loss_clip": 0.01053794, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.01344347, + "balance_loss_mlp": 1.01701748, + "epoch": 0.8539906808958365, + "flos": 26686420819200.0, + "grad_norm": 2.25833859985307, + "language_loss": 0.67175484, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.6926744, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 14204, + "time_per_iteration": 2.397388219833374 + }, + { + "auxiliary_loss_clip": 0.01051077, + "auxiliary_loss_mlp": 0.01033434, + "balance_loss_clip": 1.01046205, + "balance_loss_mlp": 1.01507998, + "epoch": 0.8540508041485044, + "flos": 13364821474560.0, + "grad_norm": 2.0632766775772726, + "language_loss": 0.61908352, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.6399287, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 14205, + "time_per_iteration": 2.3288393020629883 + }, + { + "auxiliary_loss_clip": 0.01052071, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.01213765, + "balance_loss_mlp": 1.01591587, + "epoch": 0.8541109274011724, + "flos": 32779497939840.0, + "grad_norm": 2.158883787216666, + "language_loss": 0.73175228, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.75262469, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 14206, + "time_per_iteration": 2.4556050300598145 + }, + { + "auxiliary_loss_clip": 0.01052875, + "auxiliary_loss_mlp": 0.01036789, + "balance_loss_clip": 1.01397264, + "balance_loss_mlp": 1.01686835, + "epoch": 0.8541710506538404, + "flos": 17638329934080.0, + "grad_norm": 2.4879431028472667, + "language_loss": 0.77912146, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.80001813, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 14207, + "time_per_iteration": 2.332979679107666 + }, + { + "auxiliary_loss_clip": 0.01052472, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.01504457, + "balance_loss_mlp": 1.01637459, + "epoch": 0.8542311739065084, + "flos": 20262122830080.0, + "grad_norm": 1.6525265142816643, + "language_loss": 0.85677254, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87769979, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36132812, + "step": 14208, + "time_per_iteration": 2.420612335205078 + }, + { + "auxiliary_loss_clip": 0.010537, + "auxiliary_loss_mlp": 0.01039353, + "balance_loss_clip": 1.01622677, + "balance_loss_mlp": 1.0175786, + "epoch": 0.8542912971591763, + "flos": 17784685820160.0, + "grad_norm": 1.5302790424408783, + "language_loss": 0.67651868, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.69744921, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 14209, + "time_per_iteration": 2.387730360031128 + }, + { + "auxiliary_loss_clip": 0.01052756, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.01286829, + "balance_loss_mlp": 1.01628327, + "epoch": 0.8543514204118443, + "flos": 26978294718720.0, + "grad_norm": 2.0974091085793907, + "language_loss": 0.71548915, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.73638177, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 14210, + "time_per_iteration": 2.424652338027954 + }, + { + "auxiliary_loss_clip": 0.01051718, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.01138353, + "balance_loss_mlp": 1.01580715, + "epoch": 0.8544115436645122, + "flos": 24023455511040.0, + "grad_norm": 1.53735490380864, + "language_loss": 0.71192479, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.73277795, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 14211, + "time_per_iteration": 2.438556432723999 + }, + { + "auxiliary_loss_clip": 0.01052, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.01490211, + "balance_loss_mlp": 1.01606941, + "epoch": 0.8544716669171802, + "flos": 16617050576640.0, + "grad_norm": 2.124177690237166, + "language_loss": 0.82404345, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.8449465, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 14212, + "time_per_iteration": 2.372735023498535 + }, + { + "auxiliary_loss_clip": 0.01052454, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.01443493, + "balance_loss_mlp": 1.01552439, + "epoch": 0.8545317901698481, + "flos": 40004179914240.0, + "grad_norm": 1.8247932034805714, + "language_loss": 0.67364275, + "learning_rate": 2.178190108088105e-07, + "loss": 0.69456416, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36914062, + "step": 14213, + "time_per_iteration": 3.740384578704834 + }, + { + "auxiliary_loss_clip": 0.01050114, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.01269627, + "balance_loss_mlp": 1.01519704, + "epoch": 0.8545919134225162, + "flos": 19901469818880.0, + "grad_norm": 2.9897743998709374, + "language_loss": 0.78679276, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80763352, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34960938, + "step": 14214, + "time_per_iteration": 2.3919677734375 + }, + { + "auxiliary_loss_clip": 0.01054205, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.01340199, + "balance_loss_mlp": 1.01628804, + "epoch": 0.8546520366751841, + "flos": 18951971950080.0, + "grad_norm": 2.130046224009511, + "language_loss": 0.6902495, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.71118128, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 14215, + "time_per_iteration": 2.3381853103637695 + }, + { + "auxiliary_loss_clip": 0.01051757, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.01253343, + "balance_loss_mlp": 1.01607323, + "epoch": 0.8547121599278521, + "flos": 35620136490240.0, + "grad_norm": 4.324591868684738, + "language_loss": 0.63581395, + "learning_rate": 2.172890718362279e-07, + "loss": 0.65667903, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35742188, + "step": 14216, + "time_per_iteration": 2.4723329544067383 + }, + { + "auxiliary_loss_clip": 0.01053618, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.01296389, + "balance_loss_mlp": 1.01671433, + "epoch": 0.8547722831805201, + "flos": 16909099032960.0, + "grad_norm": 1.9909534802835311, + "language_loss": 0.66755551, + "learning_rate": 2.17112560704259e-07, + "loss": 0.68846965, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 14217, + "time_per_iteration": 2.357816457748413 + }, + { + "auxiliary_loss_clip": 0.01050066, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.0143733, + "balance_loss_mlp": 1.01618361, + "epoch": 0.854832406433188, + "flos": 23001512837760.0, + "grad_norm": 1.5198784206054226, + "language_loss": 0.66223073, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.68310452, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.33984375, + "step": 14218, + "time_per_iteration": 2.3877785205841064 + }, + { + "auxiliary_loss_clip": 0.01051886, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.00925565, + "balance_loss_mlp": 1.01528692, + "epoch": 0.854892529685856, + "flos": 20411550915840.0, + "grad_norm": 2.3259191433876545, + "language_loss": 0.71423805, + "learning_rate": 2.167597412688238e-07, + "loss": 0.7350744, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36523438, + "step": 14219, + "time_per_iteration": 2.356041431427002 + }, + { + "auxiliary_loss_clip": 0.01052875, + "auxiliary_loss_mlp": 0.01041779, + "balance_loss_clip": 1.01639986, + "balance_loss_mlp": 1.01532984, + "epoch": 0.854952652938524, + "flos": 16397796038400.0, + "grad_norm": 2.522881955866012, + "language_loss": 0.69621986, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.71716642, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 14220, + "time_per_iteration": 2.3231122493743896 + }, + { + "auxiliary_loss_clip": 0.01050349, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.01254845, + "balance_loss_mlp": 1.01571393, + "epoch": 0.855012776191192, + "flos": 21177615168000.0, + "grad_norm": 2.4250903324870943, + "language_loss": 0.73312223, + "learning_rate": 2.164071923159827e-07, + "loss": 0.75397146, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 14221, + "time_per_iteration": 3.7627100944519043 + }, + { + "auxiliary_loss_clip": 0.01051569, + "auxiliary_loss_mlp": 0.01039597, + "balance_loss_clip": 1.01489711, + "balance_loss_mlp": 1.01512456, + "epoch": 0.8550728994438599, + "flos": 26139785662080.0, + "grad_norm": 1.9670247942293195, + "language_loss": 0.61314988, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.63406157, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 14222, + "time_per_iteration": 2.423718214035034 + }, + { + "auxiliary_loss_clip": 0.0105083, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.01232576, + "balance_loss_mlp": 1.01593256, + "epoch": 0.8551330226965279, + "flos": 22785609790080.0, + "grad_norm": 1.511916637479822, + "language_loss": 0.8490172, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86986244, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34960938, + "step": 14223, + "time_per_iteration": 3.7547054290771484 + }, + { + "auxiliary_loss_clip": 0.01050431, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.00906277, + "balance_loss_mlp": 1.01581049, + "epoch": 0.8551931459491958, + "flos": 22417939595520.0, + "grad_norm": 1.6480598158879873, + "language_loss": 0.75182533, + "learning_rate": 2.158788761585515e-07, + "loss": 0.77263725, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34570312, + "step": 14224, + "time_per_iteration": 2.382056474685669 + }, + { + "auxiliary_loss_clip": 0.01050507, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.01441288, + "balance_loss_mlp": 1.01537418, + "epoch": 0.8552532692018638, + "flos": 19572169075200.0, + "grad_norm": 9.458703346235927, + "language_loss": 0.75876081, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77964532, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3515625, + "step": 14225, + "time_per_iteration": 2.4040286540985107 + }, + { + "auxiliary_loss_clip": 0.01052107, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.01441658, + "balance_loss_mlp": 1.01683223, + "epoch": 0.8553133924545318, + "flos": 26431554827520.0, + "grad_norm": 1.9218871709224337, + "language_loss": 0.78189874, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.80277944, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 14226, + "time_per_iteration": 2.4230284690856934 + }, + { + "auxiliary_loss_clip": 0.01053126, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.01188314, + "balance_loss_mlp": 1.01575351, + "epoch": 0.8553735157071998, + "flos": 16361521269120.0, + "grad_norm": 2.2183632225212047, + "language_loss": 0.56626618, + "learning_rate": 2.153511688875702e-07, + "loss": 0.58716953, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 14227, + "time_per_iteration": 2.378129482269287 + }, + { + "auxiliary_loss_clip": 0.0105169, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01587653, + "balance_loss_mlp": 1.01668382, + "epoch": 0.8554336389598677, + "flos": 20886264938880.0, + "grad_norm": 2.2175202841180606, + "language_loss": 0.67226744, + "learning_rate": 2.151754018031442e-07, + "loss": 0.69317502, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14228, + "time_per_iteration": 2.3727986812591553 + }, + { + "auxiliary_loss_clip": 0.01052052, + "auxiliary_loss_mlp": 0.01045281, + "balance_loss_clip": 1.02069962, + "balance_loss_mlp": 1.01532936, + "epoch": 0.8554937622125357, + "flos": 21283751301120.0, + "grad_norm": 2.2674112597620417, + "language_loss": 0.75739515, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.77836847, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 14229, + "time_per_iteration": 2.3637821674346924 + }, + { + "auxiliary_loss_clip": 0.01051418, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.01253295, + "balance_loss_mlp": 1.01584828, + "epoch": 0.8555538854652037, + "flos": 22412249043840.0, + "grad_norm": 1.803547560593422, + "language_loss": 0.73233318, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.75319517, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 14230, + "time_per_iteration": 2.3771896362304688 + }, + { + "auxiliary_loss_clip": 0.01051859, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.01408863, + "balance_loss_mlp": 1.01615, + "epoch": 0.8556140087178716, + "flos": 20192680402560.0, + "grad_norm": 2.0178630894043863, + "language_loss": 0.8413595, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.86225611, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 14231, + "time_per_iteration": 2.3776204586029053 + }, + { + "auxiliary_loss_clip": 0.010547, + "auxiliary_loss_mlp": 0.01037284, + "balance_loss_clip": 1.01220202, + "balance_loss_mlp": 1.01779008, + "epoch": 0.8556741319705397, + "flos": 22637019576960.0, + "grad_norm": 1.9655391492153602, + "language_loss": 0.68818259, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.70910239, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 14232, + "time_per_iteration": 2.375871181488037 + }, + { + "auxiliary_loss_clip": 0.01052115, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.01123428, + "balance_loss_mlp": 1.01577032, + "epoch": 0.8557342552232076, + "flos": 23548217817600.0, + "grad_norm": 1.4676649118817033, + "language_loss": 0.68283737, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.70371199, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 14233, + "time_per_iteration": 2.3809025287628174 + }, + { + "auxiliary_loss_clip": 0.01050678, + "auxiliary_loss_mlp": 0.01039657, + "balance_loss_clip": 1.01644695, + "balance_loss_mlp": 1.01551259, + "epoch": 0.8557943784758756, + "flos": 19608862780800.0, + "grad_norm": 1.9298030289160908, + "language_loss": 0.77886283, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.79976618, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14234, + "time_per_iteration": 2.380378484725952 + }, + { + "auxiliary_loss_clip": 0.01007508, + "auxiliary_loss_mlp": 0.01001938, + "balance_loss_clip": 0.99985152, + "balance_loss_mlp": 1.00088191, + "epoch": 0.8558545017285435, + "flos": 70638753432960.0, + "grad_norm": 0.7569250946905336, + "language_loss": 0.58087528, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60096973, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06640625, + "step": 14235, + "time_per_iteration": 2.980517864227295 + }, + { + "auxiliary_loss_clip": 0.01007466, + "auxiliary_loss_mlp": 0.01003114, + "balance_loss_clip": 1.00103974, + "balance_loss_mlp": 1.00082135, + "epoch": 0.8559146249812115, + "flos": 56646715783680.0, + "grad_norm": 0.7785273872823135, + "language_loss": 0.5666039, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58670974, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.06640625, + "step": 14236, + "time_per_iteration": 4.343386888504028 + }, + { + "auxiliary_loss_clip": 0.01051548, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.01405668, + "balance_loss_mlp": 1.01606917, + "epoch": 0.8559747482338794, + "flos": 22887277269120.0, + "grad_norm": 1.884092665027232, + "language_loss": 0.71141207, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.73230153, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 14237, + "time_per_iteration": 2.412303924560547 + }, + { + "auxiliary_loss_clip": 0.01049734, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.00846481, + "balance_loss_mlp": 1.01466119, + "epoch": 0.8560348714865474, + "flos": 22600814630400.0, + "grad_norm": 2.184706797758299, + "language_loss": 0.64232361, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.66312623, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 14238, + "time_per_iteration": 2.363118886947632 + }, + { + "auxiliary_loss_clip": 0.0104832, + "auxiliary_loss_mlp": 0.01035094, + "balance_loss_clip": 1.01511478, + "balance_loss_mlp": 1.01544166, + "epoch": 0.8560949947392154, + "flos": 17930483124480.0, + "grad_norm": 1.4307236124273763, + "language_loss": 0.70268726, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.72352147, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.328125, + "step": 14239, + "time_per_iteration": 2.3594250679016113 + }, + { + "auxiliary_loss_clip": 0.01053543, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.01596141, + "balance_loss_mlp": 1.0159142, + "epoch": 0.8561551179918834, + "flos": 31024972874880.0, + "grad_norm": 2.079543085468315, + "language_loss": 0.68040776, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.70133388, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37695312, + "step": 14240, + "time_per_iteration": 2.432443857192993 + }, + { + "auxiliary_loss_clip": 0.01052478, + "auxiliary_loss_mlp": 0.01037258, + "balance_loss_clip": 1.01309466, + "balance_loss_mlp": 1.01621294, + "epoch": 0.8562152412445513, + "flos": 30663866016000.0, + "grad_norm": 1.5933112816632753, + "language_loss": 0.63404679, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.65494418, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 14241, + "time_per_iteration": 2.4248340129852295 + }, + { + "auxiliary_loss_clip": 0.01053192, + "auxiliary_loss_mlp": 0.01037964, + "balance_loss_clip": 1.01279914, + "balance_loss_mlp": 1.0159682, + "epoch": 0.8562753644972193, + "flos": 31574819877120.0, + "grad_norm": 1.9067559980636262, + "language_loss": 0.74761885, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76853043, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 14242, + "time_per_iteration": 2.4423553943634033 + }, + { + "auxiliary_loss_clip": 0.01055461, + "auxiliary_loss_mlp": 0.0104386, + "balance_loss_clip": 1.01860011, + "balance_loss_mlp": 1.01687384, + "epoch": 0.8563354877498872, + "flos": 26212439934720.0, + "grad_norm": 2.0904098676968768, + "language_loss": 0.77858227, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.79957545, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 14243, + "time_per_iteration": 2.3952407836914062 + }, + { + "auxiliary_loss_clip": 0.01051913, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.01240563, + "balance_loss_mlp": 1.01547074, + "epoch": 0.8563956110025552, + "flos": 24133187514240.0, + "grad_norm": 1.8238690772012056, + "language_loss": 0.69196421, + "learning_rate": 2.123723375556974e-07, + "loss": 0.71285117, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 14244, + "time_per_iteration": 2.3828656673431396 + }, + { + "auxiliary_loss_clip": 0.01007498, + "auxiliary_loss_mlp": 0.01004, + "balance_loss_clip": 1.00199735, + "balance_loss_mlp": 1.0008502, + "epoch": 0.8564557342552233, + "flos": 56269095851520.0, + "grad_norm": 0.7550752114065635, + "language_loss": 0.58548033, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60559529, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06640625, + "step": 14245, + "time_per_iteration": 2.9039578437805176 + }, + { + "auxiliary_loss_clip": 0.01054329, + "auxiliary_loss_mlp": 0.01036043, + "balance_loss_clip": 1.01141405, + "balance_loss_mlp": 1.01694345, + "epoch": 0.8565158575078912, + "flos": 23439498243840.0, + "grad_norm": 1.7162919033430326, + "language_loss": 0.7848593, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.80576301, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 14246, + "time_per_iteration": 2.3933281898498535 + }, + { + "auxiliary_loss_clip": 0.01050706, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.01322174, + "balance_loss_mlp": 1.01480865, + "epoch": 0.8565759807605592, + "flos": 20374892121600.0, + "grad_norm": 2.0939872339994974, + "language_loss": 0.82936156, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.85023308, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 14247, + "time_per_iteration": 2.5168938636779785 + }, + { + "auxiliary_loss_clip": 0.01053194, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.01377308, + "balance_loss_mlp": 1.01631308, + "epoch": 0.8566361040132271, + "flos": 18806104823040.0, + "grad_norm": 2.3658429764996107, + "language_loss": 0.78642792, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.8073315, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36914062, + "step": 14248, + "time_per_iteration": 2.343745231628418 + }, + { + "auxiliary_loss_clip": 0.01050366, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.01192319, + "balance_loss_mlp": 1.01528597, + "epoch": 0.8566962272658951, + "flos": 24534199923840.0, + "grad_norm": 1.6366104947513447, + "language_loss": 0.78706264, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.80791682, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 14249, + "time_per_iteration": 2.388091802597046 + }, + { + "auxiliary_loss_clip": 0.01048937, + "auxiliary_loss_mlp": 0.01042257, + "balance_loss_clip": 1.02015543, + "balance_loss_mlp": 1.01502645, + "epoch": 0.856756350518563, + "flos": 23177580157440.0, + "grad_norm": 1.714982138587563, + "language_loss": 0.78929508, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.81020701, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33984375, + "step": 14250, + "time_per_iteration": 2.3757753372192383 + }, + { + "auxiliary_loss_clip": 0.01049549, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.01590157, + "balance_loss_mlp": 1.01633346, + "epoch": 0.856816473771231, + "flos": 20807675735040.0, + "grad_norm": 1.6969024760921536, + "language_loss": 0.80752194, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.82839102, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33203125, + "step": 14251, + "time_per_iteration": 2.3598287105560303 + }, + { + "auxiliary_loss_clip": 0.0104949, + "auxiliary_loss_mlp": 0.0103883, + "balance_loss_clip": 1.01567948, + "balance_loss_mlp": 1.01543725, + "epoch": 0.856876597023899, + "flos": 20227174692480.0, + "grad_norm": 2.111820165092905, + "language_loss": 0.6268791, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.6477623, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.33984375, + "step": 14252, + "time_per_iteration": 3.5945284366607666 + }, + { + "auxiliary_loss_clip": 0.01053918, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.01612961, + "balance_loss_mlp": 1.01718855, + "epoch": 0.856936720276567, + "flos": 18295150942080.0, + "grad_norm": 1.8122178497914783, + "language_loss": 0.71517867, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.73613495, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 14253, + "time_per_iteration": 2.351261854171753 + }, + { + "auxiliary_loss_clip": 0.01007522, + "auxiliary_loss_mlp": 0.01003026, + "balance_loss_clip": 1.00087988, + "balance_loss_mlp": 1.00096035, + "epoch": 0.8569968435292349, + "flos": 69875202798720.0, + "grad_norm": 0.7986697264614281, + "language_loss": 0.59286714, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61297262, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06542969, + "step": 14254, + "time_per_iteration": 3.0478641986846924 + }, + { + "auxiliary_loss_clip": 0.0105131, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.0128206, + "balance_loss_mlp": 1.01564097, + "epoch": 0.8570569667819029, + "flos": 25847388092160.0, + "grad_norm": 2.0315172060200353, + "language_loss": 0.8204416, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.84131664, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35742188, + "step": 14255, + "time_per_iteration": 2.411677360534668 + }, + { + "auxiliary_loss_clip": 0.01050346, + "auxiliary_loss_mlp": 0.01034163, + "balance_loss_clip": 1.01249051, + "balance_loss_mlp": 1.01563919, + "epoch": 0.8571170900345708, + "flos": 23256029715840.0, + "grad_norm": 2.1891324917075554, + "language_loss": 0.67867517, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69952023, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 14256, + "time_per_iteration": 2.3813443183898926 + }, + { + "auxiliary_loss_clip": 0.01051572, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.01218951, + "balance_loss_mlp": 1.01620877, + "epoch": 0.8571772132872388, + "flos": 18916639787520.0, + "grad_norm": 1.575102866038727, + "language_loss": 0.70956153, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.73042357, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35351562, + "step": 14257, + "time_per_iteration": 2.363162040710449 + }, + { + "auxiliary_loss_clip": 0.01050804, + "auxiliary_loss_mlp": 0.0103628, + "balance_loss_clip": 1.01386905, + "balance_loss_mlp": 1.01609731, + "epoch": 0.8572373365399069, + "flos": 33248870524800.0, + "grad_norm": 2.055132774878838, + "language_loss": 0.78328735, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.80415815, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 14258, + "time_per_iteration": 2.458268404006958 + }, + { + "auxiliary_loss_clip": 0.01049797, + "auxiliary_loss_mlp": 0.01037751, + "balance_loss_clip": 1.01451755, + "balance_loss_mlp": 1.01563358, + "epoch": 0.8572974597925748, + "flos": 23326519484160.0, + "grad_norm": 1.5571999058495358, + "language_loss": 0.68618089, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.7070564, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34179688, + "step": 14259, + "time_per_iteration": 2.4487814903259277 + }, + { + "auxiliary_loss_clip": 0.01051607, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.0168159, + "balance_loss_mlp": 1.01633787, + "epoch": 0.8573575830452428, + "flos": 24534688682880.0, + "grad_norm": 1.8429491871247183, + "language_loss": 0.78116131, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.80207419, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 14260, + "time_per_iteration": 3.8206326961517334 + }, + { + "auxiliary_loss_clip": 0.01052416, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.01208949, + "balance_loss_mlp": 1.01595271, + "epoch": 0.8574177062979107, + "flos": 24164400136320.0, + "grad_norm": 1.9160344093529227, + "language_loss": 0.75255936, + "learning_rate": 2.09413096654806e-07, + "loss": 0.77344334, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 14261, + "time_per_iteration": 2.3912618160247803 + }, + { + "auxiliary_loss_clip": 0.01054041, + "auxiliary_loss_mlp": 0.01041511, + "balance_loss_clip": 1.01658463, + "balance_loss_mlp": 1.01685858, + "epoch": 0.8574778295505787, + "flos": 17929784897280.0, + "grad_norm": 1.746663444508514, + "language_loss": 0.79363006, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81458557, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37109375, + "step": 14262, + "time_per_iteration": 3.7280969619750977 + }, + { + "auxiliary_loss_clip": 0.01050114, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.01371336, + "balance_loss_mlp": 1.01598251, + "epoch": 0.8575379528032466, + "flos": 21579605095680.0, + "grad_norm": 1.6254130578689368, + "language_loss": 0.6898821, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.7107361, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34179688, + "step": 14263, + "time_per_iteration": 2.383579969406128 + }, + { + "auxiliary_loss_clip": 0.01052156, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.01108658, + "balance_loss_mlp": 1.01592088, + "epoch": 0.8575980760559146, + "flos": 21760525094400.0, + "grad_norm": 1.639666032765686, + "language_loss": 0.80461317, + "learning_rate": 2.088929137266986e-07, + "loss": 0.82548422, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 14264, + "time_per_iteration": 2.414418935775757 + }, + { + "auxiliary_loss_clip": 0.01052036, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.01412594, + "balance_loss_mlp": 1.01745415, + "epoch": 0.8576581993085826, + "flos": 34385013855360.0, + "grad_norm": 1.2575650837592116, + "language_loss": 0.70033234, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.72120476, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34570312, + "step": 14265, + "time_per_iteration": 2.4751060009002686 + }, + { + "auxiliary_loss_clip": 0.01049131, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.00975907, + "balance_loss_mlp": 1.01568902, + "epoch": 0.8577183225612506, + "flos": 23221360869120.0, + "grad_norm": 1.6958172045399718, + "language_loss": 0.67114872, + "learning_rate": 2.085464646918027e-07, + "loss": 0.69195998, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33398438, + "step": 14266, + "time_per_iteration": 2.3920364379882812 + }, + { + "auxiliary_loss_clip": 0.01049978, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.01427221, + "balance_loss_mlp": 1.01572132, + "epoch": 0.8577784458139185, + "flos": 28802890615680.0, + "grad_norm": 1.7363513264435786, + "language_loss": 0.75919098, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.78005278, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 14267, + "time_per_iteration": 2.432713508605957 + }, + { + "auxiliary_loss_clip": 0.01050346, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.0148654, + "balance_loss_mlp": 1.01635575, + "epoch": 0.8578385690665865, + "flos": 19754555351040.0, + "grad_norm": 1.664977856879449, + "language_loss": 0.88182515, + "learning_rate": 2.082002873852946e-07, + "loss": 0.90269029, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 14268, + "time_per_iteration": 2.3639976978302 + }, + { + "auxiliary_loss_clip": 0.01052205, + "auxiliary_loss_mlp": 0.01042628, + "balance_loss_clip": 1.01886964, + "balance_loss_mlp": 1.0164845, + "epoch": 0.8578986923192544, + "flos": 20703040790400.0, + "grad_norm": 2.086719359556617, + "language_loss": 0.74296671, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.76391506, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35742188, + "step": 14269, + "time_per_iteration": 2.363553285598755 + }, + { + "auxiliary_loss_clip": 0.01053532, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.01163042, + "balance_loss_mlp": 1.0172832, + "epoch": 0.8579588155719224, + "flos": 36100226862720.0, + "grad_norm": 1.617598010344271, + "language_loss": 0.66815686, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68905008, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 14270, + "time_per_iteration": 2.5219831466674805 + }, + { + "auxiliary_loss_clip": 0.0104933, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.0100863, + "balance_loss_mlp": 1.01510286, + "epoch": 0.8580189388245905, + "flos": 22852468776960.0, + "grad_norm": 1.7479383806637516, + "language_loss": 0.74351501, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76432675, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34179688, + "step": 14271, + "time_per_iteration": 2.395087242126465 + }, + { + "auxiliary_loss_clip": 0.01007498, + "auxiliary_loss_mlp": 0.01001966, + "balance_loss_clip": 0.99990374, + "balance_loss_mlp": 1.00103784, + "epoch": 0.8580790620772584, + "flos": 69638385980160.0, + "grad_norm": 0.804146333283602, + "language_loss": 0.59781289, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61790752, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.06445312, + "step": 14272, + "time_per_iteration": 3.103320837020874 + }, + { + "auxiliary_loss_clip": 0.01054445, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.01459885, + "balance_loss_mlp": 1.01628637, + "epoch": 0.8581391853299264, + "flos": 13333399384320.0, + "grad_norm": 1.7911138025259181, + "language_loss": 0.75680757, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77774596, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 14273, + "time_per_iteration": 2.3307783603668213 + }, + { + "auxiliary_loss_clip": 0.01050441, + "auxiliary_loss_mlp": 0.01036704, + "balance_loss_clip": 1.01486456, + "balance_loss_mlp": 1.01533103, + "epoch": 0.8581993085825943, + "flos": 19644648791040.0, + "grad_norm": 1.9080315461119712, + "language_loss": 0.82863688, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84950829, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 14274, + "time_per_iteration": 2.365957498550415 + }, + { + "auxiliary_loss_clip": 0.01007366, + "auxiliary_loss_mlp": 0.01004797, + "balance_loss_clip": 1.00274622, + "balance_loss_mlp": 1.00067723, + "epoch": 0.8582594318352623, + "flos": 55822452428160.0, + "grad_norm": 0.7965737159428715, + "language_loss": 0.60912901, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62925065, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06689453, + "step": 14275, + "time_per_iteration": 3.077043294906616 + }, + { + "auxiliary_loss_clip": 0.01051016, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.01172614, + "balance_loss_mlp": 1.01506591, + "epoch": 0.8583195550879302, + "flos": 24278426236800.0, + "grad_norm": 2.003225086837967, + "language_loss": 0.60258901, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.62345749, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 14276, + "time_per_iteration": 3.8302817344665527 + }, + { + "auxiliary_loss_clip": 0.01050378, + "auxiliary_loss_mlp": 0.01039978, + "balance_loss_clip": 1.0180434, + "balance_loss_mlp": 1.01547384, + "epoch": 0.8583796783405983, + "flos": 13443271032960.0, + "grad_norm": 11.323914245953544, + "language_loss": 0.7723316, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.79323518, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 14277, + "time_per_iteration": 2.3553404808044434 + }, + { + "auxiliary_loss_clip": 0.01051879, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.01429915, + "balance_loss_mlp": 1.01612473, + "epoch": 0.8584398015932662, + "flos": 16179344461440.0, + "grad_norm": 1.4691060314085298, + "language_loss": 0.84632963, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.86722231, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 14278, + "time_per_iteration": 2.3435792922973633 + }, + { + "auxiliary_loss_clip": 0.01052869, + "auxiliary_loss_mlp": 0.01041406, + "balance_loss_clip": 1.01689637, + "balance_loss_mlp": 1.01601231, + "epoch": 0.8584999248459342, + "flos": 17449659613440.0, + "grad_norm": 2.60581225637081, + "language_loss": 0.75925934, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.78020209, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 14279, + "time_per_iteration": 2.3565292358398438 + }, + { + "auxiliary_loss_clip": 0.01051778, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.01630712, + "balance_loss_mlp": 1.01689458, + "epoch": 0.8585600480986021, + "flos": 23439882268800.0, + "grad_norm": 2.837020086690907, + "language_loss": 0.68103063, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.70193607, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 14280, + "time_per_iteration": 2.4041364192962646 + }, + { + "auxiliary_loss_clip": 0.01048989, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.01393104, + "balance_loss_mlp": 1.01488876, + "epoch": 0.8586201713512701, + "flos": 19936906715520.0, + "grad_norm": 2.0851398902811606, + "language_loss": 0.6362251, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.65705836, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.34179688, + "step": 14281, + "time_per_iteration": 2.375554323196411 + }, + { + "auxiliary_loss_clip": 0.01051034, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.011904, + "balance_loss_mlp": 1.01541352, + "epoch": 0.858680294603938, + "flos": 15303862408320.0, + "grad_norm": 1.6953979716714958, + "language_loss": 0.74497187, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.76582885, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 14282, + "time_per_iteration": 2.3513193130493164 + }, + { + "auxiliary_loss_clip": 0.01048934, + "auxiliary_loss_mlp": 0.01031159, + "balance_loss_clip": 1.00961757, + "balance_loss_mlp": 1.01412809, + "epoch": 0.858740417856606, + "flos": 22710127697280.0, + "grad_norm": 1.94571691345977, + "language_loss": 0.7696082, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.79040915, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 14283, + "time_per_iteration": 2.366105794906616 + }, + { + "auxiliary_loss_clip": 0.01051815, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.01476073, + "balance_loss_mlp": 1.01581943, + "epoch": 0.8588005411092741, + "flos": 34052990025600.0, + "grad_norm": 2.0381824159204456, + "language_loss": 0.60741448, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62831151, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 14284, + "time_per_iteration": 2.4761359691619873 + }, + { + "auxiliary_loss_clip": 0.01049281, + "auxiliary_loss_mlp": 0.01037527, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.01636159, + "epoch": 0.858860664361942, + "flos": 28912308416640.0, + "grad_norm": 1.7844210426601728, + "language_loss": 0.76096022, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.78182828, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.328125, + "step": 14285, + "time_per_iteration": 2.431771755218506 + }, + { + "auxiliary_loss_clip": 0.01053461, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.00869477, + "balance_loss_mlp": 1.01679206, + "epoch": 0.85892078761461, + "flos": 19791493436160.0, + "grad_norm": 1.7883562408957088, + "language_loss": 0.75329101, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.77413929, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3671875, + "step": 14286, + "time_per_iteration": 2.374467134475708 + }, + { + "auxiliary_loss_clip": 0.01007215, + "auxiliary_loss_mlp": 0.01001952, + "balance_loss_clip": 0.99990201, + "balance_loss_mlp": 1.00078678, + "epoch": 0.8589809108672779, + "flos": 67103483005440.0, + "grad_norm": 0.7660428542429637, + "language_loss": 0.49564639, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51573813, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06445312, + "step": 14287, + "time_per_iteration": 2.9578208923339844 + }, + { + "auxiliary_loss_clip": 0.01051981, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.01498604, + "balance_loss_mlp": 1.01636612, + "epoch": 0.8590410341199459, + "flos": 29714961640320.0, + "grad_norm": 4.74191911870384, + "language_loss": 0.79904681, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81994045, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 14288, + "time_per_iteration": 2.431166887283325 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.01335466, + "balance_loss_mlp": 1.01607442, + "epoch": 0.8591011573726138, + "flos": 23986307957760.0, + "grad_norm": 1.9208755598081817, + "language_loss": 0.81716287, + "learning_rate": 2.045818444528553e-07, + "loss": 0.83805358, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 14289, + "time_per_iteration": 2.3694446086883545 + }, + { + "auxiliary_loss_clip": 0.0105105, + "auxiliary_loss_mlp": 0.01037522, + "balance_loss_clip": 1.01515818, + "balance_loss_mlp": 1.01574528, + "epoch": 0.8591612806252819, + "flos": 14427786862080.0, + "grad_norm": 1.7271681425759704, + "language_loss": 0.66202283, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.68290854, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 14290, + "time_per_iteration": 2.3477470874786377 + }, + { + "auxiliary_loss_clip": 0.01051823, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.01260567, + "balance_loss_mlp": 1.01622641, + "epoch": 0.8592214038779498, + "flos": 31575797395200.0, + "grad_norm": 1.7649327256791993, + "language_loss": 0.55867732, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57956183, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 14291, + "time_per_iteration": 2.4352805614471436 + }, + { + "auxiliary_loss_clip": 0.01052389, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.01274562, + "balance_loss_mlp": 1.0161953, + "epoch": 0.8592815271306178, + "flos": 17456327683200.0, + "grad_norm": 2.0065295702718875, + "language_loss": 0.7327435, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.75361878, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 14292, + "time_per_iteration": 3.5918936729431152 + }, + { + "auxiliary_loss_clip": 0.01050944, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.01217127, + "balance_loss_mlp": 1.01599348, + "epoch": 0.8593416503832857, + "flos": 25410170736000.0, + "grad_norm": 1.4564133341793501, + "language_loss": 0.72554511, + "learning_rate": 2.038960195018542e-07, + "loss": 0.74639976, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34960938, + "step": 14293, + "time_per_iteration": 2.39924955368042 + }, + { + "auxiliary_loss_clip": 0.01049444, + "auxiliary_loss_mlp": 0.01035552, + "balance_loss_clip": 1.01459527, + "balance_loss_mlp": 1.01551938, + "epoch": 0.8594017736359537, + "flos": 20995578005760.0, + "grad_norm": 2.4168352524979326, + "language_loss": 0.70295858, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.72380853, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33984375, + "step": 14294, + "time_per_iteration": 2.408008337020874 + }, + { + "auxiliary_loss_clip": 0.01049265, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.01335311, + "balance_loss_mlp": 1.01453781, + "epoch": 0.8594618968886216, + "flos": 22089965483520.0, + "grad_norm": 1.9259815318842464, + "language_loss": 0.78439629, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80523634, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34765625, + "step": 14295, + "time_per_iteration": 2.3979897499084473 + }, + { + "auxiliary_loss_clip": 0.01054854, + "auxiliary_loss_mlp": 0.01041268, + "balance_loss_clip": 1.01468468, + "balance_loss_mlp": 1.01707149, + "epoch": 0.8595220201412896, + "flos": 11655438664320.0, + "grad_norm": 3.8886276433957514, + "language_loss": 0.70455551, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.72551674, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 14296, + "time_per_iteration": 2.399430274963379 + }, + { + "auxiliary_loss_clip": 0.01051278, + "auxiliary_loss_mlp": 0.01037634, + "balance_loss_clip": 1.01461434, + "balance_loss_mlp": 1.01663315, + "epoch": 0.8595821433939577, + "flos": 25039358519040.0, + "grad_norm": 2.904456616152101, + "language_loss": 0.80465829, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.82554746, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34570312, + "step": 14297, + "time_per_iteration": 2.4327590465545654 + }, + { + "auxiliary_loss_clip": 0.01050095, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.01043022, + "balance_loss_mlp": 1.01557171, + "epoch": 0.8596422666466256, + "flos": 28510283577600.0, + "grad_norm": 1.5413836096475462, + "language_loss": 0.68538052, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70619762, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34375, + "step": 14298, + "time_per_iteration": 2.489471673965454 + }, + { + "auxiliary_loss_clip": 0.01049544, + "auxiliary_loss_mlp": 0.01040998, + "balance_loss_clip": 1.01901555, + "balance_loss_mlp": 1.0156616, + "epoch": 0.8597023898992936, + "flos": 13588300287360.0, + "grad_norm": 2.0945718458667484, + "language_loss": 0.69365275, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.71455818, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33984375, + "step": 14299, + "time_per_iteration": 2.3402888774871826 + }, + { + "auxiliary_loss_clip": 0.01053073, + "auxiliary_loss_mlp": 0.01043404, + "balance_loss_clip": 1.01963401, + "balance_loss_mlp": 1.01679659, + "epoch": 0.8597625131519615, + "flos": 32299617035520.0, + "grad_norm": 2.253863799311946, + "language_loss": 0.72622991, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.74719465, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 14300, + "time_per_iteration": 2.4405646324157715 + }, + { + "auxiliary_loss_clip": 0.01050988, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.01119673, + "balance_loss_mlp": 1.01567006, + "epoch": 0.8598226364046295, + "flos": 28729119179520.0, + "grad_norm": 1.5785591915022912, + "language_loss": 0.70384169, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.72469306, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 14301, + "time_per_iteration": 3.804576873779297 + }, + { + "auxiliary_loss_clip": 0.01051304, + "auxiliary_loss_mlp": 0.01035694, + "balance_loss_clip": 1.01425982, + "balance_loss_mlp": 1.0167371, + "epoch": 0.8598827596572974, + "flos": 21870745856640.0, + "grad_norm": 1.8275376379621842, + "language_loss": 0.75553656, + "learning_rate": 2.023568983386641e-07, + "loss": 0.77640659, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 14302, + "time_per_iteration": 3.6765034198760986 + }, + { + "auxiliary_loss_clip": 0.01048582, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.01341796, + "balance_loss_mlp": 1.01561189, + "epoch": 0.8599428829099655, + "flos": 23766215546880.0, + "grad_norm": 1.7524840453271138, + "language_loss": 0.84753084, + "learning_rate": 2.02186225623733e-07, + "loss": 0.86835378, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.328125, + "step": 14303, + "time_per_iteration": 2.381174325942993 + }, + { + "auxiliary_loss_clip": 0.01051905, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.01858091, + "balance_loss_mlp": 1.01564586, + "epoch": 0.8600030061626334, + "flos": 16211953537920.0, + "grad_norm": 2.011442267863653, + "language_loss": 0.77156448, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79248804, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 14304, + "time_per_iteration": 2.4121642112731934 + }, + { + "auxiliary_loss_clip": 0.01050593, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.01174915, + "balance_loss_mlp": 1.01584327, + "epoch": 0.8600631294153014, + "flos": 15668460403200.0, + "grad_norm": 2.173074361289434, + "language_loss": 0.54854405, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.56939447, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 14305, + "time_per_iteration": 2.3727962970733643 + }, + { + "auxiliary_loss_clip": 0.01050595, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.01297987, + "balance_loss_mlp": 1.01570785, + "epoch": 0.8601232526679693, + "flos": 17492148604800.0, + "grad_norm": 2.233975617450759, + "language_loss": 0.84506172, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.86593491, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34960938, + "step": 14306, + "time_per_iteration": 2.3420698642730713 + }, + { + "auxiliary_loss_clip": 0.01048712, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.01669276, + "balance_loss_mlp": 1.01491821, + "epoch": 0.8601833759206373, + "flos": 26984543852160.0, + "grad_norm": 1.4220146544779362, + "language_loss": 0.72079504, + "learning_rate": 2.01504216561474e-07, + "loss": 0.74165154, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33789062, + "step": 14307, + "time_per_iteration": 2.481304883956909 + }, + { + "auxiliary_loss_clip": 0.01052464, + "auxiliary_loss_mlp": 0.01040093, + "balance_loss_clip": 1.0151664, + "balance_loss_mlp": 1.01587772, + "epoch": 0.8602434991733052, + "flos": 25228552510080.0, + "grad_norm": 2.014513141332092, + "language_loss": 0.65132952, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.67225504, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36523438, + "step": 14308, + "time_per_iteration": 2.40107798576355 + }, + { + "auxiliary_loss_clip": 0.01007695, + "auxiliary_loss_mlp": 0.01001662, + "balance_loss_clip": 0.99955243, + "balance_loss_mlp": 1.0010612, + "epoch": 0.8603036224259732, + "flos": 71011974533760.0, + "grad_norm": 0.6259472051918687, + "language_loss": 0.48565644, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50575, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06640625, + "step": 14309, + "time_per_iteration": 3.0910515785217285 + }, + { + "auxiliary_loss_clip": 0.01052397, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.01051188, + "balance_loss_mlp": 1.01576686, + "epoch": 0.8603637456786413, + "flos": 20299654408320.0, + "grad_norm": 2.7904357798905934, + "language_loss": 0.67898214, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69985545, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 14310, + "time_per_iteration": 2.375838279724121 + }, + { + "auxiliary_loss_clip": 0.01051833, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.01669812, + "balance_loss_mlp": 1.01603186, + "epoch": 0.8604238689313092, + "flos": 21834750378240.0, + "grad_norm": 2.699264454757523, + "language_loss": 0.79315019, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.81405473, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 14311, + "time_per_iteration": 2.430875778198242 + }, + { + "auxiliary_loss_clip": 0.01051185, + "auxiliary_loss_mlp": 0.0103923, + "balance_loss_clip": 1.01717627, + "balance_loss_mlp": 1.01589739, + "epoch": 0.8604839921839772, + "flos": 18003242131200.0, + "grad_norm": 2.1299861413331134, + "language_loss": 0.7267797, + "learning_rate": 2.006532397626639e-07, + "loss": 0.74768376, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 14312, + "time_per_iteration": 2.3423750400543213 + }, + { + "auxiliary_loss_clip": 0.01049377, + "auxiliary_loss_mlp": 0.0104078, + "balance_loss_clip": 1.01834512, + "balance_loss_mlp": 1.01483762, + "epoch": 0.8605441154366451, + "flos": 16251265595520.0, + "grad_norm": 2.2950885733746356, + "language_loss": 0.78492045, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80582201, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 14313, + "time_per_iteration": 2.3421685695648193 + }, + { + "auxiliary_loss_clip": 0.01050213, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.01271021, + "balance_loss_mlp": 1.01531577, + "epoch": 0.8606042386893131, + "flos": 32265786061440.0, + "grad_norm": 1.4580167708436145, + "language_loss": 0.73356831, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75443673, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.34960938, + "step": 14314, + "time_per_iteration": 2.474968194961548 + }, + { + "auxiliary_loss_clip": 0.01050585, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.01729321, + "balance_loss_mlp": 1.01553988, + "epoch": 0.860664361941981, + "flos": 20228745703680.0, + "grad_norm": 2.125162981915688, + "language_loss": 0.69836718, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71927446, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34960938, + "step": 14315, + "time_per_iteration": 2.3753459453582764 + }, + { + "auxiliary_loss_clip": 0.01050815, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.01575255, + "balance_loss_mlp": 1.0160141, + "epoch": 0.8607244851946491, + "flos": 25190462350080.0, + "grad_norm": 2.002087728913297, + "language_loss": 0.72825456, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74913967, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 14316, + "time_per_iteration": 3.892972946166992 + }, + { + "auxiliary_loss_clip": 0.0105297, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.01515937, + "balance_loss_mlp": 1.01706851, + "epoch": 0.860784608447317, + "flos": 20481132988800.0, + "grad_norm": 2.885796569819384, + "language_loss": 0.83959895, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.86050719, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 14317, + "time_per_iteration": 2.376652956008911 + }, + { + "auxiliary_loss_clip": 0.01050511, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.01268303, + "balance_loss_mlp": 1.01684248, + "epoch": 0.860844731699985, + "flos": 50474178541440.0, + "grad_norm": 1.5623948753763375, + "language_loss": 0.67926365, + "learning_rate": 1.996343193113108e-07, + "loss": 0.70011544, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3359375, + "step": 14318, + "time_per_iteration": 2.624621868133545 + }, + { + "auxiliary_loss_clip": 0.01049271, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.01605439, + "balance_loss_mlp": 1.01540899, + "epoch": 0.8609048549526529, + "flos": 41171151841920.0, + "grad_norm": 1.6503342385204527, + "language_loss": 0.72015297, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.74101067, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 14319, + "time_per_iteration": 2.5291357040405273 + }, + { + "auxiliary_loss_clip": 0.0105344, + "auxiliary_loss_mlp": 0.01037629, + "balance_loss_clip": 1.01532483, + "balance_loss_mlp": 1.01774752, + "epoch": 0.8609649782053209, + "flos": 23950068099840.0, + "grad_norm": 1.6908877784155507, + "language_loss": 0.6833666, + "learning_rate": 1.992952252525839e-07, + "loss": 0.70427728, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 14320, + "time_per_iteration": 2.38474178314209 + }, + { + "auxiliary_loss_clip": 0.01052042, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.01183844, + "balance_loss_mlp": 1.01461947, + "epoch": 0.8610251014579888, + "flos": 23111454309120.0, + "grad_norm": 1.9278673034584484, + "language_loss": 0.80917871, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.83006614, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 14321, + "time_per_iteration": 2.3748888969421387 + }, + { + "auxiliary_loss_clip": 0.0104944, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.01200628, + "balance_loss_mlp": 1.01502228, + "epoch": 0.8610852247106568, + "flos": 19425813189120.0, + "grad_norm": 1.8475049782445703, + "language_loss": 0.72163641, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.74249041, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.34375, + "step": 14322, + "time_per_iteration": 2.3470282554626465 + }, + { + "auxiliary_loss_clip": 0.01053422, + "auxiliary_loss_mlp": 0.01044485, + "balance_loss_clip": 1.01860404, + "balance_loss_mlp": 1.01651335, + "epoch": 0.8611453479633249, + "flos": 19311228506880.0, + "grad_norm": 2.6296418918061364, + "language_loss": 0.57540858, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.59638762, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.36914062, + "step": 14323, + "time_per_iteration": 2.388249158859253 + }, + { + "auxiliary_loss_clip": 0.0104977, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.01290119, + "balance_loss_mlp": 1.01524818, + "epoch": 0.8612054712159928, + "flos": 23252678225280.0, + "grad_norm": 1.9455941518702309, + "language_loss": 0.7661097, + "learning_rate": 1.986178565813801e-07, + "loss": 0.78695339, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34570312, + "step": 14324, + "time_per_iteration": 2.4001431465148926 + }, + { + "auxiliary_loss_clip": 0.01051992, + "auxiliary_loss_mlp": 0.01036889, + "balance_loss_clip": 1.01267719, + "balance_loss_mlp": 1.01645923, + "epoch": 0.8612655944686608, + "flos": 16027682048640.0, + "grad_norm": 2.029742945289351, + "language_loss": 0.6790992, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.69998801, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 14325, + "time_per_iteration": 2.3445866107940674 + }, + { + "auxiliary_loss_clip": 0.0105256, + "auxiliary_loss_mlp": 0.01042515, + "balance_loss_clip": 1.01681304, + "balance_loss_mlp": 1.01618719, + "epoch": 0.8613257177213287, + "flos": 22491606297600.0, + "grad_norm": 1.6746860454760584, + "language_loss": 0.66470534, + "learning_rate": 1.982795820716472e-07, + "loss": 0.68565607, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36523438, + "step": 14326, + "time_per_iteration": 2.3663687705993652 + }, + { + "auxiliary_loss_clip": 0.01050923, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.01146555, + "balance_loss_mlp": 1.01570451, + "epoch": 0.8613858409739967, + "flos": 17237108056320.0, + "grad_norm": 1.9942290564813254, + "language_loss": 0.86150301, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.88236105, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 14327, + "time_per_iteration": 2.3654677867889404 + }, + { + "auxiliary_loss_clip": 0.01050941, + "auxiliary_loss_mlp": 0.0103676, + "balance_loss_clip": 1.01384795, + "balance_loss_mlp": 1.01534045, + "epoch": 0.8614459642266646, + "flos": 22819999345920.0, + "grad_norm": 2.366396813292019, + "language_loss": 0.76209134, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.78296828, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 14328, + "time_per_iteration": 2.5113308429718018 + }, + { + "auxiliary_loss_clip": 0.01050349, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.0143615, + "balance_loss_mlp": 1.01539612, + "epoch": 0.8615060874793327, + "flos": 26503126848000.0, + "grad_norm": 1.6427311277253742, + "language_loss": 0.80718976, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82804489, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34960938, + "step": 14329, + "time_per_iteration": 2.42568039894104 + }, + { + "auxiliary_loss_clip": 0.01050812, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.01137412, + "balance_loss_mlp": 1.01572895, + "epoch": 0.8615662107320006, + "flos": 24059869925760.0, + "grad_norm": 2.499847368144723, + "language_loss": 0.78202534, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.80287659, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 14330, + "time_per_iteration": 2.4003398418426514 + }, + { + "auxiliary_loss_clip": 0.01051292, + "auxiliary_loss_mlp": 0.01038156, + "balance_loss_clip": 1.01494622, + "balance_loss_mlp": 1.015553, + "epoch": 0.8616263339846686, + "flos": 24164051022720.0, + "grad_norm": 1.9712306196427594, + "language_loss": 0.65905011, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67994457, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35742188, + "step": 14331, + "time_per_iteration": 3.643087387084961 + }, + { + "auxiliary_loss_clip": 0.01051346, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.01648355, + "balance_loss_mlp": 1.01591659, + "epoch": 0.8616864572373365, + "flos": 21723307718400.0, + "grad_norm": 1.6294641329704256, + "language_loss": 0.76799405, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.78889364, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 14332, + "time_per_iteration": 2.4314746856689453 + }, + { + "auxiliary_loss_clip": 0.01052422, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.0117228, + "balance_loss_mlp": 1.01593924, + "epoch": 0.8617465804900045, + "flos": 23765587142400.0, + "grad_norm": 2.250019182161868, + "language_loss": 0.67943347, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.70032787, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 14333, + "time_per_iteration": 2.4071218967437744 + }, + { + "auxiliary_loss_clip": 0.01055634, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.01410937, + "balance_loss_mlp": 1.01748788, + "epoch": 0.8618067037426724, + "flos": 37702496021760.0, + "grad_norm": 1.8526800354880604, + "language_loss": 0.63645357, + "learning_rate": 1.969292174019157e-07, + "loss": 0.65740877, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 14334, + "time_per_iteration": 2.5634424686431885 + }, + { + "auxiliary_loss_clip": 0.01054261, + "auxiliary_loss_mlp": 0.01043255, + "balance_loss_clip": 1.02014005, + "balance_loss_mlp": 1.01693225, + "epoch": 0.8618668269953405, + "flos": 21469942915200.0, + "grad_norm": 1.9024144078805274, + "language_loss": 0.69899261, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71996778, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37304688, + "step": 14335, + "time_per_iteration": 2.4713752269744873 + }, + { + "auxiliary_loss_clip": 0.01051627, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.01010966, + "balance_loss_mlp": 1.01590121, + "epoch": 0.8619269502480085, + "flos": 22231713070080.0, + "grad_norm": 1.4017113721890082, + "language_loss": 0.83432913, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85517734, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 14336, + "time_per_iteration": 2.4397692680358887 + }, + { + "auxiliary_loss_clip": 0.0105277, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.01524234, + "balance_loss_mlp": 1.01596212, + "epoch": 0.8619870735006764, + "flos": 22709534204160.0, + "grad_norm": 1.7051428021440802, + "language_loss": 0.68666488, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.70757812, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 14337, + "time_per_iteration": 2.3845314979553223 + }, + { + "auxiliary_loss_clip": 0.01050431, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.01124537, + "balance_loss_mlp": 1.01548409, + "epoch": 0.8620471967533444, + "flos": 37518887848320.0, + "grad_norm": 2.333968655212592, + "language_loss": 0.67523372, + "learning_rate": 1.962556758053089e-07, + "loss": 0.6960817, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34960938, + "step": 14338, + "time_per_iteration": 2.5261096954345703 + }, + { + "auxiliary_loss_clip": 0.01051701, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.01160169, + "balance_loss_mlp": 1.01659822, + "epoch": 0.8621073200060123, + "flos": 19681447230720.0, + "grad_norm": 1.8923093599751015, + "language_loss": 0.63474262, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.65559471, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 14339, + "time_per_iteration": 2.382248878479004 + }, + { + "auxiliary_loss_clip": 0.0105041, + "auxiliary_loss_mlp": 0.01041433, + "balance_loss_clip": 1.01979649, + "balance_loss_mlp": 1.01513827, + "epoch": 0.8621674432586803, + "flos": 14536017676800.0, + "grad_norm": 1.9824973763259925, + "language_loss": 0.64125097, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.66216934, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35351562, + "step": 14340, + "time_per_iteration": 3.781794309616089 + }, + { + "auxiliary_loss_clip": 0.0104745, + "auxiliary_loss_mlp": 0.01030302, + "balance_loss_clip": 1.00977385, + "balance_loss_mlp": 1.01478195, + "epoch": 0.8622275665113482, + "flos": 20739071180160.0, + "grad_norm": 2.0654071128468656, + "language_loss": 0.80803549, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82881296, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.32617188, + "step": 14341, + "time_per_iteration": 3.6583330631256104 + }, + { + "auxiliary_loss_clip": 0.01051132, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01499987, + "balance_loss_mlp": 1.01590526, + "epoch": 0.8622876897640163, + "flos": 24714805720320.0, + "grad_norm": 1.878808157870589, + "language_loss": 0.75433564, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.77521026, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 14342, + "time_per_iteration": 2.4565579891204834 + }, + { + "auxiliary_loss_clip": 0.01053242, + "auxiliary_loss_mlp": 0.01036083, + "balance_loss_clip": 1.01262236, + "balance_loss_mlp": 1.01669288, + "epoch": 0.8623478130166842, + "flos": 17456397505920.0, + "grad_norm": 1.8404107092150621, + "language_loss": 0.69993186, + "learning_rate": 1.95415287816028e-07, + "loss": 0.72082508, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 14343, + "time_per_iteration": 2.370647430419922 + }, + { + "auxiliary_loss_clip": 0.01051841, + "auxiliary_loss_mlp": 0.01043298, + "balance_loss_clip": 1.01840687, + "balance_loss_mlp": 1.01561022, + "epoch": 0.8624079362693522, + "flos": 18108330923520.0, + "grad_norm": 1.7591341628445278, + "language_loss": 0.68916768, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.71011907, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36132812, + "step": 14344, + "time_per_iteration": 2.345428943634033 + }, + { + "auxiliary_loss_clip": 0.01052533, + "auxiliary_loss_mlp": 0.01042553, + "balance_loss_clip": 1.0191046, + "balance_loss_mlp": 1.01591182, + "epoch": 0.8624680595220201, + "flos": 30665087913600.0, + "grad_norm": 1.4525951709770133, + "language_loss": 0.82339704, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.84434795, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 14345, + "time_per_iteration": 2.4513206481933594 + }, + { + "auxiliary_loss_clip": 0.01054163, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.01420414, + "balance_loss_mlp": 1.01691103, + "epoch": 0.8625281827746881, + "flos": 37997058096000.0, + "grad_norm": 2.3635798710389566, + "language_loss": 0.52574247, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.5466702, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37304688, + "step": 14346, + "time_per_iteration": 2.516200304031372 + }, + { + "auxiliary_loss_clip": 0.01051196, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.01213098, + "balance_loss_mlp": 1.01593232, + "epoch": 0.862588306027356, + "flos": 26248540147200.0, + "grad_norm": 1.6003013885391215, + "language_loss": 0.76417136, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.78503352, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 14347, + "time_per_iteration": 2.394943952560425 + }, + { + "auxiliary_loss_clip": 0.01051796, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.01248717, + "balance_loss_mlp": 1.01602435, + "epoch": 0.862648429280024, + "flos": 25877797752960.0, + "grad_norm": 2.010399028019852, + "language_loss": 0.82088959, + "learning_rate": 1.945766105774449e-07, + "loss": 0.84178442, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.35742188, + "step": 14348, + "time_per_iteration": 2.41615891456604 + }, + { + "auxiliary_loss_clip": 0.01048798, + "auxiliary_loss_mlp": 0.01035573, + "balance_loss_clip": 1.01391292, + "balance_loss_mlp": 1.01460934, + "epoch": 0.862708552532692, + "flos": 37814881288320.0, + "grad_norm": 4.126265416725106, + "language_loss": 0.67220789, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.69305158, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34179688, + "step": 14349, + "time_per_iteration": 2.5754201412200928 + }, + { + "auxiliary_loss_clip": 0.01051086, + "auxiliary_loss_mlp": 0.01040874, + "balance_loss_clip": 1.01619744, + "balance_loss_mlp": 1.01603961, + "epoch": 0.86276867578536, + "flos": 19090996450560.0, + "grad_norm": 2.6717548638374966, + "language_loss": 0.71867704, + "learning_rate": 1.942416188703573e-07, + "loss": 0.73959661, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.34960938, + "step": 14350, + "time_per_iteration": 2.3782923221588135 + }, + { + "auxiliary_loss_clip": 0.01051203, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.01139951, + "balance_loss_mlp": 1.01553881, + "epoch": 0.862828799038028, + "flos": 22163178337920.0, + "grad_norm": 1.7674119769877688, + "language_loss": 0.78347218, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.80433488, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 14351, + "time_per_iteration": 2.4134812355041504 + }, + { + "auxiliary_loss_clip": 0.01052545, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.01271379, + "balance_loss_mlp": 1.01665211, + "epoch": 0.8628889222906959, + "flos": 23144552144640.0, + "grad_norm": 2.1450596668276676, + "language_loss": 0.86003006, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.8809011, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 14352, + "time_per_iteration": 2.3906478881835938 + }, + { + "auxiliary_loss_clip": 0.0100733, + "auxiliary_loss_mlp": 0.01001909, + "balance_loss_clip": 0.99990678, + "balance_loss_mlp": 1.00068617, + "epoch": 0.8629490455433639, + "flos": 57814455761280.0, + "grad_norm": 0.7956931311048218, + "language_loss": 0.62031937, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.64041179, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06640625, + "step": 14353, + "time_per_iteration": 3.022263288497925 + }, + { + "auxiliary_loss_clip": 0.01049525, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.01125169, + "balance_loss_mlp": 1.01534081, + "epoch": 0.8630091687960318, + "flos": 15918892652160.0, + "grad_norm": 1.8322733634112012, + "language_loss": 0.82273853, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84356016, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34179688, + "step": 14354, + "time_per_iteration": 2.376633405685425 + }, + { + "auxiliary_loss_clip": 0.01051105, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.01025856, + "balance_loss_mlp": 1.01586366, + "epoch": 0.8630692920486999, + "flos": 17960892785280.0, + "grad_norm": 1.9655131639202645, + "language_loss": 0.87276268, + "learning_rate": 1.934053380181031e-07, + "loss": 0.89360946, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3515625, + "step": 14355, + "time_per_iteration": 3.8565993309020996 + }, + { + "auxiliary_loss_clip": 0.01050629, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.00962079, + "balance_loss_mlp": 1.01504719, + "epoch": 0.8631294153013678, + "flos": 22454074719360.0, + "grad_norm": 2.065636535395998, + "language_loss": 0.60051262, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.62134224, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 14356, + "time_per_iteration": 2.37774658203125 + }, + { + "auxiliary_loss_clip": 0.01052833, + "auxiliary_loss_mlp": 0.0103945, + "balance_loss_clip": 1.0146668, + "balance_loss_mlp": 1.0161097, + "epoch": 0.8631895385540358, + "flos": 16836060735360.0, + "grad_norm": 12.54296107964176, + "language_loss": 0.77781641, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79873925, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 14357, + "time_per_iteration": 2.333808422088623 + }, + { + "auxiliary_loss_clip": 0.01053088, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.01299238, + "balance_loss_mlp": 1.01665878, + "epoch": 0.8632496618067037, + "flos": 18696233174400.0, + "grad_norm": 4.554164092518857, + "language_loss": 0.78753829, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.80843455, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 14358, + "time_per_iteration": 2.3766281604766846 + }, + { + "auxiliary_loss_clip": 0.01051743, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_clip": 1.01708496, + "balance_loss_mlp": 1.01627469, + "epoch": 0.8633097850593717, + "flos": 24278775350400.0, + "grad_norm": 1.5046146057602634, + "language_loss": 0.75798953, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.7789284, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35546875, + "step": 14359, + "time_per_iteration": 2.398514747619629 + }, + { + "auxiliary_loss_clip": 0.01048632, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.01039791, + "balance_loss_mlp": 1.0143218, + "epoch": 0.8633699083120396, + "flos": 21177510433920.0, + "grad_norm": 2.187515548298292, + "language_loss": 0.71698326, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.7377916, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 14360, + "time_per_iteration": 2.351353645324707 + }, + { + "auxiliary_loss_clip": 0.0105326, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.0175097, + "balance_loss_mlp": 1.01674414, + "epoch": 0.8634300315647077, + "flos": 19243880760960.0, + "grad_norm": 1.9331844430707534, + "language_loss": 0.77378923, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.79473364, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 14361, + "time_per_iteration": 2.3651018142700195 + }, + { + "auxiliary_loss_clip": 0.01007342, + "auxiliary_loss_mlp": 0.01002054, + "balance_loss_clip": 0.99994451, + "balance_loss_mlp": 1.00092745, + "epoch": 0.8634901548173756, + "flos": 66192494232960.0, + "grad_norm": 0.9617329510203322, + "language_loss": 0.58919775, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60929173, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06445312, + "step": 14362, + "time_per_iteration": 3.033932685852051 + }, + { + "auxiliary_loss_clip": 0.01052205, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.01748848, + "balance_loss_mlp": 1.01498532, + "epoch": 0.8635502780700436, + "flos": 24788402599680.0, + "grad_norm": 1.841984012020704, + "language_loss": 0.81514573, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.8361119, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37109375, + "step": 14363, + "time_per_iteration": 2.396483898162842 + }, + { + "auxiliary_loss_clip": 0.01053613, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.01542783, + "balance_loss_mlp": 1.01639414, + "epoch": 0.8636104013227116, + "flos": 25188856427520.0, + "grad_norm": 2.842748680840053, + "language_loss": 0.74851406, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.76946998, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37109375, + "step": 14364, + "time_per_iteration": 2.403127670288086 + }, + { + "auxiliary_loss_clip": 0.01051714, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.01245761, + "balance_loss_mlp": 1.01549816, + "epoch": 0.8636705245753795, + "flos": 23877309093120.0, + "grad_norm": 1.6259453615953252, + "language_loss": 0.72654533, + "learning_rate": 1.917379150731755e-07, + "loss": 0.74740934, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36132812, + "step": 14365, + "time_per_iteration": 2.373814821243286 + }, + { + "auxiliary_loss_clip": 0.01055015, + "auxiliary_loss_mlp": 0.0104197, + "balance_loss_clip": 1.01601839, + "balance_loss_mlp": 1.01703584, + "epoch": 0.8637306478280475, + "flos": 23109394538880.0, + "grad_norm": 2.07534741671618, + "language_loss": 0.72905439, + "learning_rate": 1.915715498065993e-07, + "loss": 0.7500242, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 14366, + "time_per_iteration": 2.386399030685425 + }, + { + "auxiliary_loss_clip": 0.01050302, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.01652527, + "balance_loss_mlp": 1.01628542, + "epoch": 0.8637907710807154, + "flos": 21905763816960.0, + "grad_norm": 1.5370760280974591, + "language_loss": 0.82754475, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.8484174, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 14367, + "time_per_iteration": 2.362192392349243 + }, + { + "auxiliary_loss_clip": 0.01052496, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.01479793, + "balance_loss_mlp": 1.01601171, + "epoch": 0.8638508943333835, + "flos": 23579570085120.0, + "grad_norm": 2.09377123851104, + "language_loss": 0.62257791, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.64349604, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36523438, + "step": 14368, + "time_per_iteration": 2.3991174697875977 + }, + { + "auxiliary_loss_clip": 0.01051586, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.01126552, + "balance_loss_mlp": 1.01682699, + "epoch": 0.8639110175860514, + "flos": 25774663996800.0, + "grad_norm": 1.961123152056968, + "language_loss": 0.7722922, + "learning_rate": 1.91072865486821e-07, + "loss": 0.79314697, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 14369, + "time_per_iteration": 2.4198153018951416 + }, + { + "auxiliary_loss_clip": 0.01053645, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.01813269, + "balance_loss_mlp": 1.01624858, + "epoch": 0.8639711408387194, + "flos": 23368275336960.0, + "grad_norm": 1.8161075868717163, + "language_loss": 0.6493926, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.67036092, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 14370, + "time_per_iteration": 2.4210610389709473 + }, + { + "auxiliary_loss_clip": 0.01051526, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.01470923, + "balance_loss_mlp": 1.01619887, + "epoch": 0.8640312640913873, + "flos": 22126135518720.0, + "grad_norm": 1.6874168667824085, + "language_loss": 0.67241448, + "learning_rate": 1.907407522366209e-07, + "loss": 0.6933198, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35351562, + "step": 14371, + "time_per_iteration": 3.682610511779785 + }, + { + "auxiliary_loss_clip": 0.01007367, + "auxiliary_loss_mlp": 0.01002545, + "balance_loss_clip": 1.00050676, + "balance_loss_mlp": 1.00080836, + "epoch": 0.8640913873440553, + "flos": 57569192259840.0, + "grad_norm": 0.8595969799600731, + "language_loss": 0.56965387, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58975291, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.06542969, + "step": 14372, + "time_per_iteration": 2.939011812210083 + }, + { + "auxiliary_loss_clip": 0.01050444, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.01483476, + "balance_loss_mlp": 1.01576161, + "epoch": 0.8641515105967232, + "flos": 23986307957760.0, + "grad_norm": 1.7173762939578163, + "language_loss": 0.80282629, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.823704, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 14373, + "time_per_iteration": 2.41999888420105 + }, + { + "auxiliary_loss_clip": 0.01050385, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.01310408, + "balance_loss_mlp": 1.0148499, + "epoch": 0.8642116338493913, + "flos": 19061738864640.0, + "grad_norm": 2.4164510511952635, + "language_loss": 0.64601362, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.66687101, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 14374, + "time_per_iteration": 2.3702383041381836 + }, + { + "auxiliary_loss_clip": 0.01050734, + "auxiliary_loss_mlp": 0.01039649, + "balance_loss_clip": 1.01782155, + "balance_loss_mlp": 1.01619589, + "epoch": 0.8642717571020592, + "flos": 18253325266560.0, + "grad_norm": 1.698429218989576, + "language_loss": 0.78087282, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.80177665, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 14375, + "time_per_iteration": 2.361356019973755 + }, + { + "auxiliary_loss_clip": 0.01052118, + "auxiliary_loss_mlp": 0.01037111, + "balance_loss_clip": 1.01300704, + "balance_loss_mlp": 1.01605022, + "epoch": 0.8643318803547272, + "flos": 57661224203520.0, + "grad_norm": 1.6261428268643123, + "language_loss": 0.61944938, + "learning_rate": 1.899116698488117e-07, + "loss": 0.64034164, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36132812, + "step": 14376, + "time_per_iteration": 2.7170045375823975 + }, + { + "auxiliary_loss_clip": 0.01050528, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.01449978, + "balance_loss_mlp": 1.0156827, + "epoch": 0.8643920036073952, + "flos": 19608513667200.0, + "grad_norm": 1.4370469823802037, + "language_loss": 0.67760539, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.69847685, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 14377, + "time_per_iteration": 2.403916120529175 + }, + { + "auxiliary_loss_clip": 0.01051711, + "auxiliary_loss_mlp": 0.01036142, + "balance_loss_clip": 1.01263356, + "balance_loss_mlp": 1.01576114, + "epoch": 0.8644521268600631, + "flos": 20849291942400.0, + "grad_norm": 1.6009919638063392, + "language_loss": 0.718279, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.7391575, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 14378, + "time_per_iteration": 2.3924341201782227 + }, + { + "auxiliary_loss_clip": 0.01007542, + "auxiliary_loss_mlp": 0.01003609, + "balance_loss_clip": 1.00135636, + "balance_loss_mlp": 1.00093508, + "epoch": 0.8645122501127311, + "flos": 66716016203520.0, + "grad_norm": 0.8038415321168, + "language_loss": 0.60323489, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62334645, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.06640625, + "step": 14379, + "time_per_iteration": 3.0264432430267334 + }, + { + "auxiliary_loss_clip": 0.01051591, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.015275, + "balance_loss_mlp": 1.01614761, + "epoch": 0.864572373365399, + "flos": 21688918162560.0, + "grad_norm": 1.707260192037767, + "language_loss": 0.75568128, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.7765702, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 14380, + "time_per_iteration": 3.7975189685821533 + }, + { + "auxiliary_loss_clip": 0.01053147, + "auxiliary_loss_mlp": 0.01041017, + "balance_loss_clip": 1.01871324, + "balance_loss_mlp": 1.01673937, + "epoch": 0.8646324966180671, + "flos": 20265369586560.0, + "grad_norm": 3.7721776289491857, + "language_loss": 0.76114023, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.78208184, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36328125, + "step": 14381, + "time_per_iteration": 3.6527624130249023 + }, + { + "auxiliary_loss_clip": 0.01050319, + "auxiliary_loss_mlp": 0.01036139, + "balance_loss_clip": 1.01490808, + "balance_loss_mlp": 1.01609433, + "epoch": 0.864692619870735, + "flos": 11945427350400.0, + "grad_norm": 2.226706374907213, + "language_loss": 0.85747313, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.87833768, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34375, + "step": 14382, + "time_per_iteration": 2.354745864868164 + }, + { + "auxiliary_loss_clip": 0.01053107, + "auxiliary_loss_mlp": 0.01039356, + "balance_loss_clip": 1.01526403, + "balance_loss_mlp": 1.01624107, + "epoch": 0.864752743123403, + "flos": 21469628712960.0, + "grad_norm": 1.6667954444799133, + "language_loss": 0.77066052, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.79158521, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 14383, + "time_per_iteration": 2.3565421104431152 + }, + { + "auxiliary_loss_clip": 0.01051655, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.01342332, + "balance_loss_mlp": 1.0164783, + "epoch": 0.8648128663760709, + "flos": 19529191324800.0, + "grad_norm": 1.869589342949668, + "language_loss": 0.86133265, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.8822155, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14384, + "time_per_iteration": 2.389897584915161 + }, + { + "auxiliary_loss_clip": 0.01050808, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.01470077, + "balance_loss_mlp": 1.01608753, + "epoch": 0.8648729896287389, + "flos": 21286893323520.0, + "grad_norm": 1.7868591558117337, + "language_loss": 0.81718117, + "learning_rate": 1.884236463176072e-07, + "loss": 0.83805633, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 14385, + "time_per_iteration": 2.359178304672241 + }, + { + "auxiliary_loss_clip": 0.01054765, + "auxiliary_loss_mlp": 0.01037603, + "balance_loss_clip": 1.0143218, + "balance_loss_mlp": 1.01774347, + "epoch": 0.8649331128814068, + "flos": 24603432883200.0, + "grad_norm": 2.4094531132280257, + "language_loss": 0.73892832, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.75985199, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37109375, + "step": 14386, + "time_per_iteration": 2.426994800567627 + }, + { + "auxiliary_loss_clip": 0.0105088, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.0151403, + "balance_loss_mlp": 1.01556253, + "epoch": 0.8649932361340749, + "flos": 15376900705920.0, + "grad_norm": 2.068668149610385, + "language_loss": 0.83156645, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.852449, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 14387, + "time_per_iteration": 2.3287112712860107 + }, + { + "auxiliary_loss_clip": 0.01048785, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.01487589, + "balance_loss_mlp": 1.01562548, + "epoch": 0.8650533593867428, + "flos": 19900213009920.0, + "grad_norm": 2.0065247593543645, + "language_loss": 0.69954193, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.72039968, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33203125, + "step": 14388, + "time_per_iteration": 2.349789619445801 + }, + { + "auxiliary_loss_clip": 0.01049068, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.01295233, + "balance_loss_mlp": 1.01549351, + "epoch": 0.8651134826394108, + "flos": 25625829404160.0, + "grad_norm": 2.162655054502081, + "language_loss": 0.91206992, + "learning_rate": 1.877640883285283e-07, + "loss": 0.93289161, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.3359375, + "step": 14389, + "time_per_iteration": 2.4075024127960205 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.0123502, + "balance_loss_mlp": 1.01581693, + "epoch": 0.8651736058920788, + "flos": 18733520373120.0, + "grad_norm": 1.4856391960708895, + "language_loss": 0.72011459, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.74096143, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 14390, + "time_per_iteration": 2.369751453399658 + }, + { + "auxiliary_loss_clip": 0.01052255, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.02074957, + "balance_loss_mlp": 1.01562285, + "epoch": 0.8652337291447467, + "flos": 20775729974400.0, + "grad_norm": 1.730005687610173, + "language_loss": 0.82822776, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84918809, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 14391, + "time_per_iteration": 2.3752269744873047 + }, + { + "auxiliary_loss_clip": 0.01007018, + "auxiliary_loss_mlp": 0.01003983, + "balance_loss_clip": 1.00182509, + "balance_loss_mlp": 1.00064504, + "epoch": 0.8652938523974147, + "flos": 64224719383680.0, + "grad_norm": 0.8628401988850583, + "language_loss": 0.68091345, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70102346, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06347656, + "step": 14392, + "time_per_iteration": 2.9022059440612793 + }, + { + "auxiliary_loss_clip": 0.01053554, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.01194334, + "balance_loss_mlp": 1.01569998, + "epoch": 0.8653539756500827, + "flos": 18039621634560.0, + "grad_norm": 2.071500595190967, + "language_loss": 0.76634955, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.78724855, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 14393, + "time_per_iteration": 2.3607017993927 + }, + { + "auxiliary_loss_clip": 0.01051799, + "auxiliary_loss_mlp": 0.01041451, + "balance_loss_clip": 1.01790738, + "balance_loss_mlp": 1.01525664, + "epoch": 0.8654140989027507, + "flos": 17381508906240.0, + "grad_norm": 1.8099869682145857, + "language_loss": 0.76135468, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.78228718, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 14394, + "time_per_iteration": 3.7599117755889893 + }, + { + "auxiliary_loss_clip": 0.01053142, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.01269603, + "balance_loss_mlp": 1.01582539, + "epoch": 0.8654742221554186, + "flos": 53282941153920.0, + "grad_norm": 2.060777797731929, + "language_loss": 0.67130232, + "learning_rate": 1.867768130747036e-07, + "loss": 0.69220579, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37304688, + "step": 14395, + "time_per_iteration": 2.67030668258667 + }, + { + "auxiliary_loss_clip": 0.01052287, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.01103795, + "balance_loss_mlp": 1.01616418, + "epoch": 0.8655343454080866, + "flos": 23913583862400.0, + "grad_norm": 1.82963079221799, + "language_loss": 0.68918121, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.71004319, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 14396, + "time_per_iteration": 2.3892412185668945 + }, + { + "auxiliary_loss_clip": 0.01053584, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.01550865, + "balance_loss_mlp": 1.01651955, + "epoch": 0.8655944686607545, + "flos": 24096074872320.0, + "grad_norm": 2.495892751588844, + "language_loss": 0.70968735, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.73061156, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 14397, + "time_per_iteration": 2.380506753921509 + }, + { + "auxiliary_loss_clip": 0.01050429, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.01767373, + "balance_loss_mlp": 1.01532412, + "epoch": 0.8656545919134225, + "flos": 23111593954560.0, + "grad_norm": 1.6536543215745072, + "language_loss": 0.6430977, + "learning_rate": 1.86284103591253e-07, + "loss": 0.66400599, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3515625, + "step": 14398, + "time_per_iteration": 2.3721985816955566 + }, + { + "auxiliary_loss_clip": 0.01051443, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.0132513, + "balance_loss_mlp": 1.01669979, + "epoch": 0.8657147151660904, + "flos": 21140711994240.0, + "grad_norm": 2.119871165372035, + "language_loss": 0.77818179, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.79905093, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 14399, + "time_per_iteration": 2.361816644668579 + }, + { + "auxiliary_loss_clip": 0.01050696, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.01341522, + "balance_loss_mlp": 1.01528633, + "epoch": 0.8657748384187585, + "flos": 16288517882880.0, + "grad_norm": 1.8588538403020105, + "language_loss": 0.94026971, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.96112955, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 14400, + "time_per_iteration": 2.341435670852661 + }, + { + "auxiliary_loss_clip": 0.01053545, + "auxiliary_loss_mlp": 0.01039816, + "balance_loss_clip": 1.01785803, + "balance_loss_mlp": 1.01708591, + "epoch": 0.8658349616714264, + "flos": 30842656421760.0, + "grad_norm": 3.068645084377434, + "language_loss": 0.68208051, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.70301414, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36523438, + "step": 14401, + "time_per_iteration": 2.447643280029297 + }, + { + "auxiliary_loss_clip": 0.01052571, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.01203465, + "balance_loss_mlp": 1.01624489, + "epoch": 0.8658950849240944, + "flos": 18951867216000.0, + "grad_norm": 7.024235610908877, + "language_loss": 0.7531473, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.77403724, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 14402, + "time_per_iteration": 2.4042270183563232 + }, + { + "auxiliary_loss_clip": 0.01050233, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.01482224, + "balance_loss_mlp": 1.01544046, + "epoch": 0.8659552081767624, + "flos": 23363317923840.0, + "grad_norm": 1.705171018002368, + "language_loss": 0.76789141, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.78876483, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 14403, + "time_per_iteration": 2.377317190170288 + }, + { + "auxiliary_loss_clip": 0.01053144, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.00961649, + "balance_loss_mlp": 1.0163691, + "epoch": 0.8660153314294303, + "flos": 23840859767040.0, + "grad_norm": 1.794015014646919, + "language_loss": 0.7434516, + "learning_rate": 1.853005417520368e-07, + "loss": 0.76432729, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 14404, + "time_per_iteration": 2.4152848720550537 + }, + { + "auxiliary_loss_clip": 0.01051017, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.01186001, + "balance_loss_mlp": 1.01630807, + "epoch": 0.8660754546820983, + "flos": 23111349575040.0, + "grad_norm": 1.6507702660645553, + "language_loss": 0.72616911, + "learning_rate": 1.851368555901447e-07, + "loss": 0.74702525, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 14405, + "time_per_iteration": 2.360077381134033 + }, + { + "auxiliary_loss_clip": 0.01052882, + "auxiliary_loss_mlp": 0.01041312, + "balance_loss_clip": 1.01730371, + "balance_loss_mlp": 1.01667666, + "epoch": 0.8661355779347663, + "flos": 14391128067840.0, + "grad_norm": 2.2623503851460907, + "language_loss": 0.67973387, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.70067585, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 14406, + "time_per_iteration": 2.3535852432250977 + }, + { + "auxiliary_loss_clip": 0.0105086, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.01113546, + "balance_loss_mlp": 1.01559234, + "epoch": 0.8661957011874343, + "flos": 21869105022720.0, + "grad_norm": 1.7634541150599348, + "language_loss": 0.84436381, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.86519527, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35351562, + "step": 14407, + "time_per_iteration": 2.367490768432617 + }, + { + "auxiliary_loss_clip": 0.01051043, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.01682734, + "balance_loss_mlp": 1.01588809, + "epoch": 0.8662558244401022, + "flos": 21834087062400.0, + "grad_norm": 1.8713919160069212, + "language_loss": 0.71585619, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.73676789, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14408, + "time_per_iteration": 2.379157066345215 + }, + { + "auxiliary_loss_clip": 0.01048091, + "auxiliary_loss_mlp": 0.01037957, + "balance_loss_clip": 1.01684475, + "balance_loss_mlp": 1.01480699, + "epoch": 0.8663159476927702, + "flos": 17383149740160.0, + "grad_norm": 4.607523568653544, + "language_loss": 0.78163075, + "learning_rate": 1.844827992025304e-07, + "loss": 0.80249131, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33203125, + "step": 14409, + "time_per_iteration": 2.3353817462921143 + }, + { + "auxiliary_loss_clip": 0.0105469, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.01198912, + "balance_loss_mlp": 1.01688671, + "epoch": 0.8663760709454381, + "flos": 22746611934720.0, + "grad_norm": 1.7686131202826405, + "language_loss": 0.7830891, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.80399787, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37890625, + "step": 14410, + "time_per_iteration": 2.3880598545074463 + }, + { + "auxiliary_loss_clip": 0.01051383, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.01840699, + "balance_loss_mlp": 1.01571, + "epoch": 0.8664361941981061, + "flos": 17376097645440.0, + "grad_norm": 3.058786885765909, + "language_loss": 0.78603214, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.80696344, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 14411, + "time_per_iteration": 3.53472638130188 + }, + { + "auxiliary_loss_clip": 0.01049385, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.01666641, + "balance_loss_mlp": 1.01530814, + "epoch": 0.866496317450774, + "flos": 16033512245760.0, + "grad_norm": 1.6339132344309752, + "language_loss": 0.74860048, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.76946801, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34179688, + "step": 14412, + "time_per_iteration": 2.3892323970794678 + }, + { + "auxiliary_loss_clip": 0.01047502, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.01021862, + "balance_loss_mlp": 1.0145421, + "epoch": 0.8665564407034421, + "flos": 20813750311680.0, + "grad_norm": 1.6685640436507865, + "language_loss": 0.70674813, + "learning_rate": 1.83829844328371e-07, + "loss": 0.72751617, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.33007812, + "step": 14413, + "time_per_iteration": 2.367943286895752 + }, + { + "auxiliary_loss_clip": 0.01051068, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.01747358, + "balance_loss_mlp": 1.01612329, + "epoch": 0.86661656395611, + "flos": 15814257707520.0, + "grad_norm": 2.022487475375983, + "language_loss": 0.63735592, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65826535, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34960938, + "step": 14414, + "time_per_iteration": 2.4305784702301025 + }, + { + "auxiliary_loss_clip": 0.01052466, + "auxiliary_loss_mlp": 0.01039151, + "balance_loss_clip": 1.01721621, + "balance_loss_mlp": 1.01660895, + "epoch": 0.866676687208778, + "flos": 23035867482240.0, + "grad_norm": 1.7742663599636839, + "language_loss": 0.64442754, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.66534376, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.359375, + "step": 14415, + "time_per_iteration": 2.392693519592285 + }, + { + "auxiliary_loss_clip": 0.01007711, + "auxiliary_loss_mlp": 0.01002493, + "balance_loss_clip": 1.00047863, + "balance_loss_mlp": 1.00120735, + "epoch": 0.866736810461446, + "flos": 63798778396800.0, + "grad_norm": 0.7952035184378787, + "language_loss": 0.60459179, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62469381, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.06542969, + "step": 14416, + "time_per_iteration": 3.1082167625427246 + }, + { + "auxiliary_loss_clip": 0.01053715, + "auxiliary_loss_mlp": 0.01038531, + "balance_loss_clip": 1.0128895, + "balance_loss_mlp": 1.01580429, + "epoch": 0.8667969337141139, + "flos": 20448314444160.0, + "grad_norm": 1.6381203733142533, + "language_loss": 0.76190567, + "learning_rate": 1.831779913638285e-07, + "loss": 0.78282809, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 14417, + "time_per_iteration": 2.3641412258148193 + }, + { + "auxiliary_loss_clip": 0.01050511, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.01327181, + "balance_loss_mlp": 1.01556015, + "epoch": 0.866857056966782, + "flos": 21652608481920.0, + "grad_norm": 1.8006581663878356, + "language_loss": 0.76299393, + "learning_rate": 1.830152003424319e-07, + "loss": 0.78385937, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34960938, + "step": 14418, + "time_per_iteration": 2.4161219596862793 + }, + { + "auxiliary_loss_clip": 0.01049106, + "auxiliary_loss_mlp": 0.01038762, + "balance_loss_clip": 1.01703048, + "balance_loss_mlp": 1.0142715, + "epoch": 0.8669171802194499, + "flos": 22851840372480.0, + "grad_norm": 1.4525876246499663, + "language_loss": 0.68668652, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70756519, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 14419, + "time_per_iteration": 3.8454902172088623 + }, + { + "auxiliary_loss_clip": 0.01051181, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.01372814, + "balance_loss_mlp": 1.01570344, + "epoch": 0.8669773034721179, + "flos": 18733171259520.0, + "grad_norm": 1.658256954937257, + "language_loss": 0.79460526, + "learning_rate": 1.826898250065465e-07, + "loss": 0.81547678, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 14420, + "time_per_iteration": 2.363663911819458 + }, + { + "auxiliary_loss_clip": 0.01050323, + "auxiliary_loss_mlp": 0.01038006, + "balance_loss_clip": 1.01623821, + "balance_loss_mlp": 1.01631415, + "epoch": 0.8670374267247858, + "flos": 18915033864960.0, + "grad_norm": 1.4537718602653396, + "language_loss": 0.83897996, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85986316, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.33984375, + "step": 14421, + "time_per_iteration": 3.7353038787841797 + }, + { + "auxiliary_loss_clip": 0.01007271, + "auxiliary_loss_mlp": 0.01003496, + "balance_loss_clip": 1.0013268, + "balance_loss_mlp": 1.00086844, + "epoch": 0.8670975499774538, + "flos": 48811227840000.0, + "grad_norm": 0.7863018082135329, + "language_loss": 0.49235007, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51245773, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.06396484, + "step": 14422, + "time_per_iteration": 3.04862642288208 + }, + { + "auxiliary_loss_clip": 0.01050754, + "auxiliary_loss_mlp": 0.01038854, + "balance_loss_clip": 1.01716971, + "balance_loss_mlp": 1.01593828, + "epoch": 0.8671576732301217, + "flos": 26135072628480.0, + "grad_norm": 2.1788932053276024, + "language_loss": 0.74472761, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.76562369, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 14423, + "time_per_iteration": 2.4050352573394775 + }, + { + "auxiliary_loss_clip": 0.01048005, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.01052046, + "balance_loss_mlp": 1.0144403, + "epoch": 0.8672177964827897, + "flos": 18366513494400.0, + "grad_norm": 1.58939400368069, + "language_loss": 0.77634406, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.79712582, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.3359375, + "step": 14424, + "time_per_iteration": 2.3670294284820557 + }, + { + "auxiliary_loss_clip": 0.01048521, + "auxiliary_loss_mlp": 0.01036068, + "balance_loss_clip": 1.01490867, + "balance_loss_mlp": 1.01528001, + "epoch": 0.8672779197354576, + "flos": 28544184374400.0, + "grad_norm": 1.7764334806323139, + "language_loss": 0.7202909, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.74113679, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33203125, + "step": 14425, + "time_per_iteration": 2.439204216003418 + }, + { + "auxiliary_loss_clip": 0.0105399, + "auxiliary_loss_mlp": 0.01041607, + "balance_loss_clip": 1.01708591, + "balance_loss_mlp": 1.01685369, + "epoch": 0.8673380429881257, + "flos": 22381385535360.0, + "grad_norm": 1.4706355324244202, + "language_loss": 0.6903196, + "learning_rate": 1.817153530980926e-07, + "loss": 0.71127558, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37109375, + "step": 14426, + "time_per_iteration": 2.4000675678253174 + }, + { + "auxiliary_loss_clip": 0.01053567, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.01230907, + "balance_loss_mlp": 1.01638472, + "epoch": 0.8673981662407936, + "flos": 20995368537600.0, + "grad_norm": 1.720471478332221, + "language_loss": 0.71173418, + "learning_rate": 1.815531824008234e-07, + "loss": 0.73263264, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37304688, + "step": 14427, + "time_per_iteration": 2.3671512603759766 + }, + { + "auxiliary_loss_clip": 0.01052061, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.01237226, + "balance_loss_mlp": 1.01729631, + "epoch": 0.8674582894934616, + "flos": 24425619995520.0, + "grad_norm": 1.5745276244735862, + "language_loss": 0.6892997, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.71016836, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 14428, + "time_per_iteration": 2.4152557849884033 + }, + { + "auxiliary_loss_clip": 0.01051072, + "auxiliary_loss_mlp": 0.01036357, + "balance_loss_clip": 1.01563859, + "balance_loss_mlp": 1.01599622, + "epoch": 0.8675184127461296, + "flos": 20736557562240.0, + "grad_norm": 2.7347542750204332, + "language_loss": 0.71543837, + "learning_rate": 1.812290478794889e-07, + "loss": 0.73631263, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.3515625, + "step": 14429, + "time_per_iteration": 2.38466477394104 + }, + { + "auxiliary_loss_clip": 0.01051669, + "auxiliary_loss_mlp": 0.01035094, + "balance_loss_clip": 1.01168108, + "balance_loss_mlp": 1.01558828, + "epoch": 0.8675785359987975, + "flos": 19134637516800.0, + "grad_norm": 2.0855148343500876, + "language_loss": 0.68227589, + "learning_rate": 1.810670840677151e-07, + "loss": 0.70314348, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36132812, + "step": 14430, + "time_per_iteration": 2.348679542541504 + }, + { + "auxiliary_loss_clip": 0.01053568, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.01724386, + "balance_loss_mlp": 1.01619172, + "epoch": 0.8676386592514655, + "flos": 22709569115520.0, + "grad_norm": 1.8197723765073757, + "language_loss": 0.70474076, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.72568905, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37304688, + "step": 14431, + "time_per_iteration": 2.3773789405822754 + }, + { + "auxiliary_loss_clip": 0.01052522, + "auxiliary_loss_mlp": 0.01035273, + "balance_loss_clip": 1.01276672, + "balance_loss_mlp": 1.01659822, + "epoch": 0.8676987825041335, + "flos": 14208986171520.0, + "grad_norm": 2.851282654389925, + "language_loss": 0.64794946, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.66882741, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 14432, + "time_per_iteration": 2.3374879360198975 + }, + { + "auxiliary_loss_clip": 0.01052054, + "auxiliary_loss_mlp": 0.01036839, + "balance_loss_clip": 1.01378429, + "balance_loss_mlp": 1.01602423, + "epoch": 0.8677589057568015, + "flos": 13589068337280.0, + "grad_norm": 2.072126781277754, + "language_loss": 0.79673481, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.81762373, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 14433, + "time_per_iteration": 2.392008066177368 + }, + { + "auxiliary_loss_clip": 0.010079, + "auxiliary_loss_mlp": 0.01002924, + "balance_loss_clip": 1.00090969, + "balance_loss_mlp": 1.00143969, + "epoch": 0.8678190290094694, + "flos": 68930383052160.0, + "grad_norm": 0.7064654112680087, + "language_loss": 0.58629304, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60640132, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.06445312, + "step": 14434, + "time_per_iteration": 4.567859649658203 + }, + { + "auxiliary_loss_clip": 0.01049316, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.01193571, + "balance_loss_mlp": 1.01519513, + "epoch": 0.8678791522621374, + "flos": 32556472974720.0, + "grad_norm": 1.7541212996513071, + "language_loss": 0.80602515, + "learning_rate": 1.802582997433628e-07, + "loss": 0.82683849, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 14435, + "time_per_iteration": 2.4611613750457764 + }, + { + "auxiliary_loss_clip": 0.01050317, + "auxiliary_loss_mlp": 0.0103664, + "balance_loss_clip": 1.01205921, + "balance_loss_mlp": 1.01459384, + "epoch": 0.8679392755148053, + "flos": 35041206458880.0, + "grad_norm": 1.9115427732680266, + "language_loss": 0.63658369, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.6574533, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 14436, + "time_per_iteration": 2.4748575687408447 + }, + { + "auxiliary_loss_clip": 0.01051616, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.01267338, + "balance_loss_mlp": 1.01567459, + "epoch": 0.8679993987674733, + "flos": 18551483210880.0, + "grad_norm": 1.9427905691954463, + "language_loss": 0.71655893, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.73744535, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 14437, + "time_per_iteration": 2.3624138832092285 + }, + { + "auxiliary_loss_clip": 0.01051034, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.01196766, + "balance_loss_mlp": 1.01595938, + "epoch": 0.8680595220201412, + "flos": 27453148387200.0, + "grad_norm": 1.9693064498905066, + "language_loss": 0.82057613, + "learning_rate": 1.797738571571381e-07, + "loss": 0.8414318, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 14438, + "time_per_iteration": 2.412100076675415 + }, + { + "auxiliary_loss_clip": 0.0104801, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.012362, + "balance_loss_mlp": 1.01486444, + "epoch": 0.8681196452728093, + "flos": 19207780548480.0, + "grad_norm": 1.8655223814872073, + "language_loss": 0.68529022, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.70610851, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33203125, + "step": 14439, + "time_per_iteration": 2.405155897140503 + }, + { + "auxiliary_loss_clip": 0.01049645, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.01780617, + "balance_loss_mlp": 1.0155679, + "epoch": 0.8681797685254772, + "flos": 37558933044480.0, + "grad_norm": 1.579482142597671, + "language_loss": 0.65052712, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.67139959, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.33984375, + "step": 14440, + "time_per_iteration": 2.5525436401367188 + }, + { + "auxiliary_loss_clip": 0.01048789, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.01345026, + "balance_loss_mlp": 1.01561809, + "epoch": 0.8682398917781452, + "flos": 23288952994560.0, + "grad_norm": 1.5864778673107185, + "language_loss": 0.66462296, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68545723, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33203125, + "step": 14441, + "time_per_iteration": 2.414846181869507 + }, + { + "auxiliary_loss_clip": 0.01048139, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.01035953, + "balance_loss_mlp": 1.01475143, + "epoch": 0.8683000150308132, + "flos": 21871688463360.0, + "grad_norm": 1.5129986953260706, + "language_loss": 0.66906655, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.6898551, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33398438, + "step": 14442, + "time_per_iteration": 2.3914287090301514 + }, + { + "auxiliary_loss_clip": 0.01052985, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.00887752, + "balance_loss_mlp": 1.01600635, + "epoch": 0.8683601382834811, + "flos": 14646343173120.0, + "grad_norm": 1.795210209118752, + "language_loss": 0.73153412, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.75240958, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37109375, + "step": 14443, + "time_per_iteration": 2.375741481781006 + }, + { + "auxiliary_loss_clip": 0.01051288, + "auxiliary_loss_mlp": 0.01037773, + "balance_loss_clip": 1.01468205, + "balance_loss_mlp": 1.01554728, + "epoch": 0.8684202615361492, + "flos": 26358691086720.0, + "grad_norm": 1.7248452203181228, + "language_loss": 0.84083539, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.86172599, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 14444, + "time_per_iteration": 2.432027578353882 + }, + { + "auxiliary_loss_clip": 0.01052421, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.01354527, + "balance_loss_mlp": 1.01605988, + "epoch": 0.8684803847888171, + "flos": 20702970967680.0, + "grad_norm": 1.9117899248915144, + "language_loss": 0.78204179, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.80292881, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 14445, + "time_per_iteration": 2.380584478378296 + }, + { + "auxiliary_loss_clip": 0.01051294, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.01223826, + "balance_loss_mlp": 1.0161109, + "epoch": 0.8685405080414851, + "flos": 22637019576960.0, + "grad_norm": 2.0183425071375516, + "language_loss": 0.68651801, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.70737618, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 14446, + "time_per_iteration": 2.3673148155212402 + }, + { + "auxiliary_loss_clip": 0.01050737, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.01185453, + "balance_loss_mlp": 1.01601982, + "epoch": 0.868600631294153, + "flos": 24821046587520.0, + "grad_norm": 3.5811218468262975, + "language_loss": 0.83564538, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.8564893, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34765625, + "step": 14447, + "time_per_iteration": 2.401456594467163 + }, + { + "auxiliary_loss_clip": 0.010511, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.00959206, + "balance_loss_mlp": 1.01524067, + "epoch": 0.868660754546821, + "flos": 25112955398400.0, + "grad_norm": 1.627511418538142, + "language_loss": 0.74921435, + "learning_rate": 1.781635359686515e-07, + "loss": 0.7700392, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.359375, + "step": 14448, + "time_per_iteration": 2.3894741535186768 + }, + { + "auxiliary_loss_clip": 0.01052124, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.01475525, + "balance_loss_mlp": 1.01640427, + "epoch": 0.8687208777994889, + "flos": 12676997312640.0, + "grad_norm": 1.837328683434655, + "language_loss": 0.81810379, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.8390075, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35742188, + "step": 14449, + "time_per_iteration": 2.342402458190918 + }, + { + "auxiliary_loss_clip": 0.01007504, + "auxiliary_loss_mlp": 0.01002702, + "balance_loss_clip": 1.00055635, + "balance_loss_mlp": 1.00111389, + "epoch": 0.8687810010521569, + "flos": 65613948226560.0, + "grad_norm": 0.8060752102195506, + "language_loss": 0.60662937, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.6267314, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06396484, + "step": 14450, + "time_per_iteration": 4.334201335906982 + }, + { + "auxiliary_loss_clip": 0.01052669, + "auxiliary_loss_mlp": 0.01042182, + "balance_loss_clip": 1.01851881, + "balance_loss_mlp": 1.01640177, + "epoch": 0.8688411243048249, + "flos": 24242849694720.0, + "grad_norm": 1.665231092104808, + "language_loss": 0.77154803, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.79249656, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 14451, + "time_per_iteration": 2.3821632862091064 + }, + { + "auxiliary_loss_clip": 0.01050451, + "auxiliary_loss_mlp": 0.01037571, + "balance_loss_clip": 1.01443207, + "balance_loss_mlp": 1.01563513, + "epoch": 0.8689012475574929, + "flos": 18220890746880.0, + "grad_norm": 2.3118632568996795, + "language_loss": 0.73174465, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.75262487, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 14452, + "time_per_iteration": 2.322871208190918 + }, + { + "auxiliary_loss_clip": 0.01052745, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.01336479, + "balance_loss_mlp": 1.0163703, + "epoch": 0.8689613708101608, + "flos": 19645696131840.0, + "grad_norm": 3.2578683686783396, + "language_loss": 0.73253429, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.75343752, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 14453, + "time_per_iteration": 2.3865013122558594 + }, + { + "auxiliary_loss_clip": 0.01051441, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.0145694, + "balance_loss_mlp": 1.01679325, + "epoch": 0.8690214940628288, + "flos": 11727953291520.0, + "grad_norm": 2.113175952742342, + "language_loss": 0.75225008, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.77313614, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 14454, + "time_per_iteration": 2.3500101566314697 + }, + { + "auxiliary_loss_clip": 0.01051902, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.0109725, + "balance_loss_mlp": 1.01703608, + "epoch": 0.8690816173154968, + "flos": 34934930680320.0, + "grad_norm": 1.7912316901110124, + "language_loss": 0.60556537, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.62642872, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 14455, + "time_per_iteration": 2.507681131362915 + }, + { + "auxiliary_loss_clip": 0.01051523, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.01244092, + "balance_loss_mlp": 1.01581407, + "epoch": 0.8691417405681647, + "flos": 11614136659200.0, + "grad_norm": 2.450685008244239, + "language_loss": 0.81517994, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.83603692, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35742188, + "step": 14456, + "time_per_iteration": 2.4141736030578613 + }, + { + "auxiliary_loss_clip": 0.01054681, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.01280785, + "balance_loss_mlp": 1.01712883, + "epoch": 0.8692018638208328, + "flos": 24606889107840.0, + "grad_norm": 2.0119713695527803, + "language_loss": 0.76657146, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.7875064, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 14457, + "time_per_iteration": 2.406972646713257 + }, + { + "auxiliary_loss_clip": 0.01049226, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.01735115, + "balance_loss_mlp": 1.01541305, + "epoch": 0.8692619870735007, + "flos": 25993918535040.0, + "grad_norm": 1.6818097507220557, + "language_loss": 0.79564953, + "learning_rate": 1.765601232001328e-07, + "loss": 0.81651723, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.33789062, + "step": 14458, + "time_per_iteration": 2.412250280380249 + }, + { + "auxiliary_loss_clip": 0.01050666, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.01395321, + "balance_loss_mlp": 1.01532936, + "epoch": 0.8693221103261687, + "flos": 18040808620800.0, + "grad_norm": 1.790909351944964, + "language_loss": 0.71755838, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73845601, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.35351562, + "step": 14459, + "time_per_iteration": 3.7746572494506836 + }, + { + "auxiliary_loss_clip": 0.01047852, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.01564503, + "balance_loss_mlp": 1.0154531, + "epoch": 0.8693822335788366, + "flos": 27491063990400.0, + "grad_norm": 1.3897437694234784, + "language_loss": 0.74882376, + "learning_rate": 1.762402701923398e-07, + "loss": 0.76964426, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.32421875, + "step": 14460, + "time_per_iteration": 3.8905177116394043 + }, + { + "auxiliary_loss_clip": 0.01053225, + "auxiliary_loss_mlp": 0.01042052, + "balance_loss_clip": 1.01596904, + "balance_loss_mlp": 1.01631093, + "epoch": 0.8694423568315046, + "flos": 24096563631360.0, + "grad_norm": 1.9965178057843482, + "language_loss": 0.66299862, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.68395138, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36914062, + "step": 14461, + "time_per_iteration": 2.4059970378875732 + }, + { + "auxiliary_loss_clip": 0.01052054, + "auxiliary_loss_mlp": 0.01039752, + "balance_loss_clip": 1.01539755, + "balance_loss_mlp": 1.0157187, + "epoch": 0.8695024800841725, + "flos": 18361346613120.0, + "grad_norm": 1.968614735224122, + "language_loss": 0.83271682, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.85363489, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 14462, + "time_per_iteration": 2.35707950592041 + }, + { + "auxiliary_loss_clip": 0.01051052, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.01272547, + "balance_loss_mlp": 1.01529264, + "epoch": 0.8695626033368405, + "flos": 14026879186560.0, + "grad_norm": 1.6442196783916103, + "language_loss": 0.65778005, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67864907, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 14463, + "time_per_iteration": 2.347097396850586 + }, + { + "auxiliary_loss_clip": 0.01055588, + "auxiliary_loss_mlp": 0.01042197, + "balance_loss_clip": 1.01670969, + "balance_loss_mlp": 1.01767278, + "epoch": 0.8696227265895085, + "flos": 16835921089920.0, + "grad_norm": 2.4967587124308617, + "language_loss": 0.67551196, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.69648981, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 14464, + "time_per_iteration": 2.349292039871216 + }, + { + "auxiliary_loss_clip": 0.01054044, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_clip": 1.01975131, + "balance_loss_mlp": 1.0164063, + "epoch": 0.8696828498421765, + "flos": 21797986849920.0, + "grad_norm": 2.0684708220169044, + "language_loss": 0.63398784, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.65497619, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 14465, + "time_per_iteration": 2.3800735473632812 + }, + { + "auxiliary_loss_clip": 0.01047876, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.01356494, + "balance_loss_mlp": 1.01501691, + "epoch": 0.8697429730948444, + "flos": 22893666048000.0, + "grad_norm": 1.4265899809777687, + "language_loss": 0.85213065, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.87294763, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.328125, + "step": 14466, + "time_per_iteration": 2.396258592605591 + }, + { + "auxiliary_loss_clip": 0.01054522, + "auxiliary_loss_mlp": 0.01045284, + "balance_loss_clip": 1.0182476, + "balance_loss_mlp": 1.01688814, + "epoch": 0.8698030963475124, + "flos": 24716306908800.0, + "grad_norm": 2.989678666768689, + "language_loss": 0.63806611, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.65906417, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37695312, + "step": 14467, + "time_per_iteration": 2.3986077308654785 + }, + { + "auxiliary_loss_clip": 0.01048712, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.0133462, + "balance_loss_mlp": 1.01577568, + "epoch": 0.8698632196001803, + "flos": 28440876061440.0, + "grad_norm": 1.3958473326239322, + "language_loss": 0.69412422, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71494776, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33007812, + "step": 14468, + "time_per_iteration": 2.476921319961548 + }, + { + "auxiliary_loss_clip": 0.01048431, + "auxiliary_loss_mlp": 0.01041413, + "balance_loss_clip": 1.01958597, + "balance_loss_mlp": 1.0143137, + "epoch": 0.8699233428528483, + "flos": 27635220460800.0, + "grad_norm": 1.5886291915105746, + "language_loss": 0.7165466, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.737445, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 14469, + "time_per_iteration": 2.4104373455047607 + }, + { + "auxiliary_loss_clip": 0.01048863, + "auxiliary_loss_mlp": 0.01031926, + "balance_loss_clip": 1.01146984, + "balance_loss_mlp": 1.01585186, + "epoch": 0.8699834661055164, + "flos": 20044683682560.0, + "grad_norm": 2.058547498557045, + "language_loss": 0.84875453, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86956239, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.328125, + "step": 14470, + "time_per_iteration": 2.3831186294555664 + }, + { + "auxiliary_loss_clip": 0.01052815, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.01122212, + "balance_loss_mlp": 1.01701951, + "epoch": 0.8700435893581843, + "flos": 23731651434240.0, + "grad_norm": 1.6053008113663312, + "language_loss": 0.73962528, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.76048118, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35742188, + "step": 14471, + "time_per_iteration": 2.3714077472686768 + }, + { + "auxiliary_loss_clip": 0.01050203, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.01129031, + "balance_loss_mlp": 1.01627195, + "epoch": 0.8701037126108523, + "flos": 23547345033600.0, + "grad_norm": 1.5205760103295891, + "language_loss": 0.79933476, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.82015228, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.33984375, + "step": 14472, + "time_per_iteration": 2.422560691833496 + }, + { + "auxiliary_loss_clip": 0.01050686, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.00781655, + "balance_loss_mlp": 1.01540375, + "epoch": 0.8701638358635202, + "flos": 18842449415040.0, + "grad_norm": 2.308038611795501, + "language_loss": 0.73918849, + "learning_rate": 1.741679706279644e-07, + "loss": 0.75999618, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 14473, + "time_per_iteration": 2.337561845779419 + }, + { + "auxiliary_loss_clip": 0.0105229, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.01612055, + "balance_loss_mlp": 1.01635146, + "epoch": 0.8702239591161882, + "flos": 27927094360320.0, + "grad_norm": 1.4834471818587378, + "language_loss": 0.73025572, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.75116396, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 14474, + "time_per_iteration": 3.856506109237671 + }, + { + "auxiliary_loss_clip": 0.01051024, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.01518977, + "balance_loss_mlp": 1.01552248, + "epoch": 0.8702840823688561, + "flos": 17233163072640.0, + "grad_norm": 1.870445442957608, + "language_loss": 0.69389832, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.71480238, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 14475, + "time_per_iteration": 2.3503222465515137 + }, + { + "auxiliary_loss_clip": 0.01050375, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.01314712, + "balance_loss_mlp": 1.01456714, + "epoch": 0.8703442056215241, + "flos": 19426546327680.0, + "grad_norm": 1.7174655084413035, + "language_loss": 0.78662527, + "learning_rate": 1.736914088262349e-07, + "loss": 0.80749685, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 14476, + "time_per_iteration": 2.4334428310394287 + }, + { + "auxiliary_loss_clip": 0.01050059, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.01166487, + "balance_loss_mlp": 1.01559615, + "epoch": 0.8704043288741921, + "flos": 22272735784320.0, + "grad_norm": 1.4319934102124585, + "language_loss": 0.72982097, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.75065565, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 14477, + "time_per_iteration": 2.4092743396759033 + }, + { + "auxiliary_loss_clip": 0.01050825, + "auxiliary_loss_mlp": 0.01037009, + "balance_loss_clip": 1.01382291, + "balance_loss_mlp": 1.01538324, + "epoch": 0.8704644521268601, + "flos": 16647948996480.0, + "grad_norm": 2.05573675346676, + "language_loss": 0.6060279, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.62690622, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35351562, + "step": 14478, + "time_per_iteration": 2.360492706298828 + }, + { + "auxiliary_loss_clip": 0.01051745, + "auxiliary_loss_mlp": 0.01041353, + "balance_loss_clip": 1.02069426, + "balance_loss_mlp": 1.01816034, + "epoch": 0.870524575379528, + "flos": 24279054641280.0, + "grad_norm": 1.6339579079715436, + "language_loss": 0.73044217, + "learning_rate": 1.732154703087323e-07, + "loss": 0.75137311, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.3359375, + "step": 14479, + "time_per_iteration": 2.394862651824951 + }, + { + "auxiliary_loss_clip": 0.01050443, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.0138334, + "balance_loss_mlp": 1.015728, + "epoch": 0.870584698632196, + "flos": 28767383896320.0, + "grad_norm": 1.5170201079056684, + "language_loss": 0.72456378, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.74545008, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.34765625, + "step": 14480, + "time_per_iteration": 2.482680082321167 + }, + { + "auxiliary_loss_clip": 0.01052145, + "auxiliary_loss_mlp": 0.01040195, + "balance_loss_clip": 1.01733124, + "balance_loss_mlp": 1.01638448, + "epoch": 0.8706448218848639, + "flos": 32448346894080.0, + "grad_norm": 2.5486233998507024, + "language_loss": 0.71230781, + "learning_rate": 1.728985243129666e-07, + "loss": 0.73323119, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 14481, + "time_per_iteration": 2.445005416870117 + }, + { + "auxiliary_loss_clip": 0.01050126, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.0129329, + "balance_loss_mlp": 1.01566386, + "epoch": 0.8707049451375319, + "flos": 22746891225600.0, + "grad_norm": 1.6071471212135056, + "language_loss": 0.77790958, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79875696, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 14482, + "time_per_iteration": 2.3997795581817627 + }, + { + "auxiliary_loss_clip": 0.0105143, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.01248693, + "balance_loss_mlp": 1.01622081, + "epoch": 0.8707650683902, + "flos": 15851056147200.0, + "grad_norm": 1.677502106281115, + "language_loss": 0.77432203, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.79518366, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 14483, + "time_per_iteration": 2.341221570968628 + }, + { + "auxiliary_loss_clip": 0.01055598, + "auxiliary_loss_mlp": 0.01041658, + "balance_loss_clip": 1.01627827, + "balance_loss_mlp": 1.017133, + "epoch": 0.8708251916428679, + "flos": 16467308288640.0, + "grad_norm": 2.2441600747011923, + "language_loss": 0.62732434, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64829683, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 14484, + "time_per_iteration": 2.4026832580566406 + }, + { + "auxiliary_loss_clip": 0.01050857, + "auxiliary_loss_mlp": 0.0103996, + "balance_loss_clip": 1.01679778, + "balance_loss_mlp": 1.01587522, + "epoch": 0.8708853148955359, + "flos": 15376935617280.0, + "grad_norm": 1.7876971821430463, + "language_loss": 0.69705451, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.71796262, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34960938, + "step": 14485, + "time_per_iteration": 2.343864679336548 + }, + { + "auxiliary_loss_clip": 0.01050309, + "auxiliary_loss_mlp": 0.01036713, + "balance_loss_clip": 1.01436138, + "balance_loss_mlp": 1.01544642, + "epoch": 0.8709454381482038, + "flos": 30550119206400.0, + "grad_norm": 1.8368487178134811, + "language_loss": 0.64403749, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.66490769, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 14486, + "time_per_iteration": 2.4628937244415283 + }, + { + "auxiliary_loss_clip": 0.0105227, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.01523948, + "balance_loss_mlp": 1.01526737, + "epoch": 0.8710055614008718, + "flos": 22600325871360.0, + "grad_norm": 5.221181795989689, + "language_loss": 0.63132209, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.65223753, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 14487, + "time_per_iteration": 2.3619534969329834 + }, + { + "auxiliary_loss_clip": 0.01049032, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.01243448, + "balance_loss_mlp": 1.01481378, + "epoch": 0.8710656846535397, + "flos": 18442135232640.0, + "grad_norm": 2.052719560672066, + "language_loss": 0.69268131, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.71350223, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.34375, + "step": 14488, + "time_per_iteration": 2.3762807846069336 + }, + { + "auxiliary_loss_clip": 0.01051225, + "auxiliary_loss_mlp": 0.01035347, + "balance_loss_clip": 1.01322162, + "balance_loss_mlp": 1.01574314, + "epoch": 0.8711258079062077, + "flos": 16503059387520.0, + "grad_norm": 1.9807963629322713, + "language_loss": 0.86101049, + "learning_rate": 1.716335121648338e-07, + "loss": 0.88187623, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 14489, + "time_per_iteration": 2.3453447818756104 + }, + { + "auxiliary_loss_clip": 0.01055431, + "auxiliary_loss_mlp": 0.01039864, + "balance_loss_clip": 1.01506817, + "balance_loss_mlp": 1.01692426, + "epoch": 0.8711859311588757, + "flos": 15662595294720.0, + "grad_norm": 2.3086008732504024, + "language_loss": 0.77794933, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.79890233, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38476562, + "step": 14490, + "time_per_iteration": 3.625624418258667 + }, + { + "auxiliary_loss_clip": 0.01055086, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.01172256, + "balance_loss_mlp": 1.01745367, + "epoch": 0.8712460544115437, + "flos": 15556703541120.0, + "grad_norm": 2.007114664812903, + "language_loss": 0.77498358, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.79590589, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37695312, + "step": 14491, + "time_per_iteration": 2.3316125869750977 + }, + { + "auxiliary_loss_clip": 0.01052161, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.00988197, + "balance_loss_mlp": 1.01814306, + "epoch": 0.8713061776642116, + "flos": 16762638412800.0, + "grad_norm": 1.6867502573859026, + "language_loss": 0.67660707, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69744349, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33984375, + "step": 14492, + "time_per_iteration": 2.3669166564941406 + }, + { + "auxiliary_loss_clip": 0.01050428, + "auxiliary_loss_mlp": 0.01035942, + "balance_loss_clip": 1.01426959, + "balance_loss_mlp": 1.01611364, + "epoch": 0.8713663009168796, + "flos": 24278740439040.0, + "grad_norm": 1.820461415491572, + "language_loss": 0.70943332, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.73029709, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 14493, + "time_per_iteration": 2.3752059936523438 + }, + { + "auxiliary_loss_clip": 0.01054156, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.01300573, + "balance_loss_mlp": 1.0179106, + "epoch": 0.8714264241695475, + "flos": 23794739994240.0, + "grad_norm": 2.1534882901387653, + "language_loss": 0.90536761, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.92628729, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 14494, + "time_per_iteration": 2.393378257751465 + }, + { + "auxiliary_loss_clip": 0.01049794, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.01191652, + "balance_loss_mlp": 1.0162375, + "epoch": 0.8714865474222155, + "flos": 37996429691520.0, + "grad_norm": 1.8555462780225984, + "language_loss": 0.6101315, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.63094735, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.3359375, + "step": 14495, + "time_per_iteration": 2.5097391605377197 + }, + { + "auxiliary_loss_clip": 0.01051739, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.01685309, + "balance_loss_mlp": 1.01620626, + "epoch": 0.8715466706748836, + "flos": 22454598389760.0, + "grad_norm": 2.095277928906892, + "language_loss": 0.8206116, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.8415215, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 14496, + "time_per_iteration": 2.3912484645843506 + }, + { + "auxiliary_loss_clip": 0.01051751, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.01629639, + "balance_loss_mlp": 1.01602113, + "epoch": 0.8716067939275515, + "flos": 21214064494080.0, + "grad_norm": 1.9998890409542844, + "language_loss": 0.79575229, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.81667024, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35742188, + "step": 14497, + "time_per_iteration": 2.3593032360076904 + }, + { + "auxiliary_loss_clip": 0.01053479, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.01292944, + "balance_loss_mlp": 1.01671469, + "epoch": 0.8716669171802195, + "flos": 22996764892800.0, + "grad_norm": 2.0306023453279702, + "language_loss": 0.69066668, + "learning_rate": 1.70215677535406e-07, + "loss": 0.71156418, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 14498, + "time_per_iteration": 3.860302686691284 + }, + { + "auxiliary_loss_clip": 0.01049559, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01301527, + "balance_loss_mlp": 1.01414561, + "epoch": 0.8717270404328874, + "flos": 29782902879360.0, + "grad_norm": 1.5401694129570487, + "language_loss": 0.57816005, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59900773, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 14499, + "time_per_iteration": 2.420132875442505 + }, + { + "auxiliary_loss_clip": 0.01051102, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.01134348, + "balance_loss_mlp": 1.01504695, + "epoch": 0.8717871636855554, + "flos": 22017031920000.0, + "grad_norm": 2.0625017157527727, + "language_loss": 0.81390321, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.83477283, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 14500, + "time_per_iteration": 3.6935768127441406 + }, + { + "auxiliary_loss_clip": 0.0105087, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.01198328, + "balance_loss_mlp": 1.01636529, + "epoch": 0.8718472869382233, + "flos": 16653325345920.0, + "grad_norm": 1.9855667166709514, + "language_loss": 0.74026549, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.76111507, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 14501, + "time_per_iteration": 2.4110333919525146 + }, + { + "auxiliary_loss_clip": 0.01055428, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.01618457, + "balance_loss_mlp": 1.01769507, + "epoch": 0.8719074101908914, + "flos": 19494452655360.0, + "grad_norm": 1.9311537528659417, + "language_loss": 0.65361047, + "learning_rate": 1.695873325782482e-07, + "loss": 0.67457014, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37695312, + "step": 14502, + "time_per_iteration": 2.352646827697754 + }, + { + "auxiliary_loss_clip": 0.01051681, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.01454318, + "balance_loss_mlp": 1.01553571, + "epoch": 0.8719675334435593, + "flos": 33069556448640.0, + "grad_norm": 1.7127206568203082, + "language_loss": 0.69185269, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71276063, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 14503, + "time_per_iteration": 2.4910473823547363 + }, + { + "auxiliary_loss_clip": 0.01052784, + "auxiliary_loss_mlp": 0.01038438, + "balance_loss_clip": 1.01627719, + "balance_loss_mlp": 1.01734006, + "epoch": 0.8720276566962273, + "flos": 13625412929280.0, + "grad_norm": 4.592120890562074, + "language_loss": 0.70193446, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.72284669, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 14504, + "time_per_iteration": 2.365865468978882 + }, + { + "auxiliary_loss_clip": 0.01052583, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.01123297, + "balance_loss_mlp": 1.01667786, + "epoch": 0.8720877799488952, + "flos": 23513025300480.0, + "grad_norm": 1.7856525028519383, + "language_loss": 0.71443051, + "learning_rate": 1.691168026385552e-07, + "loss": 0.7353003, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 14505, + "time_per_iteration": 2.3962557315826416 + }, + { + "auxiliary_loss_clip": 0.01049628, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.01190889, + "balance_loss_mlp": 1.01549053, + "epoch": 0.8721479032015632, + "flos": 20813086995840.0, + "grad_norm": 1.5191908036521902, + "language_loss": 0.79223692, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.81307101, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 14506, + "time_per_iteration": 2.366384983062744 + }, + { + "auxiliary_loss_clip": 0.01052852, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.0151825, + "balance_loss_mlp": 1.01578021, + "epoch": 0.8722080264542311, + "flos": 19462646540160.0, + "grad_norm": 2.3520222184498745, + "language_loss": 0.74663454, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76756305, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 14507, + "time_per_iteration": 2.398972511291504 + }, + { + "auxiliary_loss_clip": 0.01054852, + "auxiliary_loss_mlp": 0.01039936, + "balance_loss_clip": 1.01535535, + "balance_loss_mlp": 1.01757598, + "epoch": 0.8722681497068991, + "flos": 21760804385280.0, + "grad_norm": 1.9474351656384616, + "language_loss": 0.7391237, + "learning_rate": 1.686468975443156e-07, + "loss": 0.76007164, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 14508, + "time_per_iteration": 2.3806002140045166 + }, + { + "auxiliary_loss_clip": 0.01053356, + "auxiliary_loss_mlp": 0.01040464, + "balance_loss_clip": 1.01640725, + "balance_loss_mlp": 1.01679063, + "epoch": 0.8723282729595672, + "flos": 28875859090560.0, + "grad_norm": 1.7050479592082792, + "language_loss": 0.69515884, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.716097, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 14509, + "time_per_iteration": 2.46449613571167 + }, + { + "auxiliary_loss_clip": 0.010516, + "auxiliary_loss_mlp": 0.01039152, + "balance_loss_clip": 1.01707482, + "balance_loss_mlp": 1.01544428, + "epoch": 0.8723883962122351, + "flos": 26467934330880.0, + "grad_norm": 1.7507779816856157, + "language_loss": 0.59780121, + "learning_rate": 1.683339746970558e-07, + "loss": 0.61870873, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36132812, + "step": 14510, + "time_per_iteration": 2.4332053661346436 + }, + { + "auxiliary_loss_clip": 0.01055904, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.01461506, + "balance_loss_mlp": 1.01713979, + "epoch": 0.8724485194649031, + "flos": 20520445046400.0, + "grad_norm": 2.252989554385775, + "language_loss": 0.69177431, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.71274543, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38671875, + "step": 14511, + "time_per_iteration": 2.38542103767395 + }, + { + "auxiliary_loss_clip": 0.01052701, + "auxiliary_loss_mlp": 0.01039107, + "balance_loss_clip": 1.01513374, + "balance_loss_mlp": 1.01591074, + "epoch": 0.872508642717571, + "flos": 24352197672960.0, + "grad_norm": 1.587789810817385, + "language_loss": 0.83077025, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.85168827, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 14512, + "time_per_iteration": 2.4233531951904297 + }, + { + "auxiliary_loss_clip": 0.01007307, + "auxiliary_loss_mlp": 0.01002751, + "balance_loss_clip": 1.00066495, + "balance_loss_mlp": 1.00077724, + "epoch": 0.872568765970239, + "flos": 61407159108480.0, + "grad_norm": 0.7886194555220868, + "language_loss": 0.58663827, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60673887, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06542969, + "step": 14513, + "time_per_iteration": 4.3166656494140625 + }, + { + "auxiliary_loss_clip": 0.01051115, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.01428914, + "balance_loss_mlp": 1.01566899, + "epoch": 0.8726288892229069, + "flos": 22597044203520.0, + "grad_norm": 1.6527009067617198, + "language_loss": 0.7720412, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.79292029, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35351562, + "step": 14514, + "time_per_iteration": 2.3793115615844727 + }, + { + "auxiliary_loss_clip": 0.010541, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.01347113, + "balance_loss_mlp": 1.01719427, + "epoch": 0.872689012475575, + "flos": 25884011975040.0, + "grad_norm": 2.1200130228847054, + "language_loss": 0.66979003, + "learning_rate": 1.675528831794055e-07, + "loss": 0.69069624, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36914062, + "step": 14515, + "time_per_iteration": 2.4392335414886475 + }, + { + "auxiliary_loss_clip": 0.01052655, + "auxiliary_loss_mlp": 0.01038016, + "balance_loss_clip": 1.01385236, + "balance_loss_mlp": 1.01684189, + "epoch": 0.8727491357282429, + "flos": 21505659102720.0, + "grad_norm": 5.866820584193284, + "language_loss": 0.79754555, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81845224, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 14516, + "time_per_iteration": 2.3702218532562256 + }, + { + "auxiliary_loss_clip": 0.01054421, + "auxiliary_loss_mlp": 0.01040559, + "balance_loss_clip": 1.01558459, + "balance_loss_mlp": 1.01682353, + "epoch": 0.8728092589809109, + "flos": 19206523739520.0, + "grad_norm": 2.5694117224712216, + "language_loss": 0.73533773, + "learning_rate": 1.672409329369453e-07, + "loss": 0.75628752, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 14517, + "time_per_iteration": 2.375077486038208 + }, + { + "auxiliary_loss_clip": 0.0104948, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.01011229, + "balance_loss_mlp": 1.01501429, + "epoch": 0.8728693822335788, + "flos": 20594251393920.0, + "grad_norm": 1.9441985151446917, + "language_loss": 0.74341333, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.76421553, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.34570312, + "step": 14518, + "time_per_iteration": 2.359325647354126 + }, + { + "auxiliary_loss_clip": 0.01050169, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.01712942, + "balance_loss_mlp": 1.01608574, + "epoch": 0.8729295054862468, + "flos": 21727462170240.0, + "grad_norm": 1.689296842198873, + "language_loss": 0.74494874, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76583087, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33984375, + "step": 14519, + "time_per_iteration": 2.412550449371338 + }, + { + "auxiliary_loss_clip": 0.01053817, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.0095855, + "balance_loss_mlp": 1.01669669, + "epoch": 0.8729896287389147, + "flos": 17672544933120.0, + "grad_norm": 2.4443099882717734, + "language_loss": 0.77425474, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.79513735, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 14520, + "time_per_iteration": 2.3367722034454346 + }, + { + "auxiliary_loss_clip": 0.01053092, + "auxiliary_loss_mlp": 0.01038121, + "balance_loss_clip": 1.01436305, + "balance_loss_mlp": 1.01602495, + "epoch": 0.8730497519915827, + "flos": 24570649249920.0, + "grad_norm": 1.9674378611307657, + "language_loss": 0.83253276, + "learning_rate": 1.666178664801816e-07, + "loss": 0.85344487, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 14521, + "time_per_iteration": 2.4174301624298096 + }, + { + "auxiliary_loss_clip": 0.01054234, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.01457453, + "balance_loss_mlp": 1.01745558, + "epoch": 0.8731098752442508, + "flos": 13442887008000.0, + "grad_norm": 2.961688995869925, + "language_loss": 0.77622581, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.79716098, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3671875, + "step": 14522, + "time_per_iteration": 2.340801954269409 + }, + { + "auxiliary_loss_clip": 0.01050229, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.01354361, + "balance_loss_mlp": 1.01572609, + "epoch": 0.8731699984969187, + "flos": 23473399040640.0, + "grad_norm": 1.8871823987824776, + "language_loss": 0.76788521, + "learning_rate": 1.66306750360385e-07, + "loss": 0.78874075, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34570312, + "step": 14523, + "time_per_iteration": 2.472548484802246 + }, + { + "auxiliary_loss_clip": 0.01049082, + "auxiliary_loss_mlp": 0.01034835, + "balance_loss_clip": 1.01279354, + "balance_loss_mlp": 1.01532483, + "epoch": 0.8732301217495867, + "flos": 17711682433920.0, + "grad_norm": 2.089044532158217, + "language_loss": 0.8006084, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.82144761, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3359375, + "step": 14524, + "time_per_iteration": 2.3702359199523926 + }, + { + "auxiliary_loss_clip": 0.01049882, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.01522851, + "balance_loss_mlp": 1.01564193, + "epoch": 0.8732902450022546, + "flos": 22053271777920.0, + "grad_norm": 1.953926729326464, + "language_loss": 0.78962189, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.81048411, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 14525, + "time_per_iteration": 2.389815092086792 + }, + { + "auxiliary_loss_clip": 0.01053026, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.01226199, + "balance_loss_mlp": 1.01678085, + "epoch": 0.8733503682549226, + "flos": 22271897911680.0, + "grad_norm": 1.7026300731526534, + "language_loss": 0.69832736, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71921098, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 14526, + "time_per_iteration": 2.426753520965576 + }, + { + "auxiliary_loss_clip": 0.01053958, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02117229, + "balance_loss_mlp": 1.01672864, + "epoch": 0.8734104915075905, + "flos": 23363317923840.0, + "grad_norm": 1.7820131665151953, + "language_loss": 0.6197983, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.64081186, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37109375, + "step": 14527, + "time_per_iteration": 2.406776189804077 + }, + { + "auxiliary_loss_clip": 0.01057224, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.01392543, + "balance_loss_mlp": 1.01766706, + "epoch": 0.8734706147602586, + "flos": 17711333320320.0, + "grad_norm": 2.204870710376478, + "language_loss": 0.66951531, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.6905247, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.39648438, + "step": 14528, + "time_per_iteration": 2.339491844177246 + }, + { + "auxiliary_loss_clip": 0.01050713, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.0131669, + "balance_loss_mlp": 1.0158658, + "epoch": 0.8735307380129265, + "flos": 22048419098880.0, + "grad_norm": 2.0907935662216826, + "language_loss": 0.90990615, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.93075943, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 14529, + "time_per_iteration": 2.4254465103149414 + }, + { + "auxiliary_loss_clip": 0.01052493, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.01663339, + "balance_loss_mlp": 1.01721466, + "epoch": 0.8735908612655945, + "flos": 25337237172480.0, + "grad_norm": 1.642666456580712, + "language_loss": 0.85604131, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.87697554, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3515625, + "step": 14530, + "time_per_iteration": 3.6752138137817383 + }, + { + "auxiliary_loss_clip": 0.01053145, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.018013, + "balance_loss_mlp": 1.01692295, + "epoch": 0.8736509845182624, + "flos": 21539909013120.0, + "grad_norm": 1.473058567954134, + "language_loss": 0.74442333, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76536614, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36328125, + "step": 14531, + "time_per_iteration": 2.401484489440918 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.01321852, + "balance_loss_mlp": 1.01527548, + "epoch": 0.8737111077709304, + "flos": 22016159136000.0, + "grad_norm": 1.8921286767174363, + "language_loss": 0.62653184, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.64738208, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.33984375, + "step": 14532, + "time_per_iteration": 2.3815221786499023 + }, + { + "auxiliary_loss_clip": 0.01007108, + "auxiliary_loss_mlp": 0.01003071, + "balance_loss_clip": 1.00111592, + "balance_loss_mlp": 1.00057578, + "epoch": 0.8737712310235983, + "flos": 70062965424000.0, + "grad_norm": 0.8174100637871623, + "language_loss": 0.58716989, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60727167, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.06542969, + "step": 14533, + "time_per_iteration": 3.1141157150268555 + }, + { + "auxiliary_loss_clip": 0.01050982, + "auxiliary_loss_mlp": 0.01040712, + "balance_loss_clip": 1.0191946, + "balance_loss_mlp": 1.01593113, + "epoch": 0.8738313542762663, + "flos": 28657058400000.0, + "grad_norm": 1.4832637211699515, + "language_loss": 0.77388275, + "learning_rate": 1.646005846335954e-07, + "loss": 0.79479969, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34960938, + "step": 14534, + "time_per_iteration": 2.424967050552368 + }, + { + "auxiliary_loss_clip": 0.01051047, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.01146924, + "balance_loss_mlp": 1.01546597, + "epoch": 0.8738914775289344, + "flos": 22345285322880.0, + "grad_norm": 1.719036952047885, + "language_loss": 0.76095951, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.78180903, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 14535, + "time_per_iteration": 2.428898572921753 + }, + { + "auxiliary_loss_clip": 0.01050641, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.01356959, + "balance_loss_mlp": 1.01499295, + "epoch": 0.8739516007816023, + "flos": 31758288405120.0, + "grad_norm": 1.820469728259462, + "language_loss": 0.75391686, + "learning_rate": 1.64291277235048e-07, + "loss": 0.77480972, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35546875, + "step": 14536, + "time_per_iteration": 2.4703774452209473 + }, + { + "auxiliary_loss_clip": 0.0105226, + "auxiliary_loss_mlp": 0.01035047, + "balance_loss_clip": 1.01318383, + "balance_loss_mlp": 1.01588941, + "epoch": 0.8740117240342703, + "flos": 21210747914880.0, + "grad_norm": 1.838014669475169, + "language_loss": 0.65413988, + "learning_rate": 1.641367279482304e-07, + "loss": 0.67501289, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 14537, + "time_per_iteration": 2.3703253269195557 + }, + { + "auxiliary_loss_clip": 0.01050386, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.01122117, + "balance_loss_mlp": 1.01532269, + "epoch": 0.8740718472869382, + "flos": 25185644582400.0, + "grad_norm": 1.7210764117688369, + "language_loss": 0.5859949, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60683084, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 14538, + "time_per_iteration": 3.8798182010650635 + }, + { + "auxiliary_loss_clip": 0.01051181, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.01377785, + "balance_loss_mlp": 1.01682007, + "epoch": 0.8741319705396062, + "flos": 19499898827520.0, + "grad_norm": 1.769523144263107, + "language_loss": 0.69233698, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.71320486, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 14539, + "time_per_iteration": 3.725053548812866 + }, + { + "auxiliary_loss_clip": 0.01051779, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.01329947, + "balance_loss_mlp": 1.01462531, + "epoch": 0.8741920937922741, + "flos": 14100022218240.0, + "grad_norm": 2.0232703006988486, + "language_loss": 0.75708842, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.77798378, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37109375, + "step": 14540, + "time_per_iteration": 2.3647208213806152 + }, + { + "auxiliary_loss_clip": 0.01052665, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.0131824, + "balance_loss_mlp": 1.01579523, + "epoch": 0.8742522170449422, + "flos": 27708607872000.0, + "grad_norm": 1.725496990558571, + "language_loss": 0.79850203, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81939828, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 14541, + "time_per_iteration": 2.411863088607788 + }, + { + "auxiliary_loss_clip": 0.01055318, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.01450276, + "balance_loss_mlp": 1.01687515, + "epoch": 0.8743123402976101, + "flos": 21141514955520.0, + "grad_norm": 2.0477198441044173, + "language_loss": 0.67706835, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.69802856, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38476562, + "step": 14542, + "time_per_iteration": 2.4290144443511963 + }, + { + "auxiliary_loss_clip": 0.01007257, + "auxiliary_loss_mlp": 0.01005621, + "balance_loss_clip": 1.00362992, + "balance_loss_mlp": 1.00086927, + "epoch": 0.8743724635502781, + "flos": 60866982552960.0, + "grad_norm": 0.7809254347538565, + "language_loss": 0.54495132, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56508017, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.06396484, + "step": 14543, + "time_per_iteration": 2.82061767578125 + }, + { + "auxiliary_loss_clip": 0.01052569, + "auxiliary_loss_mlp": 0.01042899, + "balance_loss_clip": 1.01854479, + "balance_loss_mlp": 1.0155344, + "epoch": 0.874432586802946, + "flos": 28108084181760.0, + "grad_norm": 1.807680649800455, + "language_loss": 0.70575893, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.72671366, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 14544, + "time_per_iteration": 2.444471597671509 + }, + { + "auxiliary_loss_clip": 0.01049393, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.01183271, + "balance_loss_mlp": 1.01587033, + "epoch": 0.874492710055614, + "flos": 23549160424320.0, + "grad_norm": 1.5150320853556156, + "language_loss": 0.76764488, + "learning_rate": 1.62902840325714e-07, + "loss": 0.78845191, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.33398438, + "step": 14545, + "time_per_iteration": 2.3829143047332764 + }, + { + "auxiliary_loss_clip": 0.01051985, + "auxiliary_loss_mlp": 0.01043729, + "balance_loss_clip": 1.01855242, + "balance_loss_mlp": 1.01539457, + "epoch": 0.8745528333082819, + "flos": 40914086434560.0, + "grad_norm": 1.7039030296454296, + "language_loss": 0.66989863, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.6908558, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 14546, + "time_per_iteration": 2.5625557899475098 + }, + { + "auxiliary_loss_clip": 0.01051674, + "auxiliary_loss_mlp": 0.01037286, + "balance_loss_clip": 1.01356375, + "balance_loss_mlp": 1.01612723, + "epoch": 0.87461295656095, + "flos": 23621779785600.0, + "grad_norm": 1.810372117369066, + "language_loss": 0.73926306, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.76015264, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 14547, + "time_per_iteration": 2.3816723823547363 + }, + { + "auxiliary_loss_clip": 0.0105583, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.0163312, + "balance_loss_mlp": 1.01665401, + "epoch": 0.874673079813618, + "flos": 38792763959040.0, + "grad_norm": 3.0371995697866874, + "language_loss": 0.70599997, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.72699159, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39257812, + "step": 14548, + "time_per_iteration": 2.509713649749756 + }, + { + "auxiliary_loss_clip": 0.01054339, + "auxiliary_loss_mlp": 0.01042837, + "balance_loss_clip": 1.01860237, + "balance_loss_mlp": 1.01703501, + "epoch": 0.8747332030662859, + "flos": 23695027551360.0, + "grad_norm": 2.1337145397838, + "language_loss": 0.72025084, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.74122262, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 14549, + "time_per_iteration": 2.3755037784576416 + }, + { + "auxiliary_loss_clip": 0.01054215, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.0137676, + "balance_loss_mlp": 1.01620555, + "epoch": 0.8747933263189539, + "flos": 24461301271680.0, + "grad_norm": 2.3213538990769496, + "language_loss": 0.85580772, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.87674356, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37890625, + "step": 14550, + "time_per_iteration": 2.4042165279388428 + }, + { + "auxiliary_loss_clip": 0.01053971, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_clip": 1.01949668, + "balance_loss_mlp": 1.01746249, + "epoch": 0.8748534495716218, + "flos": 13808287964160.0, + "grad_norm": 2.097664514765953, + "language_loss": 0.7280342, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.74899888, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 14551, + "time_per_iteration": 2.371159791946411 + }, + { + "auxiliary_loss_clip": 0.01050929, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_clip": 1.0198524, + "balance_loss_mlp": 1.01522851, + "epoch": 0.8749135728242898, + "flos": 29860793856000.0, + "grad_norm": 1.8083656889802748, + "language_loss": 0.64748687, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.66842616, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 14552, + "time_per_iteration": 2.4948740005493164 + }, + { + "auxiliary_loss_clip": 0.01053529, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.01229072, + "balance_loss_mlp": 1.01654243, + "epoch": 0.8749736960769577, + "flos": 24132349641600.0, + "grad_norm": 1.7323296978630636, + "language_loss": 0.8056643, + "learning_rate": 1.616734111284479e-07, + "loss": 0.82657909, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36914062, + "step": 14553, + "time_per_iteration": 3.8931126594543457 + }, + { + "auxiliary_loss_clip": 0.01051794, + "auxiliary_loss_mlp": 0.01038953, + "balance_loss_clip": 1.01649439, + "balance_loss_mlp": 1.01537132, + "epoch": 0.8750338193296258, + "flos": 17201566425600.0, + "grad_norm": 2.0588183092556793, + "language_loss": 0.70751387, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72842133, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36523438, + "step": 14554, + "time_per_iteration": 2.3445024490356445 + }, + { + "auxiliary_loss_clip": 0.01051617, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.01027226, + "balance_loss_mlp": 1.0163368, + "epoch": 0.8750939425822937, + "flos": 23732070370560.0, + "grad_norm": 1.76698966551834, + "language_loss": 0.84692323, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.86777163, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 14555, + "time_per_iteration": 2.421133518218994 + }, + { + "auxiliary_loss_clip": 0.01052183, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.01572943, + "balance_loss_mlp": 1.01623774, + "epoch": 0.8751540658349617, + "flos": 26540483869440.0, + "grad_norm": 1.5760057221753692, + "language_loss": 0.71861625, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73953485, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 14556, + "time_per_iteration": 2.419344425201416 + }, + { + "auxiliary_loss_clip": 0.0105301, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.01330972, + "balance_loss_mlp": 1.01612353, + "epoch": 0.8752141890876296, + "flos": 19385907638400.0, + "grad_norm": 1.7303926539880536, + "language_loss": 0.78168011, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.80258942, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 14557, + "time_per_iteration": 2.3595898151397705 + }, + { + "auxiliary_loss_clip": 0.01052922, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.01849544, + "balance_loss_mlp": 1.01725745, + "epoch": 0.8752743123402976, + "flos": 25373232650880.0, + "grad_norm": 1.9822413690921272, + "language_loss": 0.83609843, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85705507, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 14558, + "time_per_iteration": 2.4252378940582275 + }, + { + "auxiliary_loss_clip": 0.01007222, + "auxiliary_loss_mlp": 0.01003125, + "balance_loss_clip": 1.00102711, + "balance_loss_mlp": 1.00060964, + "epoch": 0.8753344355929655, + "flos": 59949535178880.0, + "grad_norm": 0.8076429301721307, + "language_loss": 0.56120491, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58130836, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.06640625, + "step": 14559, + "time_per_iteration": 3.0158135890960693 + }, + { + "auxiliary_loss_clip": 0.01051633, + "auxiliary_loss_mlp": 0.0104322, + "balance_loss_clip": 1.02016449, + "balance_loss_mlp": 1.01657104, + "epoch": 0.8753945588456336, + "flos": 17893684684800.0, + "grad_norm": 1.6997829775980096, + "language_loss": 0.67151904, + "learning_rate": 1.606013202286407e-07, + "loss": 0.69246757, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 14560, + "time_per_iteration": 2.3558578491210938 + }, + { + "auxiliary_loss_clip": 0.01050842, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.01133728, + "balance_loss_mlp": 1.01563036, + "epoch": 0.8754546820983016, + "flos": 30913704771840.0, + "grad_norm": 2.2355422896787727, + "language_loss": 0.79957747, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.82042611, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3515625, + "step": 14561, + "time_per_iteration": 2.4576961994171143 + }, + { + "auxiliary_loss_clip": 0.0105279, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.01135337, + "balance_loss_mlp": 1.01576674, + "epoch": 0.8755148053509695, + "flos": 20630037404160.0, + "grad_norm": 1.9554554275711065, + "language_loss": 0.78238583, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.80327725, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 14562, + "time_per_iteration": 2.389127731323242 + }, + { + "auxiliary_loss_clip": 0.01048043, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.01201653, + "balance_loss_mlp": 1.01499462, + "epoch": 0.8755749286036375, + "flos": 34968307806720.0, + "grad_norm": 1.7356531363076508, + "language_loss": 0.72123349, + "learning_rate": 1.601428988367981e-07, + "loss": 0.74203908, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33007812, + "step": 14563, + "time_per_iteration": 2.5335230827331543 + }, + { + "auxiliary_loss_clip": 0.01053368, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.01631546, + "balance_loss_mlp": 1.01715851, + "epoch": 0.8756350518563054, + "flos": 18185488761600.0, + "grad_norm": 2.165037869642114, + "language_loss": 0.66895819, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.68989861, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36132812, + "step": 14564, + "time_per_iteration": 2.386122703552246 + }, + { + "auxiliary_loss_clip": 0.01051184, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_clip": 1.01866114, + "balance_loss_mlp": 1.01511157, + "epoch": 0.8756951751089734, + "flos": 20082983310720.0, + "grad_norm": 1.7031422890689045, + "language_loss": 0.72178292, + "learning_rate": 1.598376334037408e-07, + "loss": 0.742715, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36132812, + "step": 14565, + "time_per_iteration": 2.366636037826538 + }, + { + "auxiliary_loss_clip": 0.01055865, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.01493299, + "balance_loss_mlp": 1.01725125, + "epoch": 0.8757552983616413, + "flos": 27524057091840.0, + "grad_norm": 1.5352819349930902, + "language_loss": 0.78520077, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.8061555, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38671875, + "step": 14566, + "time_per_iteration": 2.417046546936035 + }, + { + "auxiliary_loss_clip": 0.01054976, + "auxiliary_loss_mlp": 0.01039649, + "balance_loss_clip": 1.0155803, + "balance_loss_mlp": 1.01835096, + "epoch": 0.8758154216143094, + "flos": 18072160888320.0, + "grad_norm": 1.6332418398382826, + "language_loss": 0.72486234, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.74580854, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 14567, + "time_per_iteration": 2.344057083129883 + }, + { + "auxiliary_loss_clip": 0.01049945, + "auxiliary_loss_mlp": 0.01035691, + "balance_loss_clip": 1.01331544, + "balance_loss_mlp": 1.01509142, + "epoch": 0.8758755448669773, + "flos": 25044525400320.0, + "grad_norm": 1.7431235141523838, + "language_loss": 0.74950659, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.77036291, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 14568, + "time_per_iteration": 2.394897937774658 + }, + { + "auxiliary_loss_clip": 0.01051578, + "auxiliary_loss_mlp": 0.01035769, + "balance_loss_clip": 1.01282096, + "balance_loss_mlp": 1.01653695, + "epoch": 0.8759356681196453, + "flos": 22855715533440.0, + "grad_norm": 2.5902977471189135, + "language_loss": 0.87785709, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.89873052, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 14569, + "time_per_iteration": 3.623642683029175 + }, + { + "auxiliary_loss_clip": 0.01051271, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.01290107, + "balance_loss_mlp": 1.01533735, + "epoch": 0.8759957913723132, + "flos": 21031468750080.0, + "grad_norm": 1.7201117260667607, + "language_loss": 0.74533105, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76619887, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 14570, + "time_per_iteration": 2.3672828674316406 + }, + { + "auxiliary_loss_clip": 0.01052354, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.0128119, + "balance_loss_mlp": 1.0162636, + "epoch": 0.8760559146249812, + "flos": 20009456254080.0, + "grad_norm": 1.947766536055009, + "language_loss": 0.69155496, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.71244079, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36132812, + "step": 14571, + "time_per_iteration": 2.374133348464966 + }, + { + "auxiliary_loss_clip": 0.01049984, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.01545644, + "balance_loss_mlp": 1.01541233, + "epoch": 0.8761160378776491, + "flos": 19973146573440.0, + "grad_norm": 1.9653663737465632, + "language_loss": 0.63232189, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.6531992, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34570312, + "step": 14572, + "time_per_iteration": 2.367926597595215 + }, + { + "auxiliary_loss_clip": 0.01049446, + "auxiliary_loss_mlp": 0.01035101, + "balance_loss_clip": 1.01360726, + "balance_loss_mlp": 1.01555097, + "epoch": 0.8761761611303172, + "flos": 28803134995200.0, + "grad_norm": 1.7277527700903688, + "language_loss": 0.74779129, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.76863676, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 14573, + "time_per_iteration": 2.4180846214294434 + }, + { + "auxiliary_loss_clip": 0.01050296, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.01247549, + "balance_loss_mlp": 1.01591909, + "epoch": 0.8762362843829851, + "flos": 18331530445440.0, + "grad_norm": 1.8716373587714052, + "language_loss": 0.74406058, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.76490736, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 14574, + "time_per_iteration": 2.369725465774536 + }, + { + "auxiliary_loss_clip": 0.01049162, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.01046062, + "balance_loss_mlp": 1.01485276, + "epoch": 0.8762964076356531, + "flos": 15778227317760.0, + "grad_norm": 1.6169473327908486, + "language_loss": 0.76850677, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.78931558, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 14575, + "time_per_iteration": 2.344646453857422 + }, + { + "auxiliary_loss_clip": 0.01052949, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.01201415, + "balance_loss_mlp": 1.0168047, + "epoch": 0.8763565308883211, + "flos": 33175518024960.0, + "grad_norm": 1.7028904320680918, + "language_loss": 0.67382884, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.69469362, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.36132812, + "step": 14576, + "time_per_iteration": 2.5170536041259766 + }, + { + "auxiliary_loss_clip": 0.01049276, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.01248515, + "balance_loss_mlp": 1.01465869, + "epoch": 0.876416654140989, + "flos": 15887191271040.0, + "grad_norm": 1.9655599365030259, + "language_loss": 0.67828703, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.69911623, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34570312, + "step": 14577, + "time_per_iteration": 2.3722751140594482 + }, + { + "auxiliary_loss_clip": 0.01052683, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.01357675, + "balance_loss_mlp": 1.01667774, + "epoch": 0.876476777393657, + "flos": 25884046886400.0, + "grad_norm": 2.170936043630389, + "language_loss": 0.71898293, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.739887, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.359375, + "step": 14578, + "time_per_iteration": 3.8376705646514893 + }, + { + "auxiliary_loss_clip": 0.01052057, + "auxiliary_loss_mlp": 0.01039819, + "balance_loss_clip": 1.01671648, + "balance_loss_mlp": 1.01577091, + "epoch": 0.876536900646325, + "flos": 13588823957760.0, + "grad_norm": 1.9216888572285167, + "language_loss": 0.72142768, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.74234641, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 14579, + "time_per_iteration": 3.6995441913604736 + }, + { + "auxiliary_loss_clip": 0.01049241, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.01312149, + "balance_loss_mlp": 1.01583362, + "epoch": 0.876597023898993, + "flos": 12202527669120.0, + "grad_norm": 1.685802114630485, + "language_loss": 0.71417069, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.73500842, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33398438, + "step": 14580, + "time_per_iteration": 2.352154016494751 + }, + { + "auxiliary_loss_clip": 0.0105103, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.01079249, + "balance_loss_mlp": 1.01612902, + "epoch": 0.8766571471516609, + "flos": 25335631249920.0, + "grad_norm": 1.6660201199610032, + "language_loss": 0.66678977, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.68761563, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34765625, + "step": 14581, + "time_per_iteration": 2.4073681831359863 + }, + { + "auxiliary_loss_clip": 0.01049701, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.01605403, + "balance_loss_mlp": 1.01598465, + "epoch": 0.8767172704043289, + "flos": 30112098888960.0, + "grad_norm": 1.4715494736555645, + "language_loss": 0.74186057, + "learning_rate": 1.572541512164416e-07, + "loss": 0.76273423, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33789062, + "step": 14582, + "time_per_iteration": 2.464460849761963 + }, + { + "auxiliary_loss_clip": 0.01051707, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.01441669, + "balance_loss_mlp": 1.01591849, + "epoch": 0.8767773936569968, + "flos": 19280155530240.0, + "grad_norm": 2.0496277002457437, + "language_loss": 0.67323959, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.69412392, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 14583, + "time_per_iteration": 2.38077449798584 + }, + { + "auxiliary_loss_clip": 0.01051337, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.01031148, + "balance_loss_mlp": 1.01583374, + "epoch": 0.8768375169096648, + "flos": 21246289545600.0, + "grad_norm": 1.7764908910091652, + "language_loss": 0.79944104, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.82027316, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35546875, + "step": 14584, + "time_per_iteration": 2.3708746433258057 + }, + { + "auxiliary_loss_clip": 0.0105214, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.01174378, + "balance_loss_mlp": 1.01572585, + "epoch": 0.8768976401623327, + "flos": 23294399166720.0, + "grad_norm": 1.6828564864289728, + "language_loss": 0.73508257, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.75595725, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 14585, + "time_per_iteration": 2.3895106315612793 + }, + { + "auxiliary_loss_clip": 0.010523, + "auxiliary_loss_mlp": 0.01038925, + "balance_loss_clip": 1.01499963, + "balance_loss_mlp": 1.01579821, + "epoch": 0.8769577634150008, + "flos": 21360176000640.0, + "grad_norm": 1.8822710455399656, + "language_loss": 0.754857, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.77576923, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36523438, + "step": 14586, + "time_per_iteration": 2.358318567276001 + }, + { + "auxiliary_loss_clip": 0.01051044, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.01385522, + "balance_loss_mlp": 1.01572907, + "epoch": 0.8770178866676687, + "flos": 23512920566400.0, + "grad_norm": 2.3911271177366746, + "language_loss": 0.79884398, + "learning_rate": 1.564981454895844e-07, + "loss": 0.81972265, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 14587, + "time_per_iteration": 2.40177845954895 + }, + { + "auxiliary_loss_clip": 0.0105083, + "auxiliary_loss_mlp": 0.01039767, + "balance_loss_clip": 1.01500762, + "balance_loss_mlp": 1.01550531, + "epoch": 0.8770780099203367, + "flos": 19718036202240.0, + "grad_norm": 1.5918394132006, + "language_loss": 0.74539125, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76629722, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35351562, + "step": 14588, + "time_per_iteration": 2.3632571697235107 + }, + { + "auxiliary_loss_clip": 0.01047629, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.01175046, + "balance_loss_mlp": 1.0140022, + "epoch": 0.8771381331730047, + "flos": 21394879758720.0, + "grad_norm": 1.7363272881171794, + "language_loss": 0.67724192, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.69804066, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3359375, + "step": 14589, + "time_per_iteration": 2.36016583442688 + }, + { + "auxiliary_loss_clip": 0.01052465, + "auxiliary_loss_mlp": 0.01040754, + "balance_loss_clip": 1.01787758, + "balance_loss_mlp": 1.01647973, + "epoch": 0.8771982564256726, + "flos": 20260866021120.0, + "grad_norm": 2.2065080097352814, + "language_loss": 0.72204363, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.74297589, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 14590, + "time_per_iteration": 2.357405662536621 + }, + { + "auxiliary_loss_clip": 0.01055103, + "auxiliary_loss_mlp": 0.01040892, + "balance_loss_clip": 1.01455855, + "balance_loss_mlp": 1.01622939, + "epoch": 0.8772583796783406, + "flos": 12488711016960.0, + "grad_norm": 2.3692051203805744, + "language_loss": 0.7565518, + "learning_rate": 1.558945991776086e-07, + "loss": 0.77751178, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.38867188, + "step": 14591, + "time_per_iteration": 2.3590657711029053 + }, + { + "auxiliary_loss_clip": 0.01048385, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.01121712, + "balance_loss_mlp": 1.0157299, + "epoch": 0.8773185029310085, + "flos": 15920289106560.0, + "grad_norm": 1.7135776101866866, + "language_loss": 0.81528795, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.83608961, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.32617188, + "step": 14592, + "time_per_iteration": 2.4044902324676514 + }, + { + "auxiliary_loss_clip": 0.01049295, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.01290798, + "balance_loss_mlp": 1.01628733, + "epoch": 0.8773786261836766, + "flos": 21503529509760.0, + "grad_norm": 1.633880048787702, + "language_loss": 0.83850247, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.859326, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33007812, + "step": 14593, + "time_per_iteration": 3.8078904151916504 + }, + { + "auxiliary_loss_clip": 0.01049461, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.00801611, + "balance_loss_mlp": 1.01557827, + "epoch": 0.8774387494363445, + "flos": 26760262078080.0, + "grad_norm": 1.4837288056804365, + "language_loss": 0.76988798, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.79067373, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33789062, + "step": 14594, + "time_per_iteration": 2.413724422454834 + }, + { + "auxiliary_loss_clip": 0.01052604, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.00937819, + "balance_loss_mlp": 1.01580286, + "epoch": 0.8774988726890125, + "flos": 18477851420160.0, + "grad_norm": 2.1321347766519962, + "language_loss": 0.792732, + "learning_rate": 1.552921717241651e-07, + "loss": 0.81358647, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 14595, + "time_per_iteration": 2.3622894287109375 + }, + { + "auxiliary_loss_clip": 0.01052856, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.01253033, + "balance_loss_mlp": 1.01693964, + "epoch": 0.8775589959416804, + "flos": 24425201059200.0, + "grad_norm": 1.517037972196789, + "language_loss": 0.7116797, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.73256505, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 14596, + "time_per_iteration": 2.3878815174102783 + }, + { + "auxiliary_loss_clip": 0.0105085, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.01314998, + "balance_loss_mlp": 1.01563644, + "epoch": 0.8776191191943484, + "flos": 23439044396160.0, + "grad_norm": 2.3188813631273018, + "language_loss": 0.86506951, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88594484, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 14597, + "time_per_iteration": 2.3991200923919678 + }, + { + "auxiliary_loss_clip": 0.01051171, + "auxiliary_loss_mlp": 0.01037431, + "balance_loss_clip": 1.01627123, + "balance_loss_mlp": 1.01639056, + "epoch": 0.8776792424470163, + "flos": 26829739416960.0, + "grad_norm": 1.9458280397770233, + "language_loss": 0.73625499, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.75714099, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34765625, + "step": 14598, + "time_per_iteration": 2.4084115028381348 + }, + { + "auxiliary_loss_clip": 0.01053178, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.01416504, + "balance_loss_mlp": 1.017066, + "epoch": 0.8777393656996844, + "flos": 15625447741440.0, + "grad_norm": 2.1006960461239808, + "language_loss": 0.78876412, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.80966568, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 14599, + "time_per_iteration": 2.375121593475342 + }, + { + "auxiliary_loss_clip": 0.01051647, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.01707137, + "balance_loss_mlp": 1.01592469, + "epoch": 0.8777994889523523, + "flos": 18879003475200.0, + "grad_norm": 2.422671686022171, + "language_loss": 0.69270754, + "learning_rate": 1.545407113589332e-07, + "loss": 0.71362603, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 14600, + "time_per_iteration": 2.372790575027466 + }, + { + "auxiliary_loss_clip": 0.01051432, + "auxiliary_loss_mlp": 0.01041611, + "balance_loss_clip": 1.01735187, + "balance_loss_mlp": 1.01517367, + "epoch": 0.8778596122050203, + "flos": 48824916825600.0, + "grad_norm": 1.8053161208507504, + "language_loss": 0.70565987, + "learning_rate": 1.543906292031072e-07, + "loss": 0.72659028, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 14601, + "time_per_iteration": 2.618393898010254 + }, + { + "auxiliary_loss_clip": 0.01054621, + "auxiliary_loss_mlp": 0.01041207, + "balance_loss_clip": 1.01945114, + "balance_loss_mlp": 1.01710284, + "epoch": 0.8779197354576883, + "flos": 25659171619200.0, + "grad_norm": 1.8624732162812827, + "language_loss": 0.74313915, + "learning_rate": 1.542406170329733e-07, + "loss": 0.76409745, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.375, + "step": 14602, + "time_per_iteration": 2.4199438095092773 + }, + { + "auxiliary_loss_clip": 0.01049891, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.01468301, + "balance_loss_mlp": 1.01506102, + "epoch": 0.8779798587103562, + "flos": 18842239946880.0, + "grad_norm": 2.0405340969556467, + "language_loss": 0.71652102, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73738396, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34765625, + "step": 14603, + "time_per_iteration": 2.3925294876098633 + }, + { + "auxiliary_loss_clip": 0.01007536, + "auxiliary_loss_mlp": 0.0100387, + "balance_loss_clip": 1.00192666, + "balance_loss_mlp": 1.00087619, + "epoch": 0.8780399819630242, + "flos": 68609635591680.0, + "grad_norm": 0.7491218798712806, + "language_loss": 0.54221612, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56233019, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.06640625, + "step": 14604, + "time_per_iteration": 3.0479793548583984 + }, + { + "auxiliary_loss_clip": 0.01007717, + "auxiliary_loss_mlp": 0.01003251, + "balance_loss_clip": 1.00109363, + "balance_loss_mlp": 1.0009892, + "epoch": 0.8781001052156922, + "flos": 65731290906240.0, + "grad_norm": 0.7025825843673941, + "language_loss": 0.59413135, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61424106, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06738281, + "step": 14605, + "time_per_iteration": 3.040858745574951 + }, + { + "auxiliary_loss_clip": 0.01053769, + "auxiliary_loss_mlp": 0.01036719, + "balance_loss_clip": 1.01281714, + "balance_loss_mlp": 1.0168134, + "epoch": 0.8781602284683602, + "flos": 22048698389760.0, + "grad_norm": 1.513932823924889, + "language_loss": 0.85803336, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87893832, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37109375, + "step": 14606, + "time_per_iteration": 2.443516969680786 + }, + { + "auxiliary_loss_clip": 0.01053705, + "auxiliary_loss_mlp": 0.01038621, + "balance_loss_clip": 1.01419508, + "balance_loss_mlp": 1.01663423, + "epoch": 0.8782203517210281, + "flos": 17562079791360.0, + "grad_norm": 1.8482310612406647, + "language_loss": 0.72071898, + "learning_rate": 1.534916061666931e-07, + "loss": 0.74164224, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 14607, + "time_per_iteration": 2.399864912033081 + }, + { + "auxiliary_loss_clip": 0.01050318, + "auxiliary_loss_mlp": 0.01043482, + "balance_loss_clip": 1.02281141, + "balance_loss_mlp": 1.01595402, + "epoch": 0.8782804749736961, + "flos": 25519239423360.0, + "grad_norm": 2.0546494897413425, + "language_loss": 0.72776079, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74869883, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34375, + "step": 14608, + "time_per_iteration": 3.667001485824585 + }, + { + "auxiliary_loss_clip": 0.0105293, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.01193523, + "balance_loss_mlp": 1.0158608, + "epoch": 0.878340598226364, + "flos": 21797672647680.0, + "grad_norm": 2.2716431163487405, + "language_loss": 0.88559294, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.90646946, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.37109375, + "step": 14609, + "time_per_iteration": 2.354722738265991 + }, + { + "auxiliary_loss_clip": 0.01051517, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.01340985, + "balance_loss_mlp": 1.01593447, + "epoch": 0.878400721479032, + "flos": 21101434848000.0, + "grad_norm": 1.6374785333437896, + "language_loss": 0.71098799, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.73186779, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 14610, + "time_per_iteration": 2.3782598972320557 + }, + { + "auxiliary_loss_clip": 0.01049874, + "auxiliary_loss_mlp": 0.01036963, + "balance_loss_clip": 1.01496923, + "balance_loss_mlp": 1.01583982, + "epoch": 0.8784608447316999, + "flos": 20922469885440.0, + "grad_norm": 1.9184077921712457, + "language_loss": 0.82493496, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.84580338, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33984375, + "step": 14611, + "time_per_iteration": 2.3657169342041016 + }, + { + "auxiliary_loss_clip": 0.0105198, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.01630735, + "balance_loss_mlp": 1.01609111, + "epoch": 0.878520967984368, + "flos": 23329556772480.0, + "grad_norm": 1.6502364819181696, + "language_loss": 0.77763224, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.79854977, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 14612, + "time_per_iteration": 2.371793508529663 + }, + { + "auxiliary_loss_clip": 0.01051041, + "auxiliary_loss_mlp": 0.01038898, + "balance_loss_clip": 1.01566362, + "balance_loss_mlp": 1.01591372, + "epoch": 0.8785810912370359, + "flos": 25517842968960.0, + "grad_norm": 1.4762804016886035, + "language_loss": 0.73168856, + "learning_rate": 1.525951038422002e-07, + "loss": 0.75258791, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14613, + "time_per_iteration": 2.4040138721466064 + }, + { + "auxiliary_loss_clip": 0.01007744, + "auxiliary_loss_mlp": 0.01005047, + "balance_loss_clip": 1.00310385, + "balance_loss_mlp": 1.00104463, + "epoch": 0.8786412144897039, + "flos": 61838371710720.0, + "grad_norm": 2.2419735321524175, + "language_loss": 0.6466831, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66681099, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.06738281, + "step": 14614, + "time_per_iteration": 2.764796018600464 + }, + { + "auxiliary_loss_clip": 0.01007598, + "auxiliary_loss_mlp": 0.0100237, + "balance_loss_clip": 1.00014091, + "balance_loss_mlp": 1.00079381, + "epoch": 0.8787013377423719, + "flos": 70985684413440.0, + "grad_norm": 1.0612186889936714, + "language_loss": 0.5874182, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60751796, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.06835938, + "step": 14615, + "time_per_iteration": 3.0901243686676025 + }, + { + "auxiliary_loss_clip": 0.01049503, + "auxiliary_loss_mlp": 0.01035677, + "balance_loss_clip": 1.01376653, + "balance_loss_mlp": 1.0147258, + "epoch": 0.8787614609950398, + "flos": 17346456034560.0, + "grad_norm": 2.010199554561567, + "language_loss": 0.74217439, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.76302618, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 14616, + "time_per_iteration": 2.3388102054595947 + }, + { + "auxiliary_loss_clip": 0.01007132, + "auxiliary_loss_mlp": 0.01002786, + "balance_loss_clip": 1.00064027, + "balance_loss_mlp": 1.00061274, + "epoch": 0.8788215842477078, + "flos": 72507967914240.0, + "grad_norm": 0.8107854779223704, + "language_loss": 0.57991934, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.60001856, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.06542969, + "step": 14617, + "time_per_iteration": 3.139739513397217 + }, + { + "auxiliary_loss_clip": 0.01049279, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01162148, + "balance_loss_mlp": 1.01525807, + "epoch": 0.8788817075003758, + "flos": 24826283291520.0, + "grad_norm": 1.7692779425853984, + "language_loss": 0.84165281, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.86247623, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33984375, + "step": 14618, + "time_per_iteration": 5.254082202911377 + }, + { + "auxiliary_loss_clip": 0.01048742, + "auxiliary_loss_mlp": 0.01038928, + "balance_loss_clip": 1.01688647, + "balance_loss_mlp": 1.01550448, + "epoch": 0.8789418307530438, + "flos": 22637647981440.0, + "grad_norm": 1.843018274050604, + "language_loss": 0.70210838, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.72298503, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33203125, + "step": 14619, + "time_per_iteration": 2.4150447845458984 + }, + { + "auxiliary_loss_clip": 0.01052496, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.01383507, + "balance_loss_mlp": 1.01588058, + "epoch": 0.8790019540057117, + "flos": 19784895189120.0, + "grad_norm": 1.7025136876897844, + "language_loss": 0.78141689, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.80231285, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 14620, + "time_per_iteration": 2.364327907562256 + }, + { + "auxiliary_loss_clip": 0.01052591, + "auxiliary_loss_mlp": 0.01039655, + "balance_loss_clip": 1.01484752, + "balance_loss_mlp": 1.01605034, + "epoch": 0.8790620772583797, + "flos": 20228745703680.0, + "grad_norm": 1.6534031541445695, + "language_loss": 0.80833733, + "learning_rate": 1.514036906317542e-07, + "loss": 0.82925975, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 14621, + "time_per_iteration": 2.3962936401367188 + }, + { + "auxiliary_loss_clip": 0.01053267, + "auxiliary_loss_mlp": 0.01039325, + "balance_loss_clip": 1.01547146, + "balance_loss_mlp": 1.0158186, + "epoch": 0.8791222005110476, + "flos": 24129731289600.0, + "grad_norm": 1.725273771530116, + "language_loss": 0.67544448, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.69637042, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 14622, + "time_per_iteration": 2.4168307781219482 + }, + { + "auxiliary_loss_clip": 0.01050569, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.01407015, + "balance_loss_mlp": 1.0158633, + "epoch": 0.8791823237637156, + "flos": 21613191690240.0, + "grad_norm": 1.958513423376429, + "language_loss": 0.73783863, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75870633, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 14623, + "time_per_iteration": 2.3885605335235596 + }, + { + "auxiliary_loss_clip": 0.01048686, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.01507115, + "balance_loss_mlp": 1.0140264, + "epoch": 0.8792424470163835, + "flos": 24242256201600.0, + "grad_norm": 1.6136285316277603, + "language_loss": 0.79784828, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.81868923, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34570312, + "step": 14624, + "time_per_iteration": 2.384242057800293 + }, + { + "auxiliary_loss_clip": 0.01051661, + "auxiliary_loss_mlp": 0.01038171, + "balance_loss_clip": 1.01341164, + "balance_loss_mlp": 1.01544905, + "epoch": 0.8793025702690516, + "flos": 24892234583040.0, + "grad_norm": 2.42789489661739, + "language_loss": 0.81030399, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.83120227, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 14625, + "time_per_iteration": 2.3979785442352295 + }, + { + "auxiliary_loss_clip": 0.01049851, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.01371908, + "balance_loss_mlp": 1.0157392, + "epoch": 0.8793626935217195, + "flos": 25371975841920.0, + "grad_norm": 1.497574969020073, + "language_loss": 0.74497068, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76583612, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34179688, + "step": 14626, + "time_per_iteration": 2.4881114959716797 + }, + { + "auxiliary_loss_clip": 0.01052014, + "auxiliary_loss_mlp": 0.01043998, + "balance_loss_clip": 1.01890397, + "balance_loss_mlp": 1.01526713, + "epoch": 0.8794228167743875, + "flos": 34676573552640.0, + "grad_norm": 1.7821006235267698, + "language_loss": 0.72075039, + "learning_rate": 1.505130747218246e-07, + "loss": 0.74171054, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 14627, + "time_per_iteration": 2.51969313621521 + }, + { + "auxiliary_loss_clip": 0.01051765, + "auxiliary_loss_mlp": 0.01036811, + "balance_loss_clip": 1.01341021, + "balance_loss_mlp": 1.0161643, + "epoch": 0.8794829400270555, + "flos": 19462995653760.0, + "grad_norm": 1.6852743405212474, + "language_loss": 0.7307049, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.75159073, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 14628, + "time_per_iteration": 2.382138967514038 + }, + { + "auxiliary_loss_clip": 0.01052383, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.01440644, + "balance_loss_mlp": 1.01642942, + "epoch": 0.8795430632797234, + "flos": 15230509908480.0, + "grad_norm": 3.1925889429096133, + "language_loss": 0.70181817, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.72272456, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 14629, + "time_per_iteration": 2.326552629470825 + }, + { + "auxiliary_loss_clip": 0.01048011, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.01375222, + "balance_loss_mlp": 1.01472783, + "epoch": 0.8796031865323914, + "flos": 27743521098240.0, + "grad_norm": 1.477212248349002, + "language_loss": 0.69722068, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.71803117, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.33203125, + "step": 14630, + "time_per_iteration": 2.431384563446045 + }, + { + "auxiliary_loss_clip": 0.01049563, + "auxiliary_loss_mlp": 0.01043413, + "balance_loss_clip": 1.01959467, + "balance_loss_mlp": 1.01495743, + "epoch": 0.8796633097850594, + "flos": 31284063141120.0, + "grad_norm": 1.4617566611865997, + "language_loss": 0.74890769, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76983744, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34570312, + "step": 14631, + "time_per_iteration": 2.4536116123199463 + }, + { + "auxiliary_loss_clip": 0.01049435, + "auxiliary_loss_mlp": 0.0103657, + "balance_loss_clip": 1.01407504, + "balance_loss_mlp": 1.01510763, + "epoch": 0.8797234330377274, + "flos": 24242011822080.0, + "grad_norm": 1.9741286991475018, + "language_loss": 0.7006917, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.72155178, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34375, + "step": 14632, + "time_per_iteration": 3.8044238090515137 + }, + { + "auxiliary_loss_clip": 0.01051942, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.01493359, + "balance_loss_mlp": 1.01691949, + "epoch": 0.8797835562903953, + "flos": 24166355172480.0, + "grad_norm": 1.9216362738889932, + "language_loss": 0.65643287, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67731702, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 14633, + "time_per_iteration": 2.3925583362579346 + }, + { + "auxiliary_loss_clip": 0.01051569, + "auxiliary_loss_mlp": 0.01038081, + "balance_loss_clip": 1.01481128, + "balance_loss_mlp": 1.01610279, + "epoch": 0.8798436795430633, + "flos": 19283576843520.0, + "grad_norm": 1.5115313559840258, + "language_loss": 0.85194314, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.87283969, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 14634, + "time_per_iteration": 2.411674737930298 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.01971459, + "balance_loss_mlp": 1.0155766, + "epoch": 0.8799038027957312, + "flos": 28178259747840.0, + "grad_norm": 1.7468707314417613, + "language_loss": 0.80740869, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.82837933, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 14635, + "time_per_iteration": 2.4323837757110596 + }, + { + "auxiliary_loss_clip": 0.01050131, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.01192188, + "balance_loss_mlp": 1.01516342, + "epoch": 0.8799639260483992, + "flos": 24643547902080.0, + "grad_norm": 1.676121815361769, + "language_loss": 0.65953028, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.68037093, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 14636, + "time_per_iteration": 2.3979039192199707 + }, + { + "auxiliary_loss_clip": 0.01052724, + "auxiliary_loss_mlp": 0.0103812, + "balance_loss_clip": 1.01338375, + "balance_loss_mlp": 1.01608467, + "epoch": 0.8800240493010671, + "flos": 22199383284480.0, + "grad_norm": 2.718790377766819, + "language_loss": 0.71030664, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.73121512, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 14637, + "time_per_iteration": 2.3874804973602295 + }, + { + "auxiliary_loss_clip": 0.01052782, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.01643085, + "balance_loss_mlp": 1.01729512, + "epoch": 0.8800841725537352, + "flos": 14245226029440.0, + "grad_norm": 1.9078911352764052, + "language_loss": 0.67874491, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.69964701, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.35546875, + "step": 14638, + "time_per_iteration": 2.3331735134124756 + }, + { + "auxiliary_loss_clip": 0.01050775, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.01314902, + "balance_loss_mlp": 1.01499021, + "epoch": 0.8801442958064031, + "flos": 37414252903680.0, + "grad_norm": 1.7899782009784433, + "language_loss": 0.59170234, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.61256486, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 14639, + "time_per_iteration": 2.516568183898926 + }, + { + "auxiliary_loss_clip": 0.01052193, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.01326501, + "balance_loss_mlp": 1.01601803, + "epoch": 0.8802044190590711, + "flos": 25046131322880.0, + "grad_norm": 1.8971500601668443, + "language_loss": 0.74952304, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.7704047, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 14640, + "time_per_iteration": 2.4171130657196045 + }, + { + "auxiliary_loss_clip": 0.01053827, + "auxiliary_loss_mlp": 0.0103965, + "balance_loss_clip": 1.0149262, + "balance_loss_mlp": 1.0169301, + "epoch": 0.8802645423117391, + "flos": 24132733666560.0, + "grad_norm": 1.9186908299080092, + "language_loss": 0.70992094, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.7308557, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36914062, + "step": 14641, + "time_per_iteration": 2.4263501167297363 + }, + { + "auxiliary_loss_clip": 0.01051999, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.01556897, + "balance_loss_mlp": 1.01586628, + "epoch": 0.880324665564407, + "flos": 17930238744960.0, + "grad_norm": 2.0801872974386564, + "language_loss": 0.86634523, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.88725644, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36132812, + "step": 14642, + "time_per_iteration": 2.3609280586242676 + }, + { + "auxiliary_loss_clip": 0.01052302, + "auxiliary_loss_mlp": 0.01039803, + "balance_loss_clip": 1.01623487, + "balance_loss_mlp": 1.01678514, + "epoch": 0.880384788817075, + "flos": 21286579121280.0, + "grad_norm": 1.6321026760155928, + "language_loss": 0.79960918, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.8205303, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35546875, + "step": 14643, + "time_per_iteration": 2.3718738555908203 + }, + { + "auxiliary_loss_clip": 0.01047577, + "auxiliary_loss_mlp": 0.01031469, + "balance_loss_clip": 1.01024973, + "balance_loss_mlp": 1.01444125, + "epoch": 0.880444912069743, + "flos": 12457672951680.0, + "grad_norm": 1.5451584873944582, + "language_loss": 0.73984587, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.76063633, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33007812, + "step": 14644, + "time_per_iteration": 2.375429391860962 + }, + { + "auxiliary_loss_clip": 0.01052189, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.01327872, + "balance_loss_mlp": 1.01619124, + "epoch": 0.880505035322411, + "flos": 13625098727040.0, + "grad_norm": 1.9761924518983958, + "language_loss": 0.80151892, + "learning_rate": 1.47856380505911e-07, + "loss": 0.8224026, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 14645, + "time_per_iteration": 2.3687925338745117 + }, + { + "auxiliary_loss_clip": 0.01049847, + "auxiliary_loss_mlp": 0.01040288, + "balance_loss_clip": 1.01714981, + "balance_loss_mlp": 1.01551723, + "epoch": 0.8805651585750789, + "flos": 23182013900160.0, + "grad_norm": 1.976120855431325, + "language_loss": 0.65226549, + "learning_rate": 1.477094533001364e-07, + "loss": 0.67316681, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34375, + "step": 14646, + "time_per_iteration": 2.4060981273651123 + }, + { + "auxiliary_loss_clip": 0.0105507, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_clip": 1.01517665, + "balance_loss_mlp": 1.0168494, + "epoch": 0.8806252818277469, + "flos": 14902116860160.0, + "grad_norm": 2.400269466136488, + "language_loss": 0.79461151, + "learning_rate": 1.475625963334055e-07, + "loss": 0.81557518, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 14647, + "time_per_iteration": 2.3389861583709717 + }, + { + "auxiliary_loss_clip": 0.01049218, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.01163614, + "balance_loss_mlp": 1.0157311, + "epoch": 0.8806854050804148, + "flos": 17638225200000.0, + "grad_norm": 2.0125180049269926, + "language_loss": 0.76866156, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.7894727, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33398438, + "step": 14648, + "time_per_iteration": 3.6478703022003174 + }, + { + "auxiliary_loss_clip": 0.01049628, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.0126543, + "balance_loss_mlp": 1.01422679, + "epoch": 0.8807455283330828, + "flos": 25331372064000.0, + "grad_norm": 1.6080929328129512, + "language_loss": 0.6663149, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.68715352, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35351562, + "step": 14649, + "time_per_iteration": 2.41279673576355 + }, + { + "auxiliary_loss_clip": 0.01052058, + "auxiliary_loss_mlp": 0.01040806, + "balance_loss_clip": 1.01683319, + "balance_loss_mlp": 1.01696026, + "epoch": 0.8808056515857507, + "flos": 25263989406720.0, + "grad_norm": 1.3329406428238044, + "language_loss": 0.6316244, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.65255308, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 14650, + "time_per_iteration": 2.417815923690796 + }, + { + "auxiliary_loss_clip": 0.01050257, + "auxiliary_loss_mlp": 0.01034051, + "balance_loss_clip": 1.01258087, + "balance_loss_mlp": 1.01643729, + "epoch": 0.8808657748384188, + "flos": 26577631422720.0, + "grad_norm": 1.4319901573395113, + "language_loss": 0.73330861, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.7541517, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33789062, + "step": 14651, + "time_per_iteration": 2.4618141651153564 + }, + { + "auxiliary_loss_clip": 0.01053462, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.01385057, + "balance_loss_mlp": 1.01649153, + "epoch": 0.8809258980910867, + "flos": 18660237696000.0, + "grad_norm": 1.871973372133509, + "language_loss": 0.7245481, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74546343, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 14652, + "time_per_iteration": 2.3723065853118896 + }, + { + "auxiliary_loss_clip": 0.01049086, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.01368034, + "balance_loss_mlp": 1.01459944, + "epoch": 0.8809860213437547, + "flos": 19791249056640.0, + "grad_norm": 6.689570025707998, + "language_loss": 0.76491225, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.78576511, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34375, + "step": 14653, + "time_per_iteration": 2.3501734733581543 + }, + { + "auxiliary_loss_clip": 0.01053274, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_clip": 1.01880479, + "balance_loss_mlp": 1.01666164, + "epoch": 0.8810461445964227, + "flos": 17893510128000.0, + "grad_norm": 1.8188093949543576, + "language_loss": 0.73121566, + "learning_rate": 1.465365647269421e-07, + "loss": 0.75216424, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 14654, + "time_per_iteration": 2.3689918518066406 + }, + { + "auxiliary_loss_clip": 0.01051011, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.01485336, + "balance_loss_mlp": 1.01657224, + "epoch": 0.8811062678490906, + "flos": 29162775576960.0, + "grad_norm": 2.0620574642171587, + "language_loss": 0.72510076, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.74599451, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.34375, + "step": 14655, + "time_per_iteration": 2.4395909309387207 + }, + { + "auxiliary_loss_clip": 0.01050533, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.01305604, + "balance_loss_mlp": 1.0154618, + "epoch": 0.8811663911017587, + "flos": 20337814391040.0, + "grad_norm": 1.8744210317203864, + "language_loss": 0.82878172, + "learning_rate": 1.462440453077449e-07, + "loss": 0.84964466, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 14656, + "time_per_iteration": 2.3804893493652344 + }, + { + "auxiliary_loss_clip": 0.01051011, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01362276, + "balance_loss_mlp": 1.01614571, + "epoch": 0.8812265143544266, + "flos": 25884500734080.0, + "grad_norm": 2.1344815841140194, + "language_loss": 0.69965732, + "learning_rate": 1.460978910372914e-07, + "loss": 0.72051948, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34765625, + "step": 14657, + "time_per_iteration": 3.837470531463623 + }, + { + "auxiliary_loss_clip": 0.01053081, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.01483345, + "balance_loss_mlp": 1.0172956, + "epoch": 0.8812866376070946, + "flos": 27194372323200.0, + "grad_norm": 1.8517487216035111, + "language_loss": 0.84944594, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.87035704, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 14658, + "time_per_iteration": 3.8206231594085693 + }, + { + "auxiliary_loss_clip": 0.01057772, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.0142796, + "balance_loss_mlp": 1.01898694, + "epoch": 0.8813467608597625, + "flos": 23806016363520.0, + "grad_norm": 2.0338500213524653, + "language_loss": 0.78058231, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.80155736, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 14659, + "time_per_iteration": 2.394940137863159 + }, + { + "auxiliary_loss_clip": 0.01051061, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.01250982, + "balance_loss_mlp": 1.01579213, + "epoch": 0.8814068841124305, + "flos": 21104402313600.0, + "grad_norm": 1.9921804425670369, + "language_loss": 0.61636269, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.63722467, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 14660, + "time_per_iteration": 2.3532655239105225 + }, + { + "auxiliary_loss_clip": 0.0105099, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.01180625, + "balance_loss_mlp": 1.01545763, + "epoch": 0.8814670073650984, + "flos": 24715853061120.0, + "grad_norm": 1.743900398888105, + "language_loss": 0.78071612, + "learning_rate": 1.455139770123972e-07, + "loss": 0.80158907, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.35546875, + "step": 14661, + "time_per_iteration": 2.403336763381958 + }, + { + "auxiliary_loss_clip": 0.01053497, + "auxiliary_loss_mlp": 0.0103833, + "balance_loss_clip": 1.01564491, + "balance_loss_mlp": 1.01727676, + "epoch": 0.8815271306177664, + "flos": 22965168245760.0, + "grad_norm": 2.428569271424006, + "language_loss": 0.77548337, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79640162, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 14662, + "time_per_iteration": 2.3714115619659424 + }, + { + "auxiliary_loss_clip": 0.01050505, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.01412249, + "balance_loss_mlp": 1.01601386, + "epoch": 0.8815872538704344, + "flos": 19459155404160.0, + "grad_norm": 2.2776404795246683, + "language_loss": 0.75055641, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.77140737, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34570312, + "step": 14663, + "time_per_iteration": 2.3559348583221436 + }, + { + "auxiliary_loss_clip": 0.01050208, + "auxiliary_loss_mlp": 0.01042705, + "balance_loss_clip": 1.01980519, + "balance_loss_mlp": 1.01623964, + "epoch": 0.8816473771231024, + "flos": 32155355831040.0, + "grad_norm": 1.5845917911179896, + "language_loss": 0.70784259, + "learning_rate": 1.450767798584489e-07, + "loss": 0.72877175, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.33984375, + "step": 14664, + "time_per_iteration": 2.455859422683716 + }, + { + "auxiliary_loss_clip": 0.01050418, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.013291, + "balance_loss_mlp": 1.01588047, + "epoch": 0.8817075003757703, + "flos": 19681272673920.0, + "grad_norm": 1.539708869698651, + "language_loss": 0.81569707, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83656877, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34570312, + "step": 14665, + "time_per_iteration": 2.368464469909668 + }, + { + "auxiliary_loss_clip": 0.01052585, + "auxiliary_loss_mlp": 0.01038663, + "balance_loss_clip": 1.01624012, + "balance_loss_mlp": 1.01632977, + "epoch": 0.8817676236284383, + "flos": 15667727264640.0, + "grad_norm": 2.0474168163263826, + "language_loss": 0.59539372, + "learning_rate": 1.447856667743117e-07, + "loss": 0.61630619, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36328125, + "step": 14666, + "time_per_iteration": 2.357415199279785 + }, + { + "auxiliary_loss_clip": 0.01052752, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_clip": 1.01678634, + "balance_loss_mlp": 1.01666522, + "epoch": 0.8818277468811063, + "flos": 17894208355200.0, + "grad_norm": 1.9218399007873008, + "language_loss": 0.85442472, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.87537593, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36132812, + "step": 14667, + "time_per_iteration": 2.394066333770752 + }, + { + "auxiliary_loss_clip": 0.01050194, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.0150398, + "balance_loss_mlp": 1.01538372, + "epoch": 0.8818878701337742, + "flos": 18769515851520.0, + "grad_norm": 2.145806635719087, + "language_loss": 0.63057584, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.65145445, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 14668, + "time_per_iteration": 2.362997055053711 + }, + { + "auxiliary_loss_clip": 0.0104937, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.01071525, + "balance_loss_mlp": 1.01548839, + "epoch": 0.8819479933864423, + "flos": 17711333320320.0, + "grad_norm": 2.1400383437071757, + "language_loss": 0.5858646, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.60667193, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33984375, + "step": 14669, + "time_per_iteration": 2.3936922550201416 + }, + { + "auxiliary_loss_clip": 0.01050351, + "auxiliary_loss_mlp": 0.01035402, + "balance_loss_clip": 1.01417065, + "balance_loss_mlp": 1.01585484, + "epoch": 0.8820081166391102, + "flos": 11727953291520.0, + "grad_norm": 2.8966416637383867, + "language_loss": 0.72972214, + "learning_rate": 1.442042848491043e-07, + "loss": 0.75057971, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 14670, + "time_per_iteration": 2.3401994705200195 + }, + { + "auxiliary_loss_clip": 0.0105006, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.01188159, + "balance_loss_mlp": 1.01508272, + "epoch": 0.8820682398917782, + "flos": 27489143865600.0, + "grad_norm": 2.2853226442424344, + "language_loss": 0.74422979, + "learning_rate": 1.44059115283929e-07, + "loss": 0.76508516, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.34960938, + "step": 14671, + "time_per_iteration": 2.410701036453247 + }, + { + "auxiliary_loss_clip": 0.01053555, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.01065016, + "balance_loss_mlp": 1.0156486, + "epoch": 0.8821283631444461, + "flos": 16872894086400.0, + "grad_norm": 2.0566855698135673, + "language_loss": 0.85772175, + "learning_rate": 1.43914016096218e-07, + "loss": 0.87862897, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 14672, + "time_per_iteration": 3.7538070678710938 + }, + { + "auxiliary_loss_clip": 0.01049391, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.00861669, + "balance_loss_mlp": 1.01528728, + "epoch": 0.8821884863971141, + "flos": 24279787779840.0, + "grad_norm": 1.5034026200337745, + "language_loss": 0.73692447, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.75773382, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.33984375, + "step": 14673, + "time_per_iteration": 2.3869614601135254 + }, + { + "auxiliary_loss_clip": 0.01007123, + "auxiliary_loss_mlp": 0.01003396, + "balance_loss_clip": 1.0013932, + "balance_loss_mlp": 1.00062084, + "epoch": 0.882248609649782, + "flos": 59428734428160.0, + "grad_norm": 0.8114662331786209, + "language_loss": 0.49533847, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51544362, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06494141, + "step": 14674, + "time_per_iteration": 3.1172256469726562 + }, + { + "auxiliary_loss_clip": 0.01052695, + "auxiliary_loss_mlp": 0.01040995, + "balance_loss_clip": 1.01649714, + "balance_loss_mlp": 1.01632392, + "epoch": 0.88230873290245, + "flos": 19936767070080.0, + "grad_norm": 1.8987643759859028, + "language_loss": 0.77921367, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.80015063, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36328125, + "step": 14675, + "time_per_iteration": 2.3450896739959717 + }, + { + "auxiliary_loss_clip": 0.01049249, + "auxiliary_loss_mlp": 0.01035826, + "balance_loss_clip": 1.01197219, + "balance_loss_mlp": 1.01452637, + "epoch": 0.882368856155118, + "flos": 16361835471360.0, + "grad_norm": 1.8177934753532914, + "language_loss": 0.8091265, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.82997721, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34765625, + "step": 14676, + "time_per_iteration": 2.3451733589172363 + }, + { + "auxiliary_loss_clip": 0.01007444, + "auxiliary_loss_mlp": 0.01002793, + "balance_loss_clip": 1.00059938, + "balance_loss_mlp": 1.00081897, + "epoch": 0.882428979407786, + "flos": 70590711669120.0, + "grad_norm": 0.6968071521777633, + "language_loss": 0.549335, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56943738, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06640625, + "step": 14677, + "time_per_iteration": 3.1387436389923096 + }, + { + "auxiliary_loss_clip": 0.01049464, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.01230037, + "balance_loss_mlp": 1.0149684, + "epoch": 0.8824891026604539, + "flos": 18149318726400.0, + "grad_norm": 2.214177729229667, + "language_loss": 0.66934329, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.69019634, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34570312, + "step": 14678, + "time_per_iteration": 2.3951282501220703 + }, + { + "auxiliary_loss_clip": 0.0105312, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.01032472, + "balance_loss_mlp": 1.01602066, + "epoch": 0.8825492259131219, + "flos": 27231554787840.0, + "grad_norm": 1.9129401106567923, + "language_loss": 0.72074437, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.7416153, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 14679, + "time_per_iteration": 2.416346549987793 + }, + { + "auxiliary_loss_clip": 0.01050591, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01294005, + "balance_loss_mlp": 1.0165031, + "epoch": 0.8826093491657898, + "flos": 22273294366080.0, + "grad_norm": 1.6948903633377934, + "language_loss": 0.64779937, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66864944, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 14680, + "time_per_iteration": 2.384524345397949 + }, + { + "auxiliary_loss_clip": 0.01049866, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.01093423, + "balance_loss_mlp": 1.01552081, + "epoch": 0.8826694724184578, + "flos": 14204028758400.0, + "grad_norm": 2.399927093588753, + "language_loss": 0.78623712, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.80706662, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34375, + "step": 14681, + "time_per_iteration": 2.311941146850586 + }, + { + "auxiliary_loss_clip": 0.0105255, + "auxiliary_loss_mlp": 0.01035676, + "balance_loss_clip": 1.01128626, + "balance_loss_mlp": 1.01663375, + "epoch": 0.8827295956711259, + "flos": 20630630897280.0, + "grad_norm": 2.238339378226848, + "language_loss": 0.74864483, + "learning_rate": 1.424668961888047e-07, + "loss": 0.76952714, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 14682, + "time_per_iteration": 2.3667376041412354 + }, + { + "auxiliary_loss_clip": 0.01053956, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.01178026, + "balance_loss_mlp": 1.01694298, + "epoch": 0.8827897189237938, + "flos": 18512136241920.0, + "grad_norm": 1.7613403498478073, + "language_loss": 0.76048768, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.78139758, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 14683, + "time_per_iteration": 2.3584680557250977 + }, + { + "auxiliary_loss_clip": 0.01051962, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.0094142, + "balance_loss_mlp": 1.01613927, + "epoch": 0.8828498421764618, + "flos": 22746297732480.0, + "grad_norm": 1.7964198103156148, + "language_loss": 0.66651493, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.68734628, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.359375, + "step": 14684, + "time_per_iteration": 2.3624069690704346 + }, + { + "auxiliary_loss_clip": 0.01049837, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.01442611, + "balance_loss_mlp": 1.01543307, + "epoch": 0.8829099654291297, + "flos": 15011499749760.0, + "grad_norm": 2.176683169484656, + "language_loss": 0.70344186, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.72431898, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34375, + "step": 14685, + "time_per_iteration": 2.3525054454803467 + }, + { + "auxiliary_loss_clip": 0.01052362, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.01353455, + "balance_loss_mlp": 1.01612818, + "epoch": 0.8829700886817977, + "flos": 16719800307840.0, + "grad_norm": 1.9047767798172004, + "language_loss": 0.75565302, + "learning_rate": 1.418900201783806e-07, + "loss": 0.77654034, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 14686, + "time_per_iteration": 2.3587822914123535 + }, + { + "auxiliary_loss_clip": 0.01049162, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01012969, + "balance_loss_mlp": 1.01504493, + "epoch": 0.8830302119344656, + "flos": 15262490580480.0, + "grad_norm": 1.787811810046663, + "language_loss": 0.64587873, + "learning_rate": 1.417459773114007e-07, + "loss": 0.6666823, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 14687, + "time_per_iteration": 3.609645366668701 + }, + { + "auxiliary_loss_clip": 0.01051997, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.01169693, + "balance_loss_mlp": 1.01634121, + "epoch": 0.8830903351871336, + "flos": 28616594267520.0, + "grad_norm": 1.7653816116382128, + "language_loss": 0.69407117, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71493661, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 14688, + "time_per_iteration": 2.426697254180908 + }, + { + "auxiliary_loss_clip": 0.01049821, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.00864613, + "balance_loss_mlp": 1.01560903, + "epoch": 0.8831504584398016, + "flos": 28000377037440.0, + "grad_norm": 1.6703653959792895, + "language_loss": 0.68168277, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.70248675, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 14689, + "time_per_iteration": 2.442763328552246 + }, + { + "auxiliary_loss_clip": 0.01053745, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.01518536, + "balance_loss_mlp": 1.01823211, + "epoch": 0.8832105816924696, + "flos": 26578399472640.0, + "grad_norm": 1.3949429825611912, + "language_loss": 0.75300866, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.77392268, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 14690, + "time_per_iteration": 2.5216026306152344 + }, + { + "auxiliary_loss_clip": 0.0105328, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.01534998, + "balance_loss_mlp": 1.01686525, + "epoch": 0.8832707049451375, + "flos": 24897645843840.0, + "grad_norm": 1.4484521300837756, + "language_loss": 0.74007756, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.76100761, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 14691, + "time_per_iteration": 2.417863607406616 + }, + { + "auxiliary_loss_clip": 0.01054594, + "auxiliary_loss_mlp": 0.01036329, + "balance_loss_clip": 1.0104841, + "balance_loss_mlp": 1.01661837, + "epoch": 0.8833308281978055, + "flos": 15450218294400.0, + "grad_norm": 1.9498668780143102, + "language_loss": 0.53616554, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.55707479, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 14692, + "time_per_iteration": 2.3652570247650146 + }, + { + "auxiliary_loss_clip": 0.01052606, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.01020598, + "balance_loss_mlp": 1.01648331, + "epoch": 0.8833909514504734, + "flos": 20300527192320.0, + "grad_norm": 3.702451205214102, + "language_loss": 0.61470395, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.63556272, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 14693, + "time_per_iteration": 2.35591459274292 + }, + { + "auxiliary_loss_clip": 0.01049797, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.00978625, + "balance_loss_mlp": 1.01666665, + "epoch": 0.8834510747031414, + "flos": 20373041819520.0, + "grad_norm": 1.4888641858912761, + "language_loss": 0.76384771, + "learning_rate": 1.407396505730898e-07, + "loss": 0.78464603, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.33203125, + "step": 14694, + "time_per_iteration": 2.3568222522735596 + }, + { + "auxiliary_loss_clip": 0.01052312, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.01148927, + "balance_loss_mlp": 1.01519537, + "epoch": 0.8835111979558095, + "flos": 29750747650560.0, + "grad_norm": 1.8730629576668674, + "language_loss": 0.73703134, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.75790668, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37109375, + "step": 14695, + "time_per_iteration": 2.423276424407959 + }, + { + "auxiliary_loss_clip": 0.01047532, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.00857687, + "balance_loss_mlp": 1.01509774, + "epoch": 0.8835713212084774, + "flos": 24133396982400.0, + "grad_norm": 1.6818178710568183, + "language_loss": 0.81305408, + "learning_rate": 1.404527630961998e-07, + "loss": 0.83382058, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.32421875, + "step": 14696, + "time_per_iteration": 2.375077486038208 + }, + { + "auxiliary_loss_clip": 0.01051062, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.01529026, + "balance_loss_mlp": 1.01603413, + "epoch": 0.8836314444611454, + "flos": 27671111205120.0, + "grad_norm": 1.5379292139354235, + "language_loss": 0.76208556, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.78298599, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34960938, + "step": 14697, + "time_per_iteration": 3.811378240585327 + }, + { + "auxiliary_loss_clip": 0.01051016, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.01105762, + "balance_loss_mlp": 1.01660299, + "epoch": 0.8836915677138133, + "flos": 16836025824000.0, + "grad_norm": 2.0535885235468787, + "language_loss": 0.73154414, + "learning_rate": 1.401661576761779e-07, + "loss": 0.75239205, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34375, + "step": 14698, + "time_per_iteration": 3.6180996894836426 + }, + { + "auxiliary_loss_clip": 0.0100735, + "auxiliary_loss_mlp": 0.01003856, + "balance_loss_clip": 1.00167501, + "balance_loss_mlp": 1.00084519, + "epoch": 0.8837516909664813, + "flos": 69306920732160.0, + "grad_norm": 0.7989470629199014, + "language_loss": 0.53810358, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55821562, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.06494141, + "step": 14699, + "time_per_iteration": 3.01292085647583 + }, + { + "auxiliary_loss_clip": 0.01053815, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.01017833, + "balance_loss_mlp": 1.01712084, + "epoch": 0.8838118142191492, + "flos": 21323656851840.0, + "grad_norm": 1.9141900783580514, + "language_loss": 0.78101885, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.80190128, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 14700, + "time_per_iteration": 2.365626573562622 + }, + { + "auxiliary_loss_clip": 0.01049466, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.01040888, + "balance_loss_mlp": 1.01542211, + "epoch": 0.8838719374718172, + "flos": 21469489067520.0, + "grad_norm": 2.359922764632413, + "language_loss": 0.74135756, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.76216912, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 14701, + "time_per_iteration": 2.3747668266296387 + }, + { + "auxiliary_loss_clip": 0.01053228, + "auxiliary_loss_mlp": 0.01042608, + "balance_loss_clip": 1.01855171, + "balance_loss_mlp": 1.01667762, + "epoch": 0.8839320607244852, + "flos": 26467724862720.0, + "grad_norm": 3.2824409452121195, + "language_loss": 0.72016478, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.74112308, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 14702, + "time_per_iteration": 2.44448184967041 + }, + { + "auxiliary_loss_clip": 0.0105161, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.01390672, + "balance_loss_mlp": 1.01631165, + "epoch": 0.8839921839771532, + "flos": 45220553084160.0, + "grad_norm": 1.5725787898043782, + "language_loss": 0.72624266, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.7471453, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3515625, + "step": 14703, + "time_per_iteration": 2.5852670669555664 + }, + { + "auxiliary_loss_clip": 0.01050646, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.01414037, + "balance_loss_mlp": 1.0164783, + "epoch": 0.8840523072298211, + "flos": 20005965118080.0, + "grad_norm": 1.6828188912172477, + "language_loss": 0.67782617, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.69867229, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.34179688, + "step": 14704, + "time_per_iteration": 2.3586885929107666 + }, + { + "auxiliary_loss_clip": 0.01048882, + "auxiliary_loss_mlp": 0.01033918, + "balance_loss_clip": 1.01342618, + "balance_loss_mlp": 1.01516843, + "epoch": 0.8841124304824891, + "flos": 24424851945600.0, + "grad_norm": 1.6599694063504173, + "language_loss": 0.71256185, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.73338985, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3359375, + "step": 14705, + "time_per_iteration": 2.4272680282592773 + }, + { + "auxiliary_loss_clip": 0.0105052, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.01505423, + "balance_loss_mlp": 1.01620805, + "epoch": 0.884172553735157, + "flos": 31283295091200.0, + "grad_norm": 1.588152852573069, + "language_loss": 0.71838397, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.7392441, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34375, + "step": 14706, + "time_per_iteration": 2.4467341899871826 + }, + { + "auxiliary_loss_clip": 0.01049402, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.01127243, + "balance_loss_mlp": 1.01464963, + "epoch": 0.884232676987825, + "flos": 21390271459200.0, + "grad_norm": 2.4598819558046205, + "language_loss": 0.75650251, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.77732873, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 14707, + "time_per_iteration": 2.356515407562256 + }, + { + "auxiliary_loss_clip": 0.01007273, + "auxiliary_loss_mlp": 0.01001697, + "balance_loss_clip": 0.99965864, + "balance_loss_mlp": 1.0007714, + "epoch": 0.8842928002404931, + "flos": 57908582386560.0, + "grad_norm": 0.795656255042947, + "language_loss": 0.60505176, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62514138, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.06494141, + "step": 14708, + "time_per_iteration": 2.85077166557312 + }, + { + "auxiliary_loss_clip": 0.0104803, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.01656508, + "balance_loss_mlp": 1.01502299, + "epoch": 0.884352923493161, + "flos": 41462292602880.0, + "grad_norm": 1.6374904675139312, + "language_loss": 0.67917585, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.70002663, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33007812, + "step": 14709, + "time_per_iteration": 2.5372962951660156 + }, + { + "auxiliary_loss_clip": 0.01053454, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.01723993, + "balance_loss_mlp": 1.01567674, + "epoch": 0.884413046745829, + "flos": 46539327070080.0, + "grad_norm": 1.5333441710305382, + "language_loss": 0.63940382, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.660357, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37695312, + "step": 14710, + "time_per_iteration": 2.621408224105835 + }, + { + "auxiliary_loss_clip": 0.01049688, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.00985754, + "balance_loss_mlp": 1.01560938, + "epoch": 0.8844731699984969, + "flos": 19134323314560.0, + "grad_norm": 2.3491182491360996, + "language_loss": 0.64527941, + "learning_rate": 1.38310100580431e-07, + "loss": 0.6660828, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33984375, + "step": 14711, + "time_per_iteration": 3.8813204765319824 + }, + { + "auxiliary_loss_clip": 0.01052795, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.01097965, + "balance_loss_mlp": 1.01516509, + "epoch": 0.8845332932511649, + "flos": 23259451029120.0, + "grad_norm": 2.186700473621865, + "language_loss": 0.76560366, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78648031, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37695312, + "step": 14712, + "time_per_iteration": 2.3994243144989014 + }, + { + "auxiliary_loss_clip": 0.01050583, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.01016128, + "balance_loss_mlp": 1.0152328, + "epoch": 0.8845934165038328, + "flos": 17563685713920.0, + "grad_norm": 2.158509219939818, + "language_loss": 0.8210606, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.84188646, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 14713, + "time_per_iteration": 2.3216567039489746 + }, + { + "auxiliary_loss_clip": 0.0105115, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.01248717, + "balance_loss_mlp": 1.01582003, + "epoch": 0.8846535397565009, + "flos": 27484640300160.0, + "grad_norm": 1.4340263077516178, + "language_loss": 0.56665593, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.58752179, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 14714, + "time_per_iteration": 2.4429399967193604 + }, + { + "auxiliary_loss_clip": 0.01050037, + "auxiliary_loss_mlp": 0.01035855, + "balance_loss_clip": 1.01343179, + "balance_loss_mlp": 1.01549053, + "epoch": 0.8847136630091688, + "flos": 28760331801600.0, + "grad_norm": 2.6361253651312553, + "language_loss": 0.75395221, + "learning_rate": 1.377414057838755e-07, + "loss": 0.77481115, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 14715, + "time_per_iteration": 2.4262616634368896 + }, + { + "auxiliary_loss_clip": 0.01050731, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.01589096, + "balance_loss_mlp": 1.01564837, + "epoch": 0.8847737862618368, + "flos": 23475807924480.0, + "grad_norm": 1.5739359227451473, + "language_loss": 0.75882447, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77971023, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 14716, + "time_per_iteration": 2.377725124359131 + }, + { + "auxiliary_loss_clip": 0.0105159, + "auxiliary_loss_mlp": 0.01036231, + "balance_loss_clip": 1.01428473, + "balance_loss_mlp": 1.01720953, + "epoch": 0.8848339095145047, + "flos": 18659888582400.0, + "grad_norm": 2.0499895720712336, + "language_loss": 0.72050154, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.74137974, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 14717, + "time_per_iteration": 2.3739850521087646 + }, + { + "auxiliary_loss_clip": 0.01048642, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.01408613, + "balance_loss_mlp": 1.01594162, + "epoch": 0.8848940327671727, + "flos": 32268928083840.0, + "grad_norm": 1.9122782425007487, + "language_loss": 0.7586118, + "learning_rate": 1.373156261464208e-07, + "loss": 0.77943885, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.328125, + "step": 14718, + "time_per_iteration": 2.440760850906372 + }, + { + "auxiliary_loss_clip": 0.01053039, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.01301634, + "balance_loss_mlp": 1.01659489, + "epoch": 0.8849541560198406, + "flos": 24020767336320.0, + "grad_norm": 1.4719794843831806, + "language_loss": 0.79551363, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.81640351, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 14719, + "time_per_iteration": 2.448371410369873 + }, + { + "auxiliary_loss_clip": 0.01051962, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.01079273, + "balance_loss_mlp": 1.01623249, + "epoch": 0.8850142792725086, + "flos": 16872126036480.0, + "grad_norm": 1.6746753510534922, + "language_loss": 0.72914875, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.75000918, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35742188, + "step": 14720, + "time_per_iteration": 2.321080446243286 + }, + { + "auxiliary_loss_clip": 0.01052599, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.01577258, + "balance_loss_mlp": 1.01611102, + "epoch": 0.8850744025251767, + "flos": 24022931840640.0, + "grad_norm": 1.9529949028842144, + "language_loss": 0.83635426, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.85726821, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 14721, + "time_per_iteration": 2.387066602706909 + }, + { + "auxiliary_loss_clip": 0.01050375, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.00934911, + "balance_loss_mlp": 1.01481175, + "epoch": 0.8851345257778446, + "flos": 47953868515200.0, + "grad_norm": 1.9481566897519562, + "language_loss": 0.63804162, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.65888608, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35546875, + "step": 14722, + "time_per_iteration": 2.5906805992126465 + }, + { + "auxiliary_loss_clip": 0.01052908, + "auxiliary_loss_mlp": 0.01034638, + "balance_loss_clip": 1.01145184, + "balance_loss_mlp": 1.01658392, + "epoch": 0.8851946490305126, + "flos": 36609539909760.0, + "grad_norm": 1.979313157385034, + "language_loss": 0.70039159, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.7212671, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36328125, + "step": 14723, + "time_per_iteration": 2.50056529045105 + }, + { + "auxiliary_loss_clip": 0.01052215, + "auxiliary_loss_mlp": 0.01032693, + "balance_loss_clip": 1.01124787, + "balance_loss_mlp": 1.01702714, + "epoch": 0.8852547722831805, + "flos": 21543155769600.0, + "grad_norm": 2.243513411189543, + "language_loss": 0.7863248, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80717397, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 14724, + "time_per_iteration": 2.374720335006714 + }, + { + "auxiliary_loss_clip": 0.01007401, + "auxiliary_loss_mlp": 0.01004621, + "balance_loss_clip": 1.00277317, + "balance_loss_mlp": 1.00095606, + "epoch": 0.8853148955358485, + "flos": 63056141533440.0, + "grad_norm": 0.7989246139628524, + "language_loss": 0.59033334, + "learning_rate": 1.363246127376143e-07, + "loss": 0.61045361, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.06445312, + "step": 14725, + "time_per_iteration": 2.890690803527832 + }, + { + "auxiliary_loss_clip": 0.01053474, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.01595581, + "balance_loss_mlp": 1.01566041, + "epoch": 0.8853750187885164, + "flos": 18148864878720.0, + "grad_norm": 2.0890331679198177, + "language_loss": 0.70723808, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.72817194, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37890625, + "step": 14726, + "time_per_iteration": 2.3660340309143066 + }, + { + "auxiliary_loss_clip": 0.01050444, + "auxiliary_loss_mlp": 0.01044051, + "balance_loss_clip": 1.02171135, + "balance_loss_mlp": 1.01638317, + "epoch": 0.8854351420411845, + "flos": 39568882682880.0, + "grad_norm": 1.9561295117410014, + "language_loss": 0.70124018, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.72218513, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.33984375, + "step": 14727, + "time_per_iteration": 3.797562599182129 + }, + { + "auxiliary_loss_clip": 0.01052469, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.01334381, + "balance_loss_mlp": 1.0171572, + "epoch": 0.8854952652938524, + "flos": 23768170583040.0, + "grad_norm": 1.8164049728761797, + "language_loss": 0.71159363, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.73247921, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 14728, + "time_per_iteration": 2.408064842224121 + }, + { + "auxiliary_loss_clip": 0.01050462, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.01621866, + "balance_loss_mlp": 1.01482725, + "epoch": 0.8855553885465204, + "flos": 18289495301760.0, + "grad_norm": 2.741858823085044, + "language_loss": 0.67745548, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.69834352, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 14729, + "time_per_iteration": 2.4033734798431396 + }, + { + "auxiliary_loss_clip": 0.0105042, + "auxiliary_loss_mlp": 0.01037747, + "balance_loss_clip": 1.01636076, + "balance_loss_mlp": 1.01600695, + "epoch": 0.8856155117991883, + "flos": 36865907089920.0, + "grad_norm": 1.5516827242790097, + "language_loss": 0.6382696, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.6591512, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34570312, + "step": 14730, + "time_per_iteration": 2.5201025009155273 + }, + { + "auxiliary_loss_clip": 0.01048155, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.01282787, + "balance_loss_mlp": 1.0146184, + "epoch": 0.8856756350518563, + "flos": 22162794312960.0, + "grad_norm": 1.4653830231208116, + "language_loss": 0.80333817, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.82416111, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33398438, + "step": 14731, + "time_per_iteration": 2.4182229042053223 + }, + { + "auxiliary_loss_clip": 0.01052241, + "auxiliary_loss_mlp": 0.01039499, + "balance_loss_clip": 1.01616931, + "balance_loss_mlp": 1.01631618, + "epoch": 0.8857357583045242, + "flos": 20739909052800.0, + "grad_norm": 4.395653692647698, + "language_loss": 0.8436166, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.86453402, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 14732, + "time_per_iteration": 2.397047996520996 + }, + { + "auxiliary_loss_clip": 0.01007368, + "auxiliary_loss_mlp": 0.01003941, + "balance_loss_clip": 1.00180757, + "balance_loss_mlp": 1.00076258, + "epoch": 0.8857958815571922, + "flos": 69888748406400.0, + "grad_norm": 0.9096958495326123, + "language_loss": 0.6009661, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.62107921, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.06640625, + "step": 14733, + "time_per_iteration": 3.0329487323760986 + }, + { + "auxiliary_loss_clip": 0.01052529, + "auxiliary_loss_mlp": 0.01042485, + "balance_loss_clip": 1.01996636, + "balance_loss_mlp": 1.01688588, + "epoch": 0.8858560048098603, + "flos": 15121057196160.0, + "grad_norm": 2.092835623421941, + "language_loss": 0.67863953, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.69958973, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 14734, + "time_per_iteration": 2.4193387031555176 + }, + { + "auxiliary_loss_clip": 0.01051231, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.01223874, + "balance_loss_mlp": 1.01647568, + "epoch": 0.8859161280625282, + "flos": 16610277772800.0, + "grad_norm": 1.9607925285537038, + "language_loss": 0.76260614, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.78346455, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 14735, + "time_per_iteration": 2.3928422927856445 + }, + { + "auxiliary_loss_clip": 0.01051038, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.01626885, + "balance_loss_mlp": 1.01537526, + "epoch": 0.8859762513151962, + "flos": 18693579911040.0, + "grad_norm": 1.7051663776893324, + "language_loss": 0.71580184, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.73670012, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 14736, + "time_per_iteration": 2.359177827835083 + }, + { + "auxiliary_loss_clip": 0.0105326, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.01058936, + "balance_loss_mlp": 1.01679039, + "epoch": 0.8860363745678641, + "flos": 19535859394560.0, + "grad_norm": 1.7685525722404087, + "language_loss": 0.85681939, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.87768233, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36328125, + "step": 14737, + "time_per_iteration": 3.7893126010894775 + }, + { + "auxiliary_loss_clip": 0.01053817, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.01852572, + "balance_loss_mlp": 1.01583982, + "epoch": 0.8860964978205321, + "flos": 35953452040320.0, + "grad_norm": 2.0431015404724406, + "language_loss": 0.69739789, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.71836519, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37890625, + "step": 14738, + "time_per_iteration": 3.8634164333343506 + }, + { + "auxiliary_loss_clip": 0.010536, + "auxiliary_loss_mlp": 0.01041016, + "balance_loss_clip": 1.01598167, + "balance_loss_mlp": 1.01619601, + "epoch": 0.8861566210732, + "flos": 21211585787520.0, + "grad_norm": 1.6946612579399456, + "language_loss": 0.75922263, + "learning_rate": 1.343529763547222e-07, + "loss": 0.78016877, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37304688, + "step": 14739, + "time_per_iteration": 2.3862528800964355 + }, + { + "auxiliary_loss_clip": 0.01049536, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.01403451, + "balance_loss_mlp": 1.01528418, + "epoch": 0.886216744325868, + "flos": 14608253013120.0, + "grad_norm": 1.9359119899609276, + "language_loss": 0.88140976, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.90226126, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34179688, + "step": 14740, + "time_per_iteration": 2.3734538555145264 + }, + { + "auxiliary_loss_clip": 0.01052029, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.01405609, + "balance_loss_mlp": 1.01648736, + "epoch": 0.886276867578536, + "flos": 26650425340800.0, + "grad_norm": 1.987199339541, + "language_loss": 0.64086175, + "learning_rate": 1.34072445601471e-07, + "loss": 0.66176087, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 14741, + "time_per_iteration": 2.481691360473633 + }, + { + "auxiliary_loss_clip": 0.01051026, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.01090848, + "balance_loss_mlp": 1.01556277, + "epoch": 0.886336990831204, + "flos": 16763127171840.0, + "grad_norm": 1.73085471409354, + "language_loss": 0.73896277, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.75980484, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 14742, + "time_per_iteration": 2.363487958908081 + }, + { + "auxiliary_loss_clip": 0.0105034, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.01181316, + "balance_loss_mlp": 1.01550126, + "epoch": 0.8863971140838719, + "flos": 25264094140800.0, + "grad_norm": 1.9694307494184315, + "language_loss": 0.60638952, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.62722057, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34960938, + "step": 14743, + "time_per_iteration": 2.420274257659912 + }, + { + "auxiliary_loss_clip": 0.01052608, + "auxiliary_loss_mlp": 0.01040359, + "balance_loss_clip": 1.01637459, + "balance_loss_mlp": 1.01574945, + "epoch": 0.8864572373365399, + "flos": 23403188563200.0, + "grad_norm": 1.6898113977038467, + "language_loss": 0.61042082, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.63135052, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 14744, + "time_per_iteration": 2.397066831588745 + }, + { + "auxiliary_loss_clip": 0.01052659, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.01392484, + "balance_loss_mlp": 1.01627779, + "epoch": 0.8865173605892078, + "flos": 18547852429440.0, + "grad_norm": 1.9789426535969046, + "language_loss": 0.77245796, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.79335105, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 14745, + "time_per_iteration": 2.437195301055908 + }, + { + "auxiliary_loss_clip": 0.0105159, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.01316404, + "balance_loss_mlp": 1.01646519, + "epoch": 0.8865774838418758, + "flos": 19024870602240.0, + "grad_norm": 1.6584406010776997, + "language_loss": 0.78799057, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.80887079, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14746, + "time_per_iteration": 2.377253532409668 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01038827, + "balance_loss_clip": 1.0156287, + "balance_loss_mlp": 1.01566684, + "epoch": 0.8866376070945439, + "flos": 22162096085760.0, + "grad_norm": 2.075396867836792, + "language_loss": 0.77899182, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.79991055, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37304688, + "step": 14747, + "time_per_iteration": 2.407752513885498 + }, + { + "auxiliary_loss_clip": 0.01049117, + "auxiliary_loss_mlp": 0.01038714, + "balance_loss_clip": 1.01722097, + "balance_loss_mlp": 1.01566017, + "epoch": 0.8866977303472118, + "flos": 20703215347200.0, + "grad_norm": 1.826140113927889, + "language_loss": 0.83613455, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.85701287, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33398438, + "step": 14748, + "time_per_iteration": 2.3926098346710205 + }, + { + "auxiliary_loss_clip": 0.01051464, + "auxiliary_loss_mlp": 0.01038789, + "balance_loss_clip": 1.01603162, + "balance_loss_mlp": 1.01532435, + "epoch": 0.8867578535998798, + "flos": 48792726685440.0, + "grad_norm": 1.9137885497676963, + "language_loss": 0.78548574, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.80638826, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36132812, + "step": 14749, + "time_per_iteration": 2.617220878601074 + }, + { + "auxiliary_loss_clip": 0.01053265, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.01458168, + "balance_loss_mlp": 1.01593399, + "epoch": 0.8868179768525477, + "flos": 21104262668160.0, + "grad_norm": 1.837622621768525, + "language_loss": 0.71329427, + "learning_rate": 1.328135602550451e-07, + "loss": 0.7342146, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 14750, + "time_per_iteration": 2.419569730758667 + }, + { + "auxiliary_loss_clip": 0.01050963, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.01352942, + "balance_loss_mlp": 1.01599884, + "epoch": 0.8868781001052157, + "flos": 21829967521920.0, + "grad_norm": 2.059354886972471, + "language_loss": 0.60393858, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.62480307, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34960938, + "step": 14751, + "time_per_iteration": 3.86102557182312 + }, + { + "auxiliary_loss_clip": 0.01051463, + "auxiliary_loss_mlp": 0.01041304, + "balance_loss_clip": 1.01663971, + "balance_loss_mlp": 1.01568723, + "epoch": 0.8869382233578836, + "flos": 13515576192000.0, + "grad_norm": 4.517423045563117, + "language_loss": 0.83219039, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.85311806, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 14752, + "time_per_iteration": 2.3574717044830322 + }, + { + "auxiliary_loss_clip": 0.01052169, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.01562786, + "balance_loss_mlp": 1.01575184, + "epoch": 0.8869983466105517, + "flos": 22704053120640.0, + "grad_norm": 1.8931821346306634, + "language_loss": 0.80657399, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82748634, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 14753, + "time_per_iteration": 2.4221737384796143 + }, + { + "auxiliary_loss_clip": 0.01050975, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.01032281, + "balance_loss_mlp": 1.01622844, + "epoch": 0.8870584698632196, + "flos": 15339857886720.0, + "grad_norm": 1.7974820393194133, + "language_loss": 0.66230422, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.68313682, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 14754, + "time_per_iteration": 2.386648416519165 + }, + { + "auxiliary_loss_clip": 0.01052492, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.01563501, + "balance_loss_mlp": 1.01616526, + "epoch": 0.8871185931158876, + "flos": 26614394951040.0, + "grad_norm": 1.99764462649781, + "language_loss": 0.7563355, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.77725947, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 14755, + "time_per_iteration": 2.4014854431152344 + }, + { + "auxiliary_loss_clip": 0.01053283, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.01613152, + "balance_loss_mlp": 1.01636887, + "epoch": 0.8871787163685555, + "flos": 21797951938560.0, + "grad_norm": 1.4774004448743587, + "language_loss": 0.78767437, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.80860794, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36914062, + "step": 14756, + "time_per_iteration": 2.4362778663635254 + }, + { + "auxiliary_loss_clip": 0.01052271, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01498914, + "balance_loss_mlp": 1.01693153, + "epoch": 0.8872388396212235, + "flos": 14902081948800.0, + "grad_norm": 1.965636163090672, + "language_loss": 0.77675962, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.79766792, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 14757, + "time_per_iteration": 2.379281520843506 + }, + { + "auxiliary_loss_clip": 0.0104894, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.01148963, + "balance_loss_mlp": 1.01568437, + "epoch": 0.8872989628738914, + "flos": 26430961334400.0, + "grad_norm": 2.112806029794963, + "language_loss": 0.69003123, + "learning_rate": 1.316993656021632e-07, + "loss": 0.71083891, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33203125, + "step": 14758, + "time_per_iteration": 2.5208089351654053 + }, + { + "auxiliary_loss_clip": 0.01051905, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.01354051, + "balance_loss_mlp": 1.01623714, + "epoch": 0.8873590861265594, + "flos": 48140723445120.0, + "grad_norm": 2.144936148702777, + "language_loss": 0.70096582, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.72185463, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 14759, + "time_per_iteration": 2.681598663330078 + }, + { + "auxiliary_loss_clip": 0.01050447, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.01253402, + "balance_loss_mlp": 1.01530719, + "epoch": 0.8874192093792275, + "flos": 18331984293120.0, + "grad_norm": 1.8033045556908833, + "language_loss": 0.754673, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.77552712, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 14760, + "time_per_iteration": 2.4065093994140625 + }, + { + "auxiliary_loss_clip": 0.01052082, + "auxiliary_loss_mlp": 0.01045206, + "balance_loss_clip": 1.02114928, + "balance_loss_mlp": 1.01609313, + "epoch": 0.8874793326318954, + "flos": 17893265748480.0, + "grad_norm": 2.262152577854511, + "language_loss": 0.77920634, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.80017924, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 14761, + "time_per_iteration": 2.3795619010925293 + }, + { + "auxiliary_loss_clip": 0.0105301, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.01881206, + "balance_loss_mlp": 1.01619816, + "epoch": 0.8875394558845634, + "flos": 31100908815360.0, + "grad_norm": 1.6834610228874083, + "language_loss": 0.62578404, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.64673352, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 14762, + "time_per_iteration": 2.497728109359741 + }, + { + "auxiliary_loss_clip": 0.01051459, + "auxiliary_loss_mlp": 0.01045682, + "balance_loss_clip": 1.02073121, + "balance_loss_mlp": 1.01524389, + "epoch": 0.8875995791372313, + "flos": 21140991285120.0, + "grad_norm": 3.3397768794389915, + "language_loss": 0.65686208, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.67783344, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36132812, + "step": 14763, + "time_per_iteration": 2.3707215785980225 + }, + { + "auxiliary_loss_clip": 0.01052006, + "auxiliary_loss_mlp": 0.01040872, + "balance_loss_clip": 1.0162549, + "balance_loss_mlp": 1.01583886, + "epoch": 0.8876597023898993, + "flos": 17454233001600.0, + "grad_norm": 2.1742792726391076, + "language_loss": 0.71997464, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.74090338, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 14764, + "time_per_iteration": 2.3449761867523193 + }, + { + "auxiliary_loss_clip": 0.01052846, + "auxiliary_loss_mlp": 0.01037564, + "balance_loss_clip": 1.01360285, + "balance_loss_mlp": 1.01532328, + "epoch": 0.8877198256425672, + "flos": 22706915852160.0, + "grad_norm": 2.1996305781841947, + "language_loss": 0.67721736, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.69812143, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 14765, + "time_per_iteration": 2.45389986038208 + }, + { + "auxiliary_loss_clip": 0.01048485, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.01403773, + "balance_loss_mlp": 1.01534534, + "epoch": 0.8877799488952353, + "flos": 24533955544320.0, + "grad_norm": 1.5272913023761607, + "language_loss": 0.77066612, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.79149461, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33203125, + "step": 14766, + "time_per_iteration": 3.679873466491699 + }, + { + "auxiliary_loss_clip": 0.01050068, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.01459157, + "balance_loss_mlp": 1.01591074, + "epoch": 0.8878400721479032, + "flos": 20958151161600.0, + "grad_norm": 4.558702523987892, + "language_loss": 0.74464536, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.76551402, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34179688, + "step": 14767, + "time_per_iteration": 2.393247365951538 + }, + { + "auxiliary_loss_clip": 0.01048318, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.01009035, + "balance_loss_mlp": 1.01479852, + "epoch": 0.8879001954005712, + "flos": 25294259422080.0, + "grad_norm": 1.8852588770781378, + "language_loss": 0.71725512, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73805249, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3359375, + "step": 14768, + "time_per_iteration": 2.454174041748047 + }, + { + "auxiliary_loss_clip": 0.01050301, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.01419675, + "balance_loss_mlp": 1.01566052, + "epoch": 0.8879603186532391, + "flos": 23184213315840.0, + "grad_norm": 1.9259597198050724, + "language_loss": 0.7154426, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.73630726, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34570312, + "step": 14769, + "time_per_iteration": 2.3738369941711426 + }, + { + "auxiliary_loss_clip": 0.01050298, + "auxiliary_loss_mlp": 0.01038597, + "balance_loss_clip": 1.01734161, + "balance_loss_mlp": 1.01657081, + "epoch": 0.8880204419059071, + "flos": 13654775249280.0, + "grad_norm": 1.8785305647633457, + "language_loss": 0.67695844, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69784737, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3359375, + "step": 14770, + "time_per_iteration": 2.3863718509674072 + }, + { + "auxiliary_loss_clip": 0.01049258, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.01101255, + "balance_loss_mlp": 1.01618695, + "epoch": 0.888080565158575, + "flos": 20630805454080.0, + "grad_norm": 2.095149126672128, + "language_loss": 0.66787046, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.68868905, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33007812, + "step": 14771, + "time_per_iteration": 2.4183762073516846 + }, + { + "auxiliary_loss_clip": 0.01050898, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.01338935, + "balance_loss_mlp": 1.01586962, + "epoch": 0.888140688411243, + "flos": 28618793683200.0, + "grad_norm": 1.4485260225649235, + "language_loss": 0.83245158, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.85330147, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34960938, + "step": 14772, + "time_per_iteration": 2.4486985206604004 + }, + { + "auxiliary_loss_clip": 0.01047735, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.01019776, + "balance_loss_mlp": 1.01461923, + "epoch": 0.8882008116639111, + "flos": 25519064866560.0, + "grad_norm": 1.515875771520094, + "language_loss": 0.77382445, + "learning_rate": 1.296224737033258e-07, + "loss": 0.79460508, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33007812, + "step": 14773, + "time_per_iteration": 2.404783010482788 + }, + { + "auxiliary_loss_clip": 0.01048162, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.01320779, + "balance_loss_mlp": 1.01437831, + "epoch": 0.888260934916579, + "flos": 27672437836800.0, + "grad_norm": 1.5774279565386107, + "language_loss": 0.76007712, + "learning_rate": 1.294845814469907e-07, + "loss": 0.78091168, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33789062, + "step": 14774, + "time_per_iteration": 2.449115753173828 + }, + { + "auxiliary_loss_clip": 0.01051996, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.013659, + "balance_loss_mlp": 1.01580012, + "epoch": 0.888321058169247, + "flos": 21610154401920.0, + "grad_norm": 2.680319345499952, + "language_loss": 0.7298516, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.75074404, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 14775, + "time_per_iteration": 2.367400884628296 + }, + { + "auxiliary_loss_clip": 0.01049651, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.01124287, + "balance_loss_mlp": 1.01521349, + "epoch": 0.8883811814219149, + "flos": 18148166651520.0, + "grad_norm": 1.690030931142148, + "language_loss": 0.80999327, + "learning_rate": 1.292090097299432e-07, + "loss": 0.83081627, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 14776, + "time_per_iteration": 2.355612277984619 + }, + { + "auxiliary_loss_clip": 0.01054291, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.01272488, + "balance_loss_mlp": 1.01652992, + "epoch": 0.8884413046745829, + "flos": 28323533381760.0, + "grad_norm": 1.981990844084084, + "language_loss": 0.70823574, + "learning_rate": 1.290713302796802e-07, + "loss": 0.72915506, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37695312, + "step": 14777, + "time_per_iteration": 5.131423711776733 + }, + { + "auxiliary_loss_clip": 0.01049743, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.01646948, + "balance_loss_mlp": 1.015607, + "epoch": 0.8885014279272508, + "flos": 15157855635840.0, + "grad_norm": 1.7185832991799852, + "language_loss": 0.71828926, + "learning_rate": 1.2893372177522e-07, + "loss": 0.73917788, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34179688, + "step": 14778, + "time_per_iteration": 2.3240807056427 + }, + { + "auxiliary_loss_clip": 0.01049859, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.01321054, + "balance_loss_mlp": 1.0151149, + "epoch": 0.8885615511799189, + "flos": 19098572215680.0, + "grad_norm": 1.7804618223691089, + "language_loss": 0.78462696, + "learning_rate": 1.287961842217804e-07, + "loss": 0.80548, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 14779, + "time_per_iteration": 2.3631489276885986 + }, + { + "auxiliary_loss_clip": 0.01007713, + "auxiliary_loss_mlp": 0.01008249, + "balance_loss_clip": 1.00618684, + "balance_loss_mlp": 1.001297, + "epoch": 0.8886216744325868, + "flos": 51184206328320.0, + "grad_norm": 0.8820590879232184, + "language_loss": 0.56879252, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58895212, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.06445312, + "step": 14780, + "time_per_iteration": 2.8037467002868652 + }, + { + "auxiliary_loss_clip": 0.0100723, + "auxiliary_loss_mlp": 0.01002461, + "balance_loss_clip": 1.00021946, + "balance_loss_mlp": 1.00087976, + "epoch": 0.8886817976852548, + "flos": 61609549593600.0, + "grad_norm": 0.7866438008065298, + "language_loss": 0.62488997, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64498687, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06347656, + "step": 14781, + "time_per_iteration": 3.091899871826172 + }, + { + "auxiliary_loss_clip": 0.01007503, + "auxiliary_loss_mlp": 0.01004209, + "balance_loss_clip": 1.00201523, + "balance_loss_mlp": 1.00105512, + "epoch": 0.8887419209379227, + "flos": 60644027410560.0, + "grad_norm": 0.7955781249151259, + "language_loss": 0.58187819, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60199523, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06445312, + "step": 14782, + "time_per_iteration": 2.9037725925445557 + }, + { + "auxiliary_loss_clip": 0.01049141, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.01235759, + "balance_loss_mlp": 1.01505232, + "epoch": 0.8888020441905907, + "flos": 29204566341120.0, + "grad_norm": 3.4175268029236574, + "language_loss": 0.66846347, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.68928266, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.33984375, + "step": 14783, + "time_per_iteration": 2.4568326473236084 + }, + { + "auxiliary_loss_clip": 0.0105186, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.01142955, + "balance_loss_mlp": 1.01521647, + "epoch": 0.8888621674432586, + "flos": 22161642238080.0, + "grad_norm": 1.5553596959163478, + "language_loss": 0.78983331, + "learning_rate": 1.281095609023415e-07, + "loss": 0.81068802, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36523438, + "step": 14784, + "time_per_iteration": 2.4145240783691406 + }, + { + "auxiliary_loss_clip": 0.01051273, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.01197219, + "balance_loss_mlp": 1.01549733, + "epoch": 0.8889222906959267, + "flos": 27671599964160.0, + "grad_norm": 2.59034721555454, + "language_loss": 0.61158818, + "learning_rate": 1.279724491644565e-07, + "loss": 0.63245142, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 14785, + "time_per_iteration": 2.4406211376190186 + }, + { + "auxiliary_loss_clip": 0.01051594, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.01424789, + "balance_loss_mlp": 1.01633835, + "epoch": 0.8889824139485947, + "flos": 14167893634560.0, + "grad_norm": 1.8074647057983573, + "language_loss": 0.65965152, + "learning_rate": 1.278354084140445e-07, + "loss": 0.68053854, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 14786, + "time_per_iteration": 2.4042420387268066 + }, + { + "auxiliary_loss_clip": 0.01054009, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.01151502, + "balance_loss_mlp": 1.01637793, + "epoch": 0.8890425372012626, + "flos": 12852366405120.0, + "grad_norm": 2.377474598809219, + "language_loss": 0.86342156, + "learning_rate": 1.276984386563009e-07, + "loss": 0.88433105, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 14787, + "time_per_iteration": 2.3607263565063477 + }, + { + "auxiliary_loss_clip": 0.01050108, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.01008439, + "balance_loss_mlp": 1.01599741, + "epoch": 0.8891026604539306, + "flos": 21688219935360.0, + "grad_norm": 2.3406044648916864, + "language_loss": 0.72038424, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.74121201, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.33984375, + "step": 14788, + "time_per_iteration": 2.3660459518432617 + }, + { + "auxiliary_loss_clip": 0.0104856, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.01004815, + "balance_loss_mlp": 1.01619482, + "epoch": 0.8891627837065985, + "flos": 21870361831680.0, + "grad_norm": 2.1497605820627235, + "language_loss": 0.70429915, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72510684, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.32421875, + "step": 14789, + "time_per_iteration": 2.378129720687866 + }, + { + "auxiliary_loss_clip": 0.01049992, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.01403379, + "balance_loss_mlp": 1.01523244, + "epoch": 0.8892229069592665, + "flos": 21579151248000.0, + "grad_norm": 1.4732683594268048, + "language_loss": 0.71496874, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.73582286, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34765625, + "step": 14790, + "time_per_iteration": 3.8277101516723633 + }, + { + "auxiliary_loss_clip": 0.01053107, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.01229262, + "balance_loss_mlp": 1.01688516, + "epoch": 0.8892830302119344, + "flos": 23074865337600.0, + "grad_norm": 1.6863984435486632, + "language_loss": 0.73735559, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75825471, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 14791, + "time_per_iteration": 2.366764545440674 + }, + { + "auxiliary_loss_clip": 0.01049966, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.01656103, + "balance_loss_mlp": 1.01624537, + "epoch": 0.8893431534646025, + "flos": 23071129822080.0, + "grad_norm": 1.6272401953017068, + "language_loss": 0.7520504, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.77293718, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.33789062, + "step": 14792, + "time_per_iteration": 2.4044735431671143 + }, + { + "auxiliary_loss_clip": 0.01053977, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.01570964, + "balance_loss_mlp": 1.01686168, + "epoch": 0.8894032767172704, + "flos": 22453900162560.0, + "grad_norm": 1.9892672163664173, + "language_loss": 0.67928368, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.70022476, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 14793, + "time_per_iteration": 2.3843812942504883 + }, + { + "auxiliary_loss_clip": 0.01052246, + "auxiliary_loss_mlp": 0.01037764, + "balance_loss_clip": 1.01436281, + "balance_loss_mlp": 1.01587629, + "epoch": 0.8894633999699384, + "flos": 25337062615680.0, + "grad_norm": 1.544871638271269, + "language_loss": 0.72641563, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.7473157, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 14794, + "time_per_iteration": 2.433540105819702 + }, + { + "auxiliary_loss_clip": 0.01055015, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.01316965, + "balance_loss_mlp": 1.01777434, + "epoch": 0.8895235232226063, + "flos": 20993099299200.0, + "grad_norm": 1.618307003249504, + "language_loss": 0.76069713, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.78162682, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 14795, + "time_per_iteration": 2.375746488571167 + }, + { + "auxiliary_loss_clip": 0.01007995, + "auxiliary_loss_mlp": 0.01002499, + "balance_loss_clip": 1.00011444, + "balance_loss_mlp": 1.00140536, + "epoch": 0.8895836464752743, + "flos": 69729754608000.0, + "grad_norm": 0.7651769198396768, + "language_loss": 0.5611679, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58127278, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.06591797, + "step": 14796, + "time_per_iteration": 2.9150376319885254 + }, + { + "auxiliary_loss_clip": 0.01053131, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.0092628, + "balance_loss_mlp": 1.01688278, + "epoch": 0.8896437697279422, + "flos": 23220697553280.0, + "grad_norm": 2.425223446016281, + "language_loss": 0.70581806, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72668815, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 14797, + "time_per_iteration": 2.380681276321411 + }, + { + "auxiliary_loss_clip": 0.01007021, + "auxiliary_loss_mlp": 0.01002791, + "balance_loss_clip": 1.0006336, + "balance_loss_mlp": 1.00060475, + "epoch": 0.8897038929806103, + "flos": 70749532776960.0, + "grad_norm": 0.7649326163082742, + "language_loss": 0.58066183, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60075992, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06396484, + "step": 14798, + "time_per_iteration": 3.0925769805908203 + }, + { + "auxiliary_loss_clip": 0.01051292, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.01561499, + "balance_loss_mlp": 1.01582026, + "epoch": 0.8897640162332782, + "flos": 19244090229120.0, + "grad_norm": 1.5288634003414678, + "language_loss": 0.80042708, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.82131839, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 14799, + "time_per_iteration": 2.3624792098999023 + }, + { + "auxiliary_loss_clip": 0.01007309, + "auxiliary_loss_mlp": 0.01003132, + "balance_loss_clip": 1.00085545, + "balance_loss_mlp": 1.00095177, + "epoch": 0.8898241394859462, + "flos": 41353606074240.0, + "grad_norm": 0.897152418930086, + "language_loss": 0.58214629, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.6022507, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.06347656, + "step": 14800, + "time_per_iteration": 2.9968645572662354 + }, + { + "auxiliary_loss_clip": 0.01050908, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.01147056, + "balance_loss_mlp": 1.01610756, + "epoch": 0.8898842627386142, + "flos": 18985383987840.0, + "grad_norm": 1.4546437636246483, + "language_loss": 0.67665118, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.69748765, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34765625, + "step": 14801, + "time_per_iteration": 2.3665432929992676 + }, + { + "auxiliary_loss_clip": 0.01052583, + "auxiliary_loss_mlp": 0.01042191, + "balance_loss_clip": 1.0164535, + "balance_loss_mlp": 1.01628172, + "epoch": 0.8899443859912821, + "flos": 13216545463680.0, + "grad_norm": 2.404747042440108, + "language_loss": 0.77291739, + "learning_rate": 1.256524149358682e-07, + "loss": 0.7938652, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36328125, + "step": 14802, + "time_per_iteration": 2.3704888820648193 + }, + { + "auxiliary_loss_clip": 0.01053053, + "auxiliary_loss_mlp": 0.01036811, + "balance_loss_clip": 1.01437616, + "balance_loss_mlp": 1.01747477, + "epoch": 0.8900045092439501, + "flos": 22673573637120.0, + "grad_norm": 1.7041265086199322, + "language_loss": 0.73510641, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75600505, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 14803, + "time_per_iteration": 2.413618326187134 + }, + { + "auxiliary_loss_clip": 0.01050523, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.01497197, + "balance_loss_mlp": 1.01557899, + "epoch": 0.890064632496618, + "flos": 21140572348800.0, + "grad_norm": 2.098513660000098, + "language_loss": 0.73455548, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.75544393, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34960938, + "step": 14804, + "time_per_iteration": 2.385745048522949 + }, + { + "auxiliary_loss_clip": 0.01051503, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.0145843, + "balance_loss_mlp": 1.01593518, + "epoch": 0.8901247557492861, + "flos": 23396136468480.0, + "grad_norm": 1.9303981503520953, + "language_loss": 0.82411623, + "learning_rate": 1.252451286713123e-07, + "loss": 0.8450132, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 14805, + "time_per_iteration": 2.397770643234253 + }, + { + "auxiliary_loss_clip": 0.0105326, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.01354587, + "balance_loss_mlp": 1.01615953, + "epoch": 0.890184879001954, + "flos": 29168291571840.0, + "grad_norm": 1.9865125433152264, + "language_loss": 0.6901961, + "learning_rate": 1.251095087580505e-07, + "loss": 0.71110737, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 14806, + "time_per_iteration": 3.641875743865967 + }, + { + "auxiliary_loss_clip": 0.01050682, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.01350689, + "balance_loss_mlp": 1.01547277, + "epoch": 0.890245002254622, + "flos": 14426983900800.0, + "grad_norm": 1.8028826758602752, + "language_loss": 0.68688166, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.70776451, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3515625, + "step": 14807, + "time_per_iteration": 2.349215030670166 + }, + { + "auxiliary_loss_clip": 0.01050523, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.00927067, + "balance_loss_mlp": 1.01573682, + "epoch": 0.8903051255072899, + "flos": 22381106244480.0, + "grad_norm": 1.7297845186649055, + "language_loss": 0.75641936, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77722633, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 14808, + "time_per_iteration": 2.39782977104187 + }, + { + "auxiliary_loss_clip": 0.01051277, + "auxiliary_loss_mlp": 0.01046327, + "balance_loss_clip": 1.02477419, + "balance_loss_mlp": 1.0153954, + "epoch": 0.8903652487599579, + "flos": 20776323467520.0, + "grad_norm": 1.867037083597602, + "language_loss": 0.82792574, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.84890181, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.359375, + "step": 14809, + "time_per_iteration": 2.3776838779449463 + }, + { + "auxiliary_loss_clip": 0.01051733, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.01144075, + "balance_loss_mlp": 1.01648307, + "epoch": 0.8904253720126258, + "flos": 24423385668480.0, + "grad_norm": 1.682411020551403, + "language_loss": 0.69539607, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.71625441, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 14810, + "time_per_iteration": 2.4153411388397217 + }, + { + "auxiliary_loss_clip": 0.01053412, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.01300669, + "balance_loss_mlp": 1.01612043, + "epoch": 0.8904854952652939, + "flos": 19462856008320.0, + "grad_norm": 1.8888440844050092, + "language_loss": 0.71727234, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.73817682, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 14811, + "time_per_iteration": 2.361935615539551 + }, + { + "auxiliary_loss_clip": 0.01051305, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.011814, + "balance_loss_mlp": 1.01564109, + "epoch": 0.8905456185179618, + "flos": 50798975719680.0, + "grad_norm": 1.8431598131958493, + "language_loss": 0.66688633, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.68774128, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 14812, + "time_per_iteration": 2.6266841888427734 + }, + { + "auxiliary_loss_clip": 0.0105101, + "auxiliary_loss_mlp": 0.01039772, + "balance_loss_clip": 1.01787329, + "balance_loss_mlp": 1.016096, + "epoch": 0.8906057417706298, + "flos": 17783917770240.0, + "grad_norm": 1.913007960483859, + "language_loss": 0.6963203, + "learning_rate": 1.24162160341861e-07, + "loss": 0.71722817, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 14813, + "time_per_iteration": 2.333292245864868 + }, + { + "auxiliary_loss_clip": 0.01054906, + "auxiliary_loss_mlp": 0.01040406, + "balance_loss_clip": 1.01516926, + "balance_loss_mlp": 1.01625705, + "epoch": 0.8906658650232978, + "flos": 21943784154240.0, + "grad_norm": 1.7552626828301867, + "language_loss": 0.76554704, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.78650022, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38671875, + "step": 14814, + "time_per_iteration": 2.375821590423584 + }, + { + "auxiliary_loss_clip": 0.01053447, + "auxiliary_loss_mlp": 0.0103972, + "balance_loss_clip": 1.01510298, + "balance_loss_mlp": 1.01610875, + "epoch": 0.8907259882759657, + "flos": 21286753678080.0, + "grad_norm": 1.930595412168764, + "language_loss": 0.75528151, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.77621317, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 14815, + "time_per_iteration": 2.3721117973327637 + }, + { + "auxiliary_loss_clip": 0.0104874, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.01190615, + "balance_loss_mlp": 1.01527357, + "epoch": 0.8907861115286337, + "flos": 20119397725440.0, + "grad_norm": 1.8758049164226078, + "language_loss": 0.76662827, + "learning_rate": 1.237572207545914e-07, + "loss": 0.78744495, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.3359375, + "step": 14816, + "time_per_iteration": 3.7810137271881104 + }, + { + "auxiliary_loss_clip": 0.01051629, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.01494265, + "balance_loss_mlp": 1.01556253, + "epoch": 0.8908462347813016, + "flos": 20083122956160.0, + "grad_norm": 2.093226320481197, + "language_loss": 0.78321278, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.80411541, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 14817, + "time_per_iteration": 3.724142551422119 + }, + { + "auxiliary_loss_clip": 0.01007061, + "auxiliary_loss_mlp": 0.01002756, + "balance_loss_clip": 1.0007894, + "balance_loss_mlp": 1.00064349, + "epoch": 0.8909063580339697, + "flos": 65500480707840.0, + "grad_norm": 0.7653813251518286, + "language_loss": 0.5654977, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58559591, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.06396484, + "step": 14818, + "time_per_iteration": 3.050084114074707 + }, + { + "auxiliary_loss_clip": 0.01050766, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.01486468, + "balance_loss_mlp": 1.01597953, + "epoch": 0.8909664812866376, + "flos": 29861736462720.0, + "grad_norm": 1.7601078222839204, + "language_loss": 0.65451109, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.6753794, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34765625, + "step": 14819, + "time_per_iteration": 2.438056707382202 + }, + { + "auxiliary_loss_clip": 0.01051583, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.01447034, + "balance_loss_mlp": 1.01574457, + "epoch": 0.8910266045393056, + "flos": 25445956746240.0, + "grad_norm": 1.7672090581460367, + "language_loss": 0.79758418, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.81847459, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 14820, + "time_per_iteration": 2.395019054412842 + }, + { + "auxiliary_loss_clip": 0.01051323, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.01408219, + "balance_loss_mlp": 1.0157733, + "epoch": 0.8910867277919735, + "flos": 24497960065920.0, + "grad_norm": 1.7991922740093875, + "language_loss": 0.77500254, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.79587466, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35546875, + "step": 14821, + "time_per_iteration": 2.4189209938049316 + }, + { + "auxiliary_loss_clip": 0.0100718, + "auxiliary_loss_mlp": 0.01002397, + "balance_loss_clip": 1.00019181, + "balance_loss_mlp": 1.0006727, + "epoch": 0.8911468510446415, + "flos": 60685085036160.0, + "grad_norm": 0.8057418577401025, + "language_loss": 0.59398824, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61408401, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.06494141, + "step": 14822, + "time_per_iteration": 2.9050939083099365 + }, + { + "auxiliary_loss_clip": 0.01050869, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.01217341, + "balance_loss_mlp": 1.01496363, + "epoch": 0.8912069742973094, + "flos": 25336329477120.0, + "grad_norm": 2.157988280535342, + "language_loss": 0.69957972, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.72042555, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.359375, + "step": 14823, + "time_per_iteration": 2.399238109588623 + }, + { + "auxiliary_loss_clip": 0.0105084, + "auxiliary_loss_mlp": 0.01042345, + "balance_loss_clip": 1.01913476, + "balance_loss_mlp": 1.0162909, + "epoch": 0.8912670975499775, + "flos": 18222531580800.0, + "grad_norm": 1.5192741174416835, + "language_loss": 0.70008218, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.72101402, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34375, + "step": 14824, + "time_per_iteration": 2.340325355529785 + }, + { + "auxiliary_loss_clip": 0.01053697, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.01624072, + "balance_loss_mlp": 1.01677942, + "epoch": 0.8913272208026454, + "flos": 26503301404800.0, + "grad_norm": 1.9066080746696903, + "language_loss": 0.71257818, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.73351717, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 14825, + "time_per_iteration": 2.4077060222625732 + }, + { + "auxiliary_loss_clip": 0.01051736, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.01572204, + "balance_loss_mlp": 1.01655555, + "epoch": 0.8913873440553134, + "flos": 18801461612160.0, + "grad_norm": 1.7807751593300052, + "language_loss": 0.72414076, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.74504137, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 14826, + "time_per_iteration": 2.349210739135742 + }, + { + "auxiliary_loss_clip": 0.0104995, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.00998247, + "balance_loss_mlp": 1.01577604, + "epoch": 0.8914474673079814, + "flos": 20883890966400.0, + "grad_norm": 2.1159058810326594, + "language_loss": 0.76709688, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.78792381, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34179688, + "step": 14827, + "time_per_iteration": 2.361990451812744 + }, + { + "auxiliary_loss_clip": 0.01052306, + "auxiliary_loss_mlp": 0.01037731, + "balance_loss_clip": 1.01433063, + "balance_loss_mlp": 1.0163734, + "epoch": 0.8915075905606493, + "flos": 20951587825920.0, + "grad_norm": 1.7226516453612053, + "language_loss": 0.79597795, + "learning_rate": 1.221438670423336e-07, + "loss": 0.81687832, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 14828, + "time_per_iteration": 2.3604185581207275 + }, + { + "auxiliary_loss_clip": 0.01049461, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.01528525, + "balance_loss_mlp": 1.01507998, + "epoch": 0.8915677138133173, + "flos": 23075179539840.0, + "grad_norm": 1.6219881018422477, + "language_loss": 0.75917113, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.78004187, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 14829, + "time_per_iteration": 2.4182090759277344 + }, + { + "auxiliary_loss_clip": 0.01050804, + "auxiliary_loss_mlp": 0.01037417, + "balance_loss_clip": 1.01529217, + "balance_loss_mlp": 1.01494634, + "epoch": 0.8916278370659853, + "flos": 23439149130240.0, + "grad_norm": 1.6045890988892912, + "language_loss": 0.85306025, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.87394238, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 14830, + "time_per_iteration": 3.851357936859131 + }, + { + "auxiliary_loss_clip": 0.01048312, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.01368177, + "balance_loss_mlp": 1.01490557, + "epoch": 0.8916879603186533, + "flos": 25159179905280.0, + "grad_norm": 1.4609527648802036, + "language_loss": 0.76009005, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.78092277, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33398438, + "step": 14831, + "time_per_iteration": 2.4423270225524902 + }, + { + "auxiliary_loss_clip": 0.01050776, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.01248693, + "balance_loss_mlp": 1.01514781, + "epoch": 0.8917480835713212, + "flos": 20228815526400.0, + "grad_norm": 1.8041159341666464, + "language_loss": 0.74326825, + "learning_rate": 1.216083607088847e-07, + "loss": 0.76412219, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 14832, + "time_per_iteration": 2.355379104614258 + }, + { + "auxiliary_loss_clip": 0.01050955, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.0108757, + "balance_loss_mlp": 1.01472044, + "epoch": 0.8918082068239892, + "flos": 26100787806720.0, + "grad_norm": 2.0005695687082827, + "language_loss": 0.6809386, + "learning_rate": 1.214746621848355e-07, + "loss": 0.70177543, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 14833, + "time_per_iteration": 2.44321346282959 + }, + { + "auxiliary_loss_clip": 0.01055507, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.01319671, + "balance_loss_mlp": 1.01688218, + "epoch": 0.8918683300766571, + "flos": 24830158452480.0, + "grad_norm": 1.9759516784930489, + "language_loss": 0.75258577, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.77353561, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 14834, + "time_per_iteration": 2.454921245574951 + }, + { + "auxiliary_loss_clip": 0.01050036, + "auxiliary_loss_mlp": 0.01041297, + "balance_loss_clip": 1.01870692, + "balance_loss_mlp": 1.01515317, + "epoch": 0.8919284533293251, + "flos": 22304192785920.0, + "grad_norm": 1.8434126733695482, + "language_loss": 0.8028841, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.82379735, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 14835, + "time_per_iteration": 2.3849573135375977 + }, + { + "auxiliary_loss_clip": 0.01048931, + "auxiliary_loss_mlp": 0.01032174, + "balance_loss_clip": 1.01182485, + "balance_loss_mlp": 1.01494336, + "epoch": 0.891988576581993, + "flos": 30372201584640.0, + "grad_norm": 1.31271614792042, + "language_loss": 0.75358534, + "learning_rate": 1.210739940361689e-07, + "loss": 0.7743963, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33984375, + "step": 14836, + "time_per_iteration": 2.4360358715057373 + }, + { + "auxiliary_loss_clip": 0.01051151, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.01724374, + "balance_loss_mlp": 1.01620698, + "epoch": 0.8920486998346611, + "flos": 15552234887040.0, + "grad_norm": 3.0379453071869857, + "language_loss": 0.69573683, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.71664798, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 14837, + "time_per_iteration": 2.3673036098480225 + }, + { + "auxiliary_loss_clip": 0.01052796, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.0172627, + "balance_loss_mlp": 1.01556599, + "epoch": 0.892108823087329, + "flos": 21213924848640.0, + "grad_norm": 1.8960288765815292, + "language_loss": 0.68573689, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.70667183, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37304688, + "step": 14838, + "time_per_iteration": 2.3757171630859375 + }, + { + "auxiliary_loss_clip": 0.01052031, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.01641309, + "balance_loss_mlp": 1.01611972, + "epoch": 0.892168946339997, + "flos": 21977964241920.0, + "grad_norm": 1.9506570729956223, + "language_loss": 0.76815999, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78907233, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 14839, + "time_per_iteration": 2.394498109817505 + }, + { + "auxiliary_loss_clip": 0.01007086, + "auxiliary_loss_mlp": 0.01002901, + "balance_loss_clip": 1.00080252, + "balance_loss_mlp": 1.00069702, + "epoch": 0.892229069592665, + "flos": 67472025984000.0, + "grad_norm": 0.6856539739649039, + "language_loss": 0.49558571, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51568556, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.06396484, + "step": 14840, + "time_per_iteration": 2.976696729660034 + }, + { + "auxiliary_loss_clip": 0.01054505, + "auxiliary_loss_mlp": 0.01041835, + "balance_loss_clip": 1.01541853, + "balance_loss_mlp": 1.0165602, + "epoch": 0.8922891928453329, + "flos": 19458666645120.0, + "grad_norm": 2.364006681994037, + "language_loss": 0.65899092, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.67995429, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.37890625, + "step": 14841, + "time_per_iteration": 2.3947315216064453 + }, + { + "auxiliary_loss_clip": 0.01048322, + "auxiliary_loss_mlp": 0.01038593, + "balance_loss_clip": 1.01680136, + "balance_loss_mlp": 1.01553035, + "epoch": 0.8923493160980009, + "flos": 23366285389440.0, + "grad_norm": 1.6233230426512433, + "language_loss": 0.69275796, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.7136271, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.328125, + "step": 14842, + "time_per_iteration": 2.4083244800567627 + }, + { + "auxiliary_loss_clip": 0.01050709, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.01622331, + "balance_loss_mlp": 1.01604009, + "epoch": 0.8924094393506689, + "flos": 26175850963200.0, + "grad_norm": 1.8091688648881643, + "language_loss": 0.81451178, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.83539367, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34765625, + "step": 14843, + "time_per_iteration": 2.4307589530944824 + }, + { + "auxiliary_loss_clip": 0.01051939, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.01139545, + "balance_loss_mlp": 1.01602364, + "epoch": 0.8924695626033369, + "flos": 22017415944960.0, + "grad_norm": 1.7244105222734682, + "language_loss": 0.69486403, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.71575439, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.359375, + "step": 14844, + "time_per_iteration": 2.3709352016448975 + }, + { + "auxiliary_loss_clip": 0.01052165, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.01474524, + "balance_loss_mlp": 1.0165503, + "epoch": 0.8925296858560048, + "flos": 14793048172800.0, + "grad_norm": 4.42305541046238, + "language_loss": 0.92664987, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.94753921, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35546875, + "step": 14845, + "time_per_iteration": 2.365661859512329 + }, + { + "auxiliary_loss_clip": 0.01049437, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.01311231, + "balance_loss_mlp": 1.01509356, + "epoch": 0.8925898091086728, + "flos": 22345529702400.0, + "grad_norm": 2.3884934870669436, + "language_loss": 0.73384041, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.75467461, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 14846, + "time_per_iteration": 3.7307493686676025 + }, + { + "auxiliary_loss_clip": 0.01052221, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.0131402, + "balance_loss_mlp": 1.0160265, + "epoch": 0.8926499323613407, + "flos": 45804580174080.0, + "grad_norm": 1.6652193887878242, + "language_loss": 0.58007586, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.6009627, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 14847, + "time_per_iteration": 2.5949532985687256 + }, + { + "auxiliary_loss_clip": 0.01051661, + "auxiliary_loss_mlp": 0.01038394, + "balance_loss_clip": 1.01655507, + "balance_loss_mlp": 1.01553416, + "epoch": 0.8927100556140087, + "flos": 22125960961920.0, + "grad_norm": 2.1367234469815592, + "language_loss": 0.78104532, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.80194587, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 14848, + "time_per_iteration": 2.3722219467163086 + }, + { + "auxiliary_loss_clip": 0.01050672, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01223886, + "balance_loss_mlp": 1.01609766, + "epoch": 0.8927701788666766, + "flos": 28328874819840.0, + "grad_norm": 3.05689587058186, + "language_loss": 0.70047039, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.72132468, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 14849, + "time_per_iteration": 2.4361462593078613 + }, + { + "auxiliary_loss_clip": 0.01052132, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.01910448, + "balance_loss_mlp": 1.01673102, + "epoch": 0.8928303021193447, + "flos": 25293980131200.0, + "grad_norm": 2.161323663643168, + "language_loss": 0.81212139, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83305979, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 14850, + "time_per_iteration": 2.4341299533843994 + }, + { + "auxiliary_loss_clip": 0.0104961, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.01572013, + "balance_loss_mlp": 1.01556706, + "epoch": 0.8928904253720126, + "flos": 22235623142400.0, + "grad_norm": 1.5395075069989408, + "language_loss": 0.76271588, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.78359377, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.33984375, + "step": 14851, + "time_per_iteration": 2.3653006553649902 + }, + { + "auxiliary_loss_clip": 0.0104922, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.01419163, + "balance_loss_mlp": 1.01531935, + "epoch": 0.8929505486246806, + "flos": 27091064010240.0, + "grad_norm": 4.843097906590092, + "language_loss": 0.79422939, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.81509215, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.33789062, + "step": 14852, + "time_per_iteration": 2.4754979610443115 + }, + { + "auxiliary_loss_clip": 0.01049915, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.01439118, + "balance_loss_mlp": 1.01651001, + "epoch": 0.8930106718773486, + "flos": 23038241454720.0, + "grad_norm": 1.4157552946801468, + "language_loss": 0.69985759, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.72069752, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.33398438, + "step": 14853, + "time_per_iteration": 2.40934157371521 + }, + { + "auxiliary_loss_clip": 0.0105199, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.01092386, + "balance_loss_mlp": 1.01638603, + "epoch": 0.8930707951300165, + "flos": 35625198637440.0, + "grad_norm": 1.6617548273687268, + "language_loss": 0.68832666, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.70919454, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 14854, + "time_per_iteration": 2.5686371326446533 + }, + { + "auxiliary_loss_clip": 0.01049586, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.01331615, + "balance_loss_mlp": 1.01558971, + "epoch": 0.8931309183826845, + "flos": 23038765125120.0, + "grad_norm": 1.4611403327704033, + "language_loss": 0.75849593, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.77932698, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 14855, + "time_per_iteration": 2.386436939239502 + }, + { + "auxiliary_loss_clip": 0.0104978, + "auxiliary_loss_mlp": 0.01037866, + "balance_loss_clip": 1.01554978, + "balance_loss_mlp": 1.01490414, + "epoch": 0.8931910416353525, + "flos": 26503441050240.0, + "grad_norm": 1.8815796286748594, + "language_loss": 0.65241063, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.67328715, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 14856, + "time_per_iteration": 3.8436403274536133 + }, + { + "auxiliary_loss_clip": 0.01051013, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.01333547, + "balance_loss_mlp": 1.01614642, + "epoch": 0.8932511648880205, + "flos": 24972429709440.0, + "grad_norm": 1.7686468559502548, + "language_loss": 0.67592967, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.69679129, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 14857, + "time_per_iteration": 3.7630090713500977 + }, + { + "auxiliary_loss_clip": 0.01052192, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_clip": 1.0177815, + "balance_loss_mlp": 1.01644111, + "epoch": 0.8933112881406884, + "flos": 24459625526400.0, + "grad_norm": 2.5745242543170246, + "language_loss": 0.76602131, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.78695714, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 14858, + "time_per_iteration": 2.4342620372772217 + }, + { + "auxiliary_loss_clip": 0.01050327, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.01047635, + "balance_loss_mlp": 1.01556945, + "epoch": 0.8933714113933564, + "flos": 28291832000640.0, + "grad_norm": 1.5943165359000278, + "language_loss": 0.7055527, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.72639167, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 14859, + "time_per_iteration": 2.558547019958496 + }, + { + "auxiliary_loss_clip": 0.0104713, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.0133996, + "balance_loss_mlp": 1.01487541, + "epoch": 0.8934315346460243, + "flos": 21433772880000.0, + "grad_norm": 1.8166937381377912, + "language_loss": 0.7648983, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.78569686, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.32421875, + "step": 14860, + "time_per_iteration": 2.41283917427063 + }, + { + "auxiliary_loss_clip": 0.01053441, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.01557446, + "balance_loss_mlp": 1.0170486, + "epoch": 0.8934916578986923, + "flos": 23768449873920.0, + "grad_norm": 1.7940440570536653, + "language_loss": 0.5912354, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.61216664, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 14861, + "time_per_iteration": 2.3816370964050293 + }, + { + "auxiliary_loss_clip": 0.01048758, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.01494741, + "balance_loss_mlp": 1.01441264, + "epoch": 0.8935517811513602, + "flos": 18915173510400.0, + "grad_norm": 1.8436424259842377, + "language_loss": 0.64547014, + "learning_rate": 1.176284122190685e-07, + "loss": 0.66632187, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34375, + "step": 14862, + "time_per_iteration": 2.3634543418884277 + }, + { + "auxiliary_loss_clip": 0.01049466, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.01497626, + "balance_loss_mlp": 1.01551104, + "epoch": 0.8936119044040283, + "flos": 24060219039360.0, + "grad_norm": 1.6675190657866936, + "language_loss": 0.7917698, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.81263542, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.33984375, + "step": 14863, + "time_per_iteration": 2.39382004737854 + }, + { + "auxiliary_loss_clip": 0.01048595, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.01497972, + "balance_loss_mlp": 1.0147177, + "epoch": 0.8936720276566962, + "flos": 21323028447360.0, + "grad_norm": 1.8049602549685346, + "language_loss": 0.71942008, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.74026239, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33984375, + "step": 14864, + "time_per_iteration": 2.37341046333313 + }, + { + "auxiliary_loss_clip": 0.01054097, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.0194782, + "balance_loss_mlp": 1.01658177, + "epoch": 0.8937321509093642, + "flos": 18405127324800.0, + "grad_norm": 2.052172941113189, + "language_loss": 0.77792966, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.79889941, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 14865, + "time_per_iteration": 2.3459110260009766 + }, + { + "auxiliary_loss_clip": 0.01049055, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.01500595, + "balance_loss_mlp": 1.01516318, + "epoch": 0.8937922741620322, + "flos": 22053655802880.0, + "grad_norm": 1.5636695727912613, + "language_loss": 0.728338, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.74917829, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33984375, + "step": 14866, + "time_per_iteration": 2.42179536819458 + }, + { + "auxiliary_loss_clip": 0.01053365, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.0146327, + "balance_loss_mlp": 1.0164324, + "epoch": 0.8938523974147001, + "flos": 25663256248320.0, + "grad_norm": 1.9238564542844627, + "language_loss": 0.84980315, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.87072974, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 14867, + "time_per_iteration": 2.3887078762054443 + }, + { + "auxiliary_loss_clip": 0.01050782, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_clip": 1.01399803, + "balance_loss_mlp": 1.01521313, + "epoch": 0.8939125206673681, + "flos": 25741566161280.0, + "grad_norm": 1.6311325385807318, + "language_loss": 0.81606942, + "learning_rate": 1.168401272009567e-07, + "loss": 0.83692914, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.35546875, + "step": 14868, + "time_per_iteration": 2.445683240890503 + }, + { + "auxiliary_loss_clip": 0.01051987, + "auxiliary_loss_mlp": 0.01042979, + "balance_loss_clip": 1.01981664, + "balance_loss_mlp": 1.0159502, + "epoch": 0.8939726439200361, + "flos": 27343276738560.0, + "grad_norm": 1.7472753569003732, + "language_loss": 0.78087652, + "learning_rate": 1.167089962692056e-07, + "loss": 0.80182618, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 14869, + "time_per_iteration": 3.870619773864746 + }, + { + "auxiliary_loss_clip": 0.01050143, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.00813973, + "balance_loss_mlp": 1.01540112, + "epoch": 0.8940327671727041, + "flos": 20337814391040.0, + "grad_norm": 1.629715546094554, + "language_loss": 0.66370142, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.68449247, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34765625, + "step": 14870, + "time_per_iteration": 2.356036901473999 + }, + { + "auxiliary_loss_clip": 0.01007202, + "auxiliary_loss_mlp": 0.01005391, + "balance_loss_clip": 1.00344801, + "balance_loss_mlp": 1.00076461, + "epoch": 0.894092890425372, + "flos": 58408015518720.0, + "grad_norm": 0.800322759913264, + "language_loss": 0.55945396, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57957995, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.06445312, + "step": 14871, + "time_per_iteration": 3.0734031200408936 + }, + { + "auxiliary_loss_clip": 0.01050125, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.0156672, + "balance_loss_mlp": 1.01646757, + "epoch": 0.89415301367804, + "flos": 19828606078080.0, + "grad_norm": 1.7949662647018405, + "language_loss": 0.77935767, + "learning_rate": 1.16316031981331e-07, + "loss": 0.80022585, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.3359375, + "step": 14872, + "time_per_iteration": 2.420847177505493 + }, + { + "auxiliary_loss_clip": 0.01046851, + "auxiliary_loss_mlp": 0.01032763, + "balance_loss_clip": 1.01291442, + "balance_loss_mlp": 1.01451278, + "epoch": 0.8942131369307079, + "flos": 25774594174080.0, + "grad_norm": 1.6718826344661868, + "language_loss": 0.67990822, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.70070434, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.32421875, + "step": 14873, + "time_per_iteration": 2.424628734588623 + }, + { + "auxiliary_loss_clip": 0.01049096, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.01342726, + "balance_loss_mlp": 1.01523137, + "epoch": 0.8942732601833759, + "flos": 23147903635200.0, + "grad_norm": 1.6180229024717137, + "language_loss": 0.60626721, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.62710565, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33789062, + "step": 14874, + "time_per_iteration": 2.379300594329834 + }, + { + "auxiliary_loss_clip": 0.01053543, + "auxiliary_loss_mlp": 0.01038705, + "balance_loss_clip": 1.01581693, + "balance_loss_mlp": 1.01715636, + "epoch": 0.8943333834360438, + "flos": 27854300442240.0, + "grad_norm": 1.912092342744865, + "language_loss": 0.77353221, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.79445469, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 14875, + "time_per_iteration": 2.433391809463501 + }, + { + "auxiliary_loss_clip": 0.01053762, + "auxiliary_loss_mlp": 0.01045729, + "balance_loss_clip": 1.01922858, + "balance_loss_mlp": 1.01584637, + "epoch": 0.8943935066887119, + "flos": 22162864135680.0, + "grad_norm": 1.855332862834475, + "language_loss": 0.78396523, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.80496019, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 14876, + "time_per_iteration": 2.399177312850952 + }, + { + "auxiliary_loss_clip": 0.01050012, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.01018882, + "balance_loss_mlp": 1.01546669, + "epoch": 0.8944536299413798, + "flos": 21469000308480.0, + "grad_norm": 1.6770990792964833, + "language_loss": 0.7972399, + "learning_rate": 1.156625201573287e-07, + "loss": 0.81805158, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34570312, + "step": 14877, + "time_per_iteration": 2.383795738220215 + }, + { + "auxiliary_loss_clip": 0.0105043, + "auxiliary_loss_mlp": 0.010351, + "balance_loss_clip": 1.01267624, + "balance_loss_mlp": 1.01588964, + "epoch": 0.8945137531940478, + "flos": 17747817557760.0, + "grad_norm": 2.305642842773134, + "language_loss": 0.76421052, + "learning_rate": 1.155320321355151e-07, + "loss": 0.78506583, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34570312, + "step": 14878, + "time_per_iteration": 2.3365564346313477 + }, + { + "auxiliary_loss_clip": 0.01050698, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.01343298, + "balance_loss_mlp": 1.01549721, + "epoch": 0.8945738764467158, + "flos": 21141200753280.0, + "grad_norm": 1.7712033576593, + "language_loss": 0.77257764, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.7934438, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 14879, + "time_per_iteration": 2.393078088760376 + }, + { + "auxiliary_loss_clip": 0.01052155, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.01235616, + "balance_loss_mlp": 1.01615703, + "epoch": 0.8946339996993837, + "flos": 14902116860160.0, + "grad_norm": 1.9076792557482565, + "language_loss": 0.76041067, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.78128004, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 14880, + "time_per_iteration": 2.3671092987060547 + }, + { + "auxiliary_loss_clip": 0.01050783, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.01316023, + "balance_loss_mlp": 1.01548636, + "epoch": 0.8946941229520518, + "flos": 27380913050880.0, + "grad_norm": 1.8168109603602274, + "language_loss": 0.84126222, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.86212504, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 14881, + "time_per_iteration": 2.437927007675171 + }, + { + "auxiliary_loss_clip": 0.01049666, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.01284957, + "balance_loss_mlp": 1.01545405, + "epoch": 0.8947542462047197, + "flos": 31794912288000.0, + "grad_norm": 1.5486726370160948, + "language_loss": 0.67834449, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69918382, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34179688, + "step": 14882, + "time_per_iteration": 2.4777891635894775 + }, + { + "auxiliary_loss_clip": 0.0105358, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.01535988, + "balance_loss_mlp": 1.01601171, + "epoch": 0.8948143694573877, + "flos": 20882634157440.0, + "grad_norm": 2.8837520252788424, + "language_loss": 0.77034658, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.79129761, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 14883, + "time_per_iteration": 2.3403522968292236 + }, + { + "auxiliary_loss_clip": 0.01049713, + "auxiliary_loss_mlp": 0.01039527, + "balance_loss_clip": 1.01839161, + "balance_loss_mlp": 1.01566184, + "epoch": 0.8948744927100556, + "flos": 28214429783040.0, + "grad_norm": 1.5326318713516347, + "language_loss": 0.72989583, + "learning_rate": 1.147506048211253e-07, + "loss": 0.75078821, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 14884, + "time_per_iteration": 2.441246747970581 + }, + { + "auxiliary_loss_clip": 0.01048983, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.01125407, + "balance_loss_mlp": 1.0147717, + "epoch": 0.8949346159627236, + "flos": 21901749010560.0, + "grad_norm": 1.6431452866740375, + "language_loss": 0.77318865, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.79399657, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34179688, + "step": 14885, + "time_per_iteration": 3.6117777824401855 + }, + { + "auxiliary_loss_clip": 0.01050511, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.01382101, + "balance_loss_mlp": 1.01426768, + "epoch": 0.8949947392153915, + "flos": 21358116230400.0, + "grad_norm": 2.1325965054203078, + "language_loss": 0.82853413, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.84941119, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 14886, + "time_per_iteration": 2.3813095092773438 + }, + { + "auxiliary_loss_clip": 0.01050745, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.01658154, + "balance_loss_mlp": 1.01598704, + "epoch": 0.8950548624680595, + "flos": 52443454579200.0, + "grad_norm": 1.4708031377994466, + "language_loss": 0.64703596, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.66792148, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34765625, + "step": 14887, + "time_per_iteration": 2.6677768230438232 + }, + { + "auxiliary_loss_clip": 0.01052932, + "auxiliary_loss_mlp": 0.01044163, + "balance_loss_clip": 1.01923633, + "balance_loss_mlp": 1.01624417, + "epoch": 0.8951149857207275, + "flos": 20120270509440.0, + "grad_norm": 2.0096663346377177, + "language_loss": 0.6189748, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63994581, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 14888, + "time_per_iteration": 2.366886854171753 + }, + { + "auxiliary_loss_clip": 0.01052677, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.01038527, + "balance_loss_mlp": 1.01622319, + "epoch": 0.8951751089733955, + "flos": 29861317526400.0, + "grad_norm": 5.351501224210998, + "language_loss": 0.71170735, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.73254943, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.36523438, + "step": 14889, + "time_per_iteration": 2.440108299255371 + }, + { + "auxiliary_loss_clip": 0.01053239, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.01656508, + "balance_loss_mlp": 1.01639569, + "epoch": 0.8952352322260634, + "flos": 15262036732800.0, + "grad_norm": 2.4822836599644225, + "language_loss": 0.73111838, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.75207078, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 14890, + "time_per_iteration": 2.3153316974639893 + }, + { + "auxiliary_loss_clip": 0.01050816, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.00971317, + "balance_loss_mlp": 1.01551521, + "epoch": 0.8952953554787314, + "flos": 26797095429120.0, + "grad_norm": 1.5308592940142571, + "language_loss": 0.76785731, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.7886914, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 14891, + "time_per_iteration": 2.421882152557373 + }, + { + "auxiliary_loss_clip": 0.01051884, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.01020813, + "balance_loss_mlp": 1.01535416, + "epoch": 0.8953554787313994, + "flos": 14136331898880.0, + "grad_norm": 1.7993306790696701, + "language_loss": 0.77655447, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.79740727, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 14892, + "time_per_iteration": 2.3328733444213867 + }, + { + "auxiliary_loss_clip": 0.01050942, + "auxiliary_loss_mlp": 0.01035431, + "balance_loss_clip": 1.01338959, + "balance_loss_mlp": 1.01561952, + "epoch": 0.8954156019840673, + "flos": 25702114458240.0, + "grad_norm": 1.2762769056415926, + "language_loss": 0.82328975, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.84415346, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 14893, + "time_per_iteration": 2.443012237548828 + }, + { + "auxiliary_loss_clip": 0.01048357, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.01459026, + "balance_loss_mlp": 1.01501882, + "epoch": 0.8954757252367354, + "flos": 21906915891840.0, + "grad_norm": 1.6395153263410058, + "language_loss": 0.75543582, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.77626944, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.33398438, + "step": 14894, + "time_per_iteration": 2.38569712638855 + }, + { + "auxiliary_loss_clip": 0.01052984, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.01717782, + "balance_loss_mlp": 1.01659513, + "epoch": 0.8955358484894033, + "flos": 12969534528000.0, + "grad_norm": 2.153770844264711, + "language_loss": 0.68327785, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.70422447, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36328125, + "step": 14895, + "time_per_iteration": 3.7975454330444336 + }, + { + "auxiliary_loss_clip": 0.01054446, + "auxiliary_loss_mlp": 0.01039442, + "balance_loss_clip": 1.01481295, + "balance_loss_mlp": 1.01683092, + "epoch": 0.8955959717420713, + "flos": 17273033712000.0, + "grad_norm": 1.693988328474845, + "language_loss": 0.67511004, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69604897, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 14896, + "time_per_iteration": 3.721370220184326 + }, + { + "auxiliary_loss_clip": 0.01050763, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.01307631, + "balance_loss_mlp": 1.01543355, + "epoch": 0.8956560949947392, + "flos": 14792978350080.0, + "grad_norm": 1.72764249851666, + "language_loss": 0.76612449, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.78697842, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35351562, + "step": 14897, + "time_per_iteration": 2.3594586849212646 + }, + { + "auxiliary_loss_clip": 0.01007246, + "auxiliary_loss_mlp": 0.01003289, + "balance_loss_clip": 1.00107145, + "balance_loss_mlp": 1.00086248, + "epoch": 0.8957162182474072, + "flos": 63604661904000.0, + "grad_norm": 0.7467298203132786, + "language_loss": 0.55416936, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57427466, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.0222168, + "router_z_loss_mlp": 0.06396484, + "step": 14898, + "time_per_iteration": 3.07427978515625 + }, + { + "auxiliary_loss_clip": 0.01051918, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.01413059, + "balance_loss_mlp": 1.01599193, + "epoch": 0.8957763415000751, + "flos": 25008669567360.0, + "grad_norm": 1.5921317808078892, + "language_loss": 0.71856821, + "learning_rate": 1.12808298352008e-07, + "loss": 0.73945892, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 14899, + "time_per_iteration": 2.4115021228790283 + }, + { + "auxiliary_loss_clip": 0.01051737, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.01487148, + "balance_loss_mlp": 1.01618242, + "epoch": 0.8958364647527431, + "flos": 19827593648640.0, + "grad_norm": 1.7385056279148108, + "language_loss": 0.74539518, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.76629817, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 14900, + "time_per_iteration": 2.3962490558624268 + }, + { + "auxiliary_loss_clip": 0.01007272, + "auxiliary_loss_mlp": 0.01002463, + "balance_loss_clip": 1.00055552, + "balance_loss_mlp": 1.00079656, + "epoch": 0.895896588005411, + "flos": 65534102213760.0, + "grad_norm": 0.7747427620410309, + "language_loss": 0.61916459, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63926184, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.06494141, + "step": 14901, + "time_per_iteration": 3.017338991165161 + }, + { + "auxiliary_loss_clip": 0.01049571, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.01610839, + "balance_loss_mlp": 1.01444566, + "epoch": 0.8959567112580791, + "flos": 25589903748480.0, + "grad_norm": 1.7682574213499602, + "language_loss": 0.71641767, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.73729777, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 14902, + "time_per_iteration": 2.4123973846435547 + }, + { + "auxiliary_loss_clip": 0.0104826, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.01157975, + "balance_loss_mlp": 1.01450551, + "epoch": 0.896016834510747, + "flos": 24200779639680.0, + "grad_norm": 1.9030319450004336, + "language_loss": 0.79045039, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.81126845, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33789062, + "step": 14903, + "time_per_iteration": 2.3912999629974365 + }, + { + "auxiliary_loss_clip": 0.01054546, + "auxiliary_loss_mlp": 0.0104068, + "balance_loss_clip": 1.01562238, + "balance_loss_mlp": 1.01734209, + "epoch": 0.896076957763415, + "flos": 23074830426240.0, + "grad_norm": 1.6710328682266278, + "language_loss": 0.73746455, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75841677, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 14904, + "time_per_iteration": 2.4287121295928955 + }, + { + "auxiliary_loss_clip": 0.01051581, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.01089025, + "balance_loss_mlp": 1.01550245, + "epoch": 0.8961370810160829, + "flos": 22235518408320.0, + "grad_norm": 1.8512715429452302, + "language_loss": 0.76073432, + "learning_rate": 1.12035883275166e-07, + "loss": 0.78158975, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36132812, + "step": 14905, + "time_per_iteration": 2.365452527999878 + }, + { + "auxiliary_loss_clip": 0.01050247, + "auxiliary_loss_mlp": 0.01036668, + "balance_loss_clip": 1.01420903, + "balance_loss_mlp": 1.01592159, + "epoch": 0.8961972042687509, + "flos": 23071304378880.0, + "grad_norm": 1.6113117048297951, + "language_loss": 0.78029549, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.80116463, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34179688, + "step": 14906, + "time_per_iteration": 2.4077351093292236 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01039554, + "balance_loss_clip": 1.01899004, + "balance_loss_mlp": 1.01803231, + "epoch": 0.896257327521419, + "flos": 18184930179840.0, + "grad_norm": 1.5567097323443335, + "language_loss": 0.75146401, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.77239001, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34960938, + "step": 14907, + "time_per_iteration": 2.3558757305145264 + }, + { + "auxiliary_loss_clip": 0.01051067, + "auxiliary_loss_mlp": 0.01047359, + "balance_loss_clip": 1.02364802, + "balance_loss_mlp": 1.01639163, + "epoch": 0.8963174507740869, + "flos": 17894487646080.0, + "grad_norm": 1.647002550382285, + "language_loss": 0.834548, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85553223, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34765625, + "step": 14908, + "time_per_iteration": 3.8091914653778076 + }, + { + "auxiliary_loss_clip": 0.01053066, + "auxiliary_loss_mlp": 0.01041182, + "balance_loss_clip": 1.01584983, + "balance_loss_mlp": 1.01608658, + "epoch": 0.8963775740267549, + "flos": 21031224370560.0, + "grad_norm": 1.607222181074257, + "language_loss": 0.71699393, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.73793644, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 14909, + "time_per_iteration": 2.410095691680908 + }, + { + "auxiliary_loss_clip": 0.01051675, + "auxiliary_loss_mlp": 0.01040286, + "balance_loss_clip": 1.01688528, + "balance_loss_mlp": 1.01584005, + "epoch": 0.8964376972794228, + "flos": 23178662409600.0, + "grad_norm": 1.7379097705517068, + "language_loss": 0.73602635, + "learning_rate": 1.113941727737877e-07, + "loss": 0.75694597, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 14910, + "time_per_iteration": 2.4110944271087646 + }, + { + "auxiliary_loss_clip": 0.01050207, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.0132525, + "balance_loss_mlp": 1.01462257, + "epoch": 0.8964978205320908, + "flos": 24971836216320.0, + "grad_norm": 2.656775552875627, + "language_loss": 0.64188641, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.66274357, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 14911, + "time_per_iteration": 2.3925018310546875 + }, + { + "auxiliary_loss_clip": 0.01051542, + "auxiliary_loss_mlp": 0.01036905, + "balance_loss_clip": 1.01445758, + "balance_loss_mlp": 1.01627517, + "epoch": 0.8965579437847587, + "flos": 19171017020160.0, + "grad_norm": 1.792018470863097, + "language_loss": 0.76117301, + "learning_rate": 1.111379898520437e-07, + "loss": 0.78205746, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35351562, + "step": 14912, + "time_per_iteration": 2.374375343322754 + }, + { + "auxiliary_loss_clip": 0.01050144, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.01090765, + "balance_loss_mlp": 1.01497805, + "epoch": 0.8966180670374267, + "flos": 24275633328000.0, + "grad_norm": 1.822262085863558, + "language_loss": 0.83166128, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.85250902, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3515625, + "step": 14913, + "time_per_iteration": 2.39888596534729 + }, + { + "auxiliary_loss_clip": 0.01052328, + "auxiliary_loss_mlp": 0.01047077, + "balance_loss_clip": 1.02086329, + "balance_loss_mlp": 1.01581502, + "epoch": 0.8966781902900947, + "flos": 13552339720320.0, + "grad_norm": 2.324459160698388, + "language_loss": 0.62771755, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.64871162, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3671875, + "step": 14914, + "time_per_iteration": 2.377101182937622 + }, + { + "auxiliary_loss_clip": 0.01007422, + "auxiliary_loss_mlp": 0.01001358, + "balance_loss_clip": 0.99923593, + "balance_loss_mlp": 1.00106573, + "epoch": 0.8967383135427627, + "flos": 65062949149440.0, + "grad_norm": 0.7423087575752129, + "language_loss": 0.55170262, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57179046, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.06347656, + "step": 14915, + "time_per_iteration": 3.0207266807556152 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.01405478, + "balance_loss_mlp": 1.01540613, + "epoch": 0.8967984367954306, + "flos": 29711819617920.0, + "grad_norm": 1.4636715657799353, + "language_loss": 0.72063142, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.7414602, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33203125, + "step": 14916, + "time_per_iteration": 2.464329242706299 + }, + { + "auxiliary_loss_clip": 0.01051501, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.01220512, + "balance_loss_mlp": 1.01658988, + "epoch": 0.8968585600480986, + "flos": 25701311496960.0, + "grad_norm": 1.5899278302571878, + "language_loss": 0.78345811, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.80432081, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 14917, + "time_per_iteration": 2.415252447128296 + }, + { + "auxiliary_loss_clip": 0.0105421, + "auxiliary_loss_mlp": 0.0104615, + "balance_loss_clip": 1.01992416, + "balance_loss_mlp": 1.01678526, + "epoch": 0.8969186833007665, + "flos": 30043389600000.0, + "grad_norm": 2.352300416065489, + "language_loss": 0.69692636, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.71792996, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37304688, + "step": 14918, + "time_per_iteration": 2.458951950073242 + }, + { + "auxiliary_loss_clip": 0.01050588, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.01661563, + "balance_loss_mlp": 1.01506186, + "epoch": 0.8969788065534345, + "flos": 22817101703040.0, + "grad_norm": 1.836198909314225, + "language_loss": 0.84787893, + "learning_rate": 1.102436060943881e-07, + "loss": 0.86876482, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35546875, + "step": 14919, + "time_per_iteration": 2.4026193618774414 + }, + { + "auxiliary_loss_clip": 0.01052331, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.01133358, + "balance_loss_mlp": 1.01585066, + "epoch": 0.8970389298061026, + "flos": 13260640377600.0, + "grad_norm": 2.081612636702083, + "language_loss": 0.74426544, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.76514506, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 14920, + "time_per_iteration": 2.3832204341888428 + }, + { + "auxiliary_loss_clip": 0.01051676, + "auxiliary_loss_mlp": 0.01036164, + "balance_loss_clip": 1.01136804, + "balance_loss_mlp": 1.01599586, + "epoch": 0.8970990530587705, + "flos": 10265406860160.0, + "grad_norm": 2.3758720078494227, + "language_loss": 0.9174735, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93835193, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35742188, + "step": 14921, + "time_per_iteration": 2.3324625492095947 + }, + { + "auxiliary_loss_clip": 0.01052756, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.01124895, + "balance_loss_mlp": 1.01590562, + "epoch": 0.8971591763114385, + "flos": 20301679267200.0, + "grad_norm": 1.8697688024333523, + "language_loss": 0.74941194, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.77029121, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 14922, + "time_per_iteration": 2.38751220703125 + }, + { + "auxiliary_loss_clip": 0.01051093, + "auxiliary_loss_mlp": 0.01037092, + "balance_loss_clip": 1.01420355, + "balance_loss_mlp": 1.0159421, + "epoch": 0.8972192995641064, + "flos": 23255959893120.0, + "grad_norm": 1.822642768230374, + "language_loss": 0.71662772, + "learning_rate": 1.097341060694219e-07, + "loss": 0.73750961, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 14923, + "time_per_iteration": 2.387789726257324 + }, + { + "auxiliary_loss_clip": 0.01051935, + "auxiliary_loss_mlp": 0.01034228, + "balance_loss_clip": 1.00995719, + "balance_loss_mlp": 1.01584339, + "epoch": 0.8972794228167744, + "flos": 18368608176000.0, + "grad_norm": 2.5426642020377597, + "language_loss": 0.72294468, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.7438063, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36132812, + "step": 14924, + "time_per_iteration": 3.5978314876556396 + }, + { + "auxiliary_loss_clip": 0.01050154, + "auxiliary_loss_mlp": 0.0103844, + "balance_loss_clip": 1.01760197, + "balance_loss_mlp": 1.01573396, + "epoch": 0.8973395460694423, + "flos": 23950905972480.0, + "grad_norm": 1.344889882450706, + "language_loss": 0.7329042, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.75379014, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34375, + "step": 14925, + "time_per_iteration": 2.5300843715667725 + }, + { + "auxiliary_loss_clip": 0.01054495, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.01306772, + "balance_loss_mlp": 1.01749253, + "epoch": 0.8973996693221103, + "flos": 24969741534720.0, + "grad_norm": 1.6239381845369998, + "language_loss": 0.83642715, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.85733986, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 14926, + "time_per_iteration": 2.413839101791382 + }, + { + "auxiliary_loss_clip": 0.01049514, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.014732, + "balance_loss_mlp": 1.0150435, + "epoch": 0.8974597925747783, + "flos": 25737760823040.0, + "grad_norm": 1.5140630076043755, + "language_loss": 0.79529804, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81615978, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 14927, + "time_per_iteration": 2.4088706970214844 + }, + { + "auxiliary_loss_clip": 0.01050663, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.01296735, + "balance_loss_mlp": 1.01553071, + "epoch": 0.8975199158274463, + "flos": 38070375684480.0, + "grad_norm": 2.048459350713193, + "language_loss": 0.67110544, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.6919564, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 14928, + "time_per_iteration": 2.516594171524048 + }, + { + "auxiliary_loss_clip": 0.01052606, + "auxiliary_loss_mlp": 0.01039431, + "balance_loss_clip": 1.01376545, + "balance_loss_mlp": 1.01620865, + "epoch": 0.8975800390801142, + "flos": 25410484938240.0, + "grad_norm": 3.0601042888359484, + "language_loss": 0.72473526, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.74565566, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36328125, + "step": 14929, + "time_per_iteration": 2.4213781356811523 + }, + { + "auxiliary_loss_clip": 0.01052851, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.01637006, + "balance_loss_mlp": 1.01721525, + "epoch": 0.8976401623327822, + "flos": 21758604969600.0, + "grad_norm": 2.9723918884752076, + "language_loss": 0.69092166, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.71183449, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 14930, + "time_per_iteration": 2.3944034576416016 + }, + { + "auxiliary_loss_clip": 0.01048284, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.01598191, + "balance_loss_mlp": 1.01402068, + "epoch": 0.8977002855854501, + "flos": 13844457999360.0, + "grad_norm": 1.9731137859716865, + "language_loss": 0.76278591, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.78365052, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34179688, + "step": 14931, + "time_per_iteration": 2.345442771911621 + }, + { + "auxiliary_loss_clip": 0.01050487, + "auxiliary_loss_mlp": 0.01030661, + "balance_loss_clip": 1.009323, + "balance_loss_mlp": 1.01621962, + "epoch": 0.8977604088381181, + "flos": 19426511416320.0, + "grad_norm": 1.849944007058618, + "language_loss": 0.64554191, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.66635334, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 14932, + "time_per_iteration": 2.378781318664551 + }, + { + "auxiliary_loss_clip": 0.0104812, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.00903511, + "balance_loss_mlp": 1.01489425, + "epoch": 0.8978205320907862, + "flos": 22741130851200.0, + "grad_norm": 1.9942351478130844, + "language_loss": 0.72741997, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.74819481, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33203125, + "step": 14933, + "time_per_iteration": 2.38289737701416 + }, + { + "auxiliary_loss_clip": 0.01051791, + "auxiliary_loss_mlp": 0.01039003, + "balance_loss_clip": 1.0164609, + "balance_loss_mlp": 1.01613188, + "epoch": 0.8978806553434541, + "flos": 21359477773440.0, + "grad_norm": 1.3938046570237874, + "language_loss": 0.75340277, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.77431071, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 14934, + "time_per_iteration": 2.4023547172546387 + }, + { + "auxiliary_loss_clip": 0.01051018, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.01757479, + "balance_loss_mlp": 1.01608062, + "epoch": 0.8979407785961221, + "flos": 20923098289920.0, + "grad_norm": 1.6240325149366295, + "language_loss": 0.62137556, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.6422919, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34960938, + "step": 14935, + "time_per_iteration": 3.7851572036743164 + }, + { + "auxiliary_loss_clip": 0.01049508, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.01211643, + "balance_loss_mlp": 1.01537526, + "epoch": 0.89800090184879, + "flos": 25227749548800.0, + "grad_norm": 2.4039495523314747, + "language_loss": 0.78287327, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.80372065, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34179688, + "step": 14936, + "time_per_iteration": 3.8326992988586426 + }, + { + "auxiliary_loss_clip": 0.01052156, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.01130927, + "balance_loss_mlp": 1.01683784, + "epoch": 0.898061025101458, + "flos": 22561642218240.0, + "grad_norm": 1.6030532497773822, + "language_loss": 0.75020897, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.77106726, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 14937, + "time_per_iteration": 2.406010389328003 + }, + { + "auxiliary_loss_clip": 0.01007309, + "auxiliary_loss_mlp": 0.01002336, + "balance_loss_clip": 1.00054801, + "balance_loss_mlp": 1.00103116, + "epoch": 0.8981211483541259, + "flos": 56189843331840.0, + "grad_norm": 0.9098764271575009, + "language_loss": 0.63613236, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65622878, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.06298828, + "step": 14938, + "time_per_iteration": 2.9196455478668213 + }, + { + "auxiliary_loss_clip": 0.01050698, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.01023757, + "balance_loss_mlp": 1.01614356, + "epoch": 0.898181271606794, + "flos": 16391965841280.0, + "grad_norm": 2.2618900503761643, + "language_loss": 0.81114024, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.83195972, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34570312, + "step": 14939, + "time_per_iteration": 2.369658946990967 + }, + { + "auxiliary_loss_clip": 0.01007664, + "auxiliary_loss_mlp": 0.01003141, + "balance_loss_clip": 1.00113869, + "balance_loss_mlp": 1.00118291, + "epoch": 0.8982413948594619, + "flos": 63436694019840.0, + "grad_norm": 0.7169113701967231, + "language_loss": 0.52947342, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54958147, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06494141, + "step": 14940, + "time_per_iteration": 3.1660187244415283 + }, + { + "auxiliary_loss_clip": 0.01051229, + "auxiliary_loss_mlp": 0.01040568, + "balance_loss_clip": 1.01623738, + "balance_loss_mlp": 1.01611114, + "epoch": 0.8983015181121299, + "flos": 21834261619200.0, + "grad_norm": 2.0616024069619168, + "language_loss": 0.79089344, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.81181133, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3515625, + "step": 14941, + "time_per_iteration": 2.4127047061920166 + }, + { + "auxiliary_loss_clip": 0.01052144, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.01266074, + "balance_loss_mlp": 1.01630378, + "epoch": 0.8983616413647978, + "flos": 28948687920000.0, + "grad_norm": 2.525730881317025, + "language_loss": 0.73958069, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.76045084, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 14942, + "time_per_iteration": 2.4382858276367188 + }, + { + "auxiliary_loss_clip": 0.0105242, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.01807678, + "balance_loss_mlp": 1.01629639, + "epoch": 0.8984217646174658, + "flos": 17784127238400.0, + "grad_norm": 2.2793270866070037, + "language_loss": 0.81660247, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.83753443, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 14943, + "time_per_iteration": 2.3747622966766357 + }, + { + "auxiliary_loss_clip": 0.01053311, + "auxiliary_loss_mlp": 0.01041032, + "balance_loss_clip": 1.01645064, + "balance_loss_mlp": 1.01639247, + "epoch": 0.8984818878701337, + "flos": 23403398031360.0, + "grad_norm": 1.4724150320221203, + "language_loss": 0.72212583, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.74306917, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 14944, + "time_per_iteration": 2.3862924575805664 + }, + { + "auxiliary_loss_clip": 0.01055455, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.01595032, + "balance_loss_mlp": 1.01809752, + "epoch": 0.8985420111228017, + "flos": 22344971120640.0, + "grad_norm": 2.03705519192667, + "language_loss": 0.77594727, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.79690093, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37304688, + "step": 14945, + "time_per_iteration": 2.388627529144287 + }, + { + "auxiliary_loss_clip": 0.010551, + "auxiliary_loss_mlp": 0.0103926, + "balance_loss_clip": 1.01308179, + "balance_loss_mlp": 1.01651609, + "epoch": 0.8986021343754698, + "flos": 21391842470400.0, + "grad_norm": 2.006542898845663, + "language_loss": 0.74621558, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.76715916, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.38671875, + "step": 14946, + "time_per_iteration": 2.3833260536193848 + }, + { + "auxiliary_loss_clip": 0.01050539, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.01284885, + "balance_loss_mlp": 1.0157361, + "epoch": 0.8986622576281377, + "flos": 21324285256320.0, + "grad_norm": 2.439889551593908, + "language_loss": 0.6509093, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.67177063, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34765625, + "step": 14947, + "time_per_iteration": 2.392840623855591 + }, + { + "auxiliary_loss_clip": 0.01051802, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.01486456, + "balance_loss_mlp": 1.016451, + "epoch": 0.8987223808808057, + "flos": 23987145830400.0, + "grad_norm": 1.5921533114050854, + "language_loss": 0.70678592, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.72766441, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.35351562, + "step": 14948, + "time_per_iteration": 3.8085434436798096 + }, + { + "auxiliary_loss_clip": 0.01051254, + "auxiliary_loss_mlp": 0.01036947, + "balance_loss_clip": 1.01385617, + "balance_loss_mlp": 1.01553619, + "epoch": 0.8987825041334736, + "flos": 41499335422080.0, + "grad_norm": 1.781191535069777, + "language_loss": 0.7572571, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.77813917, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 14949, + "time_per_iteration": 2.5446434020996094 + }, + { + "auxiliary_loss_clip": 0.01051631, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.01572132, + "balance_loss_mlp": 1.01577568, + "epoch": 0.8988426273861416, + "flos": 27563020035840.0, + "grad_norm": 2.4348267718544365, + "language_loss": 0.76392531, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.78486216, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.359375, + "step": 14950, + "time_per_iteration": 2.4635443687438965 + }, + { + "auxiliary_loss_clip": 0.01051196, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.0129317, + "balance_loss_mlp": 1.01643538, + "epoch": 0.8989027506388095, + "flos": 17091694776960.0, + "grad_norm": 1.645194109234987, + "language_loss": 0.67752457, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.69836414, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.34765625, + "step": 14951, + "time_per_iteration": 2.362877607345581 + }, + { + "auxiliary_loss_clip": 0.01053806, + "auxiliary_loss_mlp": 0.01039431, + "balance_loss_clip": 1.01576805, + "balance_loss_mlp": 1.01595044, + "epoch": 0.8989628738914776, + "flos": 20554171286400.0, + "grad_norm": 1.9398627828402029, + "language_loss": 0.75155205, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.77248442, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37890625, + "step": 14952, + "time_per_iteration": 2.3772268295288086 + }, + { + "auxiliary_loss_clip": 0.01050727, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.0132041, + "balance_loss_mlp": 1.01585436, + "epoch": 0.8990229971441455, + "flos": 16250218254720.0, + "grad_norm": 2.135275354364461, + "language_loss": 0.58027595, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.60114115, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 14953, + "time_per_iteration": 2.3818588256835938 + }, + { + "auxiliary_loss_clip": 0.01050796, + "auxiliary_loss_mlp": 0.01042613, + "balance_loss_clip": 1.01972508, + "balance_loss_mlp": 1.01532292, + "epoch": 0.8990831203968135, + "flos": 21980233480320.0, + "grad_norm": 1.910669273643848, + "language_loss": 0.83547455, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.85640866, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 14954, + "time_per_iteration": 2.413179397583008 + }, + { + "auxiliary_loss_clip": 0.01050464, + "auxiliary_loss_mlp": 0.01036546, + "balance_loss_clip": 1.01419401, + "balance_loss_mlp": 1.01575589, + "epoch": 0.8991432436494814, + "flos": 27446131203840.0, + "grad_norm": 1.929985671989952, + "language_loss": 0.61545658, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.63632667, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 14955, + "time_per_iteration": 2.469252347946167 + }, + { + "auxiliary_loss_clip": 0.01049337, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.0142138, + "balance_loss_mlp": 1.01535225, + "epoch": 0.8992033669021494, + "flos": 21578767223040.0, + "grad_norm": 2.451157314835947, + "language_loss": 0.5651381, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.58598733, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33984375, + "step": 14956, + "time_per_iteration": 2.3690924644470215 + }, + { + "auxiliary_loss_clip": 0.01050374, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.01342547, + "balance_loss_mlp": 1.01563931, + "epoch": 0.8992634901548173, + "flos": 28582972761600.0, + "grad_norm": 1.9192830385260282, + "language_loss": 0.8114742, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.83234072, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34765625, + "step": 14957, + "time_per_iteration": 2.4296562671661377 + }, + { + "auxiliary_loss_clip": 0.01053283, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.01453018, + "balance_loss_mlp": 1.01642048, + "epoch": 0.8993236134074853, + "flos": 19866347124480.0, + "grad_norm": 1.582182770765239, + "language_loss": 0.79643983, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.81735611, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 14958, + "time_per_iteration": 2.3776373863220215 + }, + { + "auxiliary_loss_clip": 0.01052151, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.01241732, + "balance_loss_mlp": 1.01742506, + "epoch": 0.8993837366601534, + "flos": 19389643153920.0, + "grad_norm": 1.7122394836914494, + "language_loss": 0.75946271, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.78031611, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34765625, + "step": 14959, + "time_per_iteration": 2.3777871131896973 + }, + { + "auxiliary_loss_clip": 0.01049846, + "auxiliary_loss_mlp": 0.01038502, + "balance_loss_clip": 1.01692522, + "balance_loss_mlp": 1.01572418, + "epoch": 0.8994438599128213, + "flos": 18550750072320.0, + "grad_norm": 2.0879824995068628, + "language_loss": 0.70002806, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.72091156, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33984375, + "step": 14960, + "time_per_iteration": 2.3423211574554443 + }, + { + "auxiliary_loss_clip": 0.01051254, + "auxiliary_loss_mlp": 0.01033875, + "balance_loss_clip": 1.01154721, + "balance_loss_mlp": 1.01622081, + "epoch": 0.8995039831654893, + "flos": 24426388045440.0, + "grad_norm": 1.61849451171463, + "language_loss": 0.66495299, + "learning_rate": 1.049510991294591e-07, + "loss": 0.68580425, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 14961, + "time_per_iteration": 2.435886859893799 + }, + { + "auxiliary_loss_clip": 0.01049697, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.0111562, + "balance_loss_mlp": 1.01522911, + "epoch": 0.8995641064181572, + "flos": 21250269440640.0, + "grad_norm": 1.5873022694003038, + "language_loss": 0.83700585, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85782874, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34375, + "step": 14962, + "time_per_iteration": 2.387817859649658 + }, + { + "auxiliary_loss_clip": 0.01054143, + "auxiliary_loss_mlp": 0.01039536, + "balance_loss_clip": 1.01167727, + "balance_loss_mlp": 1.01721537, + "epoch": 0.8996242296708252, + "flos": 23512536541440.0, + "grad_norm": 2.260017434165118, + "language_loss": 0.76958007, + "learning_rate": 1.047022340612298e-07, + "loss": 0.79051685, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.36914062, + "step": 14963, + "time_per_iteration": 2.4422810077667236 + }, + { + "auxiliary_loss_clip": 0.01007505, + "auxiliary_loss_mlp": 0.01005989, + "balance_loss_clip": 1.00401044, + "balance_loss_mlp": 1.00110185, + "epoch": 0.8996843529234931, + "flos": 62400123486720.0, + "grad_norm": 0.7834123166137601, + "language_loss": 0.57645494, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59658986, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.06396484, + "step": 14964, + "time_per_iteration": 4.143456935882568 + }, + { + "auxiliary_loss_clip": 0.01055042, + "auxiliary_loss_mlp": 0.01040641, + "balance_loss_clip": 1.01480806, + "balance_loss_mlp": 1.01733744, + "epoch": 0.8997444761761612, + "flos": 24235867422720.0, + "grad_norm": 2.7084635992428168, + "language_loss": 0.69389927, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.71485609, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 14965, + "time_per_iteration": 2.371042013168335 + }, + { + "auxiliary_loss_clip": 0.01053591, + "auxiliary_loss_mlp": 0.01039692, + "balance_loss_clip": 1.01811528, + "balance_loss_mlp": 1.01699984, + "epoch": 0.8998045994288291, + "flos": 21360036355200.0, + "grad_norm": 2.0934087210725116, + "language_loss": 0.72811615, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.74904889, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36523438, + "step": 14966, + "time_per_iteration": 2.391310691833496 + }, + { + "auxiliary_loss_clip": 0.010534, + "auxiliary_loss_mlp": 0.01038714, + "balance_loss_clip": 1.01419246, + "balance_loss_mlp": 1.01733398, + "epoch": 0.8998647226814971, + "flos": 28984892866560.0, + "grad_norm": 1.661887708072744, + "language_loss": 0.74639052, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.76731169, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36132812, + "step": 14967, + "time_per_iteration": 2.4342055320739746 + }, + { + "auxiliary_loss_clip": 0.01050442, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.0089761, + "balance_loss_mlp": 1.01524508, + "epoch": 0.899924845934165, + "flos": 13625063815680.0, + "grad_norm": 2.1109447262525385, + "language_loss": 0.73194277, + "learning_rate": 1.040813291960323e-07, + "loss": 0.75276482, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 14968, + "time_per_iteration": 2.367720127105713 + }, + { + "auxiliary_loss_clip": 0.01051135, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.0157479, + "balance_loss_mlp": 1.0158534, + "epoch": 0.899984969186833, + "flos": 20881691550720.0, + "grad_norm": 1.7397159097506445, + "language_loss": 0.71853191, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73941189, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35351562, + "step": 14969, + "time_per_iteration": 2.3686838150024414 + }, + { + "auxiliary_loss_clip": 0.01053297, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.01507187, + "balance_loss_mlp": 1.01735163, + "epoch": 0.9000450924395009, + "flos": 20920794140160.0, + "grad_norm": 1.7884294444333897, + "language_loss": 0.77104056, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.79196364, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 14970, + "time_per_iteration": 2.379412889480591 + }, + { + "auxiliary_loss_clip": 0.01051797, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.01882458, + "balance_loss_mlp": 1.01554763, + "epoch": 0.900105215692169, + "flos": 17164104670080.0, + "grad_norm": 1.8430240125246662, + "language_loss": 0.73927224, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.76019663, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 14971, + "time_per_iteration": 2.3629794120788574 + }, + { + "auxiliary_loss_clip": 0.01051146, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.01093602, + "balance_loss_mlp": 1.01574969, + "epoch": 0.900165338944837, + "flos": 19931076518400.0, + "grad_norm": 2.2954249901035384, + "language_loss": 0.82939601, + "learning_rate": 1.035858993572476e-07, + "loss": 0.85025859, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35351562, + "step": 14972, + "time_per_iteration": 2.347663640975952 + }, + { + "auxiliary_loss_clip": 0.0105271, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.01543355, + "balance_loss_mlp": 1.01604748, + "epoch": 0.9002254621975049, + "flos": 16106166518400.0, + "grad_norm": 2.0012384755721135, + "language_loss": 0.82860744, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.84952754, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 14973, + "time_per_iteration": 2.3555779457092285 + }, + { + "auxiliary_loss_clip": 0.01052145, + "auxiliary_loss_mlp": 0.01037417, + "balance_loss_clip": 1.01415873, + "balance_loss_mlp": 1.01700425, + "epoch": 0.9002855854501729, + "flos": 28474846680960.0, + "grad_norm": 1.7986528908218993, + "language_loss": 0.5881601, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.6090557, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14974, + "time_per_iteration": 2.42008376121521 + }, + { + "auxiliary_loss_clip": 0.01052929, + "auxiliary_loss_mlp": 0.0103729, + "balance_loss_clip": 1.01406848, + "balance_loss_mlp": 1.01701641, + "epoch": 0.9003457087028408, + "flos": 25629111072000.0, + "grad_norm": 1.6279588594428829, + "language_loss": 0.64286929, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.66377151, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 14975, + "time_per_iteration": 5.183246374130249 + }, + { + "auxiliary_loss_clip": 0.01052896, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.01603532, + "balance_loss_mlp": 1.01589739, + "epoch": 0.9004058319555088, + "flos": 24388262974080.0, + "grad_norm": 1.5709603003272041, + "language_loss": 0.73836386, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75928742, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 14976, + "time_per_iteration": 2.409421682357788 + }, + { + "auxiliary_loss_clip": 0.01051726, + "auxiliary_loss_mlp": 0.01041406, + "balance_loss_clip": 1.01800513, + "balance_loss_mlp": 1.01641512, + "epoch": 0.9004659552081767, + "flos": 29058070809600.0, + "grad_norm": 1.9078653939737877, + "language_loss": 0.70985556, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.73078686, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 14977, + "time_per_iteration": 2.428184747695923 + }, + { + "auxiliary_loss_clip": 0.01052696, + "auxiliary_loss_mlp": 0.01037387, + "balance_loss_clip": 1.01241255, + "balance_loss_mlp": 1.01554286, + "epoch": 0.9005260784608448, + "flos": 16762917703680.0, + "grad_norm": 2.135211995319159, + "language_loss": 0.68265879, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.70355964, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 14978, + "time_per_iteration": 2.359135866165161 + }, + { + "auxiliary_loss_clip": 0.0105582, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_clip": 1.01900148, + "balance_loss_mlp": 1.01761985, + "epoch": 0.9005862017135127, + "flos": 20374961944320.0, + "grad_norm": 6.013381974961672, + "language_loss": 0.80486274, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.82587171, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3828125, + "step": 14979, + "time_per_iteration": 2.359754800796509 + }, + { + "auxiliary_loss_clip": 0.01007983, + "auxiliary_loss_mlp": 0.01002387, + "balance_loss_clip": 1.00034833, + "balance_loss_mlp": 1.00132608, + "epoch": 0.9006463249661807, + "flos": 67577114776320.0, + "grad_norm": 0.7245280421778713, + "language_loss": 0.53899711, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55910081, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.06640625, + "step": 14980, + "time_per_iteration": 3.057779312133789 + }, + { + "auxiliary_loss_clip": 0.01053606, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.01774335, + "balance_loss_mlp": 1.01649356, + "epoch": 0.9007064482188486, + "flos": 28292076380160.0, + "grad_norm": 1.725214818972455, + "language_loss": 0.83419812, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.85515028, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 14981, + "time_per_iteration": 2.412033796310425 + }, + { + "auxiliary_loss_clip": 0.01050715, + "auxiliary_loss_mlp": 0.0103479, + "balance_loss_clip": 1.01095998, + "balance_loss_mlp": 1.01536357, + "epoch": 0.9007665714715166, + "flos": 21615251460480.0, + "grad_norm": 1.5332168605099683, + "language_loss": 0.82007635, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.84093142, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 14982, + "time_per_iteration": 2.3825197219848633 + }, + { + "auxiliary_loss_clip": 0.01049474, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.02039623, + "balance_loss_mlp": 1.01595032, + "epoch": 0.9008266947241845, + "flos": 26540658426240.0, + "grad_norm": 1.7730057233868943, + "language_loss": 0.7324748, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.75339156, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3359375, + "step": 14983, + "time_per_iteration": 2.428668737411499 + }, + { + "auxiliary_loss_clip": 0.01050846, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.01166868, + "balance_loss_mlp": 1.01683772, + "epoch": 0.9008868179768525, + "flos": 23109464361600.0, + "grad_norm": 1.3337988057115415, + "language_loss": 0.75941443, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.7802496, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.33984375, + "step": 14984, + "time_per_iteration": 2.429661512374878 + }, + { + "auxiliary_loss_clip": 0.01049813, + "auxiliary_loss_mlp": 0.01037793, + "balance_loss_clip": 1.01560795, + "balance_loss_mlp": 1.01546824, + "epoch": 0.9009469412295206, + "flos": 19059853651200.0, + "grad_norm": 2.092608885873575, + "language_loss": 0.71092248, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.73179853, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34375, + "step": 14985, + "time_per_iteration": 2.37290620803833 + }, + { + "auxiliary_loss_clip": 0.01052092, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.01024759, + "balance_loss_mlp": 1.01527977, + "epoch": 0.9010070644821885, + "flos": 23221151400960.0, + "grad_norm": 2.292884473473629, + "language_loss": 0.7159552, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.73679328, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3671875, + "step": 14986, + "time_per_iteration": 2.385183095932007 + }, + { + "auxiliary_loss_clip": 0.01051797, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.01675248, + "balance_loss_mlp": 1.01589131, + "epoch": 0.9010671877348565, + "flos": 17383847967360.0, + "grad_norm": 1.7315819700580697, + "language_loss": 0.77379215, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.79471517, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 14987, + "time_per_iteration": 2.354480504989624 + }, + { + "auxiliary_loss_clip": 0.01051435, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.01370323, + "balance_loss_mlp": 1.01610994, + "epoch": 0.9011273109875244, + "flos": 21907090448640.0, + "grad_norm": 2.0141861638651415, + "language_loss": 0.75144511, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.77233076, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 14988, + "time_per_iteration": 3.813774347305298 + }, + { + "auxiliary_loss_clip": 0.01054688, + "auxiliary_loss_mlp": 0.01039345, + "balance_loss_clip": 1.01293993, + "balance_loss_mlp": 1.01779079, + "epoch": 0.9011874342401924, + "flos": 24059695368960.0, + "grad_norm": 2.325564068962338, + "language_loss": 0.70492053, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.72586083, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.36914062, + "step": 14989, + "time_per_iteration": 2.4426138401031494 + }, + { + "auxiliary_loss_clip": 0.0105247, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.01096964, + "balance_loss_mlp": 1.01669765, + "epoch": 0.9012475574928603, + "flos": 16757995201920.0, + "grad_norm": 1.9485138157820747, + "language_loss": 0.80976784, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.83063197, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35742188, + "step": 14990, + "time_per_iteration": 2.3571393489837646 + }, + { + "auxiliary_loss_clip": 0.01054177, + "auxiliary_loss_mlp": 0.01038546, + "balance_loss_clip": 1.01390505, + "balance_loss_mlp": 1.01627719, + "epoch": 0.9013076807455284, + "flos": 19970179107840.0, + "grad_norm": 3.200810007364867, + "language_loss": 0.79065984, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.8115871, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 14991, + "time_per_iteration": 2.417475938796997 + }, + { + "auxiliary_loss_clip": 0.01007436, + "auxiliary_loss_mlp": 0.01001704, + "balance_loss_clip": 0.99972475, + "balance_loss_mlp": 1.00107765, + "epoch": 0.9013678039981963, + "flos": 65176975249920.0, + "grad_norm": 0.7760474460414648, + "language_loss": 0.60323453, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62332594, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.06347656, + "step": 14992, + "time_per_iteration": 2.99843430519104 + }, + { + "auxiliary_loss_clip": 0.01050659, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.01299214, + "balance_loss_mlp": 1.01558828, + "epoch": 0.9014279272508643, + "flos": 20520200666880.0, + "grad_norm": 1.8106218087939918, + "language_loss": 0.83915126, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.86001951, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 14993, + "time_per_iteration": 2.397394895553589 + }, + { + "auxiliary_loss_clip": 0.01051226, + "auxiliary_loss_mlp": 0.01035196, + "balance_loss_clip": 1.01153326, + "balance_loss_mlp": 1.01561332, + "epoch": 0.9014880505035322, + "flos": 17308156406400.0, + "grad_norm": 1.8798194760138012, + "language_loss": 0.74774289, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.76860714, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 14994, + "time_per_iteration": 2.359956979751587 + }, + { + "auxiliary_loss_clip": 0.01049268, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.01575971, + "balance_loss_mlp": 1.01503444, + "epoch": 0.9015481737562002, + "flos": 28401598915200.0, + "grad_norm": 1.8157113121104376, + "language_loss": 0.65301371, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.67387128, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34375, + "step": 14995, + "time_per_iteration": 2.4588539600372314 + }, + { + "auxiliary_loss_clip": 0.0105111, + "auxiliary_loss_mlp": 0.01039816, + "balance_loss_clip": 1.01612902, + "balance_loss_mlp": 1.01496804, + "epoch": 0.9016082970088681, + "flos": 29751376055040.0, + "grad_norm": 1.6908080593759034, + "language_loss": 0.68207765, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.70298696, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 14996, + "time_per_iteration": 2.4428491592407227 + }, + { + "auxiliary_loss_clip": 0.01048706, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.01232696, + "balance_loss_mlp": 1.01507366, + "epoch": 0.9016684202615362, + "flos": 23512117605120.0, + "grad_norm": 1.8268472314489996, + "language_loss": 0.67126834, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.69209206, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3359375, + "step": 14997, + "time_per_iteration": 2.4170167446136475 + }, + { + "auxiliary_loss_clip": 0.0105094, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.01087403, + "balance_loss_mlp": 1.01565516, + "epoch": 0.9017285435142042, + "flos": 16978401815040.0, + "grad_norm": 1.7759333677244078, + "language_loss": 0.7839765, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.80481827, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 14998, + "time_per_iteration": 2.3681607246398926 + }, + { + "auxiliary_loss_clip": 0.01052701, + "auxiliary_loss_mlp": 0.0103883, + "balance_loss_clip": 1.01464236, + "balance_loss_mlp": 1.01654065, + "epoch": 0.9017886667668721, + "flos": 21392401052160.0, + "grad_norm": 1.6911029672543323, + "language_loss": 0.75853956, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77945483, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 14999, + "time_per_iteration": 2.3810958862304688 + }, + { + "auxiliary_loss_clip": 0.01051058, + "auxiliary_loss_mlp": 0.01037731, + "balance_loss_clip": 1.01347232, + "balance_loss_mlp": 1.01587367, + "epoch": 0.9018487900195401, + "flos": 20995508183040.0, + "grad_norm": 1.9517952414468158, + "language_loss": 0.76644188, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.78732973, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3515625, + "step": 15000, + "time_per_iteration": 2.391608238220215 + }, + { + "auxiliary_loss_clip": 0.01050462, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.00877094, + "balance_loss_mlp": 1.01581681, + "epoch": 0.901908913272208, + "flos": 53356503121920.0, + "grad_norm": 2.8663410385562913, + "language_loss": 0.81709802, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.83791637, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 15001, + "time_per_iteration": 2.6566247940063477 + }, + { + "auxiliary_loss_clip": 0.0105002, + "auxiliary_loss_mlp": 0.01030507, + "balance_loss_clip": 1.01052737, + "balance_loss_mlp": 1.01615834, + "epoch": 0.901969036524876, + "flos": 22088778497280.0, + "grad_norm": 1.4052938642882236, + "language_loss": 0.78998697, + "learning_rate": 9.990687143794407e-08, + "loss": 0.81079233, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.33984375, + "step": 15002, + "time_per_iteration": 2.3991551399230957 + }, + { + "auxiliary_loss_clip": 0.01051953, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.01230192, + "balance_loss_mlp": 1.01652622, + "epoch": 0.9020291597775439, + "flos": 23834086963200.0, + "grad_norm": 2.770052620205701, + "language_loss": 0.68741709, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70828772, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 15003, + "time_per_iteration": 3.716336250305176 + }, + { + "auxiliary_loss_clip": 0.01053068, + "auxiliary_loss_mlp": 0.01036269, + "balance_loss_clip": 1.01209319, + "balance_loss_mlp": 1.01662791, + "epoch": 0.902089283030212, + "flos": 18325211489280.0, + "grad_norm": 1.8995512798942527, + "language_loss": 0.86979282, + "learning_rate": 9.9663907182292e-08, + "loss": 0.89068621, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 15004, + "time_per_iteration": 2.3642349243164062 + }, + { + "auxiliary_loss_clip": 0.0105162, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.01704788, + "balance_loss_mlp": 1.01606667, + "epoch": 0.9021494062828799, + "flos": 24169217904000.0, + "grad_norm": 2.3091997467723004, + "language_loss": 0.73508811, + "learning_rate": 9.954253314356575e-08, + "loss": 0.75600266, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 15005, + "time_per_iteration": 2.406956911087036 + }, + { + "auxiliary_loss_clip": 0.01051284, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_clip": 1.01685286, + "balance_loss_mlp": 1.01528573, + "epoch": 0.9022095295355479, + "flos": 21615775130880.0, + "grad_norm": 1.9508833116161668, + "language_loss": 0.72676343, + "learning_rate": 9.942123117037748e-08, + "loss": 0.74768442, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 15006, + "time_per_iteration": 2.3939785957336426 + }, + { + "auxiliary_loss_clip": 0.0105262, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.01438296, + "balance_loss_mlp": 1.01610792, + "epoch": 0.9022696527882158, + "flos": 18725455848960.0, + "grad_norm": 1.9916306320954533, + "language_loss": 0.8559652, + "learning_rate": 9.930000126732618e-08, + "loss": 0.87685817, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.36523438, + "step": 15007, + "time_per_iteration": 2.3554325103759766 + }, + { + "auxiliary_loss_clip": 0.0105011, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.01095879, + "balance_loss_mlp": 1.01654315, + "epoch": 0.9023297760408838, + "flos": 26759982787200.0, + "grad_norm": 1.54843759899046, + "language_loss": 0.79450673, + "learning_rate": 9.917884343900928e-08, + "loss": 0.81534058, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3359375, + "step": 15008, + "time_per_iteration": 2.4071714878082275 + }, + { + "auxiliary_loss_clip": 0.01050235, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.01332867, + "balance_loss_mlp": 1.01637256, + "epoch": 0.9023898992935517, + "flos": 20521492387200.0, + "grad_norm": 1.71959046376413, + "language_loss": 0.74527127, + "learning_rate": 9.905775769002156e-08, + "loss": 0.76612663, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33984375, + "step": 15009, + "time_per_iteration": 2.3788228034973145 + }, + { + "auxiliary_loss_clip": 0.01050922, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.01242483, + "balance_loss_mlp": 1.01579547, + "epoch": 0.9024500225462198, + "flos": 17455699278720.0, + "grad_norm": 1.7724144213564452, + "language_loss": 0.74492258, + "learning_rate": 9.893674402495399e-08, + "loss": 0.76578534, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 15010, + "time_per_iteration": 2.3566486835479736 + }, + { + "auxiliary_loss_clip": 0.01050957, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.01117563, + "balance_loss_mlp": 1.01465726, + "epoch": 0.9025101457988878, + "flos": 20812563325440.0, + "grad_norm": 1.9031977484913598, + "language_loss": 0.76408154, + "learning_rate": 9.881580244839538e-08, + "loss": 0.78495848, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36328125, + "step": 15011, + "time_per_iteration": 2.3883235454559326 + }, + { + "auxiliary_loss_clip": 0.01053698, + "auxiliary_loss_mlp": 0.0104295, + "balance_loss_clip": 1.01808286, + "balance_loss_mlp": 1.0157665, + "epoch": 0.9025702690515557, + "flos": 19025359361280.0, + "grad_norm": 2.1770034700634917, + "language_loss": 0.74383104, + "learning_rate": 9.869493296493204e-08, + "loss": 0.76479757, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 15012, + "time_per_iteration": 2.3655524253845215 + }, + { + "auxiliary_loss_clip": 0.01052583, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.01478457, + "balance_loss_mlp": 1.01700544, + "epoch": 0.9026303923042237, + "flos": 19681796344320.0, + "grad_norm": 1.698880116206448, + "language_loss": 0.70181191, + "learning_rate": 9.857413557914763e-08, + "loss": 0.72269726, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.35546875, + "step": 15013, + "time_per_iteration": 2.376260757446289 + }, + { + "auxiliary_loss_clip": 0.01048519, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.01497746, + "balance_loss_mlp": 1.01544189, + "epoch": 0.9026905155568916, + "flos": 24606958930560.0, + "grad_norm": 1.427717531059623, + "language_loss": 0.73779076, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75863922, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33007812, + "step": 15014, + "time_per_iteration": 3.8816635608673096 + }, + { + "auxiliary_loss_clip": 0.01050209, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.01077271, + "balance_loss_mlp": 1.01480484, + "epoch": 0.9027506388095596, + "flos": 20520759248640.0, + "grad_norm": 1.860487741399592, + "language_loss": 0.73024958, + "learning_rate": 9.833275711893474e-08, + "loss": 0.75109196, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 15015, + "time_per_iteration": 3.698607921600342 + }, + { + "auxiliary_loss_clip": 0.01052068, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.01651633, + "balance_loss_mlp": 1.01630497, + "epoch": 0.9028107620622275, + "flos": 22783375463040.0, + "grad_norm": 1.811972082476494, + "language_loss": 0.70252675, + "learning_rate": 9.821217605365895e-08, + "loss": 0.7234261, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35742188, + "step": 15016, + "time_per_iteration": 2.3719584941864014 + }, + { + "auxiliary_loss_clip": 0.01050779, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.01182461, + "balance_loss_mlp": 1.01671278, + "epoch": 0.9028708853148956, + "flos": 25409367774720.0, + "grad_norm": 1.660263225613082, + "language_loss": 0.71542913, + "learning_rate": 9.809166710436855e-08, + "loss": 0.73625702, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.33984375, + "step": 15017, + "time_per_iteration": 2.4338834285736084 + }, + { + "auxiliary_loss_clip": 0.01050448, + "auxiliary_loss_mlp": 0.0104089, + "balance_loss_clip": 1.01905072, + "balance_loss_mlp": 1.01607239, + "epoch": 0.9029310085675635, + "flos": 21870257097600.0, + "grad_norm": 1.8098640209285484, + "language_loss": 0.70274669, + "learning_rate": 9.797123027563237e-08, + "loss": 0.72366011, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 15018, + "time_per_iteration": 2.380964517593384 + }, + { + "auxiliary_loss_clip": 0.01050908, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.01439905, + "balance_loss_mlp": 1.01544189, + "epoch": 0.9029911318202315, + "flos": 26213173073280.0, + "grad_norm": 1.6575315998300169, + "language_loss": 0.70012128, + "learning_rate": 9.785086557201782e-08, + "loss": 0.72100884, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 15019, + "time_per_iteration": 2.420362710952759 + }, + { + "auxiliary_loss_clip": 0.01049263, + "auxiliary_loss_mlp": 0.01039523, + "balance_loss_clip": 1.0186255, + "balance_loss_mlp": 1.0154829, + "epoch": 0.9030512550728994, + "flos": 15960439036800.0, + "grad_norm": 1.7399849799204408, + "language_loss": 0.73422062, + "learning_rate": 9.773057299808951e-08, + "loss": 0.75510848, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.3359375, + "step": 15020, + "time_per_iteration": 2.339611530303955 + }, + { + "auxiliary_loss_clip": 0.01052559, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.0156405, + "balance_loss_mlp": 1.01610541, + "epoch": 0.9031113783255674, + "flos": 23986482514560.0, + "grad_norm": 1.4757315254643746, + "language_loss": 0.74849725, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76940787, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 15021, + "time_per_iteration": 2.4290621280670166 + }, + { + "auxiliary_loss_clip": 0.01054006, + "auxiliary_loss_mlp": 0.01042672, + "balance_loss_clip": 1.01596951, + "balance_loss_mlp": 1.01659894, + "epoch": 0.9031715015782353, + "flos": 22236111901440.0, + "grad_norm": 2.2872791492004754, + "language_loss": 0.74369162, + "learning_rate": 9.749020425753251e-08, + "loss": 0.76465833, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.375, + "step": 15022, + "time_per_iteration": 2.3950164318084717 + }, + { + "auxiliary_loss_clip": 0.01048566, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.01241636, + "balance_loss_mlp": 1.01560092, + "epoch": 0.9032316248309034, + "flos": 26321962469760.0, + "grad_norm": 1.5536870713820645, + "language_loss": 0.73944461, + "learning_rate": 9.737012810001943e-08, + "loss": 0.76025891, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33007812, + "step": 15023, + "time_per_iteration": 2.4184203147888184 + }, + { + "auxiliary_loss_clip": 0.01050318, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.01721048, + "balance_loss_mlp": 1.01620567, + "epoch": 0.9032917480835713, + "flos": 22635623122560.0, + "grad_norm": 1.6906985553699183, + "language_loss": 0.83885241, + "learning_rate": 9.725012409042155e-08, + "loss": 0.85973489, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34179688, + "step": 15024, + "time_per_iteration": 2.4038708209991455 + }, + { + "auxiliary_loss_clip": 0.01051398, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01110303, + "balance_loss_mlp": 1.01594269, + "epoch": 0.9033518713362393, + "flos": 23877623295360.0, + "grad_norm": 1.4722755443858184, + "language_loss": 0.70766866, + "learning_rate": 9.713019223328966e-08, + "loss": 0.72850847, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35546875, + "step": 15025, + "time_per_iteration": 2.4378652572631836 + }, + { + "auxiliary_loss_clip": 0.01048882, + "auxiliary_loss_mlp": 0.01036791, + "balance_loss_clip": 1.01480842, + "balance_loss_mlp": 1.01447988, + "epoch": 0.9034119945889073, + "flos": 26904104346240.0, + "grad_norm": 1.6816604202423076, + "language_loss": 0.77952576, + "learning_rate": 9.70103325331717e-08, + "loss": 0.80038249, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 15026, + "time_per_iteration": 2.4539074897766113 + }, + { + "auxiliary_loss_clip": 0.0105104, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.01288998, + "balance_loss_mlp": 1.01608253, + "epoch": 0.9034721178415752, + "flos": 20849117385600.0, + "grad_norm": 1.8805552105351868, + "language_loss": 0.69643378, + "learning_rate": 9.68905449946129e-08, + "loss": 0.71729326, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 15027, + "time_per_iteration": 3.8217170238494873 + }, + { + "auxiliary_loss_clip": 0.01049468, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.01301754, + "balance_loss_mlp": 1.0165695, + "epoch": 0.9035322410942432, + "flos": 22233284081280.0, + "grad_norm": 1.509952892440184, + "language_loss": 0.76072389, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78157341, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.328125, + "step": 15028, + "time_per_iteration": 2.4096908569335938 + }, + { + "auxiliary_loss_clip": 0.01051263, + "auxiliary_loss_mlp": 0.01039617, + "balance_loss_clip": 1.01629972, + "balance_loss_mlp": 1.01622856, + "epoch": 0.9035923643469111, + "flos": 25922171957760.0, + "grad_norm": 1.5962815872142295, + "language_loss": 0.70081532, + "learning_rate": 9.665118642033765e-08, + "loss": 0.72172415, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 15029, + "time_per_iteration": 2.447610378265381 + }, + { + "auxiliary_loss_clip": 0.01052981, + "auxiliary_loss_mlp": 0.01038931, + "balance_loss_clip": 1.01415968, + "balance_loss_mlp": 1.01580667, + "epoch": 0.9036524875995792, + "flos": 20338756997760.0, + "grad_norm": 2.468780289047271, + "language_loss": 0.75103283, + "learning_rate": 9.653161539369858e-08, + "loss": 0.77195203, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 15030, + "time_per_iteration": 2.3669309616088867 + }, + { + "auxiliary_loss_clip": 0.01052317, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.01362967, + "balance_loss_mlp": 1.01579094, + "epoch": 0.9037126108522471, + "flos": 40113039133440.0, + "grad_norm": 2.0554752459526466, + "language_loss": 0.69224691, + "learning_rate": 9.641211654677151e-08, + "loss": 0.7131573, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36523438, + "step": 15031, + "time_per_iteration": 2.53265380859375 + }, + { + "auxiliary_loss_clip": 0.01049925, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.01084423, + "balance_loss_mlp": 1.01569736, + "epoch": 0.9037727341049151, + "flos": 23330883404160.0, + "grad_norm": 1.5495134266370991, + "language_loss": 0.77896374, + "learning_rate": 9.629268988408723e-08, + "loss": 0.79978031, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 15032, + "time_per_iteration": 2.4004437923431396 + }, + { + "auxiliary_loss_clip": 0.01051144, + "auxiliary_loss_mlp": 0.0103782, + "balance_loss_clip": 1.01519442, + "balance_loss_mlp": 1.01613653, + "epoch": 0.903832857357583, + "flos": 12821852010240.0, + "grad_norm": 1.994326742553331, + "language_loss": 0.75743032, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77831995, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15033, + "time_per_iteration": 2.355372667312622 + }, + { + "auxiliary_loss_clip": 0.01052192, + "auxiliary_loss_mlp": 0.0103655, + "balance_loss_clip": 1.01440144, + "balance_loss_mlp": 1.01693165, + "epoch": 0.903892980610251, + "flos": 25701835167360.0, + "grad_norm": 1.593568676809785, + "language_loss": 0.74832702, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76921451, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 15034, + "time_per_iteration": 2.43583083152771 + }, + { + "auxiliary_loss_clip": 0.01050708, + "auxiliary_loss_mlp": 0.01040544, + "balance_loss_clip": 1.0174768, + "balance_loss_mlp": 1.01573575, + "epoch": 0.9039531038629189, + "flos": 14683211435520.0, + "grad_norm": 1.9088404296549337, + "language_loss": 0.6545068, + "learning_rate": 9.593484304676791e-08, + "loss": 0.67541933, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34960938, + "step": 15035, + "time_per_iteration": 2.3851022720336914 + }, + { + "auxiliary_loss_clip": 0.01051071, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.01267362, + "balance_loss_mlp": 1.015674, + "epoch": 0.904013227115587, + "flos": 24023769713280.0, + "grad_norm": 5.028998722219779, + "language_loss": 0.64136469, + "learning_rate": 9.581570516631643e-08, + "loss": 0.66224515, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35351562, + "step": 15036, + "time_per_iteration": 2.387105703353882 + }, + { + "auxiliary_loss_clip": 0.01048599, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.01194549, + "balance_loss_mlp": 1.01508522, + "epoch": 0.9040733503682549, + "flos": 22855366419840.0, + "grad_norm": 1.7966164362945753, + "language_loss": 0.83081138, + "learning_rate": 9.569663949272455e-08, + "loss": 0.85162342, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33398438, + "step": 15037, + "time_per_iteration": 2.381335973739624 + }, + { + "auxiliary_loss_clip": 0.01052639, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.01470971, + "balance_loss_mlp": 1.01623249, + "epoch": 0.9041334736209229, + "flos": 19973914623360.0, + "grad_norm": 3.6043206392504956, + "language_loss": 0.68708056, + "learning_rate": 9.557764603050667e-08, + "loss": 0.70799762, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 15038, + "time_per_iteration": 2.361074686050415 + }, + { + "auxiliary_loss_clip": 0.01052138, + "auxiliary_loss_mlp": 0.01042129, + "balance_loss_clip": 1.01784635, + "balance_loss_mlp": 1.01568735, + "epoch": 0.9041935968735909, + "flos": 17529575448960.0, + "grad_norm": 1.958335196365414, + "language_loss": 0.77189374, + "learning_rate": 9.545872478417494e-08, + "loss": 0.79283643, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 15039, + "time_per_iteration": 2.358555316925049 + }, + { + "auxiliary_loss_clip": 0.01050756, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.01257849, + "balance_loss_mlp": 1.01551986, + "epoch": 0.9042537201262588, + "flos": 22779151188480.0, + "grad_norm": 1.609717570326033, + "language_loss": 0.70997131, + "learning_rate": 9.533987575823977e-08, + "loss": 0.73082352, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 15040, + "time_per_iteration": 2.383469581604004 + }, + { + "auxiliary_loss_clip": 0.01048331, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.01277912, + "balance_loss_mlp": 1.01446342, + "epoch": 0.9043138433789268, + "flos": 20594356128000.0, + "grad_norm": 1.6974305526461582, + "language_loss": 0.69299841, + "learning_rate": 9.522109895720709e-08, + "loss": 0.71383291, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33984375, + "step": 15041, + "time_per_iteration": 2.3906495571136475 + }, + { + "auxiliary_loss_clip": 0.01050623, + "auxiliary_loss_mlp": 0.01038189, + "balance_loss_clip": 1.01480031, + "balance_loss_mlp": 1.01550722, + "epoch": 0.9043739666315948, + "flos": 32961604924800.0, + "grad_norm": 1.5532698556749072, + "language_loss": 0.58364296, + "learning_rate": 9.510239438558155e-08, + "loss": 0.60453105, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 15042, + "time_per_iteration": 3.8006675243377686 + }, + { + "auxiliary_loss_clip": 0.01007478, + "auxiliary_loss_mlp": 0.01003809, + "balance_loss_clip": 1.00183046, + "balance_loss_mlp": 1.00088644, + "epoch": 0.9044340898842628, + "flos": 67293061021440.0, + "grad_norm": 0.78650599157204, + "language_loss": 0.5701077, + "learning_rate": 9.498376204786351e-08, + "loss": 0.59022057, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.06591797, + "step": 15043, + "time_per_iteration": 2.989306926727295 + }, + { + "auxiliary_loss_clip": 0.01052446, + "auxiliary_loss_mlp": 0.01036835, + "balance_loss_clip": 1.01195586, + "balance_loss_mlp": 1.01566982, + "epoch": 0.9044942131369307, + "flos": 17712171192960.0, + "grad_norm": 1.6403142859375783, + "language_loss": 0.70930278, + "learning_rate": 9.486520194855274e-08, + "loss": 0.73019558, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 15044, + "time_per_iteration": 2.368720054626465 + }, + { + "auxiliary_loss_clip": 0.01052464, + "auxiliary_loss_mlp": 0.01040122, + "balance_loss_clip": 1.01486111, + "balance_loss_mlp": 1.01602292, + "epoch": 0.9045543363895987, + "flos": 17819633957760.0, + "grad_norm": 2.1132294337830406, + "language_loss": 0.70856953, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72949541, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 15045, + "time_per_iteration": 2.3572447299957275 + }, + { + "auxiliary_loss_clip": 0.01055857, + "auxiliary_loss_mlp": 0.01043117, + "balance_loss_clip": 1.01747537, + "balance_loss_mlp": 1.0177263, + "epoch": 0.9046144596422666, + "flos": 21871618640640.0, + "grad_norm": 1.8890897985016673, + "language_loss": 0.66188782, + "learning_rate": 9.462829848313081e-08, + "loss": 0.68287754, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3828125, + "step": 15046, + "time_per_iteration": 2.408220052719116 + }, + { + "auxiliary_loss_clip": 0.01051613, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.01605725, + "balance_loss_mlp": 1.0152036, + "epoch": 0.9046745828949346, + "flos": 17671776883200.0, + "grad_norm": 1.947826110234466, + "language_loss": 0.63388276, + "learning_rate": 9.450995512600379e-08, + "loss": 0.6547848, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 15047, + "time_per_iteration": 2.326202869415283 + }, + { + "auxiliary_loss_clip": 0.01051493, + "auxiliary_loss_mlp": 0.01033752, + "balance_loss_clip": 1.01114964, + "balance_loss_mlp": 1.0161922, + "epoch": 0.9047347061476025, + "flos": 25701381319680.0, + "grad_norm": 3.5907244419912363, + "language_loss": 0.71480751, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73565996, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 15048, + "time_per_iteration": 2.427826404571533 + }, + { + "auxiliary_loss_clip": 0.01051379, + "auxiliary_loss_mlp": 0.01039327, + "balance_loss_clip": 1.01487672, + "balance_loss_mlp": 1.01517594, + "epoch": 0.9047948294002706, + "flos": 15157262142720.0, + "grad_norm": 2.1624912491406025, + "language_loss": 0.76115394, + "learning_rate": 9.427348518535483e-08, + "loss": 0.78206098, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 15049, + "time_per_iteration": 2.38037109375 + }, + { + "auxiliary_loss_clip": 0.01048985, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.01242769, + "balance_loss_mlp": 1.01559532, + "epoch": 0.9048549526529385, + "flos": 21871199704320.0, + "grad_norm": 1.8443898199392317, + "language_loss": 0.76894581, + "learning_rate": 9.415535861079993e-08, + "loss": 0.78977132, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33398438, + "step": 15050, + "time_per_iteration": 2.394037961959839 + }, + { + "auxiliary_loss_clip": 0.01052502, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.01472187, + "balance_loss_mlp": 1.01653755, + "epoch": 0.9049150759056065, + "flos": 23545599465600.0, + "grad_norm": 1.633753504857696, + "language_loss": 0.8283543, + "learning_rate": 9.403730430606472e-08, + "loss": 0.84924579, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.359375, + "step": 15051, + "time_per_iteration": 2.388979434967041 + }, + { + "auxiliary_loss_clip": 0.01050715, + "auxiliary_loss_mlp": 0.01035737, + "balance_loss_clip": 1.01479149, + "balance_loss_mlp": 1.01586437, + "epoch": 0.9049751991582745, + "flos": 19644893170560.0, + "grad_norm": 2.0132181455467135, + "language_loss": 0.90541101, + "learning_rate": 9.391932227562582e-08, + "loss": 0.92627549, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 15052, + "time_per_iteration": 2.3580541610717773 + }, + { + "auxiliary_loss_clip": 0.01054921, + "auxiliary_loss_mlp": 0.01044073, + "balance_loss_clip": 1.02030277, + "balance_loss_mlp": 1.01760721, + "epoch": 0.9050353224109424, + "flos": 15595317371520.0, + "grad_norm": 2.0167369452926747, + "language_loss": 0.78122228, + "learning_rate": 9.380141252395724e-08, + "loss": 0.80221224, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37304688, + "step": 15053, + "time_per_iteration": 2.364488124847412 + }, + { + "auxiliary_loss_clip": 0.01050399, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.01094389, + "balance_loss_mlp": 1.01590133, + "epoch": 0.9050954456636104, + "flos": 28182344376960.0, + "grad_norm": 2.36758160023645, + "language_loss": 0.74296385, + "learning_rate": 9.368357505553049e-08, + "loss": 0.76378977, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 15054, + "time_per_iteration": 3.830526351928711 + }, + { + "auxiliary_loss_clip": 0.01051359, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.01455593, + "balance_loss_mlp": 1.01606274, + "epoch": 0.9051555689162784, + "flos": 25730638905600.0, + "grad_norm": 1.6494788135442955, + "language_loss": 0.84166622, + "learning_rate": 9.356580987481333e-08, + "loss": 0.86253887, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35351562, + "step": 15055, + "time_per_iteration": 3.7054131031036377 + }, + { + "auxiliary_loss_clip": 0.01050897, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.0175916, + "balance_loss_mlp": 1.0163573, + "epoch": 0.9052156921689464, + "flos": 23256169361280.0, + "grad_norm": 1.711353792575477, + "language_loss": 0.85895443, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87985778, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 15056, + "time_per_iteration": 2.3972790241241455 + }, + { + "auxiliary_loss_clip": 0.01050345, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.01261926, + "balance_loss_mlp": 1.01604986, + "epoch": 0.9052758154216143, + "flos": 29563159582080.0, + "grad_norm": 1.8294652794422117, + "language_loss": 0.73609209, + "learning_rate": 9.333049639436863e-08, + "loss": 0.75693572, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 15057, + "time_per_iteration": 2.471463203430176 + }, + { + "auxiliary_loss_clip": 0.01049643, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.01063836, + "balance_loss_mlp": 1.01493275, + "epoch": 0.9053359386742823, + "flos": 22126589366400.0, + "grad_norm": 4.933131153732209, + "language_loss": 0.82291132, + "learning_rate": 9.321294810356418e-08, + "loss": 0.84373492, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 15058, + "time_per_iteration": 2.401698350906372 + }, + { + "auxiliary_loss_clip": 0.01007132, + "auxiliary_loss_mlp": 0.01004767, + "balance_loss_clip": 1.00274038, + "balance_loss_mlp": 1.00074363, + "epoch": 0.9053960619269502, + "flos": 67086409484160.0, + "grad_norm": 0.6745160721413789, + "language_loss": 0.51416177, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53428078, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.06347656, + "step": 15059, + "time_per_iteration": 3.1173038482666016 + }, + { + "auxiliary_loss_clip": 0.01051667, + "auxiliary_loss_mlp": 0.01039166, + "balance_loss_clip": 1.01472783, + "balance_loss_mlp": 1.01619172, + "epoch": 0.9054561851796182, + "flos": 15814502087040.0, + "grad_norm": 1.7071141941421892, + "language_loss": 0.68593585, + "learning_rate": 9.297806844307831e-08, + "loss": 0.70684421, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 15060, + "time_per_iteration": 2.3875298500061035 + }, + { + "auxiliary_loss_clip": 0.0105366, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.01336741, + "balance_loss_mlp": 1.01699054, + "epoch": 0.9055163084322861, + "flos": 17566024775040.0, + "grad_norm": 1.925187037826481, + "language_loss": 0.65475637, + "learning_rate": 9.286073708230357e-08, + "loss": 0.67566091, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 15061, + "time_per_iteration": 2.403048276901245 + }, + { + "auxiliary_loss_clip": 0.01053012, + "auxiliary_loss_mlp": 0.0103967, + "balance_loss_clip": 1.01561356, + "balance_loss_mlp": 1.01659751, + "epoch": 0.9055764316849542, + "flos": 17638609224960.0, + "grad_norm": 1.771882182196225, + "language_loss": 0.72359645, + "learning_rate": 9.274347804044058e-08, + "loss": 0.74452335, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 15062, + "time_per_iteration": 2.392286777496338 + }, + { + "auxiliary_loss_clip": 0.01050158, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.02131581, + "balance_loss_mlp": 1.01544499, + "epoch": 0.9056365549376221, + "flos": 20119816661760.0, + "grad_norm": 1.5465106105507815, + "language_loss": 0.71571904, + "learning_rate": 9.2626291321936e-08, + "loss": 0.73665738, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 15063, + "time_per_iteration": 2.382380247116089 + }, + { + "auxiliary_loss_clip": 0.01049227, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.01168871, + "balance_loss_mlp": 1.01493716, + "epoch": 0.9056966781902901, + "flos": 27597584148480.0, + "grad_norm": 1.6540208077640417, + "language_loss": 0.73584652, + "learning_rate": 9.250917693123406e-08, + "loss": 0.75667727, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 15064, + "time_per_iteration": 2.4322423934936523 + }, + { + "auxiliary_loss_clip": 0.01050568, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.01213145, + "balance_loss_mlp": 1.01483679, + "epoch": 0.9057568014429581, + "flos": 25918960112640.0, + "grad_norm": 2.5746970093739168, + "language_loss": 0.71437275, + "learning_rate": 9.23921348727752e-08, + "loss": 0.73522639, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 15065, + "time_per_iteration": 2.39947772026062 + }, + { + "auxiliary_loss_clip": 0.01051536, + "auxiliary_loss_mlp": 0.01038455, + "balance_loss_clip": 1.01579309, + "balance_loss_mlp": 1.01646876, + "epoch": 0.905816924695626, + "flos": 22929242590080.0, + "grad_norm": 1.7819924703161123, + "language_loss": 0.64160269, + "learning_rate": 9.227516515099743e-08, + "loss": 0.66250259, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 15066, + "time_per_iteration": 3.9104833602905273 + }, + { + "auxiliary_loss_clip": 0.01054888, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.01135015, + "balance_loss_mlp": 1.0160929, + "epoch": 0.905877047948294, + "flos": 22156510268160.0, + "grad_norm": 1.8455575926783088, + "language_loss": 0.81439412, + "learning_rate": 9.215826777033675e-08, + "loss": 0.83533108, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38867188, + "step": 15067, + "time_per_iteration": 2.395298719406128 + }, + { + "auxiliary_loss_clip": 0.01052065, + "auxiliary_loss_mlp": 0.01036863, + "balance_loss_clip": 1.01142406, + "balance_loss_mlp": 1.01586938, + "epoch": 0.905937171200962, + "flos": 15303897319680.0, + "grad_norm": 1.5870875889522804, + "language_loss": 0.71375227, + "learning_rate": 9.204144273522563e-08, + "loss": 0.73464155, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36132812, + "step": 15068, + "time_per_iteration": 2.3575825691223145 + }, + { + "auxiliary_loss_clip": 0.01049715, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.01186419, + "balance_loss_mlp": 1.01497817, + "epoch": 0.90599729445363, + "flos": 19461983224320.0, + "grad_norm": 1.9423216839563149, + "language_loss": 0.87029517, + "learning_rate": 9.19246900500943e-08, + "loss": 0.89113438, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 15069, + "time_per_iteration": 2.3593192100524902 + }, + { + "auxiliary_loss_clip": 0.01054196, + "auxiliary_loss_mlp": 0.01042273, + "balance_loss_clip": 1.01717997, + "balance_loss_mlp": 1.01613247, + "epoch": 0.9060574177062979, + "flos": 23731825991040.0, + "grad_norm": 1.92445406522355, + "language_loss": 0.59954369, + "learning_rate": 9.180800971936987e-08, + "loss": 0.62050843, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.38085938, + "step": 15070, + "time_per_iteration": 2.3853113651275635 + }, + { + "auxiliary_loss_clip": 0.01052823, + "auxiliary_loss_mlp": 0.0103518, + "balance_loss_clip": 1.00903738, + "balance_loss_mlp": 1.01635206, + "epoch": 0.9061175409589659, + "flos": 17310181265280.0, + "grad_norm": 1.990371652200866, + "language_loss": 0.82297492, + "learning_rate": 9.169140174747724e-08, + "loss": 0.8438549, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36523438, + "step": 15071, + "time_per_iteration": 2.337263822555542 + }, + { + "auxiliary_loss_clip": 0.01055152, + "auxiliary_loss_mlp": 0.01042163, + "balance_loss_clip": 1.0163784, + "balance_loss_mlp": 1.01629782, + "epoch": 0.9061776642116338, + "flos": 17777668636800.0, + "grad_norm": 1.8765695336364918, + "language_loss": 0.62900305, + "learning_rate": 9.157486613883758e-08, + "loss": 0.64997613, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38867188, + "step": 15072, + "time_per_iteration": 2.3437602519989014 + }, + { + "auxiliary_loss_clip": 0.01050633, + "auxiliary_loss_mlp": 0.01043568, + "balance_loss_clip": 1.01859331, + "balance_loss_mlp": 1.01552248, + "epoch": 0.9062377874643018, + "flos": 42776039352960.0, + "grad_norm": 2.0982128117301215, + "language_loss": 0.74455678, + "learning_rate": 9.145840289787021e-08, + "loss": 0.76549876, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3515625, + "step": 15073, + "time_per_iteration": 2.5954792499542236 + }, + { + "auxiliary_loss_clip": 0.01049934, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.01181602, + "balance_loss_mlp": 1.01576352, + "epoch": 0.9062979107169697, + "flos": 16360718307840.0, + "grad_norm": 1.9721110932895534, + "language_loss": 0.82780552, + "learning_rate": 9.134201202899161e-08, + "loss": 0.84863013, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34375, + "step": 15074, + "time_per_iteration": 2.3595244884490967 + }, + { + "auxiliary_loss_clip": 0.0100718, + "auxiliary_loss_mlp": 0.01002609, + "balance_loss_clip": 1.00055826, + "balance_loss_mlp": 1.00064027, + "epoch": 0.9063580339696378, + "flos": 69310272222720.0, + "grad_norm": 0.766739562669384, + "language_loss": 0.52392209, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54402006, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06542969, + "step": 15075, + "time_per_iteration": 3.0974953174591064 + }, + { + "auxiliary_loss_clip": 0.0100709, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00007272, + "balance_loss_mlp": 1.00050139, + "epoch": 0.9064181572223057, + "flos": 58791640204800.0, + "grad_norm": 0.7298869990728761, + "language_loss": 0.62248528, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64257872, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.06591797, + "step": 15076, + "time_per_iteration": 2.9394147396087646 + }, + { + "auxiliary_loss_clip": 0.01050708, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.01450753, + "balance_loss_mlp": 1.01529968, + "epoch": 0.9064782804749737, + "flos": 21761607346560.0, + "grad_norm": 1.7391437454406076, + "language_loss": 0.83034539, + "learning_rate": 9.09932736990091e-08, + "loss": 0.85123014, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 15077, + "time_per_iteration": 2.3835256099700928 + }, + { + "auxiliary_loss_clip": 0.01048245, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.01177895, + "balance_loss_mlp": 1.0145278, + "epoch": 0.9065384037276417, + "flos": 21396311124480.0, + "grad_norm": 1.5505647438791716, + "language_loss": 0.85246086, + "learning_rate": 9.08771723625934e-08, + "loss": 0.87326837, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33789062, + "step": 15078, + "time_per_iteration": 2.378180980682373 + }, + { + "auxiliary_loss_clip": 0.0104929, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.01584172, + "balance_loss_mlp": 1.01559019, + "epoch": 0.9065985269803096, + "flos": 38282298837120.0, + "grad_norm": 1.5437044229509698, + "language_loss": 0.66722679, + "learning_rate": 9.076114342030617e-08, + "loss": 0.68810976, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.33789062, + "step": 15079, + "time_per_iteration": 2.549582004547119 + }, + { + "auxiliary_loss_clip": 0.01050635, + "auxiliary_loss_mlp": 0.01036099, + "balance_loss_clip": 1.01405692, + "balance_loss_mlp": 1.01543498, + "epoch": 0.9066586502329776, + "flos": 44816922322560.0, + "grad_norm": 1.4484849132229312, + "language_loss": 0.71272981, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73359716, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 15080, + "time_per_iteration": 2.5912885665893555 + }, + { + "auxiliary_loss_clip": 0.0105395, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.01694369, + "balance_loss_mlp": 1.01767802, + "epoch": 0.9067187734856456, + "flos": 18623020320000.0, + "grad_norm": 2.195968336407109, + "language_loss": 0.72021878, + "learning_rate": 9.052930273571547e-08, + "loss": 0.74117899, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 15081, + "time_per_iteration": 2.3888871669769287 + }, + { + "auxiliary_loss_clip": 0.0105084, + "auxiliary_loss_mlp": 0.01038531, + "balance_loss_clip": 1.01545238, + "balance_loss_mlp": 1.01576591, + "epoch": 0.9067788967383136, + "flos": 22746472289280.0, + "grad_norm": 2.0386660268943784, + "language_loss": 0.7510519, + "learning_rate": 9.04134910022032e-08, + "loss": 0.77194566, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 15082, + "time_per_iteration": 3.6623363494873047 + }, + { + "auxiliary_loss_clip": 0.01050306, + "auxiliary_loss_mlp": 0.0103864, + "balance_loss_clip": 1.01724195, + "balance_loss_mlp": 1.01602054, + "epoch": 0.9068390199909815, + "flos": 27669610016640.0, + "grad_norm": 1.723431884258522, + "language_loss": 0.79344159, + "learning_rate": 9.029775168040266e-08, + "loss": 0.81433105, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 15083, + "time_per_iteration": 2.4238500595092773 + }, + { + "auxiliary_loss_clip": 0.01048815, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.01411307, + "balance_loss_mlp": 1.01566529, + "epoch": 0.9068991432436495, + "flos": 24242849694720.0, + "grad_norm": 1.5648634737053604, + "language_loss": 0.69420564, + "learning_rate": 9.01820847747028e-08, + "loss": 0.71503115, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.33203125, + "step": 15084, + "time_per_iteration": 2.390556573867798 + }, + { + "auxiliary_loss_clip": 0.01052229, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.01837385, + "balance_loss_mlp": 1.01630282, + "epoch": 0.9069592664963174, + "flos": 28032183152640.0, + "grad_norm": 2.502446390723708, + "language_loss": 0.67972487, + "learning_rate": 9.006649028948965e-08, + "loss": 0.70066845, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 15085, + "time_per_iteration": 2.464702606201172 + }, + { + "auxiliary_loss_clip": 0.01007813, + "auxiliary_loss_mlp": 0.01005012, + "balance_loss_clip": 1.00285399, + "balance_loss_mlp": 1.0011065, + "epoch": 0.9070193897489854, + "flos": 68775404060160.0, + "grad_norm": 0.7779539763661253, + "language_loss": 0.61293036, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63305861, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06738281, + "step": 15086, + "time_per_iteration": 3.0223729610443115 + }, + { + "auxiliary_loss_clip": 0.01050617, + "auxiliary_loss_mlp": 0.01038654, + "balance_loss_clip": 1.01506233, + "balance_loss_mlp": 1.01514626, + "epoch": 0.9070795130016533, + "flos": 23440475761920.0, + "grad_norm": 1.483012463809674, + "language_loss": 0.73327291, + "learning_rate": 8.983551859805416e-08, + "loss": 0.75416565, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35351562, + "step": 15087, + "time_per_iteration": 2.4166066646575928 + }, + { + "auxiliary_loss_clip": 0.01051434, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.01554465, + "balance_loss_mlp": 1.01593423, + "epoch": 0.9071396362543214, + "flos": 18915417889920.0, + "grad_norm": 2.1510288249933907, + "language_loss": 0.77817595, + "learning_rate": 8.972014140059058e-08, + "loss": 0.79906797, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 15088, + "time_per_iteration": 2.3813536167144775 + }, + { + "auxiliary_loss_clip": 0.01048757, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.01408398, + "balance_loss_mlp": 1.0149287, + "epoch": 0.9071997595069893, + "flos": 25227470257920.0, + "grad_norm": 2.0879784874212346, + "language_loss": 0.73960835, + "learning_rate": 8.960483664113038e-08, + "loss": 0.76046634, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.33789062, + "step": 15089, + "time_per_iteration": 2.405083179473877 + }, + { + "auxiliary_loss_clip": 0.01048388, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.01357067, + "balance_loss_mlp": 1.01501203, + "epoch": 0.9072598827596573, + "flos": 24345634337280.0, + "grad_norm": 1.9217342797439527, + "language_loss": 0.76313996, + "learning_rate": 8.948960432404628e-08, + "loss": 0.78397155, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33398438, + "step": 15090, + "time_per_iteration": 2.3929076194763184 + }, + { + "auxiliary_loss_clip": 0.01053538, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.0095222, + "balance_loss_mlp": 1.01634836, + "epoch": 0.9073200060123253, + "flos": 22673852928000.0, + "grad_norm": 2.819057224547379, + "language_loss": 0.78742826, + "learning_rate": 8.93744444537079e-08, + "loss": 0.80832338, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37304688, + "step": 15091, + "time_per_iteration": 2.4856503009796143 + }, + { + "auxiliary_loss_clip": 0.01048358, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.01490974, + "balance_loss_mlp": 1.01597047, + "epoch": 0.9073801292649932, + "flos": 23694364235520.0, + "grad_norm": 1.7239642381023876, + "language_loss": 0.86387908, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88471448, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.32421875, + "step": 15092, + "time_per_iteration": 2.4006941318511963 + }, + { + "auxiliary_loss_clip": 0.01050392, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.01578927, + "balance_loss_mlp": 1.01624763, + "epoch": 0.9074402525176612, + "flos": 25374210168960.0, + "grad_norm": 1.4991782335568298, + "language_loss": 0.79615426, + "learning_rate": 8.914434207073296e-08, + "loss": 0.81703746, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34179688, + "step": 15093, + "time_per_iteration": 3.8159642219543457 + }, + { + "auxiliary_loss_clip": 0.0100721, + "auxiliary_loss_mlp": 0.01003153, + "balance_loss_clip": 1.00095975, + "balance_loss_mlp": 1.00053358, + "epoch": 0.9075003757703292, + "flos": 67645998622080.0, + "grad_norm": 0.7373730473670456, + "language_loss": 0.57135832, + "learning_rate": 8.902939956682188e-08, + "loss": 0.5914619, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06689453, + "step": 15094, + "time_per_iteration": 4.389244318008423 + }, + { + "auxiliary_loss_clip": 0.01052649, + "auxiliary_loss_mlp": 0.01042437, + "balance_loss_clip": 1.01632988, + "balance_loss_mlp": 1.01594257, + "epoch": 0.9075604990229972, + "flos": 22452608442240.0, + "grad_norm": 1.9554738733298196, + "language_loss": 0.72562981, + "learning_rate": 8.891452952710742e-08, + "loss": 0.74658072, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3671875, + "step": 15095, + "time_per_iteration": 2.3751907348632812 + }, + { + "auxiliary_loss_clip": 0.01052571, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.01214647, + "balance_loss_mlp": 1.01669753, + "epoch": 0.9076206222756651, + "flos": 19535649926400.0, + "grad_norm": 1.7392709836890108, + "language_loss": 0.75233132, + "learning_rate": 8.879973195594526e-08, + "loss": 0.77320766, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 15096, + "time_per_iteration": 2.3783934116363525 + }, + { + "auxiliary_loss_clip": 0.0105207, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.01745868, + "balance_loss_mlp": 1.01620317, + "epoch": 0.9076807455283331, + "flos": 30115624936320.0, + "grad_norm": 2.2682570021621657, + "language_loss": 0.57862979, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59956938, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 15097, + "time_per_iteration": 2.4446113109588623 + }, + { + "auxiliary_loss_clip": 0.01049215, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01352441, + "balance_loss_mlp": 1.01484299, + "epoch": 0.907740868781001, + "flos": 18696547376640.0, + "grad_norm": 1.5831040943619765, + "language_loss": 0.80106419, + "learning_rate": 8.857035423668935e-08, + "loss": 0.82190651, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34375, + "step": 15098, + "time_per_iteration": 2.353168487548828 + }, + { + "auxiliary_loss_clip": 0.0105226, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.01525688, + "balance_loss_mlp": 1.01549828, + "epoch": 0.907800992033669, + "flos": 22637682892800.0, + "grad_norm": 1.7335278312022695, + "language_loss": 0.6724056, + "learning_rate": 8.845577409729266e-08, + "loss": 0.69331956, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 15099, + "time_per_iteration": 2.3713717460632324 + }, + { + "auxiliary_loss_clip": 0.01053799, + "auxiliary_loss_mlp": 0.01037878, + "balance_loss_clip": 1.01346409, + "balance_loss_mlp": 1.01686585, + "epoch": 0.907861115286337, + "flos": 21286614032640.0, + "grad_norm": 2.526494742452283, + "language_loss": 0.71223438, + "learning_rate": 8.834126644384477e-08, + "loss": 0.73315114, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 15100, + "time_per_iteration": 2.3790347576141357 + }, + { + "auxiliary_loss_clip": 0.01007062, + "auxiliary_loss_mlp": 0.01002655, + "balance_loss_clip": 1.00060499, + "balance_loss_mlp": 1.00051665, + "epoch": 0.907921238539005, + "flos": 69736108475520.0, + "grad_norm": 0.6266908712204965, + "language_loss": 0.53446585, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55456305, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06542969, + "step": 15101, + "time_per_iteration": 3.0830421447753906 + }, + { + "auxiliary_loss_clip": 0.01050876, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.01360345, + "balance_loss_mlp": 1.01555777, + "epoch": 0.9079813617916729, + "flos": 23476261772160.0, + "grad_norm": 1.841481246993074, + "language_loss": 0.69419622, + "learning_rate": 8.811246861216081e-08, + "loss": 0.71506739, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 15102, + "time_per_iteration": 2.4149646759033203 + }, + { + "auxiliary_loss_clip": 0.01050648, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.01254535, + "balance_loss_mlp": 1.01632667, + "epoch": 0.9080414850443409, + "flos": 22928823653760.0, + "grad_norm": 1.8256161639597182, + "language_loss": 0.79907817, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81993377, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 15103, + "time_per_iteration": 2.375267267227173 + }, + { + "auxiliary_loss_clip": 0.01051383, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.00884271, + "balance_loss_mlp": 1.01522481, + "epoch": 0.9081016082970089, + "flos": 26175885874560.0, + "grad_norm": 2.3439553139288223, + "language_loss": 0.72949314, + "learning_rate": 8.78839607763413e-08, + "loss": 0.75031716, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.36328125, + "step": 15104, + "time_per_iteration": 2.4130516052246094 + }, + { + "auxiliary_loss_clip": 0.01049418, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.01205969, + "balance_loss_mlp": 1.01549709, + "epoch": 0.9081617315496768, + "flos": 24461021980800.0, + "grad_norm": 1.7389292906329683, + "language_loss": 0.7827028, + "learning_rate": 8.77698156177138e-08, + "loss": 0.80352253, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 15105, + "time_per_iteration": 2.39373779296875 + }, + { + "auxiliary_loss_clip": 0.01052233, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.01516199, + "balance_loss_mlp": 1.01534963, + "epoch": 0.9082218548023449, + "flos": 24745913608320.0, + "grad_norm": 1.9704301923492076, + "language_loss": 0.74913359, + "learning_rate": 8.765574297104628e-08, + "loss": 0.77004075, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 15106, + "time_per_iteration": 3.8610799312591553 + }, + { + "auxiliary_loss_clip": 0.01051672, + "auxiliary_loss_mlp": 0.01040735, + "balance_loss_clip": 1.01706004, + "balance_loss_mlp": 1.01607203, + "epoch": 0.9082819780550128, + "flos": 24420278557440.0, + "grad_norm": 1.9964458730031656, + "language_loss": 0.8107537, + "learning_rate": 8.754174284066462e-08, + "loss": 0.83167779, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 15107, + "time_per_iteration": 2.4689202308654785 + }, + { + "auxiliary_loss_clip": 0.01007398, + "auxiliary_loss_mlp": 0.01001703, + "balance_loss_clip": 0.99959326, + "balance_loss_mlp": 1.00088382, + "epoch": 0.9083421013076808, + "flos": 59606233113600.0, + "grad_norm": 0.8081757828764939, + "language_loss": 0.59638691, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61647791, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06542969, + "step": 15108, + "time_per_iteration": 2.9910144805908203 + }, + { + "auxiliary_loss_clip": 0.01051774, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.01070809, + "balance_loss_mlp": 1.01529789, + "epoch": 0.9084022245603487, + "flos": 33618809957760.0, + "grad_norm": 1.6284268502527683, + "language_loss": 0.75199223, + "learning_rate": 8.73139601460482e-08, + "loss": 0.77283263, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.36523438, + "step": 15109, + "time_per_iteration": 2.4879775047302246 + }, + { + "auxiliary_loss_clip": 0.01049595, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.01155186, + "balance_loss_mlp": 1.01515746, + "epoch": 0.9084623478130167, + "flos": 24970579407360.0, + "grad_norm": 1.9353387512796933, + "language_loss": 0.72834414, + "learning_rate": 8.720017759045073e-08, + "loss": 0.74916291, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34375, + "step": 15110, + "time_per_iteration": 2.409074306488037 + }, + { + "auxiliary_loss_clip": 0.01049264, + "auxiliary_loss_mlp": 0.01032134, + "balance_loss_clip": 1.01213074, + "balance_loss_mlp": 1.01533818, + "epoch": 0.9085224710656846, + "flos": 31460619219840.0, + "grad_norm": 1.904317585394856, + "language_loss": 0.70277834, + "learning_rate": 8.708646756841421e-08, + "loss": 0.72359234, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.33984375, + "step": 15111, + "time_per_iteration": 2.504939317703247 + }, + { + "auxiliary_loss_clip": 0.01007403, + "auxiliary_loss_mlp": 0.0100267, + "balance_loss_clip": 1.00060797, + "balance_loss_mlp": 1.00082445, + "epoch": 0.9085825943183526, + "flos": 64913940000000.0, + "grad_norm": 0.6966348490962464, + "language_loss": 0.51867938, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53878009, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.06591797, + "step": 15112, + "time_per_iteration": 3.1745262145996094 + }, + { + "auxiliary_loss_clip": 0.01051399, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.01347733, + "balance_loss_mlp": 1.01539934, + "epoch": 0.9086427175710206, + "flos": 18952216329600.0, + "grad_norm": 1.7585975158096268, + "language_loss": 0.71713924, + "learning_rate": 8.685926514226837e-08, + "loss": 0.73801953, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 15113, + "time_per_iteration": 2.4937188625335693 + }, + { + "auxiliary_loss_clip": 0.01052041, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01008952, + "balance_loss_mlp": 1.01677155, + "epoch": 0.9087028408236886, + "flos": 34013643056640.0, + "grad_norm": 4.091196724395817, + "language_loss": 0.8000735, + "learning_rate": 8.674577274677508e-08, + "loss": 0.82091224, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35351562, + "step": 15114, + "time_per_iteration": 2.550168991088867 + }, + { + "auxiliary_loss_clip": 0.01053948, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.01091564, + "balance_loss_mlp": 1.01726079, + "epoch": 0.9087629640763565, + "flos": 21943504863360.0, + "grad_norm": 1.9589559650189217, + "language_loss": 0.72183037, + "learning_rate": 8.663235290207405e-08, + "loss": 0.74273801, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.3671875, + "step": 15115, + "time_per_iteration": 2.382282257080078 + }, + { + "auxiliary_loss_clip": 0.01054311, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.01215672, + "balance_loss_mlp": 1.01716256, + "epoch": 0.9088230873290245, + "flos": 21761816814720.0, + "grad_norm": 2.400897865998849, + "language_loss": 0.66502732, + "learning_rate": 8.651900561246561e-08, + "loss": 0.68593413, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37109375, + "step": 15116, + "time_per_iteration": 2.374147891998291 + }, + { + "auxiliary_loss_clip": 0.01050182, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_clip": 1.01714599, + "balance_loss_mlp": 1.01608133, + "epoch": 0.9088832105816925, + "flos": 21540258126720.0, + "grad_norm": 1.89593917986977, + "language_loss": 0.70644617, + "learning_rate": 8.640573088224812e-08, + "loss": 0.72736323, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.34179688, + "step": 15117, + "time_per_iteration": 2.3851237297058105 + }, + { + "auxiliary_loss_clip": 0.01051535, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.0130161, + "balance_loss_mlp": 1.01674342, + "epoch": 0.9089433338343604, + "flos": 25995454634880.0, + "grad_norm": 1.590992920036761, + "language_loss": 0.75742626, + "learning_rate": 8.629252871571745e-08, + "loss": 0.77828932, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34765625, + "step": 15118, + "time_per_iteration": 2.4108362197875977 + }, + { + "auxiliary_loss_clip": 0.01054116, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.01097012, + "balance_loss_mlp": 1.01623249, + "epoch": 0.9090034570870285, + "flos": 21177370788480.0, + "grad_norm": 1.9344194008819082, + "language_loss": 0.74750936, + "learning_rate": 8.617939911716554e-08, + "loss": 0.76842523, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 15119, + "time_per_iteration": 2.402151107788086 + }, + { + "auxiliary_loss_clip": 0.01054619, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.01409137, + "balance_loss_mlp": 1.01669323, + "epoch": 0.9090635803396964, + "flos": 16140940099200.0, + "grad_norm": 2.66823603451073, + "language_loss": 0.7219981, + "learning_rate": 8.60663420908827e-08, + "loss": 0.74293685, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 15120, + "time_per_iteration": 2.358015298843384 + }, + { + "auxiliary_loss_clip": 0.01052429, + "auxiliary_loss_mlp": 0.01036334, + "balance_loss_clip": 1.0127666, + "balance_loss_mlp": 1.01639223, + "epoch": 0.9091237035923644, + "flos": 20590585701120.0, + "grad_norm": 1.9763142740486987, + "language_loss": 0.67467105, + "learning_rate": 8.595335764115596e-08, + "loss": 0.69555867, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 15121, + "time_per_iteration": 2.4375100135803223 + }, + { + "auxiliary_loss_clip": 0.01053056, + "auxiliary_loss_mlp": 0.01050436, + "balance_loss_clip": 1.02543736, + "balance_loss_mlp": 1.01607347, + "epoch": 0.9091838268450323, + "flos": 52225840874880.0, + "grad_norm": 1.9540074985116573, + "language_loss": 0.71468949, + "learning_rate": 8.58404457722699e-08, + "loss": 0.73572445, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 15122, + "time_per_iteration": 3.898468494415283 + }, + { + "auxiliary_loss_clip": 0.01050014, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.01775134, + "balance_loss_mlp": 1.01507652, + "epoch": 0.9092439500977003, + "flos": 20558535206400.0, + "grad_norm": 1.3075823038540777, + "language_loss": 0.75197941, + "learning_rate": 8.572760648850575e-08, + "loss": 0.77288163, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 15123, + "time_per_iteration": 2.3963544368743896 + }, + { + "auxiliary_loss_clip": 0.01050494, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.01434863, + "balance_loss_mlp": 1.01596045, + "epoch": 0.9093040733503682, + "flos": 28616699001600.0, + "grad_norm": 2.2086977231363316, + "language_loss": 0.76450837, + "learning_rate": 8.561483979414253e-08, + "loss": 0.78539133, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34570312, + "step": 15124, + "time_per_iteration": 2.4285025596618652 + }, + { + "auxiliary_loss_clip": 0.01051126, + "auxiliary_loss_mlp": 0.01035301, + "balance_loss_clip": 1.01244843, + "balance_loss_mlp": 1.01587653, + "epoch": 0.9093641966030362, + "flos": 23439079307520.0, + "grad_norm": 2.277560924770952, + "language_loss": 0.74033177, + "learning_rate": 8.55021456934566e-08, + "loss": 0.76119602, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 15125, + "time_per_iteration": 2.3974802494049072 + }, + { + "auxiliary_loss_clip": 0.01049545, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.01437557, + "balance_loss_mlp": 1.01580083, + "epoch": 0.9094243198557042, + "flos": 16799262295680.0, + "grad_norm": 1.5564219523144494, + "language_loss": 0.79704851, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81791091, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33789062, + "step": 15126, + "time_per_iteration": 2.3447511196136475 + }, + { + "auxiliary_loss_clip": 0.01050326, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.01467657, + "balance_loss_mlp": 1.01529455, + "epoch": 0.9094844431083722, + "flos": 24272316748800.0, + "grad_norm": 1.6360266289532175, + "language_loss": 0.759094, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77997887, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34960938, + "step": 15127, + "time_per_iteration": 2.4177520275115967 + }, + { + "auxiliary_loss_clip": 0.01050907, + "auxiliary_loss_mlp": 0.01037092, + "balance_loss_clip": 1.01420379, + "balance_loss_mlp": 1.01465201, + "epoch": 0.9095445663610401, + "flos": 21943574686080.0, + "grad_norm": 2.087739206623208, + "language_loss": 0.63932955, + "learning_rate": 8.516449899618173e-08, + "loss": 0.66020954, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 15128, + "time_per_iteration": 2.3760385513305664 + }, + { + "auxiliary_loss_clip": 0.01049457, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.01537108, + "balance_loss_mlp": 1.01513934, + "epoch": 0.9096046896137081, + "flos": 19791807638400.0, + "grad_norm": 1.5280348802564403, + "language_loss": 0.77613556, + "learning_rate": 8.505209531291013e-08, + "loss": 0.79699957, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34375, + "step": 15129, + "time_per_iteration": 2.4115517139434814 + }, + { + "auxiliary_loss_clip": 0.01051768, + "auxiliary_loss_mlp": 0.01034571, + "balance_loss_clip": 1.01159954, + "balance_loss_mlp": 1.01605344, + "epoch": 0.909664812866376, + "flos": 22636984665600.0, + "grad_norm": 2.150948679465896, + "language_loss": 0.84293622, + "learning_rate": 8.49397642446552e-08, + "loss": 0.86379963, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 15130, + "time_per_iteration": 2.3717751502990723 + }, + { + "auxiliary_loss_clip": 0.01053313, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.01262617, + "balance_loss_mlp": 1.01695919, + "epoch": 0.909724936119044, + "flos": 39850771933440.0, + "grad_norm": 2.026457158063785, + "language_loss": 0.76110959, + "learning_rate": 8.482750579567644e-08, + "loss": 0.78200281, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 15131, + "time_per_iteration": 2.521912097930908 + }, + { + "auxiliary_loss_clip": 0.01053523, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.01648033, + "balance_loss_mlp": 1.01758432, + "epoch": 0.9097850593717121, + "flos": 35070394222080.0, + "grad_norm": 1.7878945259328018, + "language_loss": 0.60277736, + "learning_rate": 8.471531997023085e-08, + "loss": 0.62370312, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.359375, + "step": 15132, + "time_per_iteration": 2.4953842163085938 + }, + { + "auxiliary_loss_clip": 0.01052599, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.01446319, + "balance_loss_mlp": 1.01680982, + "epoch": 0.90984518262438, + "flos": 23366355212160.0, + "grad_norm": 1.5128296312448373, + "language_loss": 0.83333117, + "learning_rate": 8.460320677257193e-08, + "loss": 0.8542245, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 15133, + "time_per_iteration": 5.278148412704468 + }, + { + "auxiliary_loss_clip": 0.01050863, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.00955129, + "balance_loss_mlp": 1.01497197, + "epoch": 0.909905305877048, + "flos": 27522171878400.0, + "grad_norm": 1.7008559574402897, + "language_loss": 0.74556816, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76640731, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 15134, + "time_per_iteration": 2.425355911254883 + }, + { + "auxiliary_loss_clip": 0.01054305, + "auxiliary_loss_mlp": 0.01044568, + "balance_loss_clip": 1.01911664, + "balance_loss_mlp": 1.01631021, + "epoch": 0.9099654291297159, + "flos": 24346856234880.0, + "grad_norm": 1.5474451047086473, + "language_loss": 0.73785543, + "learning_rate": 8.437919827761786e-08, + "loss": 0.75884414, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 15135, + "time_per_iteration": 2.392383575439453 + }, + { + "auxiliary_loss_clip": 0.01051413, + "auxiliary_loss_mlp": 0.01034154, + "balance_loss_clip": 1.01211262, + "balance_loss_mlp": 1.01662838, + "epoch": 0.9100255523823839, + "flos": 21214169228160.0, + "grad_norm": 5.851622807173836, + "language_loss": 0.70939839, + "learning_rate": 8.426730298881702e-08, + "loss": 0.73025405, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 15136, + "time_per_iteration": 2.3700170516967773 + }, + { + "auxiliary_loss_clip": 0.01007405, + "auxiliary_loss_mlp": 0.01003918, + "balance_loss_clip": 1.00165284, + "balance_loss_mlp": 1.00082278, + "epoch": 0.9100856756350518, + "flos": 46049773852800.0, + "grad_norm": 0.8130697080179206, + "language_loss": 0.59352648, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61363971, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.06542969, + "step": 15137, + "time_per_iteration": 2.838423013687134 + }, + { + "auxiliary_loss_clip": 0.01050953, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.01432252, + "balance_loss_mlp": 1.01651621, + "epoch": 0.9101457988877198, + "flos": 20228885349120.0, + "grad_norm": 1.638694349653523, + "language_loss": 0.83351803, + "learning_rate": 8.40437303497834e-08, + "loss": 0.8543905, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 15138, + "time_per_iteration": 2.3761496543884277 + }, + { + "auxiliary_loss_clip": 0.01050716, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.0117135, + "balance_loss_mlp": 1.01632202, + "epoch": 0.9102059221403878, + "flos": 26613941103360.0, + "grad_norm": 1.5759700373381627, + "language_loss": 0.81780219, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83864772, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 15139, + "time_per_iteration": 2.40073299407959 + }, + { + "auxiliary_loss_clip": 0.0105248, + "auxiliary_loss_mlp": 0.01038338, + "balance_loss_clip": 1.01505637, + "balance_loss_mlp": 1.01638365, + "epoch": 0.9102660453930558, + "flos": 21907474473600.0, + "grad_norm": 4.954914010426568, + "language_loss": 0.78147817, + "learning_rate": 8.382044832376167e-08, + "loss": 0.8023864, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 15140, + "time_per_iteration": 2.396038293838501 + }, + { + "auxiliary_loss_clip": 0.01050806, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.01588929, + "balance_loss_mlp": 1.01537251, + "epoch": 0.9103261686457237, + "flos": 36175115462400.0, + "grad_norm": 1.8951737025454354, + "language_loss": 0.67241007, + "learning_rate": 8.370891630121569e-08, + "loss": 0.69330424, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 15141, + "time_per_iteration": 2.541255235671997 + }, + { + "auxiliary_loss_clip": 0.01052637, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.01578653, + "balance_loss_mlp": 1.01646638, + "epoch": 0.9103862918983917, + "flos": 23877413827200.0, + "grad_norm": 2.3500419409852555, + "language_loss": 0.75877553, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77970809, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36132812, + "step": 15142, + "time_per_iteration": 2.3918938636779785 + }, + { + "auxiliary_loss_clip": 0.01049129, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.01792622, + "balance_loss_mlp": 1.0146445, + "epoch": 0.9104464151510596, + "flos": 14938636008960.0, + "grad_norm": 1.7404042846548633, + "language_loss": 0.66151887, + "learning_rate": 8.348607025820076e-08, + "loss": 0.6823985, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 15143, + "time_per_iteration": 2.3613274097442627 + }, + { + "auxiliary_loss_clip": 0.01053026, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.01567459, + "balance_loss_mlp": 1.01651812, + "epoch": 0.9105065384037276, + "flos": 33654421411200.0, + "grad_norm": 1.9509932650818327, + "language_loss": 0.62228274, + "learning_rate": 8.337475624618152e-08, + "loss": 0.64321738, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 15144, + "time_per_iteration": 2.5000205039978027 + }, + { + "auxiliary_loss_clip": 0.01048841, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.01049256, + "balance_loss_mlp": 1.0155139, + "epoch": 0.9105666616563957, + "flos": 24315538878720.0, + "grad_norm": 1.752132722670743, + "language_loss": 0.7174207, + "learning_rate": 8.326351491278382e-08, + "loss": 0.73823488, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33398438, + "step": 15145, + "time_per_iteration": 3.8284499645233154 + }, + { + "auxiliary_loss_clip": 0.01048844, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.01341128, + "balance_loss_mlp": 1.01502657, + "epoch": 0.9106267849090636, + "flos": 29970386213760.0, + "grad_norm": 1.6115306752377747, + "language_loss": 0.71770406, + "learning_rate": 8.315234626222545e-08, + "loss": 0.73854232, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33789062, + "step": 15146, + "time_per_iteration": 2.4413132667541504 + }, + { + "auxiliary_loss_clip": 0.01050374, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.01489401, + "balance_loss_mlp": 1.01531005, + "epoch": 0.9106869081617316, + "flos": 25336573856640.0, + "grad_norm": 1.8361916159179228, + "language_loss": 0.7374891, + "learning_rate": 8.304125029872233e-08, + "loss": 0.75836426, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 15147, + "time_per_iteration": 2.4238028526306152 + }, + { + "auxiliary_loss_clip": 0.01052386, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.00981784, + "balance_loss_mlp": 1.01617026, + "epoch": 0.9107470314143995, + "flos": 18186047343360.0, + "grad_norm": 1.8725833452706726, + "language_loss": 0.81288815, + "learning_rate": 8.293022702648711e-08, + "loss": 0.83374798, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 15148, + "time_per_iteration": 2.3576431274414062 + }, + { + "auxiliary_loss_clip": 0.01052166, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.01684046, + "balance_loss_mlp": 1.01583433, + "epoch": 0.9108071546670675, + "flos": 23549684094720.0, + "grad_norm": 1.9110768849653725, + "language_loss": 0.69078678, + "learning_rate": 8.281927644972996e-08, + "loss": 0.71170169, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36328125, + "step": 15149, + "time_per_iteration": 2.5805552005767822 + }, + { + "auxiliary_loss_clip": 0.01050692, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.01444602, + "balance_loss_mlp": 1.01615763, + "epoch": 0.9108672779197354, + "flos": 25629111072000.0, + "grad_norm": 1.9590518326713922, + "language_loss": 0.64335132, + "learning_rate": 8.270839857265776e-08, + "loss": 0.66422546, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34570312, + "step": 15150, + "time_per_iteration": 2.411139965057373 + }, + { + "auxiliary_loss_clip": 0.01051167, + "auxiliary_loss_mlp": 0.01036537, + "balance_loss_clip": 1.01463878, + "balance_loss_mlp": 1.0161283, + "epoch": 0.9109274011724035, + "flos": 22338198316800.0, + "grad_norm": 1.8368262737891778, + "language_loss": 0.73909366, + "learning_rate": 8.259759339947514e-08, + "loss": 0.75997072, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 15151, + "time_per_iteration": 2.3840997219085693 + }, + { + "auxiliary_loss_clip": 0.01050637, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.01196659, + "balance_loss_mlp": 1.01558912, + "epoch": 0.9109875244250714, + "flos": 26686979400960.0, + "grad_norm": 1.6122835233176094, + "language_loss": 0.65755415, + "learning_rate": 8.248686093438429e-08, + "loss": 0.67840046, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 15152, + "time_per_iteration": 2.4152402877807617 + }, + { + "auxiliary_loss_clip": 0.01051372, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.01050889, + "balance_loss_mlp": 1.01612914, + "epoch": 0.9110476476777394, + "flos": 22928998210560.0, + "grad_norm": 1.685315895501896, + "language_loss": 0.7419796, + "learning_rate": 8.23762011815834e-08, + "loss": 0.76282203, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 15153, + "time_per_iteration": 2.393449544906616 + }, + { + "auxiliary_loss_clip": 0.01053226, + "auxiliary_loss_mlp": 0.01039402, + "balance_loss_clip": 1.01589358, + "balance_loss_mlp": 1.01697063, + "epoch": 0.9111077709304073, + "flos": 13472214416640.0, + "grad_norm": 1.915172448914857, + "language_loss": 0.73182607, + "learning_rate": 8.226561414526956e-08, + "loss": 0.75275242, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 15154, + "time_per_iteration": 2.372143507003784 + }, + { + "auxiliary_loss_clip": 0.01051764, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.0199194, + "balance_loss_mlp": 1.01671505, + "epoch": 0.9111678941830753, + "flos": 20849501410560.0, + "grad_norm": 1.7159807784218863, + "language_loss": 0.8331542, + "learning_rate": 8.215509982963564e-08, + "loss": 0.85408759, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 15155, + "time_per_iteration": 2.428544759750366 + }, + { + "auxiliary_loss_clip": 0.01050805, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.0108031, + "balance_loss_mlp": 1.01610923, + "epoch": 0.9112280174357432, + "flos": 19681237762560.0, + "grad_norm": 1.4983943006546294, + "language_loss": 0.60541642, + "learning_rate": 8.204465823887252e-08, + "loss": 0.62625813, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 15156, + "time_per_iteration": 2.380964517593384 + }, + { + "auxiliary_loss_clip": 0.0105298, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.01204205, + "balance_loss_mlp": 1.01558805, + "epoch": 0.9112881406884112, + "flos": 25445991657600.0, + "grad_norm": 2.890945081823685, + "language_loss": 0.75291383, + "learning_rate": 8.193428937716796e-08, + "loss": 0.77380884, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.375, + "step": 15157, + "time_per_iteration": 2.438042640686035 + }, + { + "auxiliary_loss_clip": 0.01053039, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.01300812, + "balance_loss_mlp": 1.01667082, + "epoch": 0.9113482639410793, + "flos": 33065751110400.0, + "grad_norm": 2.172238793268555, + "language_loss": 0.60491359, + "learning_rate": 8.182399324870747e-08, + "loss": 0.62578321, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.36328125, + "step": 15158, + "time_per_iteration": 2.483268976211548 + }, + { + "auxiliary_loss_clip": 0.01050692, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.01723194, + "balance_loss_mlp": 1.01582694, + "epoch": 0.9114083871937472, + "flos": 21834505998720.0, + "grad_norm": 1.5856096911597755, + "language_loss": 0.68724072, + "learning_rate": 8.171376985767375e-08, + "loss": 0.70814991, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34960938, + "step": 15159, + "time_per_iteration": 2.391744375228882 + }, + { + "auxiliary_loss_clip": 0.01050391, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.01429021, + "balance_loss_mlp": 1.01529956, + "epoch": 0.9114685104464152, + "flos": 27087782342400.0, + "grad_norm": 1.9932664159770659, + "language_loss": 0.79159427, + "learning_rate": 8.160361920824588e-08, + "loss": 0.81245637, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34960938, + "step": 15160, + "time_per_iteration": 2.3996379375457764 + }, + { + "auxiliary_loss_clip": 0.01052352, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.01227283, + "balance_loss_mlp": 1.01647377, + "epoch": 0.9115286336990831, + "flos": 17966094577920.0, + "grad_norm": 1.7115918854809238, + "language_loss": 0.70172203, + "learning_rate": 8.149354130460073e-08, + "loss": 0.72263247, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.359375, + "step": 15161, + "time_per_iteration": 2.3599627017974854 + }, + { + "auxiliary_loss_clip": 0.01052325, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.01384413, + "balance_loss_mlp": 1.0169313, + "epoch": 0.9115887569517511, + "flos": 22928753831040.0, + "grad_norm": 1.8999933672753313, + "language_loss": 0.77870959, + "learning_rate": 8.138353615091321e-08, + "loss": 0.79962105, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35546875, + "step": 15162, + "time_per_iteration": 3.670523166656494 + }, + { + "auxiliary_loss_clip": 0.01052142, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.01525283, + "balance_loss_mlp": 1.01655364, + "epoch": 0.911648880204419, + "flos": 23987285475840.0, + "grad_norm": 1.9189470796069488, + "language_loss": 0.67682475, + "learning_rate": 8.127360375135395e-08, + "loss": 0.69772828, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 15163, + "time_per_iteration": 2.4003565311431885 + }, + { + "auxiliary_loss_clip": 0.01054711, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.01404929, + "balance_loss_mlp": 1.01678014, + "epoch": 0.911709003457087, + "flos": 17054372666880.0, + "grad_norm": 3.454002876937307, + "language_loss": 0.72563648, + "learning_rate": 8.116374411009186e-08, + "loss": 0.74657255, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37890625, + "step": 15164, + "time_per_iteration": 2.3680005073547363 + }, + { + "auxiliary_loss_clip": 0.01050407, + "auxiliary_loss_mlp": 0.01036873, + "balance_loss_clip": 1.01501, + "balance_loss_mlp": 1.01659155, + "epoch": 0.911769126709755, + "flos": 21652259368320.0, + "grad_norm": 1.5449890068754966, + "language_loss": 0.76891905, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78979182, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33789062, + "step": 15165, + "time_per_iteration": 2.4615161418914795 + }, + { + "auxiliary_loss_clip": 0.01051632, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.01464963, + "balance_loss_mlp": 1.01600647, + "epoch": 0.911829249962423, + "flos": 24789170649600.0, + "grad_norm": 2.3311829899030463, + "language_loss": 0.74049878, + "learning_rate": 8.094424311912074e-08, + "loss": 0.76138687, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 15166, + "time_per_iteration": 2.4209272861480713 + }, + { + "auxiliary_loss_clip": 0.01052728, + "auxiliary_loss_mlp": 0.01041276, + "balance_loss_clip": 1.01612246, + "balance_loss_mlp": 1.01642561, + "epoch": 0.9118893732150909, + "flos": 20958360629760.0, + "grad_norm": 1.7403104786953025, + "language_loss": 0.73935509, + "learning_rate": 8.083460177773482e-08, + "loss": 0.76029515, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 15167, + "time_per_iteration": 2.36311411857605 + }, + { + "auxiliary_loss_clip": 0.01007478, + "auxiliary_loss_mlp": 0.0100183, + "balance_loss_clip": 0.99963641, + "balance_loss_mlp": 1.00093222, + "epoch": 0.9119494964677589, + "flos": 67913991285120.0, + "grad_norm": 0.770600437944389, + "language_loss": 0.65663201, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67672509, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06542969, + "step": 15168, + "time_per_iteration": 3.013223171234131 + }, + { + "auxiliary_loss_clip": 0.01052148, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.0130831, + "balance_loss_mlp": 1.01645923, + "epoch": 0.9120096197204268, + "flos": 18550540604160.0, + "grad_norm": 2.15358930308531, + "language_loss": 0.78987038, + "learning_rate": 8.061553742395033e-08, + "loss": 0.81074476, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 15169, + "time_per_iteration": 2.38236403465271 + }, + { + "auxiliary_loss_clip": 0.01050956, + "auxiliary_loss_mlp": 0.0104124, + "balance_loss_clip": 1.01775599, + "balance_loss_mlp": 1.01615334, + "epoch": 0.9120697429730948, + "flos": 19024730956800.0, + "grad_norm": 2.0548979037775634, + "language_loss": 0.83201832, + "learning_rate": 8.05061144198591e-08, + "loss": 0.85294026, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 15170, + "time_per_iteration": 2.3677990436553955 + }, + { + "auxiliary_loss_clip": 0.01052483, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_clip": 1.01989865, + "balance_loss_mlp": 1.01663983, + "epoch": 0.9121298662257629, + "flos": 17162778038400.0, + "grad_norm": 2.268632909555113, + "language_loss": 0.7827363, + "learning_rate": 8.039676420316799e-08, + "loss": 0.80369455, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 15171, + "time_per_iteration": 2.346559762954712 + }, + { + "auxiliary_loss_clip": 0.01050302, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.01274228, + "balance_loss_mlp": 1.0153954, + "epoch": 0.9121899894784308, + "flos": 19681691610240.0, + "grad_norm": 1.3129667978107689, + "language_loss": 0.67763567, + "learning_rate": 8.02874867780241e-08, + "loss": 0.69848788, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 15172, + "time_per_iteration": 3.779686450958252 + }, + { + "auxiliary_loss_clip": 0.01053092, + "auxiliary_loss_mlp": 0.01035817, + "balance_loss_clip": 1.01166534, + "balance_loss_mlp": 1.016837, + "epoch": 0.9122501127310988, + "flos": 22234680535680.0, + "grad_norm": 1.6872854046019412, + "language_loss": 0.76255083, + "learning_rate": 8.017828214857103e-08, + "loss": 0.78343993, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 15173, + "time_per_iteration": 3.7590222358703613 + }, + { + "auxiliary_loss_clip": 0.01055235, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.01065612, + "balance_loss_mlp": 1.01679862, + "epoch": 0.9123102359837667, + "flos": 15956319496320.0, + "grad_norm": 5.680307255330199, + "language_loss": 0.6747005, + "learning_rate": 8.00691503189499e-08, + "loss": 0.69562447, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 15174, + "time_per_iteration": 2.33768630027771 + }, + { + "auxiliary_loss_clip": 0.01053821, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.01413846, + "balance_loss_mlp": 1.01693344, + "epoch": 0.9123703592364347, + "flos": 25154606517120.0, + "grad_norm": 2.120826259244081, + "language_loss": 0.75703007, + "learning_rate": 7.996009129329894e-08, + "loss": 0.77794623, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 15175, + "time_per_iteration": 2.4098305702209473 + }, + { + "auxiliary_loss_clip": 0.01007743, + "auxiliary_loss_mlp": 0.01002533, + "balance_loss_clip": 1.00036335, + "balance_loss_mlp": 1.00110388, + "epoch": 0.9124304824891026, + "flos": 60798482732160.0, + "grad_norm": 0.9714962190808286, + "language_loss": 0.58595693, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60605967, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.06640625, + "step": 15176, + "time_per_iteration": 3.0823464393615723 + }, + { + "auxiliary_loss_clip": 0.01052868, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.01515472, + "balance_loss_mlp": 1.01671171, + "epoch": 0.9124906057417707, + "flos": 18149947130880.0, + "grad_norm": 1.9024045963577905, + "language_loss": 0.6660167, + "learning_rate": 7.97421916704475e-08, + "loss": 0.68692982, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36132812, + "step": 15177, + "time_per_iteration": 2.3582041263580322 + }, + { + "auxiliary_loss_clip": 0.01051218, + "auxiliary_loss_mlp": 0.01036298, + "balance_loss_clip": 1.01567459, + "balance_loss_mlp": 1.0160085, + "epoch": 0.9125507289944386, + "flos": 11686127616000.0, + "grad_norm": 2.072631003055997, + "language_loss": 0.82370782, + "learning_rate": 7.963335108150926e-08, + "loss": 0.84458297, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.3515625, + "step": 15178, + "time_per_iteration": 2.335771083831787 + }, + { + "auxiliary_loss_clip": 0.01051896, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.01464248, + "balance_loss_mlp": 1.01661062, + "epoch": 0.9126108522471066, + "flos": 17747852469120.0, + "grad_norm": 2.437532408813133, + "language_loss": 0.80170488, + "learning_rate": 7.952458331306711e-08, + "loss": 0.82259154, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 15179, + "time_per_iteration": 2.360410213470459 + }, + { + "auxiliary_loss_clip": 0.01049871, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.01557231, + "balance_loss_mlp": 1.01540947, + "epoch": 0.9126709754997745, + "flos": 27634522233600.0, + "grad_norm": 1.6111438795304511, + "language_loss": 0.68785906, + "learning_rate": 7.941588836924507e-08, + "loss": 0.7087397, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34570312, + "step": 15180, + "time_per_iteration": 2.4125351905822754 + }, + { + "auxiliary_loss_clip": 0.01049377, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.00866485, + "balance_loss_mlp": 1.01473534, + "epoch": 0.9127310987524425, + "flos": 15924059533440.0, + "grad_norm": 1.6327314023138948, + "language_loss": 0.76227164, + "learning_rate": 7.930726625416495e-08, + "loss": 0.78306663, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 15181, + "time_per_iteration": 2.371516466140747 + }, + { + "auxiliary_loss_clip": 0.01053823, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.01231337, + "balance_loss_mlp": 1.01684618, + "epoch": 0.9127912220051104, + "flos": 21535998940800.0, + "grad_norm": 2.0867148906733304, + "language_loss": 0.75588834, + "learning_rate": 7.919871697194614e-08, + "loss": 0.77678168, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 15182, + "time_per_iteration": 2.3774423599243164 + }, + { + "auxiliary_loss_clip": 0.01052619, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.01722169, + "balance_loss_mlp": 1.01571989, + "epoch": 0.9128513452577784, + "flos": 24062348632320.0, + "grad_norm": 1.585485205354692, + "language_loss": 0.76992965, + "learning_rate": 7.909024052670421e-08, + "loss": 0.79087448, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 15183, + "time_per_iteration": 2.4083359241485596 + }, + { + "auxiliary_loss_clip": 0.01052591, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.0150075, + "balance_loss_mlp": 1.01598811, + "epoch": 0.9129114685104465, + "flos": 16215549408000.0, + "grad_norm": 2.4961495135799683, + "language_loss": 0.77035403, + "learning_rate": 7.898183692255256e-08, + "loss": 0.7912637, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36523438, + "step": 15184, + "time_per_iteration": 3.7914140224456787 + }, + { + "auxiliary_loss_clip": 0.01051189, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.01348376, + "balance_loss_mlp": 1.01563799, + "epoch": 0.9129715917631144, + "flos": 19383533665920.0, + "grad_norm": 1.6801316906135204, + "language_loss": 0.75811076, + "learning_rate": 7.887350616360233e-08, + "loss": 0.7789858, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 15185, + "time_per_iteration": 2.3593344688415527 + }, + { + "auxiliary_loss_clip": 0.01050945, + "auxiliary_loss_mlp": 0.01037409, + "balance_loss_clip": 1.01506889, + "balance_loss_mlp": 1.01619816, + "epoch": 0.9130317150157824, + "flos": 20589538360320.0, + "grad_norm": 1.9150249217978865, + "language_loss": 0.70266652, + "learning_rate": 7.876524825396158e-08, + "loss": 0.72355008, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 15186, + "time_per_iteration": 2.378145694732666 + }, + { + "auxiliary_loss_clip": 0.01054528, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.01167214, + "balance_loss_mlp": 1.01639903, + "epoch": 0.9130918382684503, + "flos": 20188316482560.0, + "grad_norm": 1.884954953161776, + "language_loss": 0.78709406, + "learning_rate": 7.865706319773502e-08, + "loss": 0.80800873, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 15187, + "time_per_iteration": 2.369201898574829 + }, + { + "auxiliary_loss_clip": 0.01052188, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.01523638, + "balance_loss_mlp": 1.01607633, + "epoch": 0.9131519615211183, + "flos": 25555688749440.0, + "grad_norm": 2.149920333985983, + "language_loss": 0.67276782, + "learning_rate": 7.854895099902515e-08, + "loss": 0.69366121, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36132812, + "step": 15188, + "time_per_iteration": 2.4424679279327393 + }, + { + "auxiliary_loss_clip": 0.01049902, + "auxiliary_loss_mlp": 0.01035337, + "balance_loss_clip": 1.01292562, + "balance_loss_mlp": 1.01496696, + "epoch": 0.9132120847737862, + "flos": 17930587858560.0, + "grad_norm": 2.4533419266181644, + "language_loss": 0.77551329, + "learning_rate": 7.844091166193157e-08, + "loss": 0.79636562, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 15189, + "time_per_iteration": 2.3504080772399902 + }, + { + "auxiliary_loss_clip": 0.01049254, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.01546693, + "balance_loss_mlp": 1.01560271, + "epoch": 0.9132722080264543, + "flos": 20046603807360.0, + "grad_norm": 1.8411083033954903, + "language_loss": 0.76709616, + "learning_rate": 7.8332945190551e-08, + "loss": 0.78795713, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.3359375, + "step": 15190, + "time_per_iteration": 2.368091583251953 + }, + { + "auxiliary_loss_clip": 0.01007135, + "auxiliary_loss_mlp": 0.01002809, + "balance_loss_clip": 1.00072265, + "balance_loss_mlp": 1.00053132, + "epoch": 0.9133323312791222, + "flos": 70436361081600.0, + "grad_norm": 0.7198672397282799, + "language_loss": 0.57463574, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59473515, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06640625, + "step": 15191, + "time_per_iteration": 3.077641487121582 + }, + { + "auxiliary_loss_clip": 0.01052443, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.01366425, + "balance_loss_mlp": 1.0161252, + "epoch": 0.9133924545317902, + "flos": 25482615540480.0, + "grad_norm": 2.029064650429031, + "language_loss": 0.75264323, + "learning_rate": 7.81172308613034e-08, + "loss": 0.77353913, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 15192, + "time_per_iteration": 2.4041147232055664 + }, + { + "auxiliary_loss_clip": 0.0105045, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.01177287, + "balance_loss_mlp": 1.01568532, + "epoch": 0.9134525777844581, + "flos": 39929151669120.0, + "grad_norm": 2.020019268129828, + "language_loss": 0.70341885, + "learning_rate": 7.800948301161647e-08, + "loss": 0.72426426, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 15193, + "time_per_iteration": 2.52298903465271 + }, + { + "auxiliary_loss_clip": 0.01050117, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.01270962, + "balance_loss_mlp": 1.01600301, + "epoch": 0.9135127010371261, + "flos": 20885671445760.0, + "grad_norm": 1.564126717186315, + "language_loss": 0.74198246, + "learning_rate": 7.790180804400215e-08, + "loss": 0.76282299, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33984375, + "step": 15194, + "time_per_iteration": 2.3826138973236084 + }, + { + "auxiliary_loss_clip": 0.01051937, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.01207721, + "balance_loss_mlp": 1.01544845, + "epoch": 0.913572824289794, + "flos": 20812214211840.0, + "grad_norm": 2.924128941778097, + "language_loss": 0.63117397, + "learning_rate": 7.779420596254383e-08, + "loss": 0.65205312, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 15195, + "time_per_iteration": 2.375117063522339 + }, + { + "auxiliary_loss_clip": 0.01053345, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.01517391, + "balance_loss_mlp": 1.01732695, + "epoch": 0.913632947542462, + "flos": 25702079546880.0, + "grad_norm": 1.9893008058260677, + "language_loss": 0.72085899, + "learning_rate": 7.768667677132201e-08, + "loss": 0.74177444, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 15196, + "time_per_iteration": 2.4274652004241943 + }, + { + "auxiliary_loss_clip": 0.01051555, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.01284027, + "balance_loss_mlp": 1.01653194, + "epoch": 0.9136930707951301, + "flos": 26285024384640.0, + "grad_norm": 1.7181158208166285, + "language_loss": 0.72008133, + "learning_rate": 7.757922047441411e-08, + "loss": 0.74093306, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34960938, + "step": 15197, + "time_per_iteration": 2.4101450443267822 + }, + { + "auxiliary_loss_clip": 0.01052076, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.01174212, + "balance_loss_mlp": 1.01566434, + "epoch": 0.913753194047798, + "flos": 22090768444800.0, + "grad_norm": 1.7938382298687943, + "language_loss": 0.79273593, + "learning_rate": 7.747183707589489e-08, + "loss": 0.81360888, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 15198, + "time_per_iteration": 2.3840551376342773 + }, + { + "auxiliary_loss_clip": 0.0105007, + "auxiliary_loss_mlp": 0.0104055, + "balance_loss_clip": 1.0183053, + "balance_loss_mlp": 1.01494634, + "epoch": 0.913813317300466, + "flos": 23586063598080.0, + "grad_norm": 1.365875570503282, + "language_loss": 0.68353367, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70443988, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 15199, + "time_per_iteration": 2.4282310009002686 + }, + { + "auxiliary_loss_clip": 0.01050759, + "auxiliary_loss_mlp": 0.01043257, + "balance_loss_clip": 1.02054739, + "balance_loss_mlp": 1.01576662, + "epoch": 0.9138734405531339, + "flos": 28875195774720.0, + "grad_norm": 1.6045577719599164, + "language_loss": 0.68226153, + "learning_rate": 7.725728899030714e-08, + "loss": 0.70320165, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3515625, + "step": 15200, + "time_per_iteration": 2.4239394664764404 + }, + { + "auxiliary_loss_clip": 0.01050624, + "auxiliary_loss_mlp": 0.01036924, + "balance_loss_clip": 1.01489472, + "balance_loss_mlp": 1.01640856, + "epoch": 0.9139335638058019, + "flos": 22819964434560.0, + "grad_norm": 1.5719883871579239, + "language_loss": 0.72594863, + "learning_rate": 7.715012431137435e-08, + "loss": 0.74682415, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34179688, + "step": 15201, + "time_per_iteration": 3.627408742904663 + }, + { + "auxiliary_loss_clip": 0.01049404, + "auxiliary_loss_mlp": 0.01030476, + "balance_loss_clip": 1.0098412, + "balance_loss_mlp": 1.01489699, + "epoch": 0.9139936870584698, + "flos": 18003207219840.0, + "grad_norm": 1.8958109549469233, + "language_loss": 0.71828485, + "learning_rate": 7.704303254710165e-08, + "loss": 0.73908365, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.34570312, + "step": 15202, + "time_per_iteration": 2.42482852935791 + }, + { + "auxiliary_loss_clip": 0.01050204, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.01159358, + "balance_loss_mlp": 1.0150454, + "epoch": 0.9140538103111379, + "flos": 15812896164480.0, + "grad_norm": 2.131529127815075, + "language_loss": 0.6789434, + "learning_rate": 7.693601370155001e-08, + "loss": 0.69978786, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15203, + "time_per_iteration": 2.3323166370391846 + }, + { + "auxiliary_loss_clip": 0.01051064, + "auxiliary_loss_mlp": 0.01036708, + "balance_loss_clip": 1.01229429, + "balance_loss_mlp": 1.01593053, + "epoch": 0.9141139335638058, + "flos": 23985923932800.0, + "grad_norm": 1.5934040871832942, + "language_loss": 0.69665039, + "learning_rate": 7.682906777877751e-08, + "loss": 0.7175281, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3515625, + "step": 15204, + "time_per_iteration": 2.416658878326416 + }, + { + "auxiliary_loss_clip": 0.01050976, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.01333535, + "balance_loss_mlp": 1.01486599, + "epoch": 0.9141740568164738, + "flos": 24023280954240.0, + "grad_norm": 2.9163116236593307, + "language_loss": 0.60821784, + "learning_rate": 7.672219478283915e-08, + "loss": 0.62910008, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36132812, + "step": 15205, + "time_per_iteration": 2.3857386112213135 + }, + { + "auxiliary_loss_clip": 0.01049477, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.0114764, + "balance_loss_mlp": 1.01602638, + "epoch": 0.9142341800691417, + "flos": 27017013283200.0, + "grad_norm": 1.6268422028815976, + "language_loss": 0.82466835, + "learning_rate": 7.661539471778811e-08, + "loss": 0.84550595, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.33398438, + "step": 15206, + "time_per_iteration": 2.428603410720825 + }, + { + "auxiliary_loss_clip": 0.01051204, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.01071072, + "balance_loss_mlp": 1.01506901, + "epoch": 0.9142943033218097, + "flos": 20411446181760.0, + "grad_norm": 2.1544804636293877, + "language_loss": 0.74901009, + "learning_rate": 7.650866758767382e-08, + "loss": 0.769858, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 15207, + "time_per_iteration": 2.377981185913086 + }, + { + "auxiliary_loss_clip": 0.01050884, + "auxiliary_loss_mlp": 0.01040649, + "balance_loss_clip": 1.01591325, + "balance_loss_mlp": 1.0152483, + "epoch": 0.9143544265744776, + "flos": 19754310971520.0, + "grad_norm": 2.804695002610808, + "language_loss": 0.74172431, + "learning_rate": 7.640201339654373e-08, + "loss": 0.76263964, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.35546875, + "step": 15208, + "time_per_iteration": 2.5837960243225098 + }, + { + "auxiliary_loss_clip": 0.01049276, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.01138341, + "balance_loss_mlp": 1.01498711, + "epoch": 0.9144145498271457, + "flos": 17164488695040.0, + "grad_norm": 2.2342058106230245, + "language_loss": 0.8734349, + "learning_rate": 7.629543214844237e-08, + "loss": 0.89424682, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34375, + "step": 15209, + "time_per_iteration": 2.397447109222412 + }, + { + "auxiliary_loss_clip": 0.01050908, + "auxiliary_loss_mlp": 0.01037313, + "balance_loss_clip": 1.01589072, + "balance_loss_mlp": 1.01581681, + "epoch": 0.9144746730798137, + "flos": 23725123009920.0, + "grad_norm": 1.6114912720708439, + "language_loss": 0.76500767, + "learning_rate": 7.618892384741093e-08, + "loss": 0.78588992, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.3515625, + "step": 15210, + "time_per_iteration": 2.4308536052703857 + }, + { + "auxiliary_loss_clip": 0.01050506, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.01306462, + "balance_loss_mlp": 1.01527405, + "epoch": 0.9145347963324816, + "flos": 25846689864960.0, + "grad_norm": 1.9211612723485298, + "language_loss": 0.78979707, + "learning_rate": 7.6082488497488e-08, + "loss": 0.81065953, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15211, + "time_per_iteration": 2.386380434036255 + }, + { + "auxiliary_loss_clip": 0.01053148, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.01646519, + "balance_loss_mlp": 1.01746535, + "epoch": 0.9145949195851496, + "flos": 19241820990720.0, + "grad_norm": 1.668432992085951, + "language_loss": 0.84114712, + "learning_rate": 7.597612610270986e-08, + "loss": 0.8620708, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35742188, + "step": 15212, + "time_per_iteration": 5.1538825035095215 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.01174104, + "balance_loss_mlp": 1.01577914, + "epoch": 0.9146550428378175, + "flos": 18295395321600.0, + "grad_norm": 1.9530934591538154, + "language_loss": 0.84795952, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86879802, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 15213, + "time_per_iteration": 2.386683225631714 + }, + { + "auxiliary_loss_clip": 0.01051177, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.01428366, + "balance_loss_mlp": 1.0159049, + "epoch": 0.9147151660904855, + "flos": 20083227690240.0, + "grad_norm": 1.832691632277064, + "language_loss": 0.72772276, + "learning_rate": 7.576362019471894e-08, + "loss": 0.74860203, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35351562, + "step": 15214, + "time_per_iteration": 2.3498005867004395 + }, + { + "auxiliary_loss_clip": 0.01054464, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.01593983, + "balance_loss_mlp": 1.01735473, + "epoch": 0.9147752893431534, + "flos": 24387983683200.0, + "grad_norm": 1.8572626958974865, + "language_loss": 0.64050257, + "learning_rate": 7.565747668956413e-08, + "loss": 0.66144848, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37109375, + "step": 15215, + "time_per_iteration": 2.390673875808716 + }, + { + "auxiliary_loss_clip": 0.01056333, + "auxiliary_loss_mlp": 0.01041242, + "balance_loss_clip": 1.01688719, + "balance_loss_mlp": 1.01797485, + "epoch": 0.9148354125958215, + "flos": 18149423460480.0, + "grad_norm": 2.1065076185557503, + "language_loss": 0.7841273, + "learning_rate": 7.555140615567058e-08, + "loss": 0.80510306, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3828125, + "step": 15216, + "time_per_iteration": 2.35434627532959 + }, + { + "auxiliary_loss_clip": 0.01051497, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.01798153, + "balance_loss_mlp": 1.01592231, + "epoch": 0.9148955358484894, + "flos": 23366425034880.0, + "grad_norm": 2.2740874913123807, + "language_loss": 0.69148338, + "learning_rate": 7.544540859706062e-08, + "loss": 0.71244276, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.35546875, + "step": 15217, + "time_per_iteration": 2.3872904777526855 + }, + { + "auxiliary_loss_clip": 0.01050788, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.01315355, + "balance_loss_mlp": 1.016114, + "epoch": 0.9149556591011574, + "flos": 18075547290240.0, + "grad_norm": 1.8591707201129317, + "language_loss": 0.81166273, + "learning_rate": 7.533948401775347e-08, + "loss": 0.8325125, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34765625, + "step": 15218, + "time_per_iteration": 2.357994318008423 + }, + { + "auxiliary_loss_clip": 0.01006657, + "auxiliary_loss_mlp": 0.01002761, + "balance_loss_clip": 1.00063956, + "balance_loss_mlp": 1.00031471, + "epoch": 0.9150157823538253, + "flos": 54583733923200.0, + "grad_norm": 0.8456353491287902, + "language_loss": 0.5937959, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61389005, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.06347656, + "step": 15219, + "time_per_iteration": 2.9859468936920166 + }, + { + "auxiliary_loss_clip": 0.01049527, + "auxiliary_loss_mlp": 0.01037671, + "balance_loss_clip": 1.01516461, + "balance_loss_mlp": 1.01515436, + "epoch": 0.9150759056064933, + "flos": 17892183496320.0, + "grad_norm": 1.6954069829374863, + "language_loss": 0.79407543, + "learning_rate": 7.512785381311216e-08, + "loss": 0.81494743, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34375, + "step": 15220, + "time_per_iteration": 2.3560376167297363 + }, + { + "auxiliary_loss_clip": 0.01052588, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.01142287, + "balance_loss_mlp": 1.01484036, + "epoch": 0.9151360288591612, + "flos": 18072649647360.0, + "grad_norm": 1.8189813266972759, + "language_loss": 0.66901416, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68989933, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37890625, + "step": 15221, + "time_per_iteration": 2.337916612625122 + }, + { + "auxiliary_loss_clip": 0.01051563, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.01229, + "balance_loss_mlp": 1.01568484, + "epoch": 0.9151961521118293, + "flos": 19353508030080.0, + "grad_norm": 1.7991319986858936, + "language_loss": 0.86449575, + "learning_rate": 7.491651557384692e-08, + "loss": 0.88535929, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 15222, + "time_per_iteration": 2.3781981468200684 + }, + { + "auxiliary_loss_clip": 0.01007402, + "auxiliary_loss_mlp": 0.0100438, + "balance_loss_clip": 1.00242484, + "balance_loss_mlp": 1.00101304, + "epoch": 0.9152562753644973, + "flos": 72143195362560.0, + "grad_norm": 0.7258642535300226, + "language_loss": 0.49734455, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51746237, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.06347656, + "step": 15223, + "time_per_iteration": 3.0696029663085938 + }, + { + "auxiliary_loss_clip": 0.01054618, + "auxiliary_loss_mlp": 0.01039849, + "balance_loss_clip": 1.01646042, + "balance_loss_mlp": 1.01729918, + "epoch": 0.9153163986171652, + "flos": 20775974353920.0, + "grad_norm": 1.826383762200019, + "language_loss": 0.73337221, + "learning_rate": 7.470546933201349e-08, + "loss": 0.75431681, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37304688, + "step": 15224, + "time_per_iteration": 3.815645456314087 + }, + { + "auxiliary_loss_clip": 0.01052067, + "auxiliary_loss_mlp": 0.01035291, + "balance_loss_clip": 1.01234317, + "balance_loss_mlp": 1.01651692, + "epoch": 0.9153765218698332, + "flos": 23038974593280.0, + "grad_norm": 3.3809896540328848, + "language_loss": 0.83071446, + "learning_rate": 7.460005572013895e-08, + "loss": 0.85158807, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 15225, + "time_per_iteration": 2.4253525733947754 + }, + { + "auxiliary_loss_clip": 0.01050531, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.01159596, + "balance_loss_mlp": 1.0151, + "epoch": 0.9154366451225011, + "flos": 28989501166080.0, + "grad_norm": 1.3491568782583614, + "language_loss": 0.7190702, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73991787, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 15226, + "time_per_iteration": 2.4518628120422363 + }, + { + "auxiliary_loss_clip": 0.01052552, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.01254487, + "balance_loss_mlp": 1.01615882, + "epoch": 0.9154967683751691, + "flos": 22308417060480.0, + "grad_norm": 1.8657266410321558, + "language_loss": 0.76493901, + "learning_rate": 7.43894475344613e-08, + "loss": 0.78582895, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 15227, + "time_per_iteration": 2.4098002910614014 + }, + { + "auxiliary_loss_clip": 0.01050591, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.01100397, + "balance_loss_mlp": 1.01583457, + "epoch": 0.915556891627837, + "flos": 24570335047680.0, + "grad_norm": 1.421226217442011, + "language_loss": 0.75421226, + "learning_rate": 7.428425296864404e-08, + "loss": 0.77503705, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 15228, + "time_per_iteration": 2.426741123199463 + }, + { + "auxiliary_loss_clip": 0.01049237, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.01894891, + "balance_loss_mlp": 1.01476216, + "epoch": 0.9156170148805051, + "flos": 22163562362880.0, + "grad_norm": 1.5871193047487895, + "language_loss": 0.72332346, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74422204, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 15229, + "time_per_iteration": 2.380918502807617 + }, + { + "auxiliary_loss_clip": 0.01052488, + "auxiliary_loss_mlp": 0.01045351, + "balance_loss_clip": 1.02143717, + "balance_loss_mlp": 1.01664889, + "epoch": 0.915677138133173, + "flos": 20919676976640.0, + "grad_norm": 1.5957277977597093, + "language_loss": 0.83790684, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85888523, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 15230, + "time_per_iteration": 2.380831480026245 + }, + { + "auxiliary_loss_clip": 0.01049861, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.01603651, + "balance_loss_mlp": 1.01547742, + "epoch": 0.915737261385841, + "flos": 24344202971520.0, + "grad_norm": 1.9596635675126788, + "language_loss": 0.84972274, + "learning_rate": 7.396910742713957e-08, + "loss": 0.87059379, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34375, + "step": 15231, + "time_per_iteration": 2.421139717102051 + }, + { + "auxiliary_loss_clip": 0.01049364, + "auxiliary_loss_mlp": 0.01031898, + "balance_loss_clip": 1.01040506, + "balance_loss_mlp": 1.0153749, + "epoch": 0.9157973846385089, + "flos": 26760157344000.0, + "grad_norm": 1.6235288418079081, + "language_loss": 0.73563945, + "learning_rate": 7.386420497856516e-08, + "loss": 0.75645214, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 15232, + "time_per_iteration": 2.420163631439209 + }, + { + "auxiliary_loss_clip": 0.01051726, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.01327884, + "balance_loss_mlp": 1.01540422, + "epoch": 0.9158575078911769, + "flos": 18477746686080.0, + "grad_norm": 2.34912453097201, + "language_loss": 0.686607, + "learning_rate": 7.375937556925338e-08, + "loss": 0.70749319, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 15233, + "time_per_iteration": 2.3632662296295166 + }, + { + "auxiliary_loss_clip": 0.01051381, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.01736748, + "balance_loss_mlp": 1.01618958, + "epoch": 0.9159176311438448, + "flos": 21797847204480.0, + "grad_norm": 9.56671876820394, + "language_loss": 0.70367324, + "learning_rate": 7.365461920317861e-08, + "loss": 0.72458494, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 15234, + "time_per_iteration": 2.3705618381500244 + }, + { + "auxiliary_loss_clip": 0.01053195, + "auxiliary_loss_mlp": 0.01040999, + "balance_loss_clip": 1.01662087, + "balance_loss_mlp": 1.01671362, + "epoch": 0.9159777543965129, + "flos": 24782781870720.0, + "grad_norm": 1.6492032911816235, + "language_loss": 0.88812053, + "learning_rate": 7.354993588431391e-08, + "loss": 0.9090625, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 15235, + "time_per_iteration": 2.4041740894317627 + }, + { + "auxiliary_loss_clip": 0.01051611, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.01433778, + "balance_loss_mlp": 1.01566744, + "epoch": 0.9160378776491809, + "flos": 26867585197440.0, + "grad_norm": 1.8099074499446117, + "language_loss": 0.78750122, + "learning_rate": 7.344532561662853e-08, + "loss": 0.80839479, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 15236, + "time_per_iteration": 2.4411048889160156 + }, + { + "auxiliary_loss_clip": 0.01007534, + "auxiliary_loss_mlp": 0.01003718, + "balance_loss_clip": 1.00156009, + "balance_loss_mlp": 1.00109458, + "epoch": 0.9160980009018488, + "flos": 70574827000320.0, + "grad_norm": 0.6775751180957379, + "language_loss": 0.62298858, + "learning_rate": 7.334078840409019e-08, + "loss": 0.6431011, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06445312, + "step": 15237, + "time_per_iteration": 2.9446444511413574 + }, + { + "auxiliary_loss_clip": 0.01051716, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.01066566, + "balance_loss_mlp": 1.01535881, + "epoch": 0.9161581241545168, + "flos": 16288413148800.0, + "grad_norm": 2.1698597783131746, + "language_loss": 0.75677109, + "learning_rate": 7.323632425066151e-08, + "loss": 0.7776264, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 15238, + "time_per_iteration": 2.394984483718872 + }, + { + "auxiliary_loss_clip": 0.01052287, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.01320624, + "balance_loss_mlp": 1.01576185, + "epoch": 0.9162182474071847, + "flos": 18437282553600.0, + "grad_norm": 1.6658473238841707, + "language_loss": 0.75590789, + "learning_rate": 7.313193316030464e-08, + "loss": 0.77680469, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 15239, + "time_per_iteration": 2.3507447242736816 + }, + { + "auxiliary_loss_clip": 0.01051963, + "auxiliary_loss_mlp": 0.01040387, + "balance_loss_clip": 1.01726031, + "balance_loss_mlp": 1.01564741, + "epoch": 0.9162783706598527, + "flos": 19166373809280.0, + "grad_norm": 1.9588296428558092, + "language_loss": 0.64756405, + "learning_rate": 7.302761513697819e-08, + "loss": 0.66848755, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36328125, + "step": 15240, + "time_per_iteration": 2.4080207347869873 + }, + { + "auxiliary_loss_clip": 0.01050789, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.01163507, + "balance_loss_mlp": 1.01689959, + "epoch": 0.9163384939125206, + "flos": 20411934940800.0, + "grad_norm": 2.416902241058621, + "language_loss": 0.7738328, + "learning_rate": 7.292337018463746e-08, + "loss": 0.79467893, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.33984375, + "step": 15241, + "time_per_iteration": 3.6248972415924072 + }, + { + "auxiliary_loss_clip": 0.01056223, + "auxiliary_loss_mlp": 0.01041032, + "balance_loss_clip": 1.01368499, + "balance_loss_mlp": 1.01687098, + "epoch": 0.9163986171651887, + "flos": 19644893170560.0, + "grad_norm": 2.7320828903678223, + "language_loss": 0.69351453, + "learning_rate": 7.281919830723549e-08, + "loss": 0.71448708, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.39453125, + "step": 15242, + "time_per_iteration": 2.4296491146087646 + }, + { + "auxiliary_loss_clip": 0.01052794, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.01345515, + "balance_loss_mlp": 1.01592207, + "epoch": 0.9164587404178566, + "flos": 12822236035200.0, + "grad_norm": 1.9571624116420188, + "language_loss": 0.82278931, + "learning_rate": 7.271509950872334e-08, + "loss": 0.84369469, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36914062, + "step": 15243, + "time_per_iteration": 2.3497073650360107 + }, + { + "auxiliary_loss_clip": 0.01052462, + "auxiliary_loss_mlp": 0.01040226, + "balance_loss_clip": 1.01560926, + "balance_loss_mlp": 1.01554632, + "epoch": 0.9165188636705246, + "flos": 22308312326400.0, + "grad_norm": 1.983480697879043, + "language_loss": 0.82979286, + "learning_rate": 7.261107379304721e-08, + "loss": 0.85071975, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36914062, + "step": 15244, + "time_per_iteration": 2.3992433547973633 + }, + { + "auxiliary_loss_clip": 0.01054707, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.01655269, + "balance_loss_mlp": 1.01656973, + "epoch": 0.9165789869231925, + "flos": 18222357024000.0, + "grad_norm": 2.3146243430817988, + "language_loss": 0.74770367, + "learning_rate": 7.250712116415214e-08, + "loss": 0.76867533, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.3828125, + "step": 15245, + "time_per_iteration": 2.389514446258545 + }, + { + "auxiliary_loss_clip": 0.01051012, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.01047707, + "balance_loss_mlp": 1.01604795, + "epoch": 0.9166391101758605, + "flos": 13690910373120.0, + "grad_norm": 1.7886818904393458, + "language_loss": 0.75658083, + "learning_rate": 7.240324162598033e-08, + "loss": 0.7774182, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 15246, + "time_per_iteration": 2.3617959022521973 + }, + { + "auxiliary_loss_clip": 0.01051738, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.01314235, + "balance_loss_mlp": 1.01686502, + "epoch": 0.9166992334285284, + "flos": 17345862541440.0, + "grad_norm": 4.110411356430959, + "language_loss": 0.76816154, + "learning_rate": 7.229943518247106e-08, + "loss": 0.78905797, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.34765625, + "step": 15247, + "time_per_iteration": 2.350221872329712 + }, + { + "auxiliary_loss_clip": 0.01052731, + "auxiliary_loss_mlp": 0.01038264, + "balance_loss_clip": 1.01517284, + "balance_loss_mlp": 1.01673031, + "epoch": 0.9167593566811965, + "flos": 23730045511680.0, + "grad_norm": 3.2974786381732875, + "language_loss": 0.77358812, + "learning_rate": 7.219570183756052e-08, + "loss": 0.79449809, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 15248, + "time_per_iteration": 2.421276807785034 + }, + { + "auxiliary_loss_clip": 0.01052699, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.01071393, + "balance_loss_mlp": 1.01629686, + "epoch": 0.9168194799338644, + "flos": 27816978332160.0, + "grad_norm": 4.805014955445688, + "language_loss": 0.74477547, + "learning_rate": 7.209204159518178e-08, + "loss": 0.76565719, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 15249, + "time_per_iteration": 2.42830753326416 + }, + { + "auxiliary_loss_clip": 0.01050866, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.01366854, + "balance_loss_mlp": 1.01569724, + "epoch": 0.9168796031865324, + "flos": 21716709471360.0, + "grad_norm": 2.38580984320093, + "language_loss": 0.77715647, + "learning_rate": 7.198845445926616e-08, + "loss": 0.79803574, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 15250, + "time_per_iteration": 2.358096122741699 + }, + { + "auxiliary_loss_clip": 0.01051102, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.01181138, + "balance_loss_mlp": 1.01518011, + "epoch": 0.9169397264392004, + "flos": 23403293297280.0, + "grad_norm": 1.617364535357278, + "language_loss": 0.76886821, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78974384, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 15251, + "time_per_iteration": 2.3793838024139404 + }, + { + "auxiliary_loss_clip": 0.01052283, + "auxiliary_loss_mlp": 0.01039576, + "balance_loss_clip": 1.01478028, + "balance_loss_mlp": 1.01561737, + "epoch": 0.9169998496918683, + "flos": 23949858631680.0, + "grad_norm": 2.15419184980716, + "language_loss": 0.82203096, + "learning_rate": 7.178149952253298e-08, + "loss": 0.84294951, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 15252, + "time_per_iteration": 5.0567896366119385 + }, + { + "auxiliary_loss_clip": 0.01050968, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.01441097, + "balance_loss_mlp": 1.01545238, + "epoch": 0.9170599729445363, + "flos": 18331495534080.0, + "grad_norm": 1.6743827170322876, + "language_loss": 0.78220487, + "learning_rate": 7.167813172956316e-08, + "loss": 0.80308819, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 15253, + "time_per_iteration": 2.3547821044921875 + }, + { + "auxiliary_loss_clip": 0.0105337, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.01399589, + "balance_loss_mlp": 1.01674914, + "epoch": 0.9171200961972042, + "flos": 22673748193920.0, + "grad_norm": 1.8556419328893934, + "language_loss": 0.74057031, + "learning_rate": 7.157483705875256e-08, + "loss": 0.76148188, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 15254, + "time_per_iteration": 2.387122392654419 + }, + { + "auxiliary_loss_clip": 0.01051319, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.01302075, + "balance_loss_mlp": 1.01662636, + "epoch": 0.9171802194498723, + "flos": 26718226934400.0, + "grad_norm": 1.5562298795790226, + "language_loss": 0.80451298, + "learning_rate": 7.14716155140167e-08, + "loss": 0.82537544, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 15255, + "time_per_iteration": 2.456204652786255 + }, + { + "auxiliary_loss_clip": 0.01052281, + "auxiliary_loss_mlp": 0.0104107, + "balance_loss_clip": 1.01738334, + "balance_loss_mlp": 1.0159657, + "epoch": 0.9172403427025402, + "flos": 37887710117760.0, + "grad_norm": 2.112400669519704, + "language_loss": 0.69488901, + "learning_rate": 7.136846709927047e-08, + "loss": 0.71582258, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 15256, + "time_per_iteration": 2.514353036880493 + }, + { + "auxiliary_loss_clip": 0.01050618, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.01268053, + "balance_loss_mlp": 1.01577115, + "epoch": 0.9173004659552082, + "flos": 17054233021440.0, + "grad_norm": 2.0625480365165347, + "language_loss": 0.8482269, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86908245, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 15257, + "time_per_iteration": 2.358159065246582 + }, + { + "auxiliary_loss_clip": 0.01049995, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.01596844, + "balance_loss_mlp": 1.01624894, + "epoch": 0.9173605892078761, + "flos": 22200465536640.0, + "grad_norm": 2.7633046242052486, + "language_loss": 0.78248632, + "learning_rate": 7.116238967539012e-08, + "loss": 0.80334693, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33789062, + "step": 15258, + "time_per_iteration": 2.4609692096710205 + }, + { + "auxiliary_loss_clip": 0.01052505, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.01436925, + "balance_loss_mlp": 1.01708698, + "epoch": 0.9174207124605441, + "flos": 16506864725760.0, + "grad_norm": 1.8132575453831952, + "language_loss": 0.79936719, + "learning_rate": 7.105946067406999e-08, + "loss": 0.820274, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 15259, + "time_per_iteration": 2.435352087020874 + }, + { + "auxiliary_loss_clip": 0.01050345, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.01418209, + "balance_loss_mlp": 1.01537561, + "epoch": 0.917480835713212, + "flos": 24534444303360.0, + "grad_norm": 1.8361949403427937, + "language_loss": 0.7680226, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78887987, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34960938, + "step": 15260, + "time_per_iteration": 2.4690897464752197 + }, + { + "auxiliary_loss_clip": 0.01049274, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.01364779, + "balance_loss_mlp": 1.01529634, + "epoch": 0.9175409589658801, + "flos": 20879841248640.0, + "grad_norm": 2.1492694629395843, + "language_loss": 0.62133217, + "learning_rate": 7.085382211218637e-08, + "loss": 0.64217025, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33984375, + "step": 15261, + "time_per_iteration": 2.377065896987915 + }, + { + "auxiliary_loss_clip": 0.01049834, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.01230359, + "balance_loss_mlp": 1.01484394, + "epoch": 0.917601082218548, + "flos": 14275356399360.0, + "grad_norm": 1.7926567536483071, + "language_loss": 0.75138152, + "learning_rate": 7.075111255942002e-08, + "loss": 0.77222276, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 15262, + "time_per_iteration": 2.337038278579712 + }, + { + "auxiliary_loss_clip": 0.01053107, + "auxiliary_loss_mlp": 0.01040831, + "balance_loss_clip": 1.01650047, + "balance_loss_mlp": 1.01602292, + "epoch": 0.917661205471216, + "flos": 19098223102080.0, + "grad_norm": 1.7918400136884522, + "language_loss": 0.79257345, + "learning_rate": 7.064847616396496e-08, + "loss": 0.8135128, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 15263, + "time_per_iteration": 2.3666024208068848 + }, + { + "auxiliary_loss_clip": 0.0105357, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.011181, + "balance_loss_mlp": 1.01638472, + "epoch": 0.917721328723884, + "flos": 21105484565760.0, + "grad_norm": 2.5690240537562947, + "language_loss": 0.77016926, + "learning_rate": 7.054591292971324e-08, + "loss": 0.79105365, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 15264, + "time_per_iteration": 3.7960450649261475 + }, + { + "auxiliary_loss_clip": 0.01051117, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.01404905, + "balance_loss_mlp": 1.01524067, + "epoch": 0.9177814519765519, + "flos": 21942178231680.0, + "grad_norm": 1.634163708861722, + "language_loss": 0.83884037, + "learning_rate": 7.044342286055394e-08, + "loss": 0.8597309, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 15265, + "time_per_iteration": 2.350616455078125 + }, + { + "auxiliary_loss_clip": 0.01054383, + "auxiliary_loss_mlp": 0.01043754, + "balance_loss_clip": 1.01885152, + "balance_loss_mlp": 1.01686275, + "epoch": 0.9178415752292199, + "flos": 24204864268800.0, + "grad_norm": 1.7260164746603792, + "language_loss": 0.73903036, + "learning_rate": 7.034100596037306e-08, + "loss": 0.76001179, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 15266, + "time_per_iteration": 2.392383575439453 + }, + { + "auxiliary_loss_clip": 0.01051576, + "auxiliary_loss_mlp": 0.01037125, + "balance_loss_clip": 1.01467764, + "balance_loss_mlp": 1.01566005, + "epoch": 0.9179016984818879, + "flos": 20041192546560.0, + "grad_norm": 1.8065032559076106, + "language_loss": 0.78267169, + "learning_rate": 7.023866223305486e-08, + "loss": 0.80355871, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 15267, + "time_per_iteration": 2.365565299987793 + }, + { + "auxiliary_loss_clip": 0.0100708, + "auxiliary_loss_mlp": 0.01001765, + "balance_loss_clip": 0.99983358, + "balance_loss_mlp": 1.00066292, + "epoch": 0.9179618217345559, + "flos": 65552291032320.0, + "grad_norm": 0.7383519271513952, + "language_loss": 0.56457776, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58466619, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.06396484, + "step": 15268, + "time_per_iteration": 3.0208370685577393 + }, + { + "auxiliary_loss_clip": 0.01052799, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.01335454, + "balance_loss_mlp": 1.0158658, + "epoch": 0.9180219449872238, + "flos": 21323552117760.0, + "grad_norm": 1.7405659648040406, + "language_loss": 0.78532946, + "learning_rate": 7.0034194312526e-08, + "loss": 0.80623841, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36914062, + "step": 15269, + "time_per_iteration": 2.3636474609375 + }, + { + "auxiliary_loss_clip": 0.01051704, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.01299644, + "balance_loss_mlp": 1.01574564, + "epoch": 0.9180820682398918, + "flos": 41058487284480.0, + "grad_norm": 1.5870044711452655, + "language_loss": 0.7324419, + "learning_rate": 6.993207012706936e-08, + "loss": 0.75331229, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.359375, + "step": 15270, + "time_per_iteration": 2.5401723384857178 + }, + { + "auxiliary_loss_clip": 0.01048887, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.01451802, + "balance_loss_mlp": 1.01451027, + "epoch": 0.9181421914925597, + "flos": 28071704678400.0, + "grad_norm": 1.5757264107320872, + "language_loss": 0.8024317, + "learning_rate": 6.98300191299821e-08, + "loss": 0.82329535, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34375, + "step": 15271, + "time_per_iteration": 2.4134488105773926 + }, + { + "auxiliary_loss_clip": 0.01052152, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.01043904, + "balance_loss_mlp": 1.01573801, + "epoch": 0.9182023147452277, + "flos": 29168117015040.0, + "grad_norm": 1.9990973914925392, + "language_loss": 0.73252815, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75339031, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 15272, + "time_per_iteration": 2.4453237056732178 + }, + { + "auxiliary_loss_clip": 0.01052511, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.01610947, + "balance_loss_mlp": 1.01613569, + "epoch": 0.9182624379978956, + "flos": 24059695368960.0, + "grad_norm": 1.92891695593503, + "language_loss": 0.74088061, + "learning_rate": 6.962613671639105e-08, + "loss": 0.76178604, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 15273, + "time_per_iteration": 2.378659725189209 + }, + { + "auxiliary_loss_clip": 0.01047062, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.01308155, + "balance_loss_mlp": 1.01418996, + "epoch": 0.9183225612505637, + "flos": 23292444130560.0, + "grad_norm": 1.5794345015356184, + "language_loss": 0.74840057, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76920599, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.328125, + "step": 15274, + "time_per_iteration": 2.3989408016204834 + }, + { + "auxiliary_loss_clip": 0.01053043, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.01712608, + "balance_loss_mlp": 1.01682711, + "epoch": 0.9183826845032316, + "flos": 19608234376320.0, + "grad_norm": 1.5936112647414051, + "language_loss": 0.70140064, + "learning_rate": 6.942254710267902e-08, + "loss": 0.72231889, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36132812, + "step": 15275, + "time_per_iteration": 2.3592021465301514 + }, + { + "auxiliary_loss_clip": 0.01051346, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01225209, + "balance_loss_mlp": 1.01600087, + "epoch": 0.9184428077558996, + "flos": 18478060888320.0, + "grad_norm": 1.830073675741051, + "language_loss": 0.73375267, + "learning_rate": 6.932086210542953e-08, + "loss": 0.75461829, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35351562, + "step": 15276, + "time_per_iteration": 2.4170949459075928 + }, + { + "auxiliary_loss_clip": 0.01051312, + "auxiliary_loss_mlp": 0.01037683, + "balance_loss_clip": 1.0154984, + "balance_loss_mlp": 1.01572561, + "epoch": 0.9185029310085676, + "flos": 20739978875520.0, + "grad_norm": 1.6356857540517489, + "language_loss": 0.74622548, + "learning_rate": 6.921925031972642e-08, + "loss": 0.76711535, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35546875, + "step": 15277, + "time_per_iteration": 2.3617658615112305 + }, + { + "auxiliary_loss_clip": 0.0100726, + "auxiliary_loss_mlp": 0.01001969, + "balance_loss_clip": 0.9999308, + "balance_loss_mlp": 1.00092649, + "epoch": 0.9185630542612355, + "flos": 68205830273280.0, + "grad_norm": 0.7275460290626877, + "language_loss": 0.5924778, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61257005, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.06347656, + "step": 15278, + "time_per_iteration": 3.1182339191436768 + }, + { + "auxiliary_loss_clip": 0.01048221, + "auxiliary_loss_mlp": 0.01032895, + "balance_loss_clip": 1.01216459, + "balance_loss_mlp": 1.01399267, + "epoch": 0.9186231775139035, + "flos": 12238662792960.0, + "grad_norm": 1.6309028821447207, + "language_loss": 0.65563726, + "learning_rate": 6.901624639836879e-08, + "loss": 0.67644846, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34179688, + "step": 15279, + "time_per_iteration": 2.3303134441375732 + }, + { + "auxiliary_loss_clip": 0.01007034, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00012052, + "balance_loss_mlp": 1.00072289, + "epoch": 0.9186833007665715, + "flos": 63935987506560.0, + "grad_norm": 0.8433206327216874, + "language_loss": 0.6018424, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62193531, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.06298828, + "step": 15280, + "time_per_iteration": 3.025491952896118 + }, + { + "auxiliary_loss_clip": 0.01052778, + "auxiliary_loss_mlp": 0.01041543, + "balance_loss_clip": 1.01681876, + "balance_loss_mlp": 1.01607466, + "epoch": 0.9187434240192395, + "flos": 19973670243840.0, + "grad_norm": 2.108827596187499, + "language_loss": 0.71289408, + "learning_rate": 6.881353536939815e-08, + "loss": 0.73383725, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3671875, + "step": 15281, + "time_per_iteration": 3.613032817840576 + }, + { + "auxiliary_loss_clip": 0.01052689, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.01381183, + "balance_loss_mlp": 1.01629186, + "epoch": 0.9188035472719074, + "flos": 25226667296640.0, + "grad_norm": 1.5974526872980797, + "language_loss": 0.85265988, + "learning_rate": 6.871228969916831e-08, + "loss": 0.87357259, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 15282, + "time_per_iteration": 2.41317081451416 + }, + { + "auxiliary_loss_clip": 0.01050066, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.01589489, + "balance_loss_mlp": 1.01554477, + "epoch": 0.9188636705245754, + "flos": 18404568743040.0, + "grad_norm": 1.9219534366074067, + "language_loss": 0.61284769, + "learning_rate": 6.861111726356194e-08, + "loss": 0.63374805, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.34570312, + "step": 15283, + "time_per_iteration": 2.3454651832580566 + }, + { + "auxiliary_loss_clip": 0.01053449, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.01439619, + "balance_loss_mlp": 1.01696181, + "epoch": 0.9189237937772433, + "flos": 23767996026240.0, + "grad_norm": 1.6135398165589792, + "language_loss": 0.66462094, + "learning_rate": 6.851001806641554e-08, + "loss": 0.68553293, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36523438, + "step": 15284, + "time_per_iteration": 2.3801820278167725 + }, + { + "auxiliary_loss_clip": 0.01050614, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.01635361, + "balance_loss_mlp": 1.01484609, + "epoch": 0.9189839170299113, + "flos": 21213575735040.0, + "grad_norm": 1.763723098132223, + "language_loss": 0.74865544, + "learning_rate": 6.840899211156292e-08, + "loss": 0.76955557, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 15285, + "time_per_iteration": 2.3547353744506836 + }, + { + "auxiliary_loss_clip": 0.01050291, + "auxiliary_loss_mlp": 0.01034874, + "balance_loss_clip": 1.01135445, + "balance_loss_mlp": 1.01528859, + "epoch": 0.9190440402825792, + "flos": 16726433466240.0, + "grad_norm": 1.974189813925432, + "language_loss": 0.73408329, + "learning_rate": 6.830803940283458e-08, + "loss": 0.75493503, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34960938, + "step": 15286, + "time_per_iteration": 2.342820644378662 + }, + { + "auxiliary_loss_clip": 0.01052646, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.01267147, + "balance_loss_mlp": 1.01683235, + "epoch": 0.9191041635352473, + "flos": 23440056825600.0, + "grad_norm": 2.0070472797051098, + "language_loss": 0.74270666, + "learning_rate": 6.820715994405945e-08, + "loss": 0.76360416, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 15287, + "time_per_iteration": 2.372950315475464 + }, + { + "auxiliary_loss_clip": 0.01052645, + "auxiliary_loss_mlp": 0.0103932, + "balance_loss_clip": 1.01472759, + "balance_loss_mlp": 1.01659977, + "epoch": 0.9191642867879152, + "flos": 18806523759360.0, + "grad_norm": 2.0237227907227604, + "language_loss": 0.66972691, + "learning_rate": 6.810635373906226e-08, + "loss": 0.69064653, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 15288, + "time_per_iteration": 2.40139102935791 + }, + { + "auxiliary_loss_clip": 0.01052598, + "auxiliary_loss_mlp": 0.01039518, + "balance_loss_clip": 1.01521134, + "balance_loss_mlp": 1.01697397, + "epoch": 0.9192244100405832, + "flos": 32159580105600.0, + "grad_norm": 1.879502953299506, + "language_loss": 0.72312105, + "learning_rate": 6.800562079166549e-08, + "loss": 0.74404216, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 15289, + "time_per_iteration": 2.4893312454223633 + }, + { + "auxiliary_loss_clip": 0.01053745, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_clip": 1.01720238, + "balance_loss_mlp": 1.01656973, + "epoch": 0.9192845332932512, + "flos": 16356878058240.0, + "grad_norm": 1.873494844662835, + "language_loss": 0.75443017, + "learning_rate": 6.790496110568921e-08, + "loss": 0.77539355, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 15290, + "time_per_iteration": 2.4691734313964844 + }, + { + "auxiliary_loss_clip": 0.0104936, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.01285267, + "balance_loss_mlp": 1.01502323, + "epoch": 0.9193446565459191, + "flos": 26613277787520.0, + "grad_norm": 2.465253955568107, + "language_loss": 0.73246622, + "learning_rate": 6.78043746849506e-08, + "loss": 0.75329947, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34375, + "step": 15291, + "time_per_iteration": 2.559516429901123 + }, + { + "auxiliary_loss_clip": 0.01051007, + "auxiliary_loss_mlp": 0.0103756, + "balance_loss_clip": 1.01502991, + "balance_loss_mlp": 1.01576185, + "epoch": 0.9194047797985871, + "flos": 22491082627200.0, + "grad_norm": 1.5509596894912097, + "language_loss": 0.7150085, + "learning_rate": 6.770386153326346e-08, + "loss": 0.7358942, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 15292, + "time_per_iteration": 5.320828914642334 + }, + { + "auxiliary_loss_clip": 0.01051571, + "auxiliary_loss_mlp": 0.0104019, + "balance_loss_clip": 1.01729012, + "balance_loss_mlp": 1.0156914, + "epoch": 0.9194649030512551, + "flos": 25077727969920.0, + "grad_norm": 1.8413530852057134, + "language_loss": 0.74215144, + "learning_rate": 6.760342165443988e-08, + "loss": 0.76306903, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 15293, + "time_per_iteration": 2.3922340869903564 + }, + { + "auxiliary_loss_clip": 0.01050881, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.0143398, + "balance_loss_mlp": 1.01596856, + "epoch": 0.9195250263039231, + "flos": 11910339567360.0, + "grad_norm": 1.8723844564887773, + "language_loss": 0.79759997, + "learning_rate": 6.750305505228837e-08, + "loss": 0.81847274, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 15294, + "time_per_iteration": 2.3481392860412598 + }, + { + "auxiliary_loss_clip": 0.0105429, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.01601624, + "balance_loss_mlp": 1.01672316, + "epoch": 0.919585149556591, + "flos": 21833109544320.0, + "grad_norm": 1.723259489123444, + "language_loss": 0.78771496, + "learning_rate": 6.74027617306141e-08, + "loss": 0.80866075, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 15295, + "time_per_iteration": 2.4275336265563965 + }, + { + "auxiliary_loss_clip": 0.0104948, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.00987089, + "balance_loss_mlp": 1.0157311, + "epoch": 0.919645272809259, + "flos": 28182798224640.0, + "grad_norm": 3.60942058032205, + "language_loss": 0.72662544, + "learning_rate": 6.730254169322114e-08, + "loss": 0.74743319, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33789062, + "step": 15296, + "time_per_iteration": 2.4143970012664795 + }, + { + "auxiliary_loss_clip": 0.01051782, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.02093291, + "balance_loss_mlp": 1.01674604, + "epoch": 0.9197053960619269, + "flos": 18331844647680.0, + "grad_norm": 1.8299357898403117, + "language_loss": 0.75976562, + "learning_rate": 6.720239494390912e-08, + "loss": 0.78073001, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3515625, + "step": 15297, + "time_per_iteration": 2.3450939655303955 + }, + { + "auxiliary_loss_clip": 0.01051488, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.01403725, + "balance_loss_mlp": 1.01561511, + "epoch": 0.9197655193145949, + "flos": 28182204731520.0, + "grad_norm": 1.63356895723524, + "language_loss": 0.75385404, + "learning_rate": 6.710232148647676e-08, + "loss": 0.77474028, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 15298, + "time_per_iteration": 2.4090311527252197 + }, + { + "auxiliary_loss_clip": 0.01050957, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.01201034, + "balance_loss_mlp": 1.01586318, + "epoch": 0.9198256425672628, + "flos": 17305503143040.0, + "grad_norm": 6.216534683858197, + "language_loss": 0.80912948, + "learning_rate": 6.70023213247175e-08, + "loss": 0.8299911, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 15299, + "time_per_iteration": 2.3465356826782227 + }, + { + "auxiliary_loss_clip": 0.01052068, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.01099634, + "balance_loss_mlp": 1.01646221, + "epoch": 0.9198857658199309, + "flos": 17857549560960.0, + "grad_norm": 1.9703583735498467, + "language_loss": 0.64639342, + "learning_rate": 6.690239446242385e-08, + "loss": 0.66723543, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35546875, + "step": 15300, + "time_per_iteration": 2.3372607231140137 + }, + { + "auxiliary_loss_clip": 0.01049307, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.01534486, + "balance_loss_mlp": 1.01638234, + "epoch": 0.9199458890725988, + "flos": 22126449720960.0, + "grad_norm": 1.797686180890295, + "language_loss": 0.70885837, + "learning_rate": 6.680254090338545e-08, + "loss": 0.7296949, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.33007812, + "step": 15301, + "time_per_iteration": 2.374493360519409 + }, + { + "auxiliary_loss_clip": 0.01052035, + "auxiliary_loss_mlp": 0.01040872, + "balance_loss_clip": 1.01673269, + "balance_loss_mlp": 1.01566672, + "epoch": 0.9200060123252668, + "flos": 16033128220800.0, + "grad_norm": 1.6682718127751661, + "language_loss": 0.72404087, + "learning_rate": 6.670276065138814e-08, + "loss": 0.74496996, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 15302, + "time_per_iteration": 2.335052490234375 + }, + { + "auxiliary_loss_clip": 0.01052219, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.01068878, + "balance_loss_mlp": 1.01590323, + "epoch": 0.9200661355779348, + "flos": 26863465656960.0, + "grad_norm": 2.6262876880974155, + "language_loss": 0.77786958, + "learning_rate": 6.660305371021579e-08, + "loss": 0.79872429, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 15303, + "time_per_iteration": 2.4144067764282227 + }, + { + "auxiliary_loss_clip": 0.01050995, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.01564014, + "balance_loss_mlp": 1.01609623, + "epoch": 0.9201262588306027, + "flos": 12785926354560.0, + "grad_norm": 2.6962140865571143, + "language_loss": 0.89245665, + "learning_rate": 6.650342008365006e-08, + "loss": 0.91334391, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 15304, + "time_per_iteration": 3.746840000152588 + }, + { + "auxiliary_loss_clip": 0.01054015, + "auxiliary_loss_mlp": 0.01043682, + "balance_loss_clip": 1.0175271, + "balance_loss_mlp": 1.01649559, + "epoch": 0.9201863820832707, + "flos": 20630561074560.0, + "grad_norm": 1.9571833720204719, + "language_loss": 0.79490423, + "learning_rate": 6.64038597754677e-08, + "loss": 0.81588125, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 15305, + "time_per_iteration": 2.3608803749084473 + }, + { + "auxiliary_loss_clip": 0.01051459, + "auxiliary_loss_mlp": 0.01036735, + "balance_loss_clip": 1.014395, + "balance_loss_mlp": 1.01576447, + "epoch": 0.9202465053359387, + "flos": 26394616742400.0, + "grad_norm": 1.9166731704319537, + "language_loss": 0.82485986, + "learning_rate": 6.630437278944501e-08, + "loss": 0.84574175, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 15306, + "time_per_iteration": 2.395981788635254 + }, + { + "auxiliary_loss_clip": 0.01049118, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.01438355, + "balance_loss_mlp": 1.01557326, + "epoch": 0.9203066285886067, + "flos": 10487419395840.0, + "grad_norm": 1.8900045578673272, + "language_loss": 0.72907501, + "learning_rate": 6.62049591293541e-08, + "loss": 0.7499184, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.3359375, + "step": 15307, + "time_per_iteration": 2.35368013381958 + }, + { + "auxiliary_loss_clip": 0.01053224, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.01242089, + "balance_loss_mlp": 1.01614857, + "epoch": 0.9203667518412746, + "flos": 19389712976640.0, + "grad_norm": 2.258704994485348, + "language_loss": 0.80049157, + "learning_rate": 6.610561879896526e-08, + "loss": 0.82139683, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37109375, + "step": 15308, + "time_per_iteration": 2.3615097999572754 + }, + { + "auxiliary_loss_clip": 0.01050987, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.0113709, + "balance_loss_mlp": 1.01553202, + "epoch": 0.9204268750939426, + "flos": 15924059533440.0, + "grad_norm": 1.8194231728877157, + "language_loss": 0.79118979, + "learning_rate": 6.600635180204484e-08, + "loss": 0.81203789, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 15309, + "time_per_iteration": 2.3475453853607178 + }, + { + "auxiliary_loss_clip": 0.01050483, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.01240683, + "balance_loss_mlp": 1.01572657, + "epoch": 0.9204869983466105, + "flos": 16470834336000.0, + "grad_norm": 1.799228500385183, + "language_loss": 0.67624885, + "learning_rate": 6.590715814235781e-08, + "loss": 0.69710708, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34765625, + "step": 15310, + "time_per_iteration": 2.417417526245117 + }, + { + "auxiliary_loss_clip": 0.01051132, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.01299286, + "balance_loss_mlp": 1.01477921, + "epoch": 0.9205471215992785, + "flos": 21538268179200.0, + "grad_norm": 1.751596950252876, + "language_loss": 0.66627264, + "learning_rate": 6.580803782366495e-08, + "loss": 0.6871345, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.36328125, + "step": 15311, + "time_per_iteration": 2.3843047618865967 + }, + { + "auxiliary_loss_clip": 0.01051535, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.01274586, + "balance_loss_mlp": 1.01554275, + "epoch": 0.9206072448519464, + "flos": 25004829317760.0, + "grad_norm": 1.7512596315776432, + "language_loss": 0.77436137, + "learning_rate": 6.570899084972503e-08, + "loss": 0.79524648, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 15312, + "time_per_iteration": 2.4000730514526367 + }, + { + "auxiliary_loss_clip": 0.01050621, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.01610637, + "balance_loss_mlp": 1.01571012, + "epoch": 0.9206673681046145, + "flos": 20521597121280.0, + "grad_norm": 1.580020815520702, + "language_loss": 0.79669106, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81758082, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 15313, + "time_per_iteration": 2.4180285930633545 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.0183481, + "balance_loss_mlp": 1.01601386, + "epoch": 0.9207274913572824, + "flos": 20882494512000.0, + "grad_norm": 1.8765613728212753, + "language_loss": 0.79558635, + "learning_rate": 6.55111169511251e-08, + "loss": 0.8165406, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.37304688, + "step": 15314, + "time_per_iteration": 2.3652477264404297 + }, + { + "auxiliary_loss_clip": 0.01055573, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.01802778, + "balance_loss_mlp": 1.01750255, + "epoch": 0.9207876146099504, + "flos": 22707230054400.0, + "grad_norm": 1.8485933282579343, + "language_loss": 0.81111407, + "learning_rate": 6.541229003396864e-08, + "loss": 0.83213437, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.38085938, + "step": 15315, + "time_per_iteration": 2.3768415451049805 + }, + { + "auxiliary_loss_clip": 0.01054981, + "auxiliary_loss_mlp": 0.01040961, + "balance_loss_clip": 1.0164752, + "balance_loss_mlp": 1.01621199, + "epoch": 0.9208477378626184, + "flos": 18506585335680.0, + "grad_norm": 1.7669144757812616, + "language_loss": 0.77411795, + "learning_rate": 6.531353647657156e-08, + "loss": 0.79507732, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.38671875, + "step": 15316, + "time_per_iteration": 2.3411781787872314 + }, + { + "auxiliary_loss_clip": 0.01051114, + "auxiliary_loss_mlp": 0.01043049, + "balance_loss_clip": 1.0196836, + "balance_loss_mlp": 1.01509333, + "epoch": 0.9209078611152863, + "flos": 22998615194880.0, + "grad_norm": 1.6804548106026098, + "language_loss": 0.70384544, + "learning_rate": 6.521485628267931e-08, + "loss": 0.72478706, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 15317, + "time_per_iteration": 2.41685152053833 + }, + { + "auxiliary_loss_clip": 0.0105159, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.0092274, + "balance_loss_mlp": 1.01577187, + "epoch": 0.9209679843679544, + "flos": 24060358684800.0, + "grad_norm": 1.6749085059370903, + "language_loss": 0.84575999, + "learning_rate": 6.511624945603378e-08, + "loss": 0.86659598, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 15318, + "time_per_iteration": 2.406773567199707 + }, + { + "auxiliary_loss_clip": 0.01052526, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.0140202, + "balance_loss_mlp": 1.01710606, + "epoch": 0.9210281076206223, + "flos": 13552514277120.0, + "grad_norm": 1.8869178629036363, + "language_loss": 0.87136042, + "learning_rate": 6.501771600037354e-08, + "loss": 0.89226353, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 15319, + "time_per_iteration": 2.345916986465454 + }, + { + "auxiliary_loss_clip": 0.01007147, + "auxiliary_loss_mlp": 0.01002411, + "balance_loss_clip": 1.00048029, + "balance_loss_mlp": 1.00067174, + "epoch": 0.9210882308732903, + "flos": 71422622478720.0, + "grad_norm": 0.7704901841558655, + "language_loss": 0.56232899, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58242458, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.06445312, + "step": 15320, + "time_per_iteration": 3.0600101947784424 + }, + { + "auxiliary_loss_clip": 0.01054342, + "auxiliary_loss_mlp": 0.01046369, + "balance_loss_clip": 1.02070367, + "balance_loss_mlp": 1.01670051, + "epoch": 0.9211483541259582, + "flos": 18508295992320.0, + "grad_norm": 2.2140353245947004, + "language_loss": 0.64959502, + "learning_rate": 6.482086921695384e-08, + "loss": 0.6706022, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37695312, + "step": 15321, + "time_per_iteration": 3.603483200073242 + }, + { + "auxiliary_loss_clip": 0.0104752, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.01529944, + "balance_loss_mlp": 1.0154016, + "epoch": 0.9212084773786262, + "flos": 23257111968000.0, + "grad_norm": 1.4774290887337989, + "language_loss": 0.72067875, + "learning_rate": 6.47225558966582e-08, + "loss": 0.74151009, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.3203125, + "step": 15322, + "time_per_iteration": 2.4050114154815674 + }, + { + "auxiliary_loss_clip": 0.01050485, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.01050174, + "balance_loss_mlp": 1.01570344, + "epoch": 0.9212686006312941, + "flos": 16288587705600.0, + "grad_norm": 1.739707953114904, + "language_loss": 0.70902526, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72984368, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 15323, + "time_per_iteration": 2.3520314693450928 + }, + { + "auxiliary_loss_clip": 0.0105355, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.01761806, + "balance_loss_mlp": 1.01601577, + "epoch": 0.9213287238839621, + "flos": 19784930100480.0, + "grad_norm": 2.038797405342826, + "language_loss": 0.75984907, + "learning_rate": 6.452614941753597e-08, + "loss": 0.78082049, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 15324, + "time_per_iteration": 2.3670294284820557 + }, + { + "auxiliary_loss_clip": 0.01051971, + "auxiliary_loss_mlp": 0.01042906, + "balance_loss_clip": 1.01833653, + "balance_loss_mlp": 1.01594031, + "epoch": 0.92138884713663, + "flos": 21029408979840.0, + "grad_norm": 2.4698261283053657, + "language_loss": 0.71974444, + "learning_rate": 6.442805626615744e-08, + "loss": 0.74069321, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 15325, + "time_per_iteration": 2.415095567703247 + }, + { + "auxiliary_loss_clip": 0.01050823, + "auxiliary_loss_mlp": 0.01039424, + "balance_loss_clip": 1.01664305, + "balance_loss_mlp": 1.0156951, + "epoch": 0.9214489703892981, + "flos": 28585940227200.0, + "grad_norm": 1.5678292866922723, + "language_loss": 0.78743559, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80833805, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 15326, + "time_per_iteration": 2.4377479553222656 + }, + { + "auxiliary_loss_clip": 0.01053409, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.01435101, + "balance_loss_mlp": 1.01699889, + "epoch": 0.921509093641966, + "flos": 16360578662400.0, + "grad_norm": 2.355235823209294, + "language_loss": 0.71830773, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73922533, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 15327, + "time_per_iteration": 2.3529062271118164 + }, + { + "auxiliary_loss_clip": 0.01054902, + "auxiliary_loss_mlp": 0.01041623, + "balance_loss_clip": 1.01613569, + "balance_loss_mlp": 1.01750219, + "epoch": 0.921569216894634, + "flos": 26829704505600.0, + "grad_norm": 1.9251572698192434, + "language_loss": 0.7833643, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80432951, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 15328, + "time_per_iteration": 2.382625102996826 + }, + { + "auxiliary_loss_clip": 0.01051252, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.01443934, + "balance_loss_mlp": 1.0165441, + "epoch": 0.921629340147302, + "flos": 24643966838400.0, + "grad_norm": 2.307508830439436, + "language_loss": 0.72290993, + "learning_rate": 6.4036417668619e-08, + "loss": 0.74377859, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34765625, + "step": 15329, + "time_per_iteration": 2.416104555130005 + }, + { + "auxiliary_loss_clip": 0.01050805, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.01011729, + "balance_loss_mlp": 1.01536655, + "epoch": 0.9216894633999699, + "flos": 15085585388160.0, + "grad_norm": 1.823204177940141, + "language_loss": 0.87753189, + "learning_rate": 6.393869153979192e-08, + "loss": 0.89834911, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.35546875, + "step": 15330, + "time_per_iteration": 2.3190059661865234 + }, + { + "auxiliary_loss_clip": 0.01050666, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.01219189, + "balance_loss_mlp": 1.01522934, + "epoch": 0.921749586652638, + "flos": 19203626096640.0, + "grad_norm": 2.1657977143042833, + "language_loss": 0.77177596, + "learning_rate": 6.384103882660397e-08, + "loss": 0.79263318, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 15331, + "time_per_iteration": 3.762830972671509 + }, + { + "auxiliary_loss_clip": 0.01051127, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.00995636, + "balance_loss_mlp": 1.01557922, + "epoch": 0.9218097099053059, + "flos": 20521387653120.0, + "grad_norm": 2.4153717349945483, + "language_loss": 0.76285362, + "learning_rate": 6.374345953275794e-08, + "loss": 0.78368789, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 15332, + "time_per_iteration": 3.705519437789917 + }, + { + "auxiliary_loss_clip": 0.01050054, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.01651239, + "balance_loss_mlp": 1.0148015, + "epoch": 0.9218698331579739, + "flos": 17347643020800.0, + "grad_norm": 1.9057010793697213, + "language_loss": 0.76043284, + "learning_rate": 6.364595366195358e-08, + "loss": 0.7813195, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 15333, + "time_per_iteration": 2.340244770050049 + }, + { + "auxiliary_loss_clip": 0.01006952, + "auxiliary_loss_mlp": 0.01004247, + "balance_loss_clip": 1.00208902, + "balance_loss_mlp": 1.00061607, + "epoch": 0.9219299564106418, + "flos": 61955435099520.0, + "grad_norm": 0.8130744726073179, + "language_loss": 0.52974391, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54985595, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06347656, + "step": 15334, + "time_per_iteration": 2.9934167861938477 + }, + { + "auxiliary_loss_clip": 0.0105083, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.01457632, + "balance_loss_mlp": 1.01662803, + "epoch": 0.9219900796633098, + "flos": 15700964745600.0, + "grad_norm": 2.14993344254625, + "language_loss": 0.62770289, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64857185, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 15335, + "time_per_iteration": 2.34134840965271 + }, + { + "auxiliary_loss_clip": 0.01050812, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.01176453, + "balance_loss_mlp": 1.01587188, + "epoch": 0.9220502029159777, + "flos": 24931616463360.0, + "grad_norm": 1.6089924792913122, + "language_loss": 0.72550017, + "learning_rate": 6.335387662475366e-08, + "loss": 0.74635029, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 15336, + "time_per_iteration": 2.43109130859375 + }, + { + "auxiliary_loss_clip": 0.01048206, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.01602566, + "balance_loss_mlp": 1.01452994, + "epoch": 0.9221103261686457, + "flos": 15666365721600.0, + "grad_norm": 1.894267554039299, + "language_loss": 0.72664177, + "learning_rate": 6.325666448306433e-08, + "loss": 0.74748939, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3359375, + "step": 15337, + "time_per_iteration": 2.3358404636383057 + }, + { + "auxiliary_loss_clip": 0.010077, + "auxiliary_loss_mlp": 0.01004081, + "balance_loss_clip": 1.00212562, + "balance_loss_mlp": 1.00128675, + "epoch": 0.9221704494213137, + "flos": 67512909052800.0, + "grad_norm": 0.8863206124434034, + "language_loss": 0.65510052, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67521834, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.06445312, + "step": 15338, + "time_per_iteration": 3.009782314300537 + }, + { + "auxiliary_loss_clip": 0.01052988, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.01354349, + "balance_loss_mlp": 1.01715112, + "epoch": 0.9222305726739817, + "flos": 30225636230400.0, + "grad_norm": 2.788664674333504, + "language_loss": 0.68557107, + "learning_rate": 6.306246052787289e-08, + "loss": 0.70646036, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 15339, + "time_per_iteration": 2.433680534362793 + }, + { + "auxiliary_loss_clip": 0.01051591, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.01456118, + "balance_loss_mlp": 1.0155251, + "epoch": 0.9222906959266496, + "flos": 25336050186240.0, + "grad_norm": 1.8912619281863274, + "language_loss": 0.73080462, + "learning_rate": 6.296546872173513e-08, + "loss": 0.7517097, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36132812, + "step": 15340, + "time_per_iteration": 2.4865005016326904 + }, + { + "auxiliary_loss_clip": 0.01050116, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.01457489, + "balance_loss_mlp": 1.01568377, + "epoch": 0.9223508191793176, + "flos": 27598631489280.0, + "grad_norm": 1.5021259533812021, + "language_loss": 0.71880603, + "learning_rate": 6.286855036814098e-08, + "loss": 0.73968315, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34375, + "step": 15341, + "time_per_iteration": 2.434110641479492 + }, + { + "auxiliary_loss_clip": 0.01048367, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.01080585, + "balance_loss_mlp": 1.01560366, + "epoch": 0.9224109424319856, + "flos": 27306373564800.0, + "grad_norm": 1.6337418967647617, + "language_loss": 0.68549705, + "learning_rate": 6.277170547076571e-08, + "loss": 0.70629179, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.328125, + "step": 15342, + "time_per_iteration": 2.4492218494415283 + }, + { + "auxiliary_loss_clip": 0.01050931, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_clip": 1.01940107, + "balance_loss_mlp": 1.01535106, + "epoch": 0.9224710656846535, + "flos": 48206674736640.0, + "grad_norm": 2.3248456395558996, + "language_loss": 0.70408154, + "learning_rate": 6.26749340332815e-08, + "loss": 0.72500724, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 15343, + "time_per_iteration": 4.058551549911499 + }, + { + "auxiliary_loss_clip": 0.01007308, + "auxiliary_loss_mlp": 0.01003675, + "balance_loss_clip": 1.00168431, + "balance_loss_mlp": 1.00106239, + "epoch": 0.9225311889373216, + "flos": 66718564732800.0, + "grad_norm": 0.7236229551090327, + "language_loss": 0.5205493, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54065919, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.0625, + "step": 15344, + "time_per_iteration": 3.1956093311309814 + }, + { + "auxiliary_loss_clip": 0.01047681, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.01343286, + "balance_loss_mlp": 1.01513958, + "epoch": 0.9225913121899895, + "flos": 22270257077760.0, + "grad_norm": 1.7010264266061874, + "language_loss": 0.71914268, + "learning_rate": 6.248161155266162e-08, + "loss": 0.73994452, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.32617188, + "step": 15345, + "time_per_iteration": 2.37715482711792 + }, + { + "auxiliary_loss_clip": 0.01051053, + "auxiliary_loss_mlp": 0.01050231, + "balance_loss_clip": 1.02797461, + "balance_loss_mlp": 1.01593041, + "epoch": 0.9226514354426575, + "flos": 20081726501760.0, + "grad_norm": 1.7065162245263463, + "language_loss": 0.7838828, + "learning_rate": 6.238506051685677e-08, + "loss": 0.80489564, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 15346, + "time_per_iteration": 2.371446371078491 + }, + { + "auxiliary_loss_clip": 0.01054883, + "auxiliary_loss_mlp": 0.01042539, + "balance_loss_clip": 1.01627755, + "balance_loss_mlp": 1.01697135, + "epoch": 0.9227115586953254, + "flos": 16069926660480.0, + "grad_norm": 1.813688646271003, + "language_loss": 0.77464461, + "learning_rate": 6.228858295560457e-08, + "loss": 0.79561889, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 15347, + "time_per_iteration": 2.3573999404907227 + }, + { + "auxiliary_loss_clip": 0.01048913, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.01505637, + "balance_loss_mlp": 1.01600027, + "epoch": 0.9227716819479934, + "flos": 20445067687680.0, + "grad_norm": 1.4786006510632959, + "language_loss": 0.77499282, + "learning_rate": 6.219217887256367e-08, + "loss": 0.79584414, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.328125, + "step": 15348, + "time_per_iteration": 2.3677122592926025 + }, + { + "auxiliary_loss_clip": 0.01052219, + "auxiliary_loss_mlp": 0.01036504, + "balance_loss_clip": 1.01272202, + "balance_loss_mlp": 1.01529455, + "epoch": 0.9228318052006613, + "flos": 25006295594880.0, + "grad_norm": 2.0562381314431564, + "language_loss": 0.69263792, + "learning_rate": 6.209584827138959e-08, + "loss": 0.71352518, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36914062, + "step": 15349, + "time_per_iteration": 2.396678924560547 + }, + { + "auxiliary_loss_clip": 0.01051825, + "auxiliary_loss_mlp": 0.01038942, + "balance_loss_clip": 1.01588726, + "balance_loss_mlp": 1.01526797, + "epoch": 0.9228919284533293, + "flos": 12676438730880.0, + "grad_norm": 2.446393579527446, + "language_loss": 0.88381404, + "learning_rate": 6.199959115573495e-08, + "loss": 0.90472174, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 15350, + "time_per_iteration": 2.362734794616699 + }, + { + "auxiliary_loss_clip": 0.0100732, + "auxiliary_loss_mlp": 0.01004325, + "balance_loss_clip": 1.00239372, + "balance_loss_mlp": 1.0009141, + "epoch": 0.9229520517059973, + "flos": 69983014677120.0, + "grad_norm": 0.778199143101153, + "language_loss": 0.60496324, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62507969, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.06396484, + "step": 15351, + "time_per_iteration": 2.9814682006835938 + }, + { + "auxiliary_loss_clip": 0.01051959, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.00894189, + "balance_loss_mlp": 1.01529026, + "epoch": 0.9230121749586653, + "flos": 14792943438720.0, + "grad_norm": 1.8404807836767416, + "language_loss": 0.79415345, + "learning_rate": 6.180729739558233e-08, + "loss": 0.8149873, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3671875, + "step": 15352, + "time_per_iteration": 2.350602865219116 + }, + { + "auxiliary_loss_clip": 0.01054986, + "auxiliary_loss_mlp": 0.01044165, + "balance_loss_clip": 1.02028728, + "balance_loss_mlp": 1.01688743, + "epoch": 0.9230722982113332, + "flos": 22966075941120.0, + "grad_norm": 1.9921791678696208, + "language_loss": 0.60964429, + "learning_rate": 6.171126075837585e-08, + "loss": 0.63063586, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38085938, + "step": 15353, + "time_per_iteration": 2.3971407413482666 + }, + { + "auxiliary_loss_clip": 0.01050803, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.01163197, + "balance_loss_mlp": 1.01671386, + "epoch": 0.9231324214640012, + "flos": 18550470781440.0, + "grad_norm": 1.6928846540972822, + "language_loss": 0.75669795, + "learning_rate": 6.161529762127293e-08, + "loss": 0.77754164, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 15354, + "time_per_iteration": 2.360952615737915 + }, + { + "auxiliary_loss_clip": 0.01052666, + "auxiliary_loss_mlp": 0.01043387, + "balance_loss_clip": 1.01642191, + "balance_loss_mlp": 1.01540279, + "epoch": 0.9231925447166691, + "flos": 22081866048000.0, + "grad_norm": 3.672586130919069, + "language_loss": 0.66598517, + "learning_rate": 6.1519407987912e-08, + "loss": 0.68694574, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37304688, + "step": 15355, + "time_per_iteration": 2.3724162578582764 + }, + { + "auxiliary_loss_clip": 0.01051481, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.01546073, + "balance_loss_mlp": 1.01700783, + "epoch": 0.9232526679693371, + "flos": 26539960199040.0, + "grad_norm": 1.942898379941915, + "language_loss": 0.75023985, + "learning_rate": 6.142359186192947e-08, + "loss": 0.77113324, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 15356, + "time_per_iteration": 2.421790838241577 + }, + { + "auxiliary_loss_clip": 0.01053025, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.0127486, + "balance_loss_mlp": 1.01644623, + "epoch": 0.9233127912220052, + "flos": 14755795885440.0, + "grad_norm": 2.067266046126509, + "language_loss": 0.62477243, + "learning_rate": 6.132784924695844e-08, + "loss": 0.64567316, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36523438, + "step": 15357, + "time_per_iteration": 2.3439671993255615 + }, + { + "auxiliary_loss_clip": 0.0105299, + "auxiliary_loss_mlp": 0.01040735, + "balance_loss_clip": 1.01466393, + "balance_loss_mlp": 1.01514912, + "epoch": 0.9233729144746731, + "flos": 25260707738880.0, + "grad_norm": 1.4335072994641604, + "language_loss": 0.70439249, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72532976, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 15358, + "time_per_iteration": 2.4120168685913086 + }, + { + "auxiliary_loss_clip": 0.01052315, + "auxiliary_loss_mlp": 0.0103411, + "balance_loss_clip": 1.01199722, + "balance_loss_mlp": 1.01645374, + "epoch": 0.9234330377273411, + "flos": 27848749536000.0, + "grad_norm": 4.701293079685919, + "language_loss": 0.74734831, + "learning_rate": 6.113658456457104e-08, + "loss": 0.76821256, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 15359, + "time_per_iteration": 2.4119837284088135 + }, + { + "auxiliary_loss_clip": 0.01053301, + "auxiliary_loss_mlp": 0.01040653, + "balance_loss_clip": 1.01766956, + "balance_loss_mlp": 1.01688063, + "epoch": 0.923493160980009, + "flos": 24607203310080.0, + "grad_norm": 2.312420887571466, + "language_loss": 0.66212904, + "learning_rate": 6.104106250440732e-08, + "loss": 0.68306863, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 15360, + "time_per_iteration": 3.6597657203674316 + }, + { + "auxiliary_loss_clip": 0.01006535, + "auxiliary_loss_mlp": 0.01001545, + "balance_loss_clip": 0.99969691, + "balance_loss_mlp": 1.00033927, + "epoch": 0.923553284232677, + "flos": 67697459832960.0, + "grad_norm": 0.7672192305093569, + "language_loss": 0.55288333, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57296419, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.06201172, + "step": 15361, + "time_per_iteration": 2.966583251953125 + }, + { + "auxiliary_loss_clip": 0.01052803, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.00910294, + "balance_loss_mlp": 1.01583314, + "epoch": 0.9236134074853449, + "flos": 18806244468480.0, + "grad_norm": 1.7203801604272766, + "language_loss": 0.70901269, + "learning_rate": 6.085023896425112e-08, + "loss": 0.72986132, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36914062, + "step": 15362, + "time_per_iteration": 2.3445169925689697 + }, + { + "auxiliary_loss_clip": 0.01054248, + "auxiliary_loss_mlp": 0.01040804, + "balance_loss_clip": 1.01454234, + "balance_loss_mlp": 1.01693535, + "epoch": 0.923673530738013, + "flos": 27781122499200.0, + "grad_norm": 1.4217139723969487, + "language_loss": 0.76357347, + "learning_rate": 6.075493749149463e-08, + "loss": 0.78452402, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37304688, + "step": 15363, + "time_per_iteration": 2.4201204776763916 + }, + { + "auxiliary_loss_clip": 0.01050694, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.01429009, + "balance_loss_mlp": 1.01519775, + "epoch": 0.9237336539906809, + "flos": 26795908442880.0, + "grad_norm": 3.375166383724164, + "language_loss": 0.84050894, + "learning_rate": 6.065970955510514e-08, + "loss": 0.86137658, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35546875, + "step": 15364, + "time_per_iteration": 2.390108346939087 + }, + { + "auxiliary_loss_clip": 0.0105025, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.01244211, + "balance_loss_mlp": 1.01607978, + "epoch": 0.9237937772433489, + "flos": 23586552357120.0, + "grad_norm": 1.4191354794287796, + "language_loss": 0.68537056, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70620048, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34179688, + "step": 15365, + "time_per_iteration": 2.3936142921447754 + }, + { + "auxiliary_loss_clip": 0.0105189, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.01359439, + "balance_loss_mlp": 1.01588392, + "epoch": 0.9238539004960168, + "flos": 26139366725760.0, + "grad_norm": 1.8667158439718592, + "language_loss": 0.64006162, + "learning_rate": 6.046947430586913e-08, + "loss": 0.66094398, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 15366, + "time_per_iteration": 2.4121859073638916 + }, + { + "auxiliary_loss_clip": 0.01051608, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.01407695, + "balance_loss_mlp": 1.01622248, + "epoch": 0.9239140237486848, + "flos": 21066975469440.0, + "grad_norm": 1.436527583184416, + "language_loss": 0.7575044, + "learning_rate": 6.037446700023619e-08, + "loss": 0.77839768, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35351562, + "step": 15367, + "time_per_iteration": 2.370427131652832 + }, + { + "auxiliary_loss_clip": 0.01049678, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.01048374, + "balance_loss_mlp": 1.01602423, + "epoch": 0.9239741470013527, + "flos": 24606784373760.0, + "grad_norm": 5.659362194928722, + "language_loss": 0.65844148, + "learning_rate": 6.027953324539759e-08, + "loss": 0.67925197, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.3359375, + "step": 15368, + "time_per_iteration": 2.385650157928467 + }, + { + "auxiliary_loss_clip": 0.01054669, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.01262116, + "balance_loss_mlp": 1.01659548, + "epoch": 0.9240342702540207, + "flos": 24717074958720.0, + "grad_norm": 1.9885917727497393, + "language_loss": 0.76530981, + "learning_rate": 6.018467304495401e-08, + "loss": 0.78622127, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.38085938, + "step": 15369, + "time_per_iteration": 2.430224895477295 + }, + { + "auxiliary_loss_clip": 0.0105561, + "auxiliary_loss_mlp": 0.01048351, + "balance_loss_clip": 1.02162457, + "balance_loss_mlp": 1.01739526, + "epoch": 0.9240943935066888, + "flos": 20848942828800.0, + "grad_norm": 1.8007990684839255, + "language_loss": 0.77750194, + "learning_rate": 6.008988640250145e-08, + "loss": 0.79854155, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3828125, + "step": 15370, + "time_per_iteration": 3.796916961669922 + }, + { + "auxiliary_loss_clip": 0.01052338, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.02164507, + "balance_loss_mlp": 1.01709318, + "epoch": 0.9241545167593567, + "flos": 24461161626240.0, + "grad_norm": 2.116223279410189, + "language_loss": 0.67919368, + "learning_rate": 5.999517332163528e-08, + "loss": 0.70015717, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 15371, + "time_per_iteration": 3.719233751296997 + }, + { + "auxiliary_loss_clip": 0.01007398, + "auxiliary_loss_mlp": 0.01002414, + "balance_loss_clip": 1.00036323, + "balance_loss_mlp": 1.00106251, + "epoch": 0.9242146400120247, + "flos": 61823951452800.0, + "grad_norm": 0.7266312487718359, + "language_loss": 0.5779953, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59809339, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06347656, + "step": 15372, + "time_per_iteration": 2.9654109477996826 + }, + { + "auxiliary_loss_clip": 0.0105065, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.01565123, + "balance_loss_mlp": 1.01654744, + "epoch": 0.9242747632646926, + "flos": 22047476492160.0, + "grad_norm": 2.228563726568278, + "language_loss": 0.71583098, + "learning_rate": 5.98059678590237e-08, + "loss": 0.73670709, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 15373, + "time_per_iteration": 2.3642055988311768 + }, + { + "auxiliary_loss_clip": 0.01051492, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.01821065, + "balance_loss_mlp": 1.01554477, + "epoch": 0.9243348865173606, + "flos": 18477362661120.0, + "grad_norm": 2.311385518942066, + "language_loss": 0.76251829, + "learning_rate": 5.971147548445299e-08, + "loss": 0.78345108, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 15374, + "time_per_iteration": 2.384127378463745 + }, + { + "auxiliary_loss_clip": 0.01052451, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.01348066, + "balance_loss_mlp": 1.01661873, + "epoch": 0.9243950097700285, + "flos": 23257635638400.0, + "grad_norm": 1.8588280215934128, + "language_loss": 0.66339254, + "learning_rate": 5.961705668581784e-08, + "loss": 0.68428171, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 15375, + "time_per_iteration": 2.3879010677337646 + }, + { + "auxiliary_loss_clip": 0.01052167, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.01307702, + "balance_loss_mlp": 1.0171628, + "epoch": 0.9244551330226966, + "flos": 29747884919040.0, + "grad_norm": 1.8533380822029064, + "language_loss": 0.67733037, + "learning_rate": 5.952271146669829e-08, + "loss": 0.69820321, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 15376, + "time_per_iteration": 2.501324415206909 + }, + { + "auxiliary_loss_clip": 0.01007069, + "auxiliary_loss_mlp": 0.01001488, + "balance_loss_clip": 0.99975967, + "balance_loss_mlp": 1.00057745, + "epoch": 0.9245152562753645, + "flos": 68861569029120.0, + "grad_norm": 0.6508527699562395, + "language_loss": 0.61250848, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63259411, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.06494141, + "step": 15377, + "time_per_iteration": 3.0804834365844727 + }, + { + "auxiliary_loss_clip": 0.01051927, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.01499808, + "balance_loss_mlp": 1.01598859, + "epoch": 0.9245753795280325, + "flos": 21578208641280.0, + "grad_norm": 1.9839719171774826, + "language_loss": 0.75093937, + "learning_rate": 5.933424178131341e-08, + "loss": 0.77183867, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 15378, + "time_per_iteration": 2.4066317081451416 + }, + { + "auxiliary_loss_clip": 0.01052139, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.01551628, + "balance_loss_mlp": 1.01612437, + "epoch": 0.9246355027807004, + "flos": 34494641124480.0, + "grad_norm": 2.048759237344714, + "language_loss": 0.63258094, + "learning_rate": 5.924011732219503e-08, + "loss": 0.65349805, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 15379, + "time_per_iteration": 2.5653374195098877 + }, + { + "auxiliary_loss_clip": 0.01051106, + "auxiliary_loss_mlp": 0.01040038, + "balance_loss_clip": 1.01742435, + "balance_loss_mlp": 1.01613283, + "epoch": 0.9246956260333684, + "flos": 15953142562560.0, + "grad_norm": 2.142279046918483, + "language_loss": 0.85817474, + "learning_rate": 5.914606645688591e-08, + "loss": 0.8790862, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 15380, + "time_per_iteration": 2.368302345275879 + }, + { + "auxiliary_loss_clip": 0.01053058, + "auxiliary_loss_mlp": 0.010415, + "balance_loss_clip": 1.01577497, + "balance_loss_mlp": 1.0154469, + "epoch": 0.9247557492860363, + "flos": 23367227996160.0, + "grad_norm": 1.6205207070230894, + "language_loss": 0.74485242, + "learning_rate": 5.905208918895233e-08, + "loss": 0.76579797, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 15381, + "time_per_iteration": 2.3691210746765137 + }, + { + "auxiliary_loss_clip": 0.01052675, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.01263928, + "balance_loss_mlp": 1.01701248, + "epoch": 0.9248158725387043, + "flos": 23038730213760.0, + "grad_norm": 1.8799708907307389, + "language_loss": 0.79396981, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.81485969, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 15382, + "time_per_iteration": 2.3785715103149414 + }, + { + "auxiliary_loss_clip": 0.01051708, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.01311588, + "balance_loss_mlp": 1.01570034, + "epoch": 0.9248759957913724, + "flos": 22521492288000.0, + "grad_norm": 1.6668081396769785, + "language_loss": 0.75908482, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77996784, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 15383, + "time_per_iteration": 3.8243911266326904 + }, + { + "auxiliary_loss_clip": 0.01049249, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.01226127, + "balance_loss_mlp": 1.01464558, + "epoch": 0.9249361190440403, + "flos": 25446096391680.0, + "grad_norm": 1.6934675354556872, + "language_loss": 0.76377821, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.78461659, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34570312, + "step": 15384, + "time_per_iteration": 2.408388137817383 + }, + { + "auxiliary_loss_clip": 0.01049118, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.01041365, + "balance_loss_mlp": 1.01541591, + "epoch": 0.9249962422967083, + "flos": 12378001495680.0, + "grad_norm": 1.9464963292939774, + "language_loss": 0.67370623, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.69451678, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33789062, + "step": 15385, + "time_per_iteration": 2.355088710784912 + }, + { + "auxiliary_loss_clip": 0.01049912, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.01550841, + "balance_loss_mlp": 1.01502538, + "epoch": 0.9250563655493762, + "flos": 22928334894720.0, + "grad_norm": 1.8035753415961924, + "language_loss": 0.80839694, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82927763, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34960938, + "step": 15386, + "time_per_iteration": 2.3712542057037354 + }, + { + "auxiliary_loss_clip": 0.01051051, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.01888633, + "balance_loss_mlp": 1.0165894, + "epoch": 0.9251164888020442, + "flos": 18477676863360.0, + "grad_norm": 3.367762621418299, + "language_loss": 0.77017665, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.79109681, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 15387, + "time_per_iteration": 2.3631856441497803 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.01219797, + "balance_loss_mlp": 1.01483572, + "epoch": 0.9251766120547121, + "flos": 33035655651840.0, + "grad_norm": 1.2868056173122322, + "language_loss": 0.70752704, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72833896, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33789062, + "step": 15388, + "time_per_iteration": 2.5433900356292725 + }, + { + "auxiliary_loss_clip": 0.01053419, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.0134176, + "balance_loss_mlp": 1.01689887, + "epoch": 0.9252367353073802, + "flos": 24386796696960.0, + "grad_norm": 1.741957707054808, + "language_loss": 0.82879961, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84970033, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36523438, + "step": 15389, + "time_per_iteration": 2.392090082168579 + }, + { + "auxiliary_loss_clip": 0.01054892, + "auxiliary_loss_mlp": 0.01041737, + "balance_loss_clip": 1.01635766, + "balance_loss_mlp": 1.01716936, + "epoch": 0.9252968585600481, + "flos": 18915836826240.0, + "grad_norm": 1.6914358741267246, + "language_loss": 0.80171055, + "learning_rate": 5.820960624653381e-08, + "loss": 0.82267684, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37890625, + "step": 15390, + "time_per_iteration": 2.4385111331939697 + }, + { + "auxiliary_loss_clip": 0.01052658, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.01450682, + "balance_loss_mlp": 1.01627254, + "epoch": 0.9253569818127161, + "flos": 21724285236480.0, + "grad_norm": 1.8153392875505523, + "language_loss": 0.77316666, + "learning_rate": 5.811636514789597e-08, + "loss": 0.79407406, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 15391, + "time_per_iteration": 2.5704450607299805 + }, + { + "auxiliary_loss_clip": 0.0105109, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.0104351, + "balance_loss_mlp": 1.0149945, + "epoch": 0.925417105065384, + "flos": 34238937260160.0, + "grad_norm": 2.890257705646562, + "language_loss": 0.54029715, + "learning_rate": 5.80231976856802e-08, + "loss": 0.56116194, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.359375, + "step": 15392, + "time_per_iteration": 2.5047607421875 + }, + { + "auxiliary_loss_clip": 0.01050254, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.01173282, + "balance_loss_mlp": 1.01487362, + "epoch": 0.925477228318052, + "flos": 25958307081600.0, + "grad_norm": 1.851322463352318, + "language_loss": 0.77963686, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.80047506, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 15393, + "time_per_iteration": 2.4171814918518066 + }, + { + "auxiliary_loss_clip": 0.01049875, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.01822543, + "balance_loss_mlp": 1.01510215, + "epoch": 0.9255373515707199, + "flos": 11837440915200.0, + "grad_norm": 1.7486199821786865, + "language_loss": 0.70659971, + "learning_rate": 5.783708368464357e-08, + "loss": 0.72750056, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 15394, + "time_per_iteration": 2.350252389907837 + }, + { + "auxiliary_loss_clip": 0.01052049, + "auxiliary_loss_mlp": 0.01041327, + "balance_loss_clip": 1.01796246, + "balance_loss_mlp": 1.01707721, + "epoch": 0.925597474823388, + "flos": 21433249209600.0, + "grad_norm": 5.38686037721917, + "language_loss": 0.7392469, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.76018071, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 15395, + "time_per_iteration": 2.392467498779297 + }, + { + "auxiliary_loss_clip": 0.01048765, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01081681, + "balance_loss_mlp": 1.01443768, + "epoch": 0.925657598076056, + "flos": 22856448672000.0, + "grad_norm": 2.2880539839914182, + "language_loss": 0.73003727, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.75084686, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34375, + "step": 15396, + "time_per_iteration": 2.382709264755249 + }, + { + "auxiliary_loss_clip": 0.01051195, + "auxiliary_loss_mlp": 0.01038829, + "balance_loss_clip": 1.01600039, + "balance_loss_mlp": 1.0159514, + "epoch": 0.9257177213287239, + "flos": 25702812685440.0, + "grad_norm": 1.7709636529205373, + "language_loss": 0.88119841, + "learning_rate": 5.755846504448603e-08, + "loss": 0.90209866, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 15397, + "time_per_iteration": 2.395958662033081 + }, + { + "auxiliary_loss_clip": 0.01006897, + "auxiliary_loss_mlp": 0.01002267, + "balance_loss_clip": 1.00027657, + "balance_loss_mlp": 1.00058401, + "epoch": 0.9257778445813919, + "flos": 59589929508480.0, + "grad_norm": 0.8069971643774221, + "language_loss": 0.55266517, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57275683, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.06347656, + "step": 15398, + "time_per_iteration": 2.882063865661621 + }, + { + "auxiliary_loss_clip": 0.01056693, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.01146388, + "balance_loss_mlp": 1.01711762, + "epoch": 0.9258379678340598, + "flos": 27708188935680.0, + "grad_norm": 1.8838102242928636, + "language_loss": 0.77997482, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.80091643, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.39453125, + "step": 15399, + "time_per_iteration": 2.399207592010498 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.01354742, + "balance_loss_mlp": 1.01541042, + "epoch": 0.9258980910867278, + "flos": 24862383504000.0, + "grad_norm": 1.438230453056525, + "language_loss": 0.79200292, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.81284833, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 15400, + "time_per_iteration": 3.648439407348633 + }, + { + "auxiliary_loss_clip": 0.01007185, + "auxiliary_loss_mlp": 0.01003014, + "balance_loss_clip": 1.0010829, + "balance_loss_mlp": 1.00075936, + "epoch": 0.9259582143393957, + "flos": 63131414158080.0, + "grad_norm": 0.7146864517387549, + "language_loss": 0.5129751, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53307712, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.06445312, + "step": 15401, + "time_per_iteration": 2.9916491508483887 + }, + { + "auxiliary_loss_clip": 0.01049646, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.01633239, + "balance_loss_mlp": 1.01626635, + "epoch": 0.9260183375920638, + "flos": 24126170330880.0, + "grad_norm": 2.118591962538782, + "language_loss": 0.83353126, + "learning_rate": 5.709557384259378e-08, + "loss": 0.85439849, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33398438, + "step": 15402, + "time_per_iteration": 2.405545711517334 + }, + { + "auxiliary_loss_clip": 0.0100681, + "auxiliary_loss_mlp": 0.01005714, + "balance_loss_clip": 1.00355637, + "balance_loss_mlp": 1.00042844, + "epoch": 0.9260784608447317, + "flos": 63039207657600.0, + "grad_norm": 0.7436769847149317, + "language_loss": 0.51193005, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53205532, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.06396484, + "step": 15403, + "time_per_iteration": 3.1323418617248535 + }, + { + "auxiliary_loss_clip": 0.01006666, + "auxiliary_loss_mlp": 0.01003017, + "balance_loss_clip": 1.00119352, + "balance_loss_mlp": 1.00041938, + "epoch": 0.9261385840973997, + "flos": 70582367854080.0, + "grad_norm": 0.6865232341886973, + "language_loss": 0.58810252, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60819936, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.0625, + "step": 15404, + "time_per_iteration": 3.0514886379241943 + }, + { + "auxiliary_loss_clip": 0.01052362, + "auxiliary_loss_mlp": 0.010357, + "balance_loss_clip": 1.0112859, + "balance_loss_mlp": 1.01547813, + "epoch": 0.9261987073500676, + "flos": 20228885349120.0, + "grad_norm": 2.692854339325013, + "language_loss": 0.73773986, + "learning_rate": 5.681872319494596e-08, + "loss": 0.75862044, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 15405, + "time_per_iteration": 2.371661424636841 + }, + { + "auxiliary_loss_clip": 0.01054052, + "auxiliary_loss_mlp": 0.01047379, + "balance_loss_clip": 1.02294123, + "balance_loss_mlp": 1.01711726, + "epoch": 0.9262588306027356, + "flos": 20953263571200.0, + "grad_norm": 1.8054808332045595, + "language_loss": 0.69289929, + "learning_rate": 5.672658701232458e-08, + "loss": 0.71391356, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36914062, + "step": 15406, + "time_per_iteration": 2.37551212310791 + }, + { + "auxiliary_loss_clip": 0.01051057, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.01755476, + "balance_loss_mlp": 1.01511145, + "epoch": 0.9263189538554035, + "flos": 22157732165760.0, + "grad_norm": 2.610657567832657, + "language_loss": 0.77398837, + "learning_rate": 5.663452451882555e-08, + "loss": 0.79491568, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.359375, + "step": 15407, + "time_per_iteration": 2.3745970726013184 + }, + { + "auxiliary_loss_clip": 0.0105263, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.01649714, + "balance_loss_mlp": 1.01500833, + "epoch": 0.9263790771080715, + "flos": 18186221900160.0, + "grad_norm": 2.3106702256213913, + "language_loss": 0.73375511, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.75470471, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37695312, + "step": 15408, + "time_per_iteration": 2.3496766090393066 + }, + { + "auxiliary_loss_clip": 0.0105013, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.01317441, + "balance_loss_mlp": 1.0157671, + "epoch": 0.9264392003607396, + "flos": 48176718923520.0, + "grad_norm": 1.7205268593334944, + "language_loss": 0.69789755, + "learning_rate": 5.645062061315675e-08, + "loss": 0.71874136, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34375, + "step": 15409, + "time_per_iteration": 2.6112823486328125 + }, + { + "auxiliary_loss_clip": 0.01053583, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.01286268, + "balance_loss_mlp": 1.01664913, + "epoch": 0.9264993236134075, + "flos": 26388437431680.0, + "grad_norm": 3.9331721856542403, + "language_loss": 0.76717454, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.78810519, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36914062, + "step": 15410, + "time_per_iteration": 5.315945625305176 + }, + { + "auxiliary_loss_clip": 0.01052609, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.01167285, + "balance_loss_mlp": 1.01622713, + "epoch": 0.9265594468660755, + "flos": 20919118394880.0, + "grad_norm": 1.598637739146034, + "language_loss": 0.82640588, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84728587, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36328125, + "step": 15411, + "time_per_iteration": 2.383453607559204 + }, + { + "auxiliary_loss_clip": 0.01053525, + "auxiliary_loss_mlp": 0.01035597, + "balance_loss_clip": 1.01320982, + "balance_loss_mlp": 1.01812458, + "epoch": 0.9266195701187434, + "flos": 17524199099520.0, + "grad_norm": 1.7979583270701456, + "language_loss": 0.762218, + "learning_rate": 5.617531751025728e-08, + "loss": 0.78310925, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 15412, + "time_per_iteration": 2.374799966812134 + }, + { + "auxiliary_loss_clip": 0.01051711, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.01329708, + "balance_loss_mlp": 1.01530325, + "epoch": 0.9266796933714114, + "flos": 33687449424000.0, + "grad_norm": 1.622815615234456, + "language_loss": 0.67649156, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69737196, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 15413, + "time_per_iteration": 2.483412265777588 + }, + { + "auxiliary_loss_clip": 0.01052695, + "auxiliary_loss_mlp": 0.0103893, + "balance_loss_clip": 1.01519489, + "balance_loss_mlp": 1.01579916, + "epoch": 0.9267398166240793, + "flos": 18915522624000.0, + "grad_norm": 1.6259275129236803, + "language_loss": 0.76539111, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78630739, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3671875, + "step": 15414, + "time_per_iteration": 2.3618295192718506 + }, + { + "auxiliary_loss_clip": 0.01052381, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.00990438, + "balance_loss_mlp": 1.01657212, + "epoch": 0.9267999398767474, + "flos": 20478095700480.0, + "grad_norm": 2.2922434583233007, + "language_loss": 0.82255268, + "learning_rate": 5.59006777975819e-08, + "loss": 0.84339368, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35742188, + "step": 15415, + "time_per_iteration": 2.347046375274658 + }, + { + "auxiliary_loss_clip": 0.01052282, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.01394975, + "balance_loss_mlp": 1.01634526, + "epoch": 0.9268600631294153, + "flos": 24788228042880.0, + "grad_norm": 1.575483710711171, + "language_loss": 0.55063933, + "learning_rate": 5.580927866294671e-08, + "loss": 0.57152951, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 15416, + "time_per_iteration": 2.416163921356201 + }, + { + "auxiliary_loss_clip": 0.01050656, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.01168585, + "balance_loss_mlp": 1.01586938, + "epoch": 0.9269201863820833, + "flos": 18696198263040.0, + "grad_norm": 1.9862017910436238, + "language_loss": 0.72507811, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74592674, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 15417, + "time_per_iteration": 2.3417580127716064 + }, + { + "auxiliary_loss_clip": 0.0105163, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.0132103, + "balance_loss_mlp": 1.01630807, + "epoch": 0.9269803096347512, + "flos": 20922923733120.0, + "grad_norm": 2.0874330332415933, + "language_loss": 0.76749027, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.7883662, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 15418, + "time_per_iteration": 2.3812272548675537 + }, + { + "auxiliary_loss_clip": 0.01050027, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.01098108, + "balance_loss_mlp": 1.01563239, + "epoch": 0.9270404328874192, + "flos": 28001424378240.0, + "grad_norm": 1.4662870990666435, + "language_loss": 0.77172709, + "learning_rate": 5.553552361633174e-08, + "loss": 0.79256773, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34375, + "step": 15419, + "time_per_iteration": 2.447937488555908 + }, + { + "auxiliary_loss_clip": 0.01047821, + "auxiliary_loss_mlp": 0.01038446, + "balance_loss_clip": 1.01884818, + "balance_loss_mlp": 1.01493824, + "epoch": 0.9271005561400871, + "flos": 25888550451840.0, + "grad_norm": 1.6392138078250358, + "language_loss": 0.76819646, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78905916, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.328125, + "step": 15420, + "time_per_iteration": 2.450503349304199 + }, + { + "auxiliary_loss_clip": 0.01053285, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.01290464, + "balance_loss_mlp": 1.01628911, + "epoch": 0.9271606793927551, + "flos": 27052659648000.0, + "grad_norm": 1.613134989320184, + "language_loss": 0.77285206, + "learning_rate": 5.535338891759389e-08, + "loss": 0.79374957, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 15421, + "time_per_iteration": 2.4176900386810303 + }, + { + "auxiliary_loss_clip": 0.01052022, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.01200438, + "balance_loss_mlp": 1.01577652, + "epoch": 0.9272208026454232, + "flos": 26208774241920.0, + "grad_norm": 2.21728234999961, + "language_loss": 0.73905486, + "learning_rate": 5.526243217829041e-08, + "loss": 0.75991458, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.36328125, + "step": 15422, + "time_per_iteration": 3.849534273147583 + }, + { + "auxiliary_loss_clip": 0.01052487, + "auxiliary_loss_mlp": 0.01043074, + "balance_loss_clip": 1.01906562, + "balance_loss_mlp": 1.01595449, + "epoch": 0.9272809258980911, + "flos": 12457638040320.0, + "grad_norm": 2.1981632777477604, + "language_loss": 0.79276204, + "learning_rate": 5.517154918363065e-08, + "loss": 0.8137176, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 15423, + "time_per_iteration": 2.3573899269104004 + }, + { + "auxiliary_loss_clip": 0.01051631, + "auxiliary_loss_mlp": 0.01040397, + "balance_loss_clip": 1.01644802, + "balance_loss_mlp": 1.01537919, + "epoch": 0.9273410491507591, + "flos": 22855785356160.0, + "grad_norm": 1.732772081427573, + "language_loss": 0.76249588, + "learning_rate": 5.508073993706053e-08, + "loss": 0.78341615, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 15424, + "time_per_iteration": 2.3975846767425537 + }, + { + "auxiliary_loss_clip": 0.01006741, + "auxiliary_loss_mlp": 0.01002494, + "balance_loss_clip": 1.0004437, + "balance_loss_mlp": 1.00037718, + "epoch": 0.927401172403427, + "flos": 47662621153920.0, + "grad_norm": 0.7776832360495015, + "language_loss": 0.60772258, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62781495, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06347656, + "step": 15425, + "time_per_iteration": 2.868772506713867 + }, + { + "auxiliary_loss_clip": 0.01052059, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.01656365, + "balance_loss_mlp": 1.01575065, + "epoch": 0.927461295656095, + "flos": 29971049529600.0, + "grad_norm": 1.400017919067707, + "language_loss": 0.71744585, + "learning_rate": 5.489934270196106e-08, + "loss": 0.73836625, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 15426, + "time_per_iteration": 2.455390691757202 + }, + { + "auxiliary_loss_clip": 0.01051395, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.01120079, + "balance_loss_mlp": 1.01630795, + "epoch": 0.9275214189087629, + "flos": 20374403362560.0, + "grad_norm": 1.9270698068457937, + "language_loss": 0.83923584, + "learning_rate": 5.480875472030977e-08, + "loss": 0.86007482, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 15427, + "time_per_iteration": 2.353458881378174 + }, + { + "auxiliary_loss_clip": 0.01051556, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.01169932, + "balance_loss_mlp": 1.01622701, + "epoch": 0.927581542161431, + "flos": 22382083762560.0, + "grad_norm": 1.4449250408100502, + "language_loss": 0.77626657, + "learning_rate": 5.471824050050555e-08, + "loss": 0.7971307, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35351562, + "step": 15428, + "time_per_iteration": 2.3825225830078125 + }, + { + "auxiliary_loss_clip": 0.01050374, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.01087594, + "balance_loss_mlp": 1.01513839, + "epoch": 0.9276416654140989, + "flos": 23951289997440.0, + "grad_norm": 1.936789431694983, + "language_loss": 0.75383145, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.77467829, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 15429, + "time_per_iteration": 2.3745245933532715 + }, + { + "auxiliary_loss_clip": 0.01049217, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.01180911, + "balance_loss_mlp": 1.01507974, + "epoch": 0.9277017886667669, + "flos": 13916867892480.0, + "grad_norm": 1.9235720600984634, + "language_loss": 0.75799656, + "learning_rate": 5.45374333601647e-08, + "loss": 0.7788204, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34179688, + "step": 15430, + "time_per_iteration": 2.3502449989318848 + }, + { + "auxiliary_loss_clip": 0.01050917, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.01139927, + "balance_loss_mlp": 1.01530695, + "epoch": 0.9277619119194348, + "flos": 35664929631360.0, + "grad_norm": 1.4470820441503982, + "language_loss": 0.77265114, + "learning_rate": 5.444714044648391e-08, + "loss": 0.79351801, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 15431, + "time_per_iteration": 2.498563766479492 + }, + { + "auxiliary_loss_clip": 0.01051247, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.01184237, + "balance_loss_mlp": 1.0165565, + "epoch": 0.9278220351721028, + "flos": 23840126628480.0, + "grad_norm": 1.6577370778793363, + "language_loss": 0.72058022, + "learning_rate": 5.4356921308363e-08, + "loss": 0.74142075, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34765625, + "step": 15432, + "time_per_iteration": 2.4000725746154785 + }, + { + "auxiliary_loss_clip": 0.0105214, + "auxiliary_loss_mlp": 0.01037562, + "balance_loss_clip": 1.01463842, + "balance_loss_mlp": 1.01602364, + "epoch": 0.9278821584247707, + "flos": 15227332974720.0, + "grad_norm": 2.3157342044341527, + "language_loss": 0.84305406, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.86395109, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36132812, + "step": 15433, + "time_per_iteration": 2.3293073177337646 + }, + { + "auxiliary_loss_clip": 0.01049308, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.01110744, + "balance_loss_mlp": 1.01581335, + "epoch": 0.9279422816774388, + "flos": 24680241607680.0, + "grad_norm": 1.8396776033261326, + "language_loss": 0.67429137, + "learning_rate": 5.417670437248056e-08, + "loss": 0.69510651, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33398438, + "step": 15434, + "time_per_iteration": 2.426029682159424 + }, + { + "auxiliary_loss_clip": 0.01047835, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.01221085, + "balance_loss_mlp": 1.01455641, + "epoch": 0.9280024049301068, + "flos": 19168259022720.0, + "grad_norm": 1.691954205271849, + "language_loss": 0.69450068, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71530473, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.33203125, + "step": 15435, + "time_per_iteration": 2.3532016277313232 + }, + { + "auxiliary_loss_clip": 0.01052078, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.01176727, + "balance_loss_mlp": 1.01618838, + "epoch": 0.9280625281827747, + "flos": 11393101641600.0, + "grad_norm": 1.8372593274515352, + "language_loss": 0.7405771, + "learning_rate": 5.399678257985263e-08, + "loss": 0.76144767, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 15436, + "time_per_iteration": 2.3422205448150635 + }, + { + "auxiliary_loss_clip": 0.01051998, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.00977874, + "balance_loss_mlp": 1.01671624, + "epoch": 0.9281226514354427, + "flos": 24784597261440.0, + "grad_norm": 3.0381304843181396, + "language_loss": 0.67702329, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69787878, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35351562, + "step": 15437, + "time_per_iteration": 2.427947521209717 + }, + { + "auxiliary_loss_clip": 0.01052774, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.01272464, + "balance_loss_mlp": 1.01633668, + "epoch": 0.9281827746881106, + "flos": 15082303720320.0, + "grad_norm": 2.251650347489714, + "language_loss": 0.72843397, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.74935418, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36328125, + "step": 15438, + "time_per_iteration": 2.3529088497161865 + }, + { + "auxiliary_loss_clip": 0.01051463, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.01072168, + "balance_loss_mlp": 1.0161643, + "epoch": 0.9282428979407786, + "flos": 24133885741440.0, + "grad_norm": 2.5923800753187884, + "language_loss": 0.65576768, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.67661572, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 15439, + "time_per_iteration": 3.6783390045166016 + }, + { + "auxiliary_loss_clip": 0.01050968, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.0114336, + "balance_loss_mlp": 1.01653361, + "epoch": 0.9283030211934465, + "flos": 24822163751040.0, + "grad_norm": 1.7290096187845199, + "language_loss": 0.71320701, + "learning_rate": 5.363782453347876e-08, + "loss": 0.73406005, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34375, + "step": 15440, + "time_per_iteration": 2.4173314571380615 + }, + { + "auxiliary_loss_clip": 0.01052674, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.01304746, + "balance_loss_mlp": 1.01614499, + "epoch": 0.9283631444461146, + "flos": 23980093735680.0, + "grad_norm": 1.6792720650071553, + "language_loss": 0.77824235, + "learning_rate": 5.354826952900682e-08, + "loss": 0.79914767, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 15441, + "time_per_iteration": 2.407503128051758 + }, + { + "auxiliary_loss_clip": 0.01047569, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.01356542, + "balance_loss_mlp": 1.01489997, + "epoch": 0.9284232676987825, + "flos": 22783410374400.0, + "grad_norm": 1.8422604609753301, + "language_loss": 0.64768559, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66849983, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.32617188, + "step": 15442, + "time_per_iteration": 2.394751787185669 + }, + { + "auxiliary_loss_clip": 0.01053161, + "auxiliary_loss_mlp": 0.01040687, + "balance_loss_clip": 1.01833582, + "balance_loss_mlp": 1.01589394, + "epoch": 0.9284833909514505, + "flos": 19499479891200.0, + "grad_norm": 1.9244006890329577, + "language_loss": 0.81939209, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.8403306, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.37109375, + "step": 15443, + "time_per_iteration": 2.3707332611083984 + }, + { + "auxiliary_loss_clip": 0.01053042, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.01221609, + "balance_loss_mlp": 1.0162189, + "epoch": 0.9285435142041184, + "flos": 23184841720320.0, + "grad_norm": 2.4742626385671653, + "language_loss": 0.66415119, + "learning_rate": 5.328004738702896e-08, + "loss": 0.68503714, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 15444, + "time_per_iteration": 2.395737648010254 + }, + { + "auxiliary_loss_clip": 0.0105266, + "auxiliary_loss_mlp": 0.01036955, + "balance_loss_clip": 1.0143168, + "balance_loss_mlp": 1.0159744, + "epoch": 0.9286036374567864, + "flos": 17674569792000.0, + "grad_norm": 1.9684637082017338, + "language_loss": 0.7422958, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.76319194, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3671875, + "step": 15445, + "time_per_iteration": 2.3202152252197266 + }, + { + "auxiliary_loss_clip": 0.01051036, + "auxiliary_loss_mlp": 0.01039169, + "balance_loss_clip": 1.01607811, + "balance_loss_mlp": 1.01639783, + "epoch": 0.9286637607094543, + "flos": 20885636534400.0, + "grad_norm": 2.167659046385479, + "language_loss": 0.72436911, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.74527121, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34765625, + "step": 15446, + "time_per_iteration": 2.3781802654266357 + }, + { + "auxiliary_loss_clip": 0.01054213, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.0133462, + "balance_loss_mlp": 1.01685739, + "epoch": 0.9287238839621224, + "flos": 19025010247680.0, + "grad_norm": 1.72471336507201, + "language_loss": 0.70301461, + "learning_rate": 5.301248962337523e-08, + "loss": 0.7239393, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 15447, + "time_per_iteration": 2.3611490726470947 + }, + { + "auxiliary_loss_clip": 0.01048519, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.01128793, + "balance_loss_mlp": 1.0158093, + "epoch": 0.9287840072147904, + "flos": 20556021588480.0, + "grad_norm": 1.6207484922561115, + "language_loss": 0.73703748, + "learning_rate": 5.292345135757403e-08, + "loss": 0.75784272, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.328125, + "step": 15448, + "time_per_iteration": 2.3943426609039307 + }, + { + "auxiliary_loss_clip": 0.01050945, + "auxiliary_loss_mlp": 0.01038403, + "balance_loss_clip": 1.01471615, + "balance_loss_mlp": 1.01533377, + "epoch": 0.9288441304674583, + "flos": 21249780681600.0, + "grad_norm": 1.5729282866943668, + "language_loss": 0.7602458, + "learning_rate": 5.283448692511072e-08, + "loss": 0.78113925, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 15449, + "time_per_iteration": 2.37914776802063 + }, + { + "auxiliary_loss_clip": 0.0105247, + "auxiliary_loss_mlp": 0.0103616, + "balance_loss_clip": 1.0116384, + "balance_loss_mlp": 1.01628864, + "epoch": 0.9289042537201263, + "flos": 27668702321280.0, + "grad_norm": 2.012790129958376, + "language_loss": 0.68962002, + "learning_rate": 5.27455963293586e-08, + "loss": 0.71050632, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36328125, + "step": 15450, + "time_per_iteration": 5.244949102401733 + }, + { + "auxiliary_loss_clip": 0.01052047, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.01250684, + "balance_loss_mlp": 1.01602376, + "epoch": 0.9289643769727942, + "flos": 19316744501760.0, + "grad_norm": 1.9220194862768503, + "language_loss": 0.73279798, + "learning_rate": 5.265677957368875e-08, + "loss": 0.75368154, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 15451, + "time_per_iteration": 2.3459155559539795 + }, + { + "auxiliary_loss_clip": 0.01053247, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.01883924, + "balance_loss_mlp": 1.01703644, + "epoch": 0.9290245002254622, + "flos": 14057358670080.0, + "grad_norm": 2.308863464216243, + "language_loss": 0.74242461, + "learning_rate": 5.25680366614687e-08, + "loss": 0.76337183, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 15452, + "time_per_iteration": 2.3372955322265625 + }, + { + "auxiliary_loss_clip": 0.01052553, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.01599765, + "balance_loss_mlp": 1.0175612, + "epoch": 0.9290846234781301, + "flos": 20046115048320.0, + "grad_norm": 1.8377458751352682, + "language_loss": 0.74658811, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76751596, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.34960938, + "step": 15453, + "time_per_iteration": 2.3649237155914307 + }, + { + "auxiliary_loss_clip": 0.01006861, + "auxiliary_loss_mlp": 0.0100231, + "balance_loss_clip": 1.00021207, + "balance_loss_mlp": 1.00043845, + "epoch": 0.9291447467307982, + "flos": 61224668098560.0, + "grad_norm": 0.8204203078681165, + "language_loss": 0.6079185, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62801015, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.06445312, + "step": 15454, + "time_per_iteration": 2.9322316646575928 + }, + { + "auxiliary_loss_clip": 0.0105192, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.01224661, + "balance_loss_mlp": 1.01567888, + "epoch": 0.9292048699834661, + "flos": 20552425718400.0, + "grad_norm": 1.8884284224132608, + "language_loss": 0.6967715, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71764594, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36132812, + "step": 15455, + "time_per_iteration": 2.3759891986846924 + }, + { + "auxiliary_loss_clip": 0.01053089, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.01463628, + "balance_loss_mlp": 1.0171392, + "epoch": 0.9292649932361341, + "flos": 23622512924160.0, + "grad_norm": 2.1821020734168965, + "language_loss": 0.65686917, + "learning_rate": 5.22138035143509e-08, + "loss": 0.67778456, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 15456, + "time_per_iteration": 2.381910562515259 + }, + { + "auxiliary_loss_clip": 0.01049633, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.01147258, + "balance_loss_mlp": 1.015504, + "epoch": 0.929325116488802, + "flos": 15009125777280.0, + "grad_norm": 1.754484817731398, + "language_loss": 0.69709206, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.71793151, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34179688, + "step": 15457, + "time_per_iteration": 2.3438079357147217 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.01422501, + "balance_loss_mlp": 1.01548874, + "epoch": 0.92938523974147, + "flos": 17966408780160.0, + "grad_norm": 2.053932063715907, + "language_loss": 0.81433213, + "learning_rate": 5.203713008885291e-08, + "loss": 0.83522177, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.36328125, + "step": 15458, + "time_per_iteration": 2.3426055908203125 + }, + { + "auxiliary_loss_clip": 0.01050946, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.01631343, + "balance_loss_mlp": 1.01505303, + "epoch": 0.9294453629941379, + "flos": 23001931774080.0, + "grad_norm": 1.6218502594545223, + "language_loss": 0.73112679, + "learning_rate": 5.194890417485065e-08, + "loss": 0.75202674, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 15459, + "time_per_iteration": 2.3789749145507812 + }, + { + "auxiliary_loss_clip": 0.01053102, + "auxiliary_loss_mlp": 0.01039881, + "balance_loss_clip": 1.01593137, + "balance_loss_mlp": 1.01700044, + "epoch": 0.929505486246806, + "flos": 17054302844160.0, + "grad_norm": 2.3491222574455177, + "language_loss": 0.60239172, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.62332153, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36132812, + "step": 15460, + "time_per_iteration": 2.3468258380889893 + }, + { + "auxiliary_loss_clip": 0.0105212, + "auxiliary_loss_mlp": 0.01042159, + "balance_loss_clip": 1.01887751, + "balance_loss_mlp": 1.01622748, + "epoch": 0.9295656094994739, + "flos": 27339296843520.0, + "grad_norm": 1.7414471158311102, + "language_loss": 0.81421041, + "learning_rate": 5.177267396106733e-08, + "loss": 0.8351531, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 15461, + "time_per_iteration": 2.398174524307251 + }, + { + "auxiliary_loss_clip": 0.01050217, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.01376188, + "balance_loss_mlp": 1.01548851, + "epoch": 0.9296257327521419, + "flos": 21469873092480.0, + "grad_norm": 1.771857487705492, + "language_loss": 0.78811288, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80895698, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.34765625, + "step": 15462, + "time_per_iteration": 3.812796115875244 + }, + { + "auxiliary_loss_clip": 0.01051213, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.01995707, + "balance_loss_mlp": 1.0152148, + "epoch": 0.9296858560048099, + "flos": 16361730737280.0, + "grad_norm": 1.9239114206061534, + "language_loss": 0.63584721, + "learning_rate": 5.159673925518282e-08, + "loss": 0.65678644, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 15463, + "time_per_iteration": 2.3739259243011475 + }, + { + "auxiliary_loss_clip": 0.01050156, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.01322591, + "balance_loss_mlp": 1.01522708, + "epoch": 0.9297459792574778, + "flos": 29857407454080.0, + "grad_norm": 2.2071802281479695, + "language_loss": 0.72058141, + "learning_rate": 5.15088827260437e-08, + "loss": 0.74142051, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34960938, + "step": 15464, + "time_per_iteration": 2.450702428817749 + }, + { + "auxiliary_loss_clip": 0.01050896, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.00942338, + "balance_loss_mlp": 1.01548052, + "epoch": 0.9298061025101458, + "flos": 15923919888000.0, + "grad_norm": 1.8907641416665046, + "language_loss": 0.78243697, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.80326664, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 15465, + "time_per_iteration": 2.4018173217773438 + }, + { + "auxiliary_loss_clip": 0.01006766, + "auxiliary_loss_mlp": 0.01003059, + "balance_loss_clip": 1.00102043, + "balance_loss_mlp": 1.00034046, + "epoch": 0.9298662257628137, + "flos": 64093410293760.0, + "grad_norm": 0.6994327217301708, + "language_loss": 0.56484145, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58493966, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.06445312, + "step": 15466, + "time_per_iteration": 3.115100145339966 + }, + { + "auxiliary_loss_clip": 0.0105174, + "auxiliary_loss_mlp": 0.01042029, + "balance_loss_clip": 1.01706612, + "balance_loss_mlp": 1.01554751, + "epoch": 0.9299263490154818, + "flos": 24279054641280.0, + "grad_norm": 1.4324275263939712, + "language_loss": 0.7357024, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.75664008, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36132812, + "step": 15467, + "time_per_iteration": 2.4122750759124756 + }, + { + "auxiliary_loss_clip": 0.01052888, + "auxiliary_loss_mlp": 0.0103995, + "balance_loss_clip": 1.01555967, + "balance_loss_mlp": 1.0161891, + "epoch": 0.9299864722681497, + "flos": 23293247091840.0, + "grad_norm": 1.686358431062476, + "language_loss": 0.72654349, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.74747187, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 15468, + "time_per_iteration": 2.3808610439300537 + }, + { + "auxiliary_loss_clip": 0.01052182, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.01506948, + "balance_loss_mlp": 1.01532805, + "epoch": 0.9300465955208177, + "flos": 21394949581440.0, + "grad_norm": 1.647491070198712, + "language_loss": 0.75796723, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77889043, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 15469, + "time_per_iteration": 2.3883211612701416 + }, + { + "auxiliary_loss_clip": 0.01050792, + "auxiliary_loss_mlp": 0.01038265, + "balance_loss_clip": 1.01491213, + "balance_loss_mlp": 1.01561153, + "epoch": 0.9301067187734856, + "flos": 24570300136320.0, + "grad_norm": 1.9342002794579205, + "language_loss": 0.77367383, + "learning_rate": 5.098329529416379e-08, + "loss": 0.79456437, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3515625, + "step": 15470, + "time_per_iteration": 2.506520986557007 + }, + { + "auxiliary_loss_clip": 0.01051244, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.01247561, + "balance_loss_mlp": 1.01566839, + "epoch": 0.9301668420261536, + "flos": 22195961971200.0, + "grad_norm": 1.570254779238394, + "language_loss": 0.75279051, + "learning_rate": 5.089595604367902e-08, + "loss": 0.77365494, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 15471, + "time_per_iteration": 2.3790082931518555 + }, + { + "auxiliary_loss_clip": 0.0105168, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.01423883, + "balance_loss_mlp": 1.01570725, + "epoch": 0.9302269652788215, + "flos": 17746700394240.0, + "grad_norm": 2.1505279292796176, + "language_loss": 0.70745623, + "learning_rate": 5.080869070341487e-08, + "loss": 0.72834051, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 15472, + "time_per_iteration": 2.3207974433898926 + }, + { + "auxiliary_loss_clip": 0.01048029, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.01176882, + "balance_loss_mlp": 1.01496065, + "epoch": 0.9302870885314896, + "flos": 19389782799360.0, + "grad_norm": 2.000038301420985, + "language_loss": 0.90017295, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.92098874, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.33007812, + "step": 15473, + "time_per_iteration": 2.375072717666626 + }, + { + "auxiliary_loss_clip": 0.01054582, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_clip": 1.01915145, + "balance_loss_mlp": 1.01710236, + "epoch": 0.9303472117841575, + "flos": 21759268285440.0, + "grad_norm": 2.3435289317648103, + "language_loss": 0.66395676, + "learning_rate": 5.063438176678203e-08, + "loss": 0.68494934, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 15474, + "time_per_iteration": 2.37300705909729 + }, + { + "auxiliary_loss_clip": 0.01051597, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.01398849, + "balance_loss_mlp": 1.01606774, + "epoch": 0.9304073350368255, + "flos": 19608723135360.0, + "grad_norm": 1.7745935611833767, + "language_loss": 0.75614184, + "learning_rate": 5.054733817702339e-08, + "loss": 0.77702391, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 15475, + "time_per_iteration": 2.3689987659454346 + }, + { + "auxiliary_loss_clip": 0.010506, + "auxiliary_loss_mlp": 0.01037715, + "balance_loss_clip": 1.01448107, + "balance_loss_mlp": 1.01575041, + "epoch": 0.9304674582894935, + "flos": 30440387203200.0, + "grad_norm": 1.8091250871278772, + "language_loss": 0.68238896, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.7032721, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 15476, + "time_per_iteration": 2.4600582122802734 + }, + { + "auxiliary_loss_clip": 0.01051062, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.015486, + "balance_loss_mlp": 1.01528168, + "epoch": 0.9305275815421614, + "flos": 17784720731520.0, + "grad_norm": 5.591629258092099, + "language_loss": 0.70238221, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.72329056, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35742188, + "step": 15477, + "time_per_iteration": 2.4136195182800293 + }, + { + "auxiliary_loss_clip": 0.01050831, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.0145539, + "balance_loss_mlp": 1.0162127, + "epoch": 0.9305877047948294, + "flos": 25297366533120.0, + "grad_norm": 1.8099178812081842, + "language_loss": 0.59518814, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.61606061, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 15478, + "time_per_iteration": 2.442432165145874 + }, + { + "auxiliary_loss_clip": 0.01055394, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.0130446, + "balance_loss_mlp": 1.01712441, + "epoch": 0.9306478280474973, + "flos": 16976446778880.0, + "grad_norm": 1.907385694348522, + "language_loss": 0.80931556, + "learning_rate": 5.01999030853566e-08, + "loss": 0.83025301, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 15479, + "time_per_iteration": 3.6370887756347656 + }, + { + "auxiliary_loss_clip": 0.01050879, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.01452422, + "balance_loss_mlp": 1.01534796, + "epoch": 0.9307079513001654, + "flos": 35661892343040.0, + "grad_norm": 1.7077334036133218, + "language_loss": 0.6996218, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.72050703, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 15480, + "time_per_iteration": 2.5033771991729736 + }, + { + "auxiliary_loss_clip": 0.01052789, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.01558399, + "balance_loss_mlp": 1.01650691, + "epoch": 0.9307680745528333, + "flos": 19207152144000.0, + "grad_norm": 1.5716652693733677, + "language_loss": 0.68526614, + "learning_rate": 5.002662914604583e-08, + "loss": 0.70618427, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36132812, + "step": 15481, + "time_per_iteration": 2.3965609073638916 + }, + { + "auxiliary_loss_clip": 0.01048885, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.01231027, + "balance_loss_mlp": 1.01491022, + "epoch": 0.9308281978055013, + "flos": 19061634130560.0, + "grad_norm": 3.0315172047708465, + "language_loss": 0.75629687, + "learning_rate": 4.994010308952701e-08, + "loss": 0.77712148, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 15482, + "time_per_iteration": 2.3873937129974365 + }, + { + "auxiliary_loss_clip": 0.01050605, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.0153656, + "balance_loss_mlp": 1.01605952, + "epoch": 0.9308883210581692, + "flos": 20520514869120.0, + "grad_norm": 3.7453203508151405, + "language_loss": 0.81255436, + "learning_rate": 4.985365097947469e-08, + "loss": 0.83342552, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34570312, + "step": 15483, + "time_per_iteration": 2.3916823863983154 + }, + { + "auxiliary_loss_clip": 0.01052425, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_clip": 1.02107823, + "balance_loss_mlp": 1.01626444, + "epoch": 0.9309484443108372, + "flos": 13000712238720.0, + "grad_norm": 1.8868584985550634, + "language_loss": 0.75895619, + "learning_rate": 4.976727281916782e-08, + "loss": 0.77991021, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 15484, + "time_per_iteration": 2.3744494915008545 + }, + { + "auxiliary_loss_clip": 0.01054065, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.01506734, + "balance_loss_mlp": 1.01716113, + "epoch": 0.9310085675635051, + "flos": 12566951107200.0, + "grad_norm": 2.10416124334466, + "language_loss": 0.77949762, + "learning_rate": 4.968096861188087e-08, + "loss": 0.80042869, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36914062, + "step": 15485, + "time_per_iteration": 2.3867828845977783 + }, + { + "auxiliary_loss_clip": 0.01052783, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.01279306, + "balance_loss_mlp": 1.01599753, + "epoch": 0.9310686908161732, + "flos": 23476436328960.0, + "grad_norm": 1.873990051427053, + "language_loss": 0.7936722, + "learning_rate": 4.959473836088723e-08, + "loss": 0.81457865, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 15486, + "time_per_iteration": 2.4315803050994873 + }, + { + "auxiliary_loss_clip": 0.01053486, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.01652527, + "balance_loss_mlp": 1.01726902, + "epoch": 0.9311288140688411, + "flos": 24169148081280.0, + "grad_norm": 1.9010512856590633, + "language_loss": 0.78689325, + "learning_rate": 4.950858206945674e-08, + "loss": 0.80783963, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36328125, + "step": 15487, + "time_per_iteration": 2.393742561340332 + }, + { + "auxiliary_loss_clip": 0.01050485, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.00867748, + "balance_loss_mlp": 1.01531672, + "epoch": 0.9311889373215091, + "flos": 35588749311360.0, + "grad_norm": 2.2632890331940114, + "language_loss": 0.6908502, + "learning_rate": 4.942249974085633e-08, + "loss": 0.71166945, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3515625, + "step": 15488, + "time_per_iteration": 2.4751522541046143 + }, + { + "auxiliary_loss_clip": 0.0104843, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.01263237, + "balance_loss_mlp": 1.0155654, + "epoch": 0.9312490605741771, + "flos": 20229478842240.0, + "grad_norm": 1.880765877155819, + "language_loss": 0.76407802, + "learning_rate": 4.933649137834983e-08, + "loss": 0.78490722, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.328125, + "step": 15489, + "time_per_iteration": 2.4068539142608643 + }, + { + "auxiliary_loss_clip": 0.01052996, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.01196575, + "balance_loss_mlp": 1.01618361, + "epoch": 0.931309183826845, + "flos": 13949826082560.0, + "grad_norm": 2.1868058784293267, + "language_loss": 0.82385576, + "learning_rate": 4.925055698519931e-08, + "loss": 0.84474099, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 15490, + "time_per_iteration": 5.259932041168213 + }, + { + "auxiliary_loss_clip": 0.010526, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.0122385, + "balance_loss_mlp": 1.01607871, + "epoch": 0.931369307079513, + "flos": 20155707406080.0, + "grad_norm": 1.5225258937874817, + "language_loss": 0.73487014, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.75576341, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36523438, + "step": 15491, + "time_per_iteration": 2.374633312225342 + }, + { + "auxiliary_loss_clip": 0.01048525, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.01320481, + "balance_loss_mlp": 1.01456475, + "epoch": 0.931429430332181, + "flos": 25337376817920.0, + "grad_norm": 2.0594830599494025, + "language_loss": 0.75429034, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.77511287, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33984375, + "step": 15492, + "time_per_iteration": 2.4147138595581055 + }, + { + "auxiliary_loss_clip": 0.01006885, + "auxiliary_loss_mlp": 0.01002831, + "balance_loss_clip": 1.00079286, + "balance_loss_mlp": 1.00066495, + "epoch": 0.931489553584849, + "flos": 71223024902400.0, + "grad_norm": 0.7108712739598405, + "language_loss": 0.5348987, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55499583, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.06201172, + "step": 15493, + "time_per_iteration": 2.886294364929199 + }, + { + "auxiliary_loss_clip": 0.01049918, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.01337683, + "balance_loss_mlp": 1.01562214, + "epoch": 0.9315496768375169, + "flos": 14642886948480.0, + "grad_norm": 1.5901055419351053, + "language_loss": 0.7196362, + "learning_rate": 4.890755917128531e-08, + "loss": 0.74049079, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 15494, + "time_per_iteration": 2.3536510467529297 + }, + { + "auxiliary_loss_clip": 0.01053187, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.01169348, + "balance_loss_mlp": 1.01631606, + "epoch": 0.9316098000901849, + "flos": 28328665351680.0, + "grad_norm": 1.6859262488960933, + "language_loss": 0.69579858, + "learning_rate": 4.882199467373671e-08, + "loss": 0.71667773, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 15495, + "time_per_iteration": 2.420713186264038 + }, + { + "auxiliary_loss_clip": 0.01049679, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.01442039, + "balance_loss_mlp": 1.015172, + "epoch": 0.9316699233428528, + "flos": 28511400741120.0, + "grad_norm": 1.8577740677898036, + "language_loss": 0.6315192, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.65238327, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 15496, + "time_per_iteration": 2.437195301055908 + }, + { + "auxiliary_loss_clip": 0.01049935, + "auxiliary_loss_mlp": 0.01040626, + "balance_loss_clip": 1.01679564, + "balance_loss_mlp": 1.01528525, + "epoch": 0.9317300465955208, + "flos": 33691987900800.0, + "grad_norm": 1.5189830896959533, + "language_loss": 0.7789076, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79981327, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34765625, + "step": 15497, + "time_per_iteration": 2.4789657592773438 + }, + { + "auxiliary_loss_clip": 0.01054178, + "auxiliary_loss_mlp": 0.01041875, + "balance_loss_clip": 1.01723409, + "balance_loss_mlp": 1.0168128, + "epoch": 0.9317901698481887, + "flos": 23657146859520.0, + "grad_norm": 1.6003078524519532, + "language_loss": 0.67085141, + "learning_rate": 4.856574512724898e-08, + "loss": 0.69181192, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.375, + "step": 15498, + "time_per_iteration": 2.402937650680542 + }, + { + "auxiliary_loss_clip": 0.01053238, + "auxiliary_loss_mlp": 0.01041822, + "balance_loss_clip": 1.01739573, + "balance_loss_mlp": 1.01651382, + "epoch": 0.9318502931008568, + "flos": 20958954122880.0, + "grad_norm": 1.9022369695638224, + "language_loss": 0.80394447, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82489502, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 15499, + "time_per_iteration": 2.389277935028076 + }, + { + "auxiliary_loss_clip": 0.01051307, + "auxiliary_loss_mlp": 0.01038775, + "balance_loss_clip": 1.01440883, + "balance_loss_mlp": 1.01699257, + "epoch": 0.9319104163535247, + "flos": 23439917180160.0, + "grad_norm": 1.6057817272385164, + "language_loss": 0.77962089, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.80052167, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.34179688, + "step": 15500, + "time_per_iteration": 2.416773796081543 + }, + { + "auxiliary_loss_clip": 0.01050187, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.01170552, + "balance_loss_mlp": 1.01541066, + "epoch": 0.9319705396061927, + "flos": 22346297752320.0, + "grad_norm": 1.7899731142937378, + "language_loss": 0.73341179, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.75424248, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34765625, + "step": 15501, + "time_per_iteration": 2.3810460567474365 + }, + { + "auxiliary_loss_clip": 0.01053233, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.01606965, + "balance_loss_mlp": 1.01618791, + "epoch": 0.9320306628588607, + "flos": 20992575628800.0, + "grad_norm": 1.63791757904497, + "language_loss": 0.67448437, + "learning_rate": 4.822511506047666e-08, + "loss": 0.69541049, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37109375, + "step": 15502, + "time_per_iteration": 3.805577278137207 + }, + { + "auxiliary_loss_clip": 0.01051287, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.01323652, + "balance_loss_mlp": 1.01548219, + "epoch": 0.9320907861115286, + "flos": 24537062655360.0, + "grad_norm": 1.4721106437727456, + "language_loss": 0.66344607, + "learning_rate": 4.814014256446586e-08, + "loss": 0.68431985, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.359375, + "step": 15503, + "time_per_iteration": 2.470339298248291 + }, + { + "auxiliary_loss_clip": 0.0105336, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.01432657, + "balance_loss_mlp": 1.01605141, + "epoch": 0.9321509093641966, + "flos": 19784580986880.0, + "grad_norm": 1.665075675259837, + "language_loss": 0.75832868, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77925384, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37304688, + "step": 15504, + "time_per_iteration": 2.3862807750701904 + }, + { + "auxiliary_loss_clip": 0.01051967, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.00923431, + "balance_loss_mlp": 1.01606655, + "epoch": 0.9322110326168646, + "flos": 24971522014080.0, + "grad_norm": 2.4589652644103985, + "language_loss": 0.72537422, + "learning_rate": 4.797041961982762e-08, + "loss": 0.74622762, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 15505, + "time_per_iteration": 2.4808223247528076 + }, + { + "auxiliary_loss_clip": 0.01051096, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.01462579, + "balance_loss_mlp": 1.01584625, + "epoch": 0.9322711558695326, + "flos": 16142720578560.0, + "grad_norm": 1.963995058944008, + "language_loss": 0.76887888, + "learning_rate": 4.788566917763614e-08, + "loss": 0.7897653, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 15506, + "time_per_iteration": 2.362354040145874 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.01147377, + "balance_loss_mlp": 1.01569867, + "epoch": 0.9323312791222005, + "flos": 23731302320640.0, + "grad_norm": 2.004634049680931, + "language_loss": 0.84173101, + "learning_rate": 4.780099275981597e-08, + "loss": 0.86255479, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34375, + "step": 15507, + "time_per_iteration": 2.3755786418914795 + }, + { + "auxiliary_loss_clip": 0.01053227, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.01523519, + "balance_loss_mlp": 1.0168798, + "epoch": 0.9323914023748685, + "flos": 20776847137920.0, + "grad_norm": 1.4955228351924654, + "language_loss": 0.68712223, + "learning_rate": 4.771639036957742e-08, + "loss": 0.7080425, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 15508, + "time_per_iteration": 2.37910795211792 + }, + { + "auxiliary_loss_clip": 0.01051354, + "auxiliary_loss_mlp": 0.0103717, + "balance_loss_clip": 1.01294637, + "balance_loss_mlp": 1.01580012, + "epoch": 0.9324515256275364, + "flos": 23914037710080.0, + "grad_norm": 1.7157071134316018, + "language_loss": 0.73326474, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.75414997, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 15509, + "time_per_iteration": 2.3757073879241943 + }, + { + "auxiliary_loss_clip": 0.01050919, + "auxiliary_loss_mlp": 0.01040381, + "balance_loss_clip": 1.01674223, + "balance_loss_mlp": 1.01554048, + "epoch": 0.9325116488802044, + "flos": 18004219649280.0, + "grad_norm": 1.9666653871366206, + "language_loss": 0.75361347, + "learning_rate": 4.754740768467624e-08, + "loss": 0.77452648, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35351562, + "step": 15510, + "time_per_iteration": 2.3623502254486084 + }, + { + "auxiliary_loss_clip": 0.01052284, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.01568007, + "balance_loss_mlp": 1.01618016, + "epoch": 0.9325717721328723, + "flos": 29020364674560.0, + "grad_norm": 1.5050547531539282, + "language_loss": 0.71341455, + "learning_rate": 4.746302739642161e-08, + "loss": 0.73431861, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36132812, + "step": 15511, + "time_per_iteration": 2.424105405807495 + }, + { + "auxiliary_loss_clip": 0.01052704, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.01680422, + "balance_loss_mlp": 1.01672244, + "epoch": 0.9326318953855404, + "flos": 21645451653120.0, + "grad_norm": 1.760351069032224, + "language_loss": 0.79069889, + "learning_rate": 4.737872114856412e-08, + "loss": 0.81163031, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.359375, + "step": 15512, + "time_per_iteration": 2.366345167160034 + }, + { + "auxiliary_loss_clip": 0.01050272, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.01208866, + "balance_loss_mlp": 1.01513243, + "epoch": 0.9326920186382083, + "flos": 26064582860160.0, + "grad_norm": 1.4886806138162374, + "language_loss": 0.80672336, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82757711, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 15513, + "time_per_iteration": 2.39361572265625 + }, + { + "auxiliary_loss_clip": 0.01055304, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.01254892, + "balance_loss_mlp": 1.01711917, + "epoch": 0.9327521418908763, + "flos": 12056311428480.0, + "grad_norm": 2.9338870594714495, + "language_loss": 0.82046336, + "learning_rate": 4.721033078682768e-08, + "loss": 0.84138888, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3828125, + "step": 15514, + "time_per_iteration": 2.3663597106933594 + }, + { + "auxiliary_loss_clip": 0.01049804, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.01844764, + "balance_loss_mlp": 1.01550579, + "epoch": 0.9328122651435443, + "flos": 43832755607040.0, + "grad_norm": 1.8656055489022059, + "language_loss": 0.72386193, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.74476331, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 15515, + "time_per_iteration": 2.567047119140625 + }, + { + "auxiliary_loss_clip": 0.01054262, + "auxiliary_loss_mlp": 0.01038949, + "balance_loss_clip": 1.01415396, + "balance_loss_mlp": 1.01671958, + "epoch": 0.9328723883962122, + "flos": 15194060582400.0, + "grad_norm": 2.6484929458080817, + "language_loss": 0.82226193, + "learning_rate": 4.704223662500806e-08, + "loss": 0.84319407, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 15516, + "time_per_iteration": 2.3804430961608887 + }, + { + "auxiliary_loss_clip": 0.01052946, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.01398492, + "balance_loss_mlp": 1.01698196, + "epoch": 0.9329325116488802, + "flos": 20260866021120.0, + "grad_norm": 1.7154006003846705, + "language_loss": 0.81991786, + "learning_rate": 4.695830062703643e-08, + "loss": 0.84081793, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 15517, + "time_per_iteration": 2.362855911254883 + }, + { + "auxiliary_loss_clip": 0.01052782, + "auxiliary_loss_mlp": 0.01041317, + "balance_loss_clip": 1.01591372, + "balance_loss_mlp": 1.01624691, + "epoch": 0.9329926349015482, + "flos": 13114179757440.0, + "grad_norm": 2.251732142920275, + "language_loss": 0.7544601, + "learning_rate": 4.687443868860219e-08, + "loss": 0.77540112, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36523438, + "step": 15518, + "time_per_iteration": 2.3753061294555664 + }, + { + "auxiliary_loss_clip": 0.01052642, + "auxiliary_loss_mlp": 0.01045483, + "balance_loss_clip": 1.02022207, + "balance_loss_mlp": 1.01628697, + "epoch": 0.9330527581542162, + "flos": 23039114238720.0, + "grad_norm": 1.9775999176521728, + "language_loss": 0.77191019, + "learning_rate": 4.679065081288458e-08, + "loss": 0.79289138, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 15519, + "time_per_iteration": 3.9107143878936768 + }, + { + "auxiliary_loss_clip": 0.01051291, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.01643968, + "balance_loss_mlp": 1.01552701, + "epoch": 0.9331128814068841, + "flos": 15558728400000.0, + "grad_norm": 2.150252313294838, + "language_loss": 0.84228456, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.86319828, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 15520, + "time_per_iteration": 2.5652806758880615 + }, + { + "auxiliary_loss_clip": 0.01050801, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.01269233, + "balance_loss_mlp": 1.01583517, + "epoch": 0.9331730046595521, + "flos": 22270710925440.0, + "grad_norm": 1.5882863882652158, + "language_loss": 0.77148998, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.79236102, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34960938, + "step": 15521, + "time_per_iteration": 2.447887420654297 + }, + { + "auxiliary_loss_clip": 0.01051841, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.0138731, + "balance_loss_mlp": 1.01583099, + "epoch": 0.93323312791222, + "flos": 15776761040640.0, + "grad_norm": 1.7817591789809348, + "language_loss": 0.78855592, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.80943584, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 15522, + "time_per_iteration": 2.436081647872925 + }, + { + "auxiliary_loss_clip": 0.01050919, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.00999236, + "balance_loss_mlp": 1.01530552, + "epoch": 0.933293251164888, + "flos": 22010084559360.0, + "grad_norm": 5.938100882867834, + "language_loss": 0.64308828, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.66392517, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 15523, + "time_per_iteration": 2.364863872528076 + }, + { + "auxiliary_loss_clip": 0.01051446, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.01796281, + "balance_loss_mlp": 1.01626945, + "epoch": 0.933353374417556, + "flos": 26030158392960.0, + "grad_norm": 1.643196156131243, + "language_loss": 0.69006878, + "learning_rate": 4.63728224861577e-08, + "loss": 0.71098763, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 15524, + "time_per_iteration": 2.4553122520446777 + }, + { + "auxiliary_loss_clip": 0.01052232, + "auxiliary_loss_mlp": 0.01041241, + "balance_loss_clip": 1.01722002, + "balance_loss_mlp": 1.01624274, + "epoch": 0.933413497670224, + "flos": 24898937564160.0, + "grad_norm": 1.4956553818234406, + "language_loss": 0.74851155, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76944625, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 15525, + "time_per_iteration": 2.4126312732696533 + }, + { + "auxiliary_loss_clip": 0.01049915, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_clip": 1.02069092, + "balance_loss_mlp": 1.01444101, + "epoch": 0.9334736209228919, + "flos": 23687765988480.0, + "grad_norm": 1.8530707204079593, + "language_loss": 0.85078549, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.87174308, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.35546875, + "step": 15526, + "time_per_iteration": 2.406351089477539 + }, + { + "auxiliary_loss_clip": 0.01051717, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.01451683, + "balance_loss_mlp": 1.0160259, + "epoch": 0.9335337441755599, + "flos": 15376446858240.0, + "grad_norm": 2.062333979843498, + "language_loss": 0.70179081, + "learning_rate": 4.61230144456366e-08, + "loss": 0.72268248, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35742188, + "step": 15527, + "time_per_iteration": 2.3730995655059814 + }, + { + "auxiliary_loss_clip": 0.01053478, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.01373315, + "balance_loss_mlp": 1.01641512, + "epoch": 0.9335938674282279, + "flos": 16105817404800.0, + "grad_norm": 2.016365469937635, + "language_loss": 0.66432559, + "learning_rate": 4.603989327701141e-08, + "loss": 0.68525088, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 15528, + "time_per_iteration": 2.3671157360076904 + }, + { + "auxiliary_loss_clip": 0.01053836, + "auxiliary_loss_mlp": 0.01042168, + "balance_loss_clip": 1.01651406, + "balance_loss_mlp": 1.01645744, + "epoch": 0.9336539906808958, + "flos": 18951902127360.0, + "grad_norm": 1.7734269730704326, + "language_loss": 0.76493645, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.78589648, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 15529, + "time_per_iteration": 3.7705845832824707 + }, + { + "auxiliary_loss_clip": 0.01049846, + "auxiliary_loss_mlp": 0.01039817, + "balance_loss_clip": 1.01732254, + "balance_loss_mlp": 1.01533389, + "epoch": 0.9337141139335638, + "flos": 18108261100800.0, + "grad_norm": 1.6845630981912458, + "language_loss": 0.63670456, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65760124, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34570312, + "step": 15530, + "time_per_iteration": 3.7735142707824707 + }, + { + "auxiliary_loss_clip": 0.01048721, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.01114821, + "balance_loss_mlp": 1.01446831, + "epoch": 0.9337742371862318, + "flos": 17344815200640.0, + "grad_norm": 1.7219876857610978, + "language_loss": 0.73910606, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.75991154, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34179688, + "step": 15531, + "time_per_iteration": 2.4998130798339844 + }, + { + "auxiliary_loss_clip": 0.01049873, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.01146293, + "balance_loss_mlp": 1.01518452, + "epoch": 0.9338343604388998, + "flos": 29057721696000.0, + "grad_norm": 2.1656148153810313, + "language_loss": 0.71994406, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.74078274, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 15532, + "time_per_iteration": 2.437512159347534 + }, + { + "auxiliary_loss_clip": 0.01052776, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.01227343, + "balance_loss_mlp": 1.01672912, + "epoch": 0.9338944836915677, + "flos": 18659923493760.0, + "grad_norm": 1.6990174671551157, + "language_loss": 0.73429704, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75516969, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 15533, + "time_per_iteration": 2.4290270805358887 + }, + { + "auxiliary_loss_clip": 0.01050666, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.01073074, + "balance_loss_mlp": 1.01620579, + "epoch": 0.9339546069442357, + "flos": 16616806197120.0, + "grad_norm": 1.7199163351046083, + "language_loss": 0.80961418, + "learning_rate": 4.554272235700507e-08, + "loss": 0.83044302, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34375, + "step": 15534, + "time_per_iteration": 2.4293696880340576 + }, + { + "auxiliary_loss_clip": 0.01047448, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.01037621, + "balance_loss_mlp": 1.01508141, + "epoch": 0.9340147301969036, + "flos": 23692060085760.0, + "grad_norm": 1.8014677556288161, + "language_loss": 0.75052774, + "learning_rate": 4.546011991495513e-08, + "loss": 0.77130443, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.32421875, + "step": 15535, + "time_per_iteration": 2.4058406352996826 + }, + { + "auxiliary_loss_clip": 0.0105211, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.0099535, + "balance_loss_mlp": 1.01596856, + "epoch": 0.9340748534495716, + "flos": 28653287973120.0, + "grad_norm": 2.3912791878297925, + "language_loss": 0.79233843, + "learning_rate": 4.537759158925292e-08, + "loss": 0.81319129, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 15536, + "time_per_iteration": 2.4327948093414307 + }, + { + "auxiliary_loss_clip": 0.01050537, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.01129007, + "balance_loss_mlp": 1.01534402, + "epoch": 0.9341349767022396, + "flos": 24898483716480.0, + "grad_norm": 1.4934101780873912, + "language_loss": 0.81487447, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.83571637, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 15537, + "time_per_iteration": 2.4371771812438965 + }, + { + "auxiliary_loss_clip": 0.01052126, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.01306152, + "balance_loss_mlp": 1.015715, + "epoch": 0.9341950999549076, + "flos": 29058245366400.0, + "grad_norm": 1.6105032699440676, + "language_loss": 0.79115015, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.81204367, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 15538, + "time_per_iteration": 2.501819133758545 + }, + { + "auxiliary_loss_clip": 0.01049591, + "auxiliary_loss_mlp": 0.01034001, + "balance_loss_clip": 1.01273382, + "balance_loss_mlp": 1.01521754, + "epoch": 0.9342552232075755, + "flos": 23585923952640.0, + "grad_norm": 1.4921908511811355, + "language_loss": 0.7388128, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75964868, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 15539, + "time_per_iteration": 2.4543988704681396 + }, + { + "auxiliary_loss_clip": 0.01050255, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.01390338, + "balance_loss_mlp": 1.01543021, + "epoch": 0.9343153464602435, + "flos": 36719900317440.0, + "grad_norm": 1.4775618498312006, + "language_loss": 0.65639466, + "learning_rate": 4.504821951247373e-08, + "loss": 0.67725217, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34765625, + "step": 15540, + "time_per_iteration": 2.5121264457702637 + }, + { + "auxiliary_loss_clip": 0.01050641, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.01430106, + "balance_loss_mlp": 1.01535273, + "epoch": 0.9343754697129115, + "flos": 22235413674240.0, + "grad_norm": 1.5801377283995806, + "language_loss": 0.7680003, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78886968, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 15541, + "time_per_iteration": 3.891711950302124 + }, + { + "auxiliary_loss_clip": 0.0105253, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.01551294, + "balance_loss_mlp": 1.01737189, + "epoch": 0.9344355929655794, + "flos": 29709201265920.0, + "grad_norm": 1.9209625870424736, + "language_loss": 0.68309653, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.70402694, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3515625, + "step": 15542, + "time_per_iteration": 2.4518649578094482 + }, + { + "auxiliary_loss_clip": 0.0105323, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.01118135, + "balance_loss_mlp": 1.01696873, + "epoch": 0.9344957162182475, + "flos": 18879387500160.0, + "grad_norm": 1.6890937794815168, + "language_loss": 0.70507216, + "learning_rate": 4.480196882960907e-08, + "loss": 0.72594893, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36328125, + "step": 15543, + "time_per_iteration": 2.378181219100952 + }, + { + "auxiliary_loss_clip": 0.01051993, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.01396608, + "balance_loss_mlp": 1.01545167, + "epoch": 0.9345558394709154, + "flos": 27416524504320.0, + "grad_norm": 1.9760616671413913, + "language_loss": 0.7127353, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.73365653, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36523438, + "step": 15544, + "time_per_iteration": 2.481067419052124 + }, + { + "auxiliary_loss_clip": 0.01052823, + "auxiliary_loss_mlp": 0.01037069, + "balance_loss_clip": 1.01244056, + "balance_loss_mlp": 1.01634419, + "epoch": 0.9346159627235834, + "flos": 20740223255040.0, + "grad_norm": 1.6791138003123842, + "language_loss": 0.78262335, + "learning_rate": 4.463817240903789e-08, + "loss": 0.80352223, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 15545, + "time_per_iteration": 2.4423398971557617 + }, + { + "auxiliary_loss_clip": 0.01053562, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.013237, + "balance_loss_mlp": 1.01636732, + "epoch": 0.9346760859762513, + "flos": 21068162455680.0, + "grad_norm": 3.102317163373488, + "language_loss": 0.7023989, + "learning_rate": 4.455638541847495e-08, + "loss": 0.72329354, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.37109375, + "step": 15546, + "time_per_iteration": 2.398581027984619 + }, + { + "auxiliary_loss_clip": 0.01049172, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.01028168, + "balance_loss_mlp": 1.01535106, + "epoch": 0.9347362092289193, + "flos": 29203658645760.0, + "grad_norm": 1.7656543840902614, + "language_loss": 0.8303746, + "learning_rate": 4.447467257852966e-08, + "loss": 0.85119581, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.33789062, + "step": 15547, + "time_per_iteration": 2.501020669937134 + }, + { + "auxiliary_loss_clip": 0.01049563, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.01484776, + "balance_loss_mlp": 1.01490784, + "epoch": 0.9347963324815872, + "flos": 19426336859520.0, + "grad_norm": 1.753886332321221, + "language_loss": 0.8488313, + "learning_rate": 4.439303389230087e-08, + "loss": 0.86969036, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34765625, + "step": 15548, + "time_per_iteration": 2.361800193786621 + }, + { + "auxiliary_loss_clip": 0.01053141, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.01574767, + "balance_loss_mlp": 1.01556945, + "epoch": 0.9348564557342552, + "flos": 36900401379840.0, + "grad_norm": 1.8807040185281136, + "language_loss": 0.66739476, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.68833578, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37695312, + "step": 15549, + "time_per_iteration": 2.5184123516082764 + }, + { + "auxiliary_loss_clip": 0.0105302, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_clip": 1.01917195, + "balance_loss_mlp": 1.01626611, + "epoch": 0.9349165789869232, + "flos": 21689022896640.0, + "grad_norm": 1.7685635089134109, + "language_loss": 0.81532145, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.83629322, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.3671875, + "step": 15550, + "time_per_iteration": 2.3723294734954834 + }, + { + "auxiliary_loss_clip": 0.01051615, + "auxiliary_loss_mlp": 0.01036049, + "balance_loss_clip": 1.01310122, + "balance_loss_mlp": 1.01650167, + "epoch": 0.9349767022395912, + "flos": 18843042908160.0, + "grad_norm": 1.7073711170403898, + "language_loss": 0.76802647, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.78890312, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 15551, + "time_per_iteration": 2.3926734924316406 + }, + { + "auxiliary_loss_clip": 0.01049472, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.00932717, + "balance_loss_mlp": 1.01652658, + "epoch": 0.9350368254922591, + "flos": 24972255152640.0, + "grad_norm": 1.7169125501637814, + "language_loss": 0.74450666, + "learning_rate": 4.406722074642255e-08, + "loss": 0.76529038, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.33007812, + "step": 15552, + "time_per_iteration": 2.413543701171875 + }, + { + "auxiliary_loss_clip": 0.01051346, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.01151335, + "balance_loss_mlp": 1.01579607, + "epoch": 0.9350969487449271, + "flos": 23069628633600.0, + "grad_norm": 1.7447277076997598, + "language_loss": 0.78058887, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.80144525, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 15553, + "time_per_iteration": 2.4090209007263184 + }, + { + "auxiliary_loss_clip": 0.01053715, + "auxiliary_loss_mlp": 0.01040906, + "balance_loss_clip": 1.01675439, + "balance_loss_mlp": 1.01676154, + "epoch": 0.9351570719975951, + "flos": 18624172394880.0, + "grad_norm": 2.0901306220400495, + "language_loss": 0.78745627, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80840248, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 15554, + "time_per_iteration": 2.353708028793335 + }, + { + "auxiliary_loss_clip": 0.01048126, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.01665556, + "balance_loss_mlp": 1.01482177, + "epoch": 0.935217195250263, + "flos": 15887435650560.0, + "grad_norm": 1.5482913580332578, + "language_loss": 0.70198011, + "learning_rate": 4.382363965244695e-08, + "loss": 0.72282624, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.33398438, + "step": 15555, + "time_per_iteration": 2.336634874343872 + }, + { + "auxiliary_loss_clip": 0.01050893, + "auxiliary_loss_mlp": 0.0103896, + "balance_loss_clip": 1.01568997, + "balance_loss_mlp": 1.0154748, + "epoch": 0.935277318502931, + "flos": 24389135758080.0, + "grad_norm": 1.5727893435886218, + "language_loss": 0.76900572, + "learning_rate": 4.374259430715965e-08, + "loss": 0.78990424, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 15556, + "time_per_iteration": 2.4207804203033447 + }, + { + "auxiliary_loss_clip": 0.01050389, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.01425505, + "balance_loss_mlp": 1.01481462, + "epoch": 0.935337441755599, + "flos": 27599015514240.0, + "grad_norm": 1.5368566585951444, + "language_loss": 0.737553, + "learning_rate": 4.366162314334953e-08, + "loss": 0.75841749, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.35546875, + "step": 15557, + "time_per_iteration": 2.441277027130127 + }, + { + "auxiliary_loss_clip": 0.01052197, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.01544201, + "balance_loss_mlp": 1.01558065, + "epoch": 0.935397565008267, + "flos": 20481901038720.0, + "grad_norm": 1.584859323079968, + "language_loss": 0.63808179, + "learning_rate": 4.358072616408681e-08, + "loss": 0.6589973, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3671875, + "step": 15558, + "time_per_iteration": 2.3788363933563232 + }, + { + "auxiliary_loss_clip": 0.01050305, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.01269555, + "balance_loss_mlp": 1.01587427, + "epoch": 0.9354576882609349, + "flos": 23653411344000.0, + "grad_norm": 2.0974987036033528, + "language_loss": 0.74405056, + "learning_rate": 4.34999033724388e-08, + "loss": 0.76490915, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34570312, + "step": 15559, + "time_per_iteration": 3.709498405456543 + }, + { + "auxiliary_loss_clip": 0.01049851, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.01129746, + "balance_loss_mlp": 1.01559949, + "epoch": 0.9355178115136029, + "flos": 36683416080000.0, + "grad_norm": 1.7827282120971069, + "language_loss": 0.65058064, + "learning_rate": 4.341915477147062e-08, + "loss": 0.6714015, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34179688, + "step": 15560, + "time_per_iteration": 2.5275275707244873 + }, + { + "auxiliary_loss_clip": 0.01057068, + "auxiliary_loss_mlp": 0.01042155, + "balance_loss_clip": 1.01533294, + "balance_loss_mlp": 1.0170716, + "epoch": 0.9355779347662708, + "flos": 14459662800000.0, + "grad_norm": 2.1712758872841054, + "language_loss": 0.65523863, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.67623091, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.40039062, + "step": 15561, + "time_per_iteration": 2.3613178730010986 + }, + { + "auxiliary_loss_clip": 0.01052856, + "auxiliary_loss_mlp": 0.0103573, + "balance_loss_clip": 1.01241302, + "balance_loss_mlp": 1.01741934, + "epoch": 0.9356380580189388, + "flos": 23184841720320.0, + "grad_norm": 1.6979583288386324, + "language_loss": 0.76780564, + "learning_rate": 4.325788015381859e-08, + "loss": 0.78869152, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 15562, + "time_per_iteration": 2.3847169876098633 + }, + { + "auxiliary_loss_clip": 0.01006906, + "auxiliary_loss_mlp": 0.01003834, + "balance_loss_clip": 1.00164044, + "balance_loss_mlp": 1.00076199, + "epoch": 0.9356981812716068, + "flos": 67468465025280.0, + "grad_norm": 0.9468820190188354, + "language_loss": 0.62479985, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64490724, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.06152344, + "step": 15563, + "time_per_iteration": 2.921511173248291 + }, + { + "auxiliary_loss_clip": 0.01049837, + "auxiliary_loss_mlp": 0.01038366, + "balance_loss_clip": 1.01560855, + "balance_loss_mlp": 1.01568794, + "epoch": 0.9357583045242748, + "flos": 24680451075840.0, + "grad_norm": 1.5634408105583546, + "language_loss": 0.79206073, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.81294274, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34179688, + "step": 15564, + "time_per_iteration": 2.3827171325683594 + }, + { + "auxiliary_loss_clip": 0.01053187, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_clip": 1.0129416, + "balance_loss_mlp": 1.01621163, + "epoch": 0.9358184277769427, + "flos": 19462716362880.0, + "grad_norm": 1.7866624577755008, + "language_loss": 0.79249668, + "learning_rate": 4.301652473389694e-08, + "loss": 0.81339782, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 15565, + "time_per_iteration": 2.405205488204956 + }, + { + "auxiliary_loss_clip": 0.01049243, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.01121569, + "balance_loss_mlp": 1.01543212, + "epoch": 0.9358785510296107, + "flos": 18915976471680.0, + "grad_norm": 1.8874216853412216, + "language_loss": 0.73369557, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.75449395, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.33984375, + "step": 15566, + "time_per_iteration": 2.419931650161743 + }, + { + "auxiliary_loss_clip": 0.01051649, + "auxiliary_loss_mlp": 0.01036014, + "balance_loss_clip": 1.01345944, + "balance_loss_mlp": 1.01531863, + "epoch": 0.9359386742822787, + "flos": 23440126648320.0, + "grad_norm": 2.776499698610212, + "language_loss": 0.68979996, + "learning_rate": 4.285599216057889e-08, + "loss": 0.71067655, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.36328125, + "step": 15567, + "time_per_iteration": 2.4416613578796387 + }, + { + "auxiliary_loss_clip": 0.01051533, + "auxiliary_loss_mlp": 0.01039935, + "balance_loss_clip": 1.01692784, + "balance_loss_mlp": 1.01603258, + "epoch": 0.9359987975349466, + "flos": 32742699500160.0, + "grad_norm": 1.9974722752151701, + "language_loss": 0.63864136, + "learning_rate": 4.277583719504418e-08, + "loss": 0.65955603, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 15568, + "time_per_iteration": 3.8917109966278076 + }, + { + "auxiliary_loss_clip": 0.01050358, + "auxiliary_loss_mlp": 0.01041073, + "balance_loss_clip": 1.01732647, + "balance_loss_mlp": 1.01456785, + "epoch": 0.9360589207876147, + "flos": 22818567980160.0, + "grad_norm": 1.5864017510804298, + "language_loss": 0.79737961, + "learning_rate": 4.269575644764556e-08, + "loss": 0.81829387, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35742188, + "step": 15569, + "time_per_iteration": 3.7031047344207764 + }, + { + "auxiliary_loss_clip": 0.01052084, + "auxiliary_loss_mlp": 0.0104172, + "balance_loss_clip": 1.01812875, + "balance_loss_mlp": 1.0160985, + "epoch": 0.9361190440402826, + "flos": 20884240080000.0, + "grad_norm": 2.251479837732253, + "language_loss": 0.703655, + "learning_rate": 4.261574992142014e-08, + "loss": 0.72459304, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36132812, + "step": 15570, + "time_per_iteration": 2.429901123046875 + }, + { + "auxiliary_loss_clip": 0.01052927, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.01596177, + "balance_loss_mlp": 1.0164851, + "epoch": 0.9361791672929506, + "flos": 19316814324480.0, + "grad_norm": 2.1003143530186668, + "language_loss": 0.80161887, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.82254291, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 15571, + "time_per_iteration": 2.431450128555298 + }, + { + "auxiliary_loss_clip": 0.01051405, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.01629663, + "balance_loss_mlp": 1.01616037, + "epoch": 0.9362392905456185, + "flos": 15157297054080.0, + "grad_norm": 2.7293620941017362, + "language_loss": 0.77614141, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79703748, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 15572, + "time_per_iteration": 2.3560757637023926 + }, + { + "auxiliary_loss_clip": 0.01052315, + "auxiliary_loss_mlp": 0.01045964, + "balance_loss_clip": 1.02382636, + "balance_loss_mlp": 1.01695943, + "epoch": 0.9362994137982865, + "flos": 22084938247680.0, + "grad_norm": 1.9255784168349535, + "language_loss": 0.78612989, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80711269, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.35351562, + "step": 15573, + "time_per_iteration": 2.3574860095977783 + }, + { + "auxiliary_loss_clip": 0.01049686, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.0151751, + "balance_loss_mlp": 1.01617265, + "epoch": 0.9363595370509544, + "flos": 23511174998400.0, + "grad_norm": 1.5973179538976112, + "language_loss": 0.74797916, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76883328, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3359375, + "step": 15574, + "time_per_iteration": 2.4065310955047607 + }, + { + "auxiliary_loss_clip": 0.01049791, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.01587391, + "balance_loss_mlp": 1.01477861, + "epoch": 0.9364196603036224, + "flos": 27122311543680.0, + "grad_norm": 2.8972055986516367, + "language_loss": 0.69574404, + "learning_rate": 4.221683071397564e-08, + "loss": 0.71663249, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34960938, + "step": 15575, + "time_per_iteration": 2.4790120124816895 + }, + { + "auxiliary_loss_clip": 0.01048439, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.01552081, + "balance_loss_mlp": 1.01499379, + "epoch": 0.9364797835562904, + "flos": 18478060888320.0, + "grad_norm": 1.4745489527685491, + "language_loss": 0.66078651, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.68163979, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33398438, + "step": 15576, + "time_per_iteration": 2.3426690101623535 + }, + { + "auxiliary_loss_clip": 0.01051606, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.01222277, + "balance_loss_mlp": 1.01492107, + "epoch": 0.9365399068089584, + "flos": 13004727045120.0, + "grad_norm": 2.271328244474399, + "language_loss": 0.77750683, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.79837579, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3671875, + "step": 15577, + "time_per_iteration": 2.313136100769043 + }, + { + "auxiliary_loss_clip": 0.01051105, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.01358438, + "balance_loss_mlp": 1.0153296, + "epoch": 0.9366000300616263, + "flos": 25665246195840.0, + "grad_norm": 1.998717747670019, + "language_loss": 0.53937387, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.5602659, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.35742188, + "step": 15578, + "time_per_iteration": 2.426535129547119 + }, + { + "auxiliary_loss_clip": 0.01049605, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.01449549, + "balance_loss_mlp": 1.01579392, + "epoch": 0.9366601533142943, + "flos": 21432306602880.0, + "grad_norm": 1.6289305271890389, + "language_loss": 0.70872164, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72956586, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33789062, + "step": 15579, + "time_per_iteration": 2.3681886196136475 + }, + { + "auxiliary_loss_clip": 0.01050336, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.00929463, + "balance_loss_mlp": 1.0149951, + "epoch": 0.9367202765669622, + "flos": 24092199711360.0, + "grad_norm": 1.8961419215694968, + "language_loss": 0.77220267, + "learning_rate": 4.181976748973959e-08, + "loss": 0.79302609, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 15580, + "time_per_iteration": 2.399958372116089 + }, + { + "auxiliary_loss_clip": 0.01054158, + "auxiliary_loss_mlp": 0.01038272, + "balance_loss_clip": 1.01470482, + "balance_loss_mlp": 1.01689887, + "epoch": 0.9367803998196302, + "flos": 20887731216000.0, + "grad_norm": 1.951086176792541, + "language_loss": 0.67388529, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.69480962, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37304688, + "step": 15581, + "time_per_iteration": 3.831249237060547 + }, + { + "auxiliary_loss_clip": 0.01052717, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.01207578, + "balance_loss_mlp": 1.01710343, + "epoch": 0.9368405230722983, + "flos": 22563283052160.0, + "grad_norm": 1.7416279329174358, + "language_loss": 0.77385497, + "learning_rate": 4.166146195972042e-08, + "loss": 0.79473484, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35546875, + "step": 15582, + "time_per_iteration": 2.4045424461364746 + }, + { + "auxiliary_loss_clip": 0.01049623, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.01225781, + "balance_loss_mlp": 1.01547658, + "epoch": 0.9369006463249662, + "flos": 18879212943360.0, + "grad_norm": 1.6580390865097596, + "language_loss": 0.75148284, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.77230942, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34179688, + "step": 15583, + "time_per_iteration": 2.3852062225341797 + }, + { + "auxiliary_loss_clip": 0.01054603, + "auxiliary_loss_mlp": 0.01040805, + "balance_loss_clip": 1.01589012, + "balance_loss_mlp": 1.01714718, + "epoch": 0.9369607695776342, + "flos": 26431310448000.0, + "grad_norm": 2.0264821660251675, + "language_loss": 0.85793573, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.8788898, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 15584, + "time_per_iteration": 2.533287286758423 + }, + { + "auxiliary_loss_clip": 0.01053269, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.01395404, + "balance_loss_mlp": 1.01596427, + "epoch": 0.9370208928303021, + "flos": 39565775571840.0, + "grad_norm": 1.5614612290891945, + "language_loss": 0.73460281, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.75551909, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37304688, + "step": 15585, + "time_per_iteration": 2.550903558731079 + }, + { + "auxiliary_loss_clip": 0.01048621, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.01091552, + "balance_loss_mlp": 1.0146122, + "epoch": 0.9370810160829701, + "flos": 22962096046080.0, + "grad_norm": 1.8616704486834752, + "language_loss": 0.8097899, + "learning_rate": 4.134574204836316e-08, + "loss": 0.83059967, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33984375, + "step": 15586, + "time_per_iteration": 2.4045028686523438 + }, + { + "auxiliary_loss_clip": 0.01050301, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.01580071, + "balance_loss_mlp": 1.01468492, + "epoch": 0.937141139335638, + "flos": 23073957642240.0, + "grad_norm": 1.5837977022722, + "language_loss": 0.77501911, + "learning_rate": 4.126699774396258e-08, + "loss": 0.79591417, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 15587, + "time_per_iteration": 2.41715931892395 + }, + { + "auxiliary_loss_clip": 0.01053775, + "auxiliary_loss_mlp": 0.01044213, + "balance_loss_clip": 1.01917934, + "balance_loss_mlp": 1.01686704, + "epoch": 0.937201262588306, + "flos": 16355900540160.0, + "grad_norm": 1.8899401678699659, + "language_loss": 0.88800901, + "learning_rate": 4.118832771491387e-08, + "loss": 0.90898889, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 15588, + "time_per_iteration": 2.3943769931793213 + }, + { + "auxiliary_loss_clip": 0.01050048, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01046813, + "balance_loss_mlp": 1.01632404, + "epoch": 0.937261385840974, + "flos": 20193029516160.0, + "grad_norm": 1.8024561798003838, + "language_loss": 0.79297262, + "learning_rate": 4.11097319642002e-08, + "loss": 0.8137747, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.33789062, + "step": 15589, + "time_per_iteration": 2.3895111083984375 + }, + { + "auxiliary_loss_clip": 0.01049883, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.00967169, + "balance_loss_mlp": 1.01571584, + "epoch": 0.937321509093642, + "flos": 18294976385280.0, + "grad_norm": 1.8515327319542423, + "language_loss": 0.78707862, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80790389, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34179688, + "step": 15590, + "time_per_iteration": 2.3717401027679443 + }, + { + "auxiliary_loss_clip": 0.01052546, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.01434672, + "balance_loss_mlp": 1.01539898, + "epoch": 0.9373816323463099, + "flos": 25883488304640.0, + "grad_norm": 2.2184565379397836, + "language_loss": 0.72379827, + "learning_rate": 4.095276330969577e-08, + "loss": 0.74471879, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 15591, + "time_per_iteration": 2.400038957595825 + }, + { + "auxiliary_loss_clip": 0.01054472, + "auxiliary_loss_mlp": 0.01043542, + "balance_loss_clip": 1.0170176, + "balance_loss_mlp": 1.01644588, + "epoch": 0.9374417555989779, + "flos": 27197619079680.0, + "grad_norm": 2.0746597792349135, + "language_loss": 0.55443549, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.57541555, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 15592, + "time_per_iteration": 2.419501304626465 + }, + { + "auxiliary_loss_clip": 0.01050203, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.01466358, + "balance_loss_mlp": 1.01537585, + "epoch": 0.9375018788516458, + "flos": 23620173863040.0, + "grad_norm": 2.11019556028233, + "language_loss": 0.68094552, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.70180714, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34765625, + "step": 15593, + "time_per_iteration": 2.3870997428894043 + }, + { + "auxiliary_loss_clip": 0.01052142, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.01170659, + "balance_loss_mlp": 1.01678848, + "epoch": 0.9375620021043138, + "flos": 22677553532160.0, + "grad_norm": 1.513755228932916, + "language_loss": 0.7463029, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76716125, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 15594, + "time_per_iteration": 2.4158999919891357 + }, + { + "auxiliary_loss_clip": 0.01048534, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.01109004, + "balance_loss_mlp": 1.01495469, + "epoch": 0.9376221253569819, + "flos": 27559109963520.0, + "grad_norm": 1.5810671433110417, + "language_loss": 0.74595338, + "learning_rate": 4.063971747165351e-08, + "loss": 0.76675802, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.3359375, + "step": 15595, + "time_per_iteration": 2.395942211151123 + }, + { + "auxiliary_loss_clip": 0.01052412, + "auxiliary_loss_mlp": 0.0104005, + "balance_loss_clip": 1.01644695, + "balance_loss_mlp": 1.01574457, + "epoch": 0.9376822486096498, + "flos": 24128858505600.0, + "grad_norm": 1.7864084192890564, + "language_loss": 0.77138156, + "learning_rate": 4.056164175257626e-08, + "loss": 0.79230618, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 15596, + "time_per_iteration": 2.3851065635681152 + }, + { + "auxiliary_loss_clip": 0.01051647, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.01274729, + "balance_loss_mlp": 1.01596093, + "epoch": 0.9377423718623178, + "flos": 22782921615360.0, + "grad_norm": 1.6596710233968117, + "language_loss": 0.79474044, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.81560516, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35546875, + "step": 15597, + "time_per_iteration": 2.361423969268799 + }, + { + "auxiliary_loss_clip": 0.0105341, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.01387048, + "balance_loss_mlp": 1.01632965, + "epoch": 0.9378024951149857, + "flos": 19167979731840.0, + "grad_norm": 1.7393717774963604, + "language_loss": 0.82109153, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.84200603, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37109375, + "step": 15598, + "time_per_iteration": 2.4837722778320312 + }, + { + "auxiliary_loss_clip": 0.01055479, + "auxiliary_loss_mlp": 0.01041007, + "balance_loss_clip": 1.01660454, + "balance_loss_mlp": 1.01678848, + "epoch": 0.9378626183676537, + "flos": 23504611662720.0, + "grad_norm": 1.781353264866782, + "language_loss": 0.64918667, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.67015153, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38671875, + "step": 15599, + "time_per_iteration": 3.8161299228668213 + }, + { + "auxiliary_loss_clip": 0.01052001, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.01025343, + "balance_loss_mlp": 1.01577878, + "epoch": 0.9379227416203216, + "flos": 18404673477120.0, + "grad_norm": 1.6985585116493827, + "language_loss": 0.73912603, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75998676, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 15600, + "time_per_iteration": 2.3504292964935303 + }, + { + "auxiliary_loss_clip": 0.01050348, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.01332998, + "balance_loss_mlp": 1.01586628, + "epoch": 0.9379828648729897, + "flos": 17820890766720.0, + "grad_norm": 1.7431432996334848, + "language_loss": 0.70345283, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.72431135, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34570312, + "step": 15601, + "time_per_iteration": 2.337918758392334 + }, + { + "auxiliary_loss_clip": 0.01007252, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 1.00014901, + "balance_loss_mlp": 1.0009501, + "epoch": 0.9380429881256576, + "flos": 68021070024960.0, + "grad_norm": 0.745403438262729, + "language_loss": 0.58097732, + "learning_rate": 4.009474788561573e-08, + "loss": 0.601071, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.06298828, + "step": 15602, + "time_per_iteration": 3.1973063945770264 + }, + { + "auxiliary_loss_clip": 0.01051202, + "auxiliary_loss_mlp": 0.01036041, + "balance_loss_clip": 1.01324844, + "balance_loss_mlp": 1.01503873, + "epoch": 0.9381031113783256, + "flos": 20775939442560.0, + "grad_norm": 1.8366003076466202, + "language_loss": 0.73026192, + "learning_rate": 4.001719234324663e-08, + "loss": 0.7511344, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36132812, + "step": 15603, + "time_per_iteration": 2.3533098697662354 + }, + { + "auxiliary_loss_clip": 0.01047823, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.01699793, + "balance_loss_mlp": 1.01526999, + "epoch": 0.9381632346309935, + "flos": 19024102552320.0, + "grad_norm": 1.6617385875961301, + "language_loss": 0.77022874, + "learning_rate": 3.993971112362171e-08, + "loss": 0.79108393, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.32617188, + "step": 15604, + "time_per_iteration": 2.38677978515625 + }, + { + "auxiliary_loss_clip": 0.01051474, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.01017809, + "balance_loss_mlp": 1.01586866, + "epoch": 0.9382233578836615, + "flos": 23512711098240.0, + "grad_norm": 2.5094016145645064, + "language_loss": 0.65928507, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.68014371, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 15605, + "time_per_iteration": 2.384951591491699 + }, + { + "auxiliary_loss_clip": 0.01053159, + "auxiliary_loss_mlp": 0.01041183, + "balance_loss_clip": 1.01650691, + "balance_loss_mlp": 1.01605725, + "epoch": 0.9382834811363294, + "flos": 43066272418560.0, + "grad_norm": 1.5887821012328822, + "language_loss": 0.6840356, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.70497894, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 15606, + "time_per_iteration": 2.569531202316284 + }, + { + "auxiliary_loss_clip": 0.0104853, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.00893962, + "balance_loss_mlp": 1.0146997, + "epoch": 0.9383436043889974, + "flos": 16435292705280.0, + "grad_norm": 1.7581833613091775, + "language_loss": 0.7855317, + "learning_rate": 3.970771343058166e-08, + "loss": 0.80631459, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33789062, + "step": 15607, + "time_per_iteration": 2.3436005115509033 + }, + { + "auxiliary_loss_clip": 0.0105219, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.01285481, + "balance_loss_mlp": 1.01583648, + "epoch": 0.9384037276416655, + "flos": 20739559939200.0, + "grad_norm": 1.7385882696859578, + "language_loss": 0.83887893, + "learning_rate": 3.963052953128776e-08, + "loss": 0.85974401, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.36328125, + "step": 15608, + "time_per_iteration": 3.8705523014068604 + }, + { + "auxiliary_loss_clip": 0.01051808, + "auxiliary_loss_mlp": 0.01038887, + "balance_loss_clip": 1.01586795, + "balance_loss_mlp": 1.01657891, + "epoch": 0.9384638508943334, + "flos": 19061669041920.0, + "grad_norm": 1.7194690411885467, + "language_loss": 0.70325053, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.72415751, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.3515625, + "step": 15609, + "time_per_iteration": 2.373213052749634 + }, + { + "auxiliary_loss_clip": 0.01054589, + "auxiliary_loss_mlp": 0.01030688, + "balance_loss_clip": 1.00731099, + "balance_loss_mlp": 1.01669431, + "epoch": 0.9385239741470014, + "flos": 23403747144960.0, + "grad_norm": 1.8771172444693556, + "language_loss": 0.76327538, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.78412819, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37890625, + "step": 15610, + "time_per_iteration": 2.3753364086151123 + }, + { + "auxiliary_loss_clip": 0.01051717, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.01290715, + "balance_loss_mlp": 1.01558685, + "epoch": 0.9385840973996693, + "flos": 12823876869120.0, + "grad_norm": 2.3283247290559217, + "language_loss": 0.76196337, + "learning_rate": 3.939942386953987e-08, + "loss": 0.78282791, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36132812, + "step": 15611, + "time_per_iteration": 2.355422258377075 + }, + { + "auxiliary_loss_clip": 0.01050453, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.01375628, + "balance_loss_mlp": 1.01603007, + "epoch": 0.9386442206523373, + "flos": 15486074127360.0, + "grad_norm": 1.884335815512752, + "language_loss": 0.66472042, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68557525, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 15612, + "time_per_iteration": 2.366431951522827 + }, + { + "auxiliary_loss_clip": 0.01049728, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.01573551, + "balance_loss_mlp": 1.01526213, + "epoch": 0.9387043439050052, + "flos": 21177754813440.0, + "grad_norm": 2.4585437693066, + "language_loss": 0.58972031, + "learning_rate": 3.924572515435742e-08, + "loss": 0.61060345, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34375, + "step": 15613, + "time_per_iteration": 2.353928565979004 + }, + { + "auxiliary_loss_clip": 0.0105163, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.01394999, + "balance_loss_mlp": 1.01600778, + "epoch": 0.9387644671576733, + "flos": 27667166221440.0, + "grad_norm": 2.4397508487518955, + "language_loss": 0.71952766, + "learning_rate": 3.916898732330764e-08, + "loss": 0.74040961, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 15614, + "time_per_iteration": 2.4126012325286865 + }, + { + "auxiliary_loss_clip": 0.01055202, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.01222432, + "balance_loss_mlp": 1.01763582, + "epoch": 0.9388245904103412, + "flos": 18835536965760.0, + "grad_norm": 4.5926146757850965, + "language_loss": 0.82168424, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.84261811, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.375, + "step": 15615, + "time_per_iteration": 2.3452248573303223 + }, + { + "auxiliary_loss_clip": 0.01049954, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.01466441, + "balance_loss_mlp": 1.01573849, + "epoch": 0.9388847136630092, + "flos": 25482650451840.0, + "grad_norm": 1.5671611850862652, + "language_loss": 0.72773451, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74860108, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34179688, + "step": 15616, + "time_per_iteration": 2.389707326889038 + }, + { + "auxiliary_loss_clip": 0.01052034, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.01335001, + "balance_loss_mlp": 1.01559722, + "epoch": 0.9389448369156771, + "flos": 18733974220800.0, + "grad_norm": 1.6666321402409459, + "language_loss": 0.67212701, + "learning_rate": 3.89392199712355e-08, + "loss": 0.6930058, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36328125, + "step": 15617, + "time_per_iteration": 2.3517935276031494 + }, + { + "auxiliary_loss_clip": 0.01052735, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.01619065, + "balance_loss_mlp": 1.01641524, + "epoch": 0.9390049601683451, + "flos": 21716988762240.0, + "grad_norm": 2.12089900316259, + "language_loss": 0.74135602, + "learning_rate": 3.886277957725092e-08, + "loss": 0.76230311, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36328125, + "step": 15618, + "time_per_iteration": 2.356299638748169 + }, + { + "auxiliary_loss_clip": 0.01054512, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.01274705, + "balance_loss_mlp": 1.01646602, + "epoch": 0.939065083421013, + "flos": 19390201735680.0, + "grad_norm": 2.529720748924497, + "language_loss": 0.71133423, + "learning_rate": 3.878641354978662e-08, + "loss": 0.73226029, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 15619, + "time_per_iteration": 2.361297130584717 + }, + { + "auxiliary_loss_clip": 0.01052408, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.01588428, + "balance_loss_mlp": 1.01592553, + "epoch": 0.939125206673681, + "flos": 24680346341760.0, + "grad_norm": 1.827646240630847, + "language_loss": 0.78627193, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.80720437, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36328125, + "step": 15620, + "time_per_iteration": 2.411228895187378 + }, + { + "auxiliary_loss_clip": 0.01048773, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.0159272, + "balance_loss_mlp": 1.01462555, + "epoch": 0.9391853299263491, + "flos": 16325037031680.0, + "grad_norm": 1.9328035895493718, + "language_loss": 0.74847609, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.76933861, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.33984375, + "step": 15621, + "time_per_iteration": 3.7611842155456543 + }, + { + "auxiliary_loss_clip": 0.01054903, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.01577878, + "balance_loss_mlp": 1.01684976, + "epoch": 0.939245453179017, + "flos": 11654984816640.0, + "grad_norm": 2.2239452061147644, + "language_loss": 0.68314373, + "learning_rate": 3.855776169545688e-08, + "loss": 0.70409817, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.38085938, + "step": 15622, + "time_per_iteration": 2.3479156494140625 + }, + { + "auxiliary_loss_clip": 0.01049851, + "auxiliary_loss_mlp": 0.01039837, + "balance_loss_clip": 1.01704431, + "balance_loss_mlp": 1.01540875, + "epoch": 0.939305576431685, + "flos": 23147589432960.0, + "grad_norm": 1.547412555081906, + "language_loss": 0.72117376, + "learning_rate": 3.848169316300209e-08, + "loss": 0.74207062, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34375, + "step": 15623, + "time_per_iteration": 2.3616764545440674 + }, + { + "auxiliary_loss_clip": 0.01055158, + "auxiliary_loss_mlp": 0.01039366, + "balance_loss_clip": 1.01387942, + "balance_loss_mlp": 1.01698554, + "epoch": 0.9393656996843529, + "flos": 33286506837120.0, + "grad_norm": 1.7947239603423966, + "language_loss": 0.73918319, + "learning_rate": 3.84056990115178e-08, + "loss": 0.76012844, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38085938, + "step": 15624, + "time_per_iteration": 2.537196636199951 + }, + { + "auxiliary_loss_clip": 0.01050962, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.01170444, + "balance_loss_mlp": 1.01600659, + "epoch": 0.9394258229370209, + "flos": 21688359580800.0, + "grad_norm": 1.8667039152642837, + "language_loss": 0.90371323, + "learning_rate": 3.832977924388614e-08, + "loss": 0.92456597, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34960938, + "step": 15625, + "time_per_iteration": 2.38071870803833 + }, + { + "auxiliary_loss_clip": 0.01050524, + "auxiliary_loss_mlp": 0.01037537, + "balance_loss_clip": 1.01495862, + "balance_loss_mlp": 1.01536763, + "epoch": 0.9394859461896888, + "flos": 23873189552640.0, + "grad_norm": 1.726840184102843, + "language_loss": 0.85184729, + "learning_rate": 3.825393386298592e-08, + "loss": 0.87272787, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15626, + "time_per_iteration": 2.4309463500976562 + }, + { + "auxiliary_loss_clip": 0.01007084, + "auxiliary_loss_mlp": 0.01002278, + "balance_loss_clip": 1.00039434, + "balance_loss_mlp": 1.00091851, + "epoch": 0.9395460694423569, + "flos": 61562975973120.0, + "grad_norm": 0.7766354257421524, + "language_loss": 0.56118798, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58128154, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.06152344, + "step": 15627, + "time_per_iteration": 2.995896816253662 + }, + { + "auxiliary_loss_clip": 0.01049938, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.01432228, + "balance_loss_mlp": 1.01594329, + "epoch": 0.9396061926950248, + "flos": 20994670310400.0, + "grad_norm": 2.6657700825245874, + "language_loss": 0.71020567, + "learning_rate": 3.810246627288105e-08, + "loss": 0.73105758, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33984375, + "step": 15628, + "time_per_iteration": 2.4003913402557373 + }, + { + "auxiliary_loss_clip": 0.01050343, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.01623869, + "balance_loss_mlp": 1.01540077, + "epoch": 0.9396663159476928, + "flos": 27486630247680.0, + "grad_norm": 1.5218562726447324, + "language_loss": 0.76275909, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.78364629, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 15629, + "time_per_iteration": 2.4130239486694336 + }, + { + "auxiliary_loss_clip": 0.0104716, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.01724887, + "balance_loss_mlp": 1.01443696, + "epoch": 0.9397264392003607, + "flos": 19426441593600.0, + "grad_norm": 1.8050609228224286, + "language_loss": 0.74520159, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76607221, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.328125, + "step": 15630, + "time_per_iteration": 2.3671884536743164 + }, + { + "auxiliary_loss_clip": 0.0104937, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.01370311, + "balance_loss_mlp": 1.01601601, + "epoch": 0.9397865624530287, + "flos": 18003835624320.0, + "grad_norm": 2.1230261040379355, + "language_loss": 0.70099765, + "learning_rate": 3.787582286001845e-08, + "loss": 0.72183359, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33398438, + "step": 15631, + "time_per_iteration": 2.341913938522339 + }, + { + "auxiliary_loss_clip": 0.01049859, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.01213157, + "balance_loss_mlp": 1.0155158, + "epoch": 0.9398466857056966, + "flos": 22563527431680.0, + "grad_norm": 2.943020981027574, + "language_loss": 0.75766695, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77850372, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 15632, + "time_per_iteration": 2.4089062213897705 + }, + { + "auxiliary_loss_clip": 0.0105361, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.01556051, + "balance_loss_mlp": 1.01674008, + "epoch": 0.9399068089583646, + "flos": 24534514126080.0, + "grad_norm": 1.5282250589363653, + "language_loss": 0.75462735, + "learning_rate": 3.772509926639622e-08, + "loss": 0.77556998, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36914062, + "step": 15633, + "time_per_iteration": 2.4162654876708984 + }, + { + "auxiliary_loss_clip": 0.01053285, + "auxiliary_loss_mlp": 0.01040393, + "balance_loss_clip": 1.01703966, + "balance_loss_mlp": 1.01650786, + "epoch": 0.9399669322110327, + "flos": 25629145983360.0, + "grad_norm": 1.8188463854194714, + "language_loss": 0.73983693, + "learning_rate": 3.764984908264823e-08, + "loss": 0.76077366, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.3671875, + "step": 15634, + "time_per_iteration": 2.428926706314087 + }, + { + "auxiliary_loss_clip": 0.0105199, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.01712179, + "balance_loss_mlp": 1.01544857, + "epoch": 0.9400270554637006, + "flos": 17088517843200.0, + "grad_norm": 3.5849532096124332, + "language_loss": 0.70200169, + "learning_rate": 3.75746733114144e-08, + "loss": 0.72293115, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 15635, + "time_per_iteration": 2.3613803386688232 + }, + { + "auxiliary_loss_clip": 0.01049839, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.01445651, + "balance_loss_mlp": 1.0155822, + "epoch": 0.9400871787163686, + "flos": 22054004916480.0, + "grad_norm": 1.6583025740230928, + "language_loss": 0.74934673, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.77020442, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 15636, + "time_per_iteration": 2.414755344390869 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.01038196, + "balance_loss_clip": 1.01508129, + "balance_loss_mlp": 1.01648653, + "epoch": 0.9401473019690365, + "flos": 16981823128320.0, + "grad_norm": 2.6993919560570356, + "language_loss": 0.84831071, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.86921787, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 15637, + "time_per_iteration": 2.3242437839508057 + }, + { + "auxiliary_loss_clip": 0.01051117, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.01124883, + "balance_loss_mlp": 1.0155251, + "epoch": 0.9402074252217045, + "flos": 19680958471680.0, + "grad_norm": 2.1122304911285528, + "language_loss": 0.69991219, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.72076297, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 15638, + "time_per_iteration": 3.7071399688720703 + }, + { + "auxiliary_loss_clip": 0.01049102, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.01746535, + "balance_loss_mlp": 1.01575935, + "epoch": 0.9402675484743724, + "flos": 24753140259840.0, + "grad_norm": 1.6515562797776364, + "language_loss": 0.85843271, + "learning_rate": 3.727471440859498e-08, + "loss": 0.8793062, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33398438, + "step": 15639, + "time_per_iteration": 2.449061632156372 + }, + { + "auxiliary_loss_clip": 0.01050394, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.01253772, + "balance_loss_mlp": 1.01496971, + "epoch": 0.9403276717270405, + "flos": 25557399406080.0, + "grad_norm": 1.4716663322677248, + "language_loss": 0.78826511, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80911106, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35351562, + "step": 15640, + "time_per_iteration": 2.4462640285491943 + }, + { + "auxiliary_loss_clip": 0.01052352, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.01279557, + "balance_loss_mlp": 1.01564741, + "epoch": 0.9403877949797084, + "flos": 26688585323520.0, + "grad_norm": 1.5373280305275707, + "language_loss": 0.7471444, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76803195, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 15641, + "time_per_iteration": 2.4562628269195557 + }, + { + "auxiliary_loss_clip": 0.0105437, + "auxiliary_loss_mlp": 0.01045137, + "balance_loss_clip": 1.01980555, + "balance_loss_mlp": 1.0158093, + "epoch": 0.9404479182323764, + "flos": 15010801522560.0, + "grad_norm": 2.052496067904438, + "language_loss": 0.83522308, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.8562181, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 15642, + "time_per_iteration": 2.3289942741394043 + }, + { + "auxiliary_loss_clip": 0.01049849, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.0084244, + "balance_loss_mlp": 1.01527965, + "epoch": 0.9405080414850443, + "flos": 24972394798080.0, + "grad_norm": 1.815302472596836, + "language_loss": 0.69430149, + "learning_rate": 3.697594633355084e-08, + "loss": 0.71510756, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34570312, + "step": 15643, + "time_per_iteration": 2.3960869312286377 + }, + { + "auxiliary_loss_clip": 0.01053862, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.0140053, + "balance_loss_mlp": 1.01636064, + "epoch": 0.9405681647377123, + "flos": 20843845770240.0, + "grad_norm": 1.758146964285999, + "language_loss": 0.77498543, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.79592198, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 15644, + "time_per_iteration": 2.3765246868133545 + }, + { + "auxiliary_loss_clip": 0.01049296, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.01252961, + "balance_loss_mlp": 1.01558161, + "epoch": 0.9406282879903802, + "flos": 23804445352320.0, + "grad_norm": 1.5520697880724856, + "language_loss": 0.68292248, + "learning_rate": 3.682700891311974e-08, + "loss": 0.70375365, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3359375, + "step": 15645, + "time_per_iteration": 2.3781869411468506 + }, + { + "auxiliary_loss_clip": 0.01048767, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.01603127, + "balance_loss_mlp": 1.01489544, + "epoch": 0.9406884112430483, + "flos": 27673659734400.0, + "grad_norm": 1.457525108360432, + "language_loss": 0.70921159, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.73008716, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.33984375, + "step": 15646, + "time_per_iteration": 2.451057195663452 + }, + { + "auxiliary_loss_clip": 0.01049848, + "auxiliary_loss_mlp": 0.0103723, + "balance_loss_clip": 1.01521206, + "balance_loss_mlp": 1.01590347, + "epoch": 0.9407485344957163, + "flos": 23073957642240.0, + "grad_norm": 1.5293283113259832, + "language_loss": 0.74949479, + "learning_rate": 3.667836926755208e-08, + "loss": 0.7703656, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33984375, + "step": 15647, + "time_per_iteration": 2.3760108947753906 + }, + { + "auxiliary_loss_clip": 0.01006811, + "auxiliary_loss_mlp": 0.01002289, + "balance_loss_clip": 1.00028658, + "balance_loss_mlp": 1.00059032, + "epoch": 0.9408086577483842, + "flos": 71010682813440.0, + "grad_norm": 0.8918114597104883, + "language_loss": 0.63546491, + "learning_rate": 3.660416111738907e-08, + "loss": 0.6555559, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06201172, + "step": 15648, + "time_per_iteration": 5.926705598831177 + }, + { + "auxiliary_loss_clip": 0.01049273, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.0107789, + "balance_loss_mlp": 1.01572967, + "epoch": 0.9408687810010522, + "flos": 23729870954880.0, + "grad_norm": 1.4922902923425476, + "language_loss": 0.66908103, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68988144, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.3359375, + "step": 15649, + "time_per_iteration": 2.4111316204071045 + }, + { + "auxiliary_loss_clip": 0.01051639, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.01616514, + "balance_loss_mlp": 1.01548266, + "epoch": 0.9409289042537201, + "flos": 18368328885120.0, + "grad_norm": 1.9097197622058568, + "language_loss": 0.78417587, + "learning_rate": 3.645596817637586e-08, + "loss": 0.80508173, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 15650, + "time_per_iteration": 2.3677749633789062 + }, + { + "auxiliary_loss_clip": 0.0105252, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.01982808, + "balance_loss_mlp": 1.01682079, + "epoch": 0.9409890275063881, + "flos": 23877204359040.0, + "grad_norm": 2.1846560446648167, + "language_loss": 0.75485516, + "learning_rate": 3.638198339114451e-08, + "loss": 0.77580899, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 15651, + "time_per_iteration": 2.4164466857910156 + }, + { + "auxiliary_loss_clip": 0.01051233, + "auxiliary_loss_mlp": 0.01038098, + "balance_loss_clip": 1.01565099, + "balance_loss_mlp": 1.01557755, + "epoch": 0.941049150759056, + "flos": 16544151924480.0, + "grad_norm": 1.948662377684444, + "language_loss": 0.73065972, + "learning_rate": 3.630807306650507e-08, + "loss": 0.751553, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 15652, + "time_per_iteration": 2.3763630390167236 + }, + { + "auxiliary_loss_clip": 0.01055023, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.01537049, + "balance_loss_mlp": 1.01655626, + "epoch": 0.9411092740117241, + "flos": 25117249495680.0, + "grad_norm": 1.7394381525351137, + "language_loss": 0.67121506, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.69216812, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.38476562, + "step": 15653, + "time_per_iteration": 2.4335546493530273 + }, + { + "auxiliary_loss_clip": 0.01051008, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.01342392, + "balance_loss_mlp": 1.01579666, + "epoch": 0.941169397264392, + "flos": 21141200753280.0, + "grad_norm": 1.976442858887634, + "language_loss": 0.78803641, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.80890775, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 15654, + "time_per_iteration": 2.3758723735809326 + }, + { + "auxiliary_loss_clip": 0.01052959, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.01341128, + "balance_loss_mlp": 1.01576376, + "epoch": 0.94122952051706, + "flos": 38507383572480.0, + "grad_norm": 1.4673606203937788, + "language_loss": 0.70814204, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72903073, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.37109375, + "step": 15655, + "time_per_iteration": 2.5172908306121826 + }, + { + "auxiliary_loss_clip": 0.01051158, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.01624298, + "balance_loss_mlp": 1.01568627, + "epoch": 0.9412896437697279, + "flos": 18368224151040.0, + "grad_norm": 1.7879057157047091, + "language_loss": 0.73356426, + "learning_rate": 3.601317642987944e-08, + "loss": 0.75448567, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.35546875, + "step": 15656, + "time_per_iteration": 2.3691539764404297 + }, + { + "auxiliary_loss_clip": 0.01049888, + "auxiliary_loss_mlp": 0.01035626, + "balance_loss_clip": 1.01427591, + "balance_loss_mlp": 1.01588106, + "epoch": 0.9413497670223959, + "flos": 25883767595520.0, + "grad_norm": 1.8691132579598406, + "language_loss": 0.79492581, + "learning_rate": 3.593963845018377e-08, + "loss": 0.81578088, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33984375, + "step": 15657, + "time_per_iteration": 2.4129810333251953 + }, + { + "auxiliary_loss_clip": 0.0105052, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.0159148, + "balance_loss_mlp": 1.0150702, + "epoch": 0.9414098902750638, + "flos": 16617364778880.0, + "grad_norm": 1.902449610708662, + "language_loss": 0.85148299, + "learning_rate": 3.586617494785371e-08, + "loss": 0.87236726, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35546875, + "step": 15658, + "time_per_iteration": 2.3510165214538574 + }, + { + "auxiliary_loss_clip": 0.01054305, + "auxiliary_loss_mlp": 0.01038494, + "balance_loss_clip": 1.01241136, + "balance_loss_mlp": 1.016559, + "epoch": 0.9414700135277319, + "flos": 18624032749440.0, + "grad_norm": 1.7775656004559017, + "language_loss": 0.72003543, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.7409634, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37890625, + "step": 15659, + "time_per_iteration": 2.377020835876465 + }, + { + "auxiliary_loss_clip": 0.0104931, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.01646733, + "balance_loss_mlp": 1.01525259, + "epoch": 0.9415301367803999, + "flos": 26279124364800.0, + "grad_norm": 1.598641710190227, + "language_loss": 0.801265, + "learning_rate": 3.571947138643172e-08, + "loss": 0.82214659, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33984375, + "step": 15660, + "time_per_iteration": 3.8955111503601074 + }, + { + "auxiliary_loss_clip": 0.01048704, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.01259542, + "balance_loss_mlp": 1.01517761, + "epoch": 0.9415902600330678, + "flos": 23260184167680.0, + "grad_norm": 1.4392962721244393, + "language_loss": 0.68352115, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70434821, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.3359375, + "step": 15661, + "time_per_iteration": 2.4307644367218018 + }, + { + "auxiliary_loss_clip": 0.01051345, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.01357913, + "balance_loss_mlp": 1.01589549, + "epoch": 0.9416503832857358, + "flos": 14718299218560.0, + "grad_norm": 2.112032573062945, + "language_loss": 0.6746403, + "learning_rate": 3.557306576786434e-08, + "loss": 0.69551229, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 15662, + "time_per_iteration": 2.3466787338256836 + }, + { + "auxiliary_loss_clip": 0.01007344, + "auxiliary_loss_mlp": 0.01002646, + "balance_loss_clip": 1.0006671, + "balance_loss_mlp": 1.00096118, + "epoch": 0.9417105065384037, + "flos": 70309592334720.0, + "grad_norm": 0.766958982859779, + "language_loss": 0.59388822, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61398816, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.06347656, + "step": 15663, + "time_per_iteration": 3.0594472885131836 + }, + { + "auxiliary_loss_clip": 0.01055156, + "auxiliary_loss_mlp": 0.01039283, + "balance_loss_clip": 1.01370049, + "balance_loss_mlp": 1.01660132, + "epoch": 0.9417706297910717, + "flos": 34056481161600.0, + "grad_norm": 1.7615910425832437, + "language_loss": 0.68008703, + "learning_rate": 3.542695811435914e-08, + "loss": 0.70103145, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38476562, + "step": 15664, + "time_per_iteration": 2.4801340103149414 + }, + { + "auxiliary_loss_clip": 0.01052926, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.0136354, + "balance_loss_mlp": 1.01700747, + "epoch": 0.9418307530437396, + "flos": 16470694690560.0, + "grad_norm": 2.222533073935684, + "language_loss": 0.74297833, + "learning_rate": 3.535401603143207e-08, + "loss": 0.76387417, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 15665, + "time_per_iteration": 2.370307207107544 + }, + { + "auxiliary_loss_clip": 0.01049788, + "auxiliary_loss_mlp": 0.0103685, + "balance_loss_clip": 1.01548719, + "balance_loss_mlp": 1.01631618, + "epoch": 0.9418908762964077, + "flos": 11252785420800.0, + "grad_norm": 2.95733737200462, + "language_loss": 0.64406824, + "learning_rate": 3.528114844807773e-08, + "loss": 0.66493464, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33398438, + "step": 15666, + "time_per_iteration": 2.344013214111328 + }, + { + "auxiliary_loss_clip": 0.01051705, + "auxiliary_loss_mlp": 0.01036317, + "balance_loss_clip": 1.0135361, + "balance_loss_mlp": 1.01601267, + "epoch": 0.9419509995490756, + "flos": 18437945869440.0, + "grad_norm": 1.7151042841992807, + "language_loss": 0.79952621, + "learning_rate": 3.520835536705902e-08, + "loss": 0.82040638, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35742188, + "step": 15667, + "time_per_iteration": 2.3620805740356445 + }, + { + "auxiliary_loss_clip": 0.01049746, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.01246452, + "balance_loss_mlp": 1.0151757, + "epoch": 0.9420111228017436, + "flos": 20736976498560.0, + "grad_norm": 1.678955583754132, + "language_loss": 0.76715958, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.7879886, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34570312, + "step": 15668, + "time_per_iteration": 2.3686819076538086 + }, + { + "auxiliary_loss_clip": 0.01052544, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.01409018, + "balance_loss_mlp": 1.01543689, + "epoch": 0.9420712460544115, + "flos": 21140886551040.0, + "grad_norm": 4.611883980605143, + "language_loss": 0.60261154, + "learning_rate": 3.506299272306723e-08, + "loss": 0.62352771, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37109375, + "step": 15669, + "time_per_iteration": 2.3777499198913574 + }, + { + "auxiliary_loss_clip": 0.01048029, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.01060319, + "balance_loss_mlp": 1.01487875, + "epoch": 0.9421313693070795, + "flos": 15850846679040.0, + "grad_norm": 1.4650272705966911, + "language_loss": 0.78346336, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.80424809, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33203125, + "step": 15670, + "time_per_iteration": 2.3679869174957275 + }, + { + "auxiliary_loss_clip": 0.01050984, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.01413357, + "balance_loss_mlp": 1.01619506, + "epoch": 0.9421914925597474, + "flos": 32414550831360.0, + "grad_norm": 2.1348573545554252, + "language_loss": 0.66222078, + "learning_rate": 3.491792812150574e-08, + "loss": 0.68309009, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34765625, + "step": 15671, + "time_per_iteration": 2.45574951171875 + }, + { + "auxiliary_loss_clip": 0.01051075, + "auxiliary_loss_mlp": 0.01036603, + "balance_loss_clip": 1.01297569, + "balance_loss_mlp": 1.01554179, + "epoch": 0.9422516158124155, + "flos": 19717512531840.0, + "grad_norm": 1.686994194948593, + "language_loss": 0.80618894, + "learning_rate": 3.48455075935139e-08, + "loss": 0.82706571, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 15672, + "time_per_iteration": 2.4345691204071045 + }, + { + "auxiliary_loss_clip": 0.01054495, + "auxiliary_loss_mlp": 0.01039649, + "balance_loss_clip": 1.01406705, + "balance_loss_mlp": 1.01584673, + "epoch": 0.9423117390650835, + "flos": 16252347847680.0, + "grad_norm": 2.957850114322184, + "language_loss": 0.74753499, + "learning_rate": 3.47731615843776e-08, + "loss": 0.76847643, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38671875, + "step": 15673, + "time_per_iteration": 2.3533878326416016 + }, + { + "auxiliary_loss_clip": 0.01050621, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.00982451, + "balance_loss_mlp": 1.0150578, + "epoch": 0.9423718623177514, + "flos": 31795191578880.0, + "grad_norm": 1.4225851448468043, + "language_loss": 0.7104491, + "learning_rate": 3.470089009683974e-08, + "loss": 0.73128909, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35546875, + "step": 15674, + "time_per_iteration": 2.4670565128326416 + }, + { + "auxiliary_loss_clip": 0.01050825, + "auxiliary_loss_mlp": 0.01030507, + "balance_loss_clip": 1.00898933, + "balance_loss_mlp": 1.01511526, + "epoch": 0.9424319855704194, + "flos": 23330673936000.0, + "grad_norm": 1.8188527376441446, + "language_loss": 0.81617695, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83699024, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35742188, + "step": 15675, + "time_per_iteration": 2.4221954345703125 + }, + { + "auxiliary_loss_clip": 0.01051244, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.01633072, + "balance_loss_mlp": 1.01587164, + "epoch": 0.9424921088230873, + "flos": 20776567847040.0, + "grad_norm": 1.592897648999295, + "language_loss": 0.63371134, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.65460342, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35351562, + "step": 15676, + "time_per_iteration": 2.3891806602478027 + }, + { + "auxiliary_loss_clip": 0.0105139, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.01234555, + "balance_loss_mlp": 1.01531136, + "epoch": 0.9425522320757553, + "flos": 19025638652160.0, + "grad_norm": 1.7811726629502391, + "language_loss": 0.6793493, + "learning_rate": 3.448452279120984e-08, + "loss": 0.700234, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.359375, + "step": 15677, + "time_per_iteration": 2.360773801803589 + }, + { + "auxiliary_loss_clip": 0.0105241, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.01072443, + "balance_loss_mlp": 1.01515222, + "epoch": 0.9426123553284232, + "flos": 25154187580800.0, + "grad_norm": 1.7110955319939203, + "language_loss": 0.65850306, + "learning_rate": 3.441254941744387e-08, + "loss": 0.67938113, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37109375, + "step": 15678, + "time_per_iteration": 3.680584192276001 + }, + { + "auxiliary_loss_clip": 0.01051204, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.01131546, + "balance_loss_mlp": 1.01634359, + "epoch": 0.9426724785810913, + "flos": 21178278483840.0, + "grad_norm": 1.865312280566845, + "language_loss": 0.75618827, + "learning_rate": 3.434065057895097e-08, + "loss": 0.77704561, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 15679, + "time_per_iteration": 2.3730101585388184 + }, + { + "auxiliary_loss_clip": 0.01053749, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.01333642, + "balance_loss_mlp": 1.01644516, + "epoch": 0.9427326018337592, + "flos": 14756040264960.0, + "grad_norm": 2.1160773064386933, + "language_loss": 0.78637087, + "learning_rate": 3.426882627845762e-08, + "loss": 0.8072741, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.37304688, + "step": 15680, + "time_per_iteration": 2.351349115371704 + }, + { + "auxiliary_loss_clip": 0.01050805, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.01395762, + "balance_loss_mlp": 1.01575518, + "epoch": 0.9427927250864272, + "flos": 20922574619520.0, + "grad_norm": 1.9274873690265357, + "language_loss": 0.76353133, + "learning_rate": 3.419707651868742e-08, + "loss": 0.7844162, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34960938, + "step": 15681, + "time_per_iteration": 2.358640193939209 + }, + { + "auxiliary_loss_clip": 0.01050909, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.01546264, + "balance_loss_mlp": 1.01551008, + "epoch": 0.9428528483390951, + "flos": 19751587885440.0, + "grad_norm": 2.7010765932653196, + "language_loss": 0.67387354, + "learning_rate": 3.412540130236086e-08, + "loss": 0.69477642, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.35351562, + "step": 15682, + "time_per_iteration": 2.3876149654388428 + }, + { + "auxiliary_loss_clip": 0.01050769, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.01330996, + "balance_loss_mlp": 1.01536489, + "epoch": 0.9429129715917631, + "flos": 24533850810240.0, + "grad_norm": 6.4492329066291765, + "language_loss": 0.77862942, + "learning_rate": 3.405380063219665e-08, + "loss": 0.79951686, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35351562, + "step": 15683, + "time_per_iteration": 2.392002582550049 + }, + { + "auxiliary_loss_clip": 0.01054369, + "auxiliary_loss_mlp": 0.01044228, + "balance_loss_clip": 1.01803744, + "balance_loss_mlp": 1.01616371, + "epoch": 0.942973094844431, + "flos": 17959077394560.0, + "grad_norm": 2.5979596334955066, + "language_loss": 0.76860291, + "learning_rate": 3.398227451090885e-08, + "loss": 0.78958887, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.3828125, + "step": 15684, + "time_per_iteration": 2.338413953781128 + }, + { + "auxiliary_loss_clip": 0.01048615, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.0097611, + "balance_loss_mlp": 1.01498389, + "epoch": 0.9430332180970991, + "flos": 26136573816960.0, + "grad_norm": 1.513125249916925, + "language_loss": 0.78003323, + "learning_rate": 3.391082294121017e-08, + "loss": 0.80081022, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.3359375, + "step": 15685, + "time_per_iteration": 2.408259391784668 + }, + { + "auxiliary_loss_clip": 0.01049306, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.01672387, + "balance_loss_mlp": 1.01500905, + "epoch": 0.943093341349767, + "flos": 23950242656640.0, + "grad_norm": 1.833602543248329, + "language_loss": 0.76786196, + "learning_rate": 3.383944592581023e-08, + "loss": 0.78871697, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.34375, + "step": 15686, + "time_per_iteration": 2.3895561695098877 + }, + { + "auxiliary_loss_clip": 0.01052849, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.01787424, + "balance_loss_mlp": 1.01557934, + "epoch": 0.943153464602435, + "flos": 17967421209600.0, + "grad_norm": 1.7689822182921808, + "language_loss": 0.81614923, + "learning_rate": 3.376814346741575e-08, + "loss": 0.83709776, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37304688, + "step": 15687, + "time_per_iteration": 2.354069471359253 + }, + { + "auxiliary_loss_clip": 0.01053648, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.0126996, + "balance_loss_mlp": 1.01639271, + "epoch": 0.943213587855103, + "flos": 14500650602880.0, + "grad_norm": 2.0626068598587595, + "language_loss": 0.77690166, + "learning_rate": 3.369691556873011e-08, + "loss": 0.79782426, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37109375, + "step": 15688, + "time_per_iteration": 5.177797794342041 + }, + { + "auxiliary_loss_clip": 0.01047647, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.00877619, + "balance_loss_mlp": 1.01445019, + "epoch": 0.9432737111077709, + "flos": 28985137246080.0, + "grad_norm": 1.5647809383099627, + "language_loss": 0.69460744, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.71538591, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33203125, + "step": 15689, + "time_per_iteration": 2.430363893508911 + }, + { + "auxiliary_loss_clip": 0.01049501, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.01503158, + "balance_loss_mlp": 1.01619172, + "epoch": 0.9433338343604389, + "flos": 21608199365760.0, + "grad_norm": 1.7651282329268918, + "language_loss": 0.81571043, + "learning_rate": 3.35546834612872e-08, + "loss": 0.83655214, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.33203125, + "step": 15690, + "time_per_iteration": 2.3944578170776367 + }, + { + "auxiliary_loss_clip": 0.01049292, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.01541638, + "balance_loss_mlp": 1.01492918, + "epoch": 0.9433939576131068, + "flos": 33180894374400.0, + "grad_norm": 1.8153269164989358, + "language_loss": 0.61306536, + "learning_rate": 3.348367925792317e-08, + "loss": 0.63392818, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34375, + "step": 15691, + "time_per_iteration": 2.452047824859619 + }, + { + "auxiliary_loss_clip": 0.01052879, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.01712489, + "balance_loss_mlp": 1.01677012, + "epoch": 0.9434540808657749, + "flos": 20485322352000.0, + "grad_norm": 1.5130579082667968, + "language_loss": 0.67191529, + "learning_rate": 3.341274962505514e-08, + "loss": 0.69283402, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36132812, + "step": 15692, + "time_per_iteration": 2.38089919090271 + }, + { + "auxiliary_loss_clip": 0.01052184, + "auxiliary_loss_mlp": 0.01036872, + "balance_loss_clip": 1.01270819, + "balance_loss_mlp": 1.01661599, + "epoch": 0.9435142041184428, + "flos": 21541898960640.0, + "grad_norm": 2.093818173948926, + "language_loss": 0.75675499, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77764553, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 15693, + "time_per_iteration": 2.36436128616333 + }, + { + "auxiliary_loss_clip": 0.01051585, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.01151359, + "balance_loss_mlp": 1.01612198, + "epoch": 0.9435743273711108, + "flos": 25007936428800.0, + "grad_norm": 1.6107327433275025, + "language_loss": 0.73632777, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75719404, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 15694, + "time_per_iteration": 2.40400767326355 + }, + { + "auxiliary_loss_clip": 0.01006817, + "auxiliary_loss_mlp": 0.01002314, + "balance_loss_clip": 0.99997795, + "balance_loss_mlp": 1.00051272, + "epoch": 0.9436344506237787, + "flos": 60155172420480.0, + "grad_norm": 0.6997556146662767, + "language_loss": 0.50689393, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52698529, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.06298828, + "step": 15695, + "time_per_iteration": 3.0540308952331543 + }, + { + "auxiliary_loss_clip": 0.01047909, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.01387215, + "balance_loss_mlp": 1.01517987, + "epoch": 0.9436945738764467, + "flos": 22236146812800.0, + "grad_norm": 1.6622802311362894, + "language_loss": 0.66174775, + "learning_rate": 3.312977685229335e-08, + "loss": 0.68256307, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.328125, + "step": 15696, + "time_per_iteration": 2.4006612300872803 + }, + { + "auxiliary_loss_clip": 0.01051873, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.01107192, + "balance_loss_mlp": 1.01652789, + "epoch": 0.9437546971291146, + "flos": 25044036641280.0, + "grad_norm": 1.7213576577931613, + "language_loss": 0.67571533, + "learning_rate": 3.305922011219353e-08, + "loss": 0.69655919, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35351562, + "step": 15697, + "time_per_iteration": 2.407270908355713 + }, + { + "auxiliary_loss_clip": 0.01007196, + "auxiliary_loss_mlp": 0.0100259, + "balance_loss_clip": 1.00050378, + "balance_loss_mlp": 1.00077844, + "epoch": 0.9438148203817827, + "flos": 56787660408960.0, + "grad_norm": 0.8506090074101591, + "language_loss": 0.63308966, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65318751, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06445312, + "step": 15698, + "time_per_iteration": 2.9413440227508545 + }, + { + "auxiliary_loss_clip": 0.01054237, + "auxiliary_loss_mlp": 0.01042373, + "balance_loss_clip": 1.01774395, + "balance_loss_mlp": 1.01646614, + "epoch": 0.9438749436344506, + "flos": 22345285322880.0, + "grad_norm": 1.7582276677596744, + "language_loss": 0.70387596, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72484207, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37890625, + "step": 15699, + "time_per_iteration": 2.395869493484497 + }, + { + "auxiliary_loss_clip": 0.0104836, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.01330101, + "balance_loss_mlp": 1.01444578, + "epoch": 0.9439350668871186, + "flos": 13369953444480.0, + "grad_norm": 3.2685026257879897, + "language_loss": 0.77191794, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.79275179, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 15700, + "time_per_iteration": 3.8112454414367676 + }, + { + "auxiliary_loss_clip": 0.0105067, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.01487017, + "balance_loss_mlp": 1.01568592, + "epoch": 0.9439951901397866, + "flos": 17784371617920.0, + "grad_norm": 1.538161794301533, + "language_loss": 0.71721721, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.73807549, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.34960938, + "step": 15701, + "time_per_iteration": 2.369601011276245 + }, + { + "auxiliary_loss_clip": 0.01054447, + "auxiliary_loss_mlp": 0.01036548, + "balance_loss_clip": 1.01269388, + "balance_loss_mlp": 1.01606381, + "epoch": 0.9440553133924545, + "flos": 18878584538880.0, + "grad_norm": 1.9926690784734284, + "language_loss": 0.78800839, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80891836, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3828125, + "step": 15702, + "time_per_iteration": 2.3600189685821533 + }, + { + "auxiliary_loss_clip": 0.01052109, + "auxiliary_loss_mlp": 0.01037032, + "balance_loss_clip": 1.01435804, + "balance_loss_mlp": 1.01584864, + "epoch": 0.9441154366451225, + "flos": 19571959607040.0, + "grad_norm": 1.7766818579195918, + "language_loss": 0.67802179, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.69891316, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 15703, + "time_per_iteration": 2.3622708320617676 + }, + { + "auxiliary_loss_clip": 0.01052455, + "auxiliary_loss_mlp": 0.0103906, + "balance_loss_clip": 1.01492, + "balance_loss_mlp": 1.01562691, + "epoch": 0.9441755598977905, + "flos": 30293821848960.0, + "grad_norm": 1.8644875813867392, + "language_loss": 0.74283695, + "learning_rate": 3.256741150552833e-08, + "loss": 0.7637521, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 15704, + "time_per_iteration": 2.4171149730682373 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.01472533, + "balance_loss_mlp": 1.01542127, + "epoch": 0.9442356831504585, + "flos": 20666835843840.0, + "grad_norm": 1.7897907650679277, + "language_loss": 0.75566, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.7765404, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15705, + "time_per_iteration": 2.3887364864349365 + }, + { + "auxiliary_loss_clip": 0.01050424, + "auxiliary_loss_mlp": 0.01037213, + "balance_loss_clip": 1.01545739, + "balance_loss_mlp": 1.01553643, + "epoch": 0.9442958064031264, + "flos": 16106341075200.0, + "grad_norm": 1.9089173177989442, + "language_loss": 0.7800808, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.8009572, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34765625, + "step": 15706, + "time_per_iteration": 2.3486855030059814 + }, + { + "auxiliary_loss_clip": 0.01048838, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.01181996, + "balance_loss_mlp": 1.01579976, + "epoch": 0.9443559296557944, + "flos": 20446394319360.0, + "grad_norm": 1.4893752781675078, + "language_loss": 0.69977427, + "learning_rate": 3.23577554137866e-08, + "loss": 0.72059029, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33007812, + "step": 15707, + "time_per_iteration": 2.3724913597106934 + }, + { + "auxiliary_loss_clip": 0.01046779, + "auxiliary_loss_mlp": 0.01035346, + "balance_loss_clip": 1.01529455, + "balance_loss_mlp": 1.01394308, + "epoch": 0.9444160529084623, + "flos": 21609979845120.0, + "grad_norm": 2.0935765545694203, + "language_loss": 0.70179212, + "learning_rate": 3.22880192727244e-08, + "loss": 0.72261339, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.328125, + "step": 15708, + "time_per_iteration": 2.3791630268096924 + }, + { + "auxiliary_loss_clip": 0.01048398, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.0125246, + "balance_loss_mlp": 1.0153892, + "epoch": 0.9444761761611303, + "flos": 18440808600960.0, + "grad_norm": 2.6958870654620233, + "language_loss": 0.71644783, + "learning_rate": 3.221835774749748e-08, + "loss": 0.7372694, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.33007812, + "step": 15709, + "time_per_iteration": 2.35504412651062 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.01274133, + "balance_loss_mlp": 1.01697791, + "epoch": 0.9445362994137982, + "flos": 20956161214080.0, + "grad_norm": 2.1278301484370585, + "language_loss": 0.86089122, + "learning_rate": 3.214877084074774e-08, + "loss": 0.88175809, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 15710, + "time_per_iteration": 2.35038685798645 + }, + { + "auxiliary_loss_clip": 0.01054751, + "auxiliary_loss_mlp": 0.01039843, + "balance_loss_clip": 1.0151906, + "balance_loss_mlp": 1.01669383, + "epoch": 0.9445964226664663, + "flos": 20302237848960.0, + "grad_norm": 1.5593689613397428, + "language_loss": 0.72561204, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.74655801, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.38085938, + "step": 15711, + "time_per_iteration": 2.368347644805908 + }, + { + "auxiliary_loss_clip": 0.01054099, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.01296139, + "balance_loss_mlp": 1.01839876, + "epoch": 0.9446565459191342, + "flos": 26394826210560.0, + "grad_norm": 1.7496243806983809, + "language_loss": 0.70933217, + "learning_rate": 3.200982089323179e-08, + "loss": 0.73022807, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35742188, + "step": 15712, + "time_per_iteration": 2.4209465980529785 + }, + { + "auxiliary_loss_clip": 0.01053939, + "auxiliary_loss_mlp": 0.01038729, + "balance_loss_clip": 1.01383829, + "balance_loss_mlp": 1.0175941, + "epoch": 0.9447166691718022, + "flos": 16543837722240.0, + "grad_norm": 2.429246153867456, + "language_loss": 0.71340889, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73433554, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36328125, + "step": 15713, + "time_per_iteration": 2.3387863636016846 + }, + { + "auxiliary_loss_clip": 0.01050294, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.01237118, + "balance_loss_mlp": 1.01571119, + "epoch": 0.9447767924244702, + "flos": 29163473804160.0, + "grad_norm": 1.5251877969228511, + "language_loss": 0.77621722, + "learning_rate": 3.187116945125212e-08, + "loss": 0.79707259, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34570312, + "step": 15714, + "time_per_iteration": 2.4268229007720947 + }, + { + "auxiliary_loss_clip": 0.01051506, + "auxiliary_loss_mlp": 0.01039961, + "balance_loss_clip": 1.01620245, + "balance_loss_mlp": 1.0155549, + "epoch": 0.9448369156771381, + "flos": 19274080953600.0, + "grad_norm": 1.948182136061874, + "language_loss": 0.68240595, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.70332062, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.359375, + "step": 15715, + "time_per_iteration": 2.340466260910034 + }, + { + "auxiliary_loss_clip": 0.01051631, + "auxiliary_loss_mlp": 0.01041643, + "balance_loss_clip": 1.0182662, + "balance_loss_mlp": 1.0156678, + "epoch": 0.9448970389298061, + "flos": 23840056805760.0, + "grad_norm": 1.9675959950097006, + "language_loss": 0.76481324, + "learning_rate": 3.173281653583948e-08, + "loss": 0.78574598, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 15716, + "time_per_iteration": 2.421613931655884 + }, + { + "auxiliary_loss_clip": 0.01052931, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.01407206, + "balance_loss_mlp": 1.01658416, + "epoch": 0.944957162182474, + "flos": 22381176067200.0, + "grad_norm": 1.7591417026581748, + "language_loss": 0.63288689, + "learning_rate": 3.166375203215565e-08, + "loss": 0.65379584, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 15717, + "time_per_iteration": 3.7301392555236816 + }, + { + "auxiliary_loss_clip": 0.01051169, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.01769888, + "balance_loss_mlp": 1.01597261, + "epoch": 0.9450172854351421, + "flos": 17382940272000.0, + "grad_norm": 1.5609500568491539, + "language_loss": 0.79615843, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81706494, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.3515625, + "step": 15718, + "time_per_iteration": 2.338719606399536 + }, + { + "auxiliary_loss_clip": 0.01007053, + "auxiliary_loss_mlp": 0.010024, + "balance_loss_clip": 1.00036168, + "balance_loss_mlp": 1.00079274, + "epoch": 0.94507740868781, + "flos": 68462895680640.0, + "grad_norm": 0.7020023369103464, + "language_loss": 0.57897186, + "learning_rate": 3.152584694592719e-08, + "loss": 0.5990665, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.0625, + "step": 15719, + "time_per_iteration": 3.036783218383789 + }, + { + "auxiliary_loss_clip": 0.01052792, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.01394355, + "balance_loss_mlp": 1.01689458, + "epoch": 0.945137531940478, + "flos": 21141410221440.0, + "grad_norm": 1.808888907378482, + "language_loss": 0.77094722, + "learning_rate": 3.145700636861193e-08, + "loss": 0.79185271, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 15720, + "time_per_iteration": 2.409707546234131 + }, + { + "auxiliary_loss_clip": 0.01050407, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.0109508, + "balance_loss_mlp": 1.01543927, + "epoch": 0.9451976551931459, + "flos": 24532803469440.0, + "grad_norm": 1.8868901117773662, + "language_loss": 0.73210549, + "learning_rate": 3.138824043864452e-08, + "loss": 0.75292528, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.34960938, + "step": 15721, + "time_per_iteration": 2.4238338470458984 + }, + { + "auxiliary_loss_clip": 0.01051869, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.01854873, + "balance_loss_mlp": 1.01627827, + "epoch": 0.9452577784458139, + "flos": 23439463332480.0, + "grad_norm": 1.7299126540210101, + "language_loss": 0.86341059, + "learning_rate": 3.131954915863244e-08, + "loss": 0.88434231, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 15722, + "time_per_iteration": 2.388282060623169 + }, + { + "auxiliary_loss_clip": 0.01006695, + "auxiliary_loss_mlp": 0.01001614, + "balance_loss_clip": 0.99962324, + "balance_loss_mlp": 1.00048327, + "epoch": 0.9453179016984818, + "flos": 52014509349120.0, + "grad_norm": 0.8969533625200669, + "language_loss": 0.64559221, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66567528, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.06201172, + "step": 15723, + "time_per_iteration": 2.966252565383911 + }, + { + "auxiliary_loss_clip": 0.01054788, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.01330924, + "balance_loss_mlp": 1.01765704, + "epoch": 0.9453780249511499, + "flos": 13472354062080.0, + "grad_norm": 1.9908249120846178, + "language_loss": 0.74402249, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.76495135, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 15724, + "time_per_iteration": 2.353783369064331 + }, + { + "auxiliary_loss_clip": 0.0105029, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.01427817, + "balance_loss_mlp": 1.01510239, + "epoch": 0.9454381482038178, + "flos": 23257391258880.0, + "grad_norm": 2.6483226999533422, + "language_loss": 0.86021805, + "learning_rate": 3.111392324436024e-08, + "loss": 0.88107908, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 15725, + "time_per_iteration": 2.413386344909668 + }, + { + "auxiliary_loss_clip": 0.01051607, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.0082643, + "balance_loss_mlp": 1.01587057, + "epoch": 0.9454982714564858, + "flos": 19495709464320.0, + "grad_norm": 2.393209428691187, + "language_loss": 0.72035265, + "learning_rate": 3.104553059018822e-08, + "loss": 0.74117178, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35742188, + "step": 15726, + "time_per_iteration": 2.3790717124938965 + }, + { + "auxiliary_loss_clip": 0.01053726, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.01768899, + "epoch": 0.9455583947091538, + "flos": 23257007233920.0, + "grad_norm": 1.8265232387882848, + "language_loss": 0.62331235, + "learning_rate": 3.097721259896735e-08, + "loss": 0.64424533, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 15727, + "time_per_iteration": 3.783712863922119 + }, + { + "auxiliary_loss_clip": 0.01048854, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.0161705, + "balance_loss_mlp": 1.01505744, + "epoch": 0.9456185179618217, + "flos": 17672160908160.0, + "grad_norm": 2.0313236856982266, + "language_loss": 0.83083779, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.85169792, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.33789062, + "step": 15728, + "time_per_iteration": 2.349702835083008 + }, + { + "auxiliary_loss_clip": 0.01006989, + "auxiliary_loss_mlp": 0.01004247, + "balance_loss_clip": 1.00225627, + "balance_loss_mlp": 1.00068545, + "epoch": 0.9456786412144897, + "flos": 61412046698880.0, + "grad_norm": 0.7274375207276249, + "language_loss": 0.59050035, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61061275, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.06298828, + "step": 15729, + "time_per_iteration": 2.9529356956481934 + }, + { + "auxiliary_loss_clip": 0.01049143, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.01321554, + "balance_loss_mlp": 1.01421511, + "epoch": 0.9457387644671577, + "flos": 18587373955200.0, + "grad_norm": 2.1071409430480292, + "language_loss": 0.77911067, + "learning_rate": 3.077270662890052e-08, + "loss": 0.79997301, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34960938, + "step": 15730, + "time_per_iteration": 2.3394429683685303 + }, + { + "auxiliary_loss_clip": 0.01050876, + "auxiliary_loss_mlp": 0.01038493, + "balance_loss_clip": 1.01347136, + "balance_loss_mlp": 1.0153923, + "epoch": 0.9457988877198257, + "flos": 21107404690560.0, + "grad_norm": 1.573660215432561, + "language_loss": 0.63896251, + "learning_rate": 3.070468731536047e-08, + "loss": 0.6598562, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35546875, + "step": 15731, + "time_per_iteration": 2.3992385864257812 + }, + { + "auxiliary_loss_clip": 0.01053233, + "auxiliary_loss_mlp": 0.0103945, + "balance_loss_clip": 1.0147264, + "balance_loss_mlp": 1.01643026, + "epoch": 0.9458590109724936, + "flos": 26687153957760.0, + "grad_norm": 1.7031635192331982, + "language_loss": 0.64893425, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66986114, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 15732, + "time_per_iteration": 2.4150149822235107 + }, + { + "auxiliary_loss_clip": 0.01052734, + "auxiliary_loss_mlp": 0.01039815, + "balance_loss_clip": 1.01410115, + "balance_loss_mlp": 1.01579356, + "epoch": 0.9459191342251616, + "flos": 18660586809600.0, + "grad_norm": 1.8639573177589215, + "language_loss": 0.85138643, + "learning_rate": 3.056887271848363e-08, + "loss": 0.87231195, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36914062, + "step": 15733, + "time_per_iteration": 2.3545610904693604 + }, + { + "auxiliary_loss_clip": 0.01050259, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.01372635, + "balance_loss_mlp": 1.01625395, + "epoch": 0.9459792574778295, + "flos": 23397498011520.0, + "grad_norm": 1.633419121414046, + "language_loss": 0.73101872, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.75185525, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.33984375, + "step": 15734, + "time_per_iteration": 2.3664653301239014 + }, + { + "auxiliary_loss_clip": 0.01047515, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.01427782, + "balance_loss_mlp": 1.01481378, + "epoch": 0.9460393807304975, + "flos": 24391719198720.0, + "grad_norm": 1.8496724832755427, + "language_loss": 0.87283492, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89364183, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.328125, + "step": 15735, + "time_per_iteration": 2.4172842502593994 + }, + { + "auxiliary_loss_clip": 0.01051713, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.0124222, + "balance_loss_mlp": 1.01610923, + "epoch": 0.9460995039831654, + "flos": 21938477627520.0, + "grad_norm": 1.8817584183315856, + "language_loss": 0.68113804, + "learning_rate": 3.036571093728102e-08, + "loss": 0.70200282, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 15736, + "time_per_iteration": 2.3443260192871094 + }, + { + "auxiliary_loss_clip": 0.01007251, + "auxiliary_loss_mlp": 0.01004608, + "balance_loss_clip": 1.00260508, + "balance_loss_mlp": 1.00084352, + "epoch": 0.9461596272358335, + "flos": 70319088224640.0, + "grad_norm": 3.345535156247148, + "language_loss": 0.65383285, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67395145, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06445312, + "step": 15737, + "time_per_iteration": 3.0105857849121094 + }, + { + "auxiliary_loss_clip": 0.01006807, + "auxiliary_loss_mlp": 0.01003425, + "balance_loss_clip": 1.00149429, + "balance_loss_mlp": 1.00062609, + "epoch": 0.9462197504885014, + "flos": 58587711753600.0, + "grad_norm": 0.8032576440295021, + "language_loss": 0.58887529, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60897762, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.06201172, + "step": 15738, + "time_per_iteration": 3.0291783809661865 + }, + { + "auxiliary_loss_clip": 0.01049333, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.01648343, + "balance_loss_mlp": 1.0156709, + "epoch": 0.9462798737411694, + "flos": 23432830174080.0, + "grad_norm": 1.7100755897649076, + "language_loss": 0.72265196, + "learning_rate": 3.016322135462834e-08, + "loss": 0.74351811, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.3359375, + "step": 15739, + "time_per_iteration": 3.7920031547546387 + }, + { + "auxiliary_loss_clip": 0.01051422, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.01459348, + "balance_loss_mlp": 1.0161258, + "epoch": 0.9463399969938374, + "flos": 25044909425280.0, + "grad_norm": 2.311977274133704, + "language_loss": 0.65706706, + "learning_rate": 3.009587421648363e-08, + "loss": 0.67798686, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.35351562, + "step": 15740, + "time_per_iteration": 2.4056015014648438 + }, + { + "auxiliary_loss_clip": 0.01050302, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.01148176, + "balance_loss_mlp": 1.01619697, + "epoch": 0.9464001202465053, + "flos": 24351464534400.0, + "grad_norm": 1.7601092745326523, + "language_loss": 0.67770946, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.69854569, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 15741, + "time_per_iteration": 2.4002838134765625 + }, + { + "auxiliary_loss_clip": 0.01051897, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.01295495, + "balance_loss_mlp": 1.01601577, + "epoch": 0.9464602434991733, + "flos": 17164488695040.0, + "grad_norm": 1.8318941130114585, + "language_loss": 0.76413423, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.7850132, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 15742, + "time_per_iteration": 2.3537046909332275 + }, + { + "auxiliary_loss_clip": 0.01049763, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.01157522, + "balance_loss_mlp": 1.01582313, + "epoch": 0.9465203667518413, + "flos": 19937081272320.0, + "grad_norm": 1.701882020607607, + "language_loss": 0.73186368, + "learning_rate": 2.989428100602187e-08, + "loss": 0.75267851, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 15743, + "time_per_iteration": 2.356045961380005 + }, + { + "auxiliary_loss_clip": 0.01052126, + "auxiliary_loss_mlp": 0.01036346, + "balance_loss_clip": 1.01178908, + "balance_loss_mlp": 1.01549971, + "epoch": 0.9465804900045093, + "flos": 20119292991360.0, + "grad_norm": 1.5473158208878013, + "language_loss": 0.81053233, + "learning_rate": 2.982723267901943e-08, + "loss": 0.83141708, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 15744, + "time_per_iteration": 2.3845858573913574 + }, + { + "auxiliary_loss_clip": 0.01053372, + "auxiliary_loss_mlp": 0.01040665, + "balance_loss_clip": 1.01684713, + "balance_loss_mlp": 1.01688266, + "epoch": 0.9466406132571772, + "flos": 23910581485440.0, + "grad_norm": 1.6986603999036616, + "language_loss": 0.79273266, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.81367308, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 15745, + "time_per_iteration": 2.3632452487945557 + }, + { + "auxiliary_loss_clip": 0.01052721, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.02128458, + "balance_loss_mlp": 1.01596212, + "epoch": 0.9467007365098452, + "flos": 19932333327360.0, + "grad_norm": 1.54415685289877, + "language_loss": 0.70964062, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.73062372, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3671875, + "step": 15746, + "time_per_iteration": 2.3761560916900635 + }, + { + "auxiliary_loss_clip": 0.0105129, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.0136658, + "balance_loss_mlp": 1.01568663, + "epoch": 0.9467608597625131, + "flos": 19309692407040.0, + "grad_norm": 2.1451746300962546, + "language_loss": 0.57907569, + "learning_rate": 2.962653596305964e-08, + "loss": 0.59995824, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 15747, + "time_per_iteration": 2.3392863273620605 + }, + { + "auxiliary_loss_clip": 0.01006986, + "auxiliary_loss_mlp": 0.01002343, + "balance_loss_clip": 1.00037611, + "balance_loss_mlp": 1.00056291, + "epoch": 0.9468209830151811, + "flos": 69626865231360.0, + "grad_norm": 0.6611829450844628, + "language_loss": 0.53334635, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55343962, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.06445312, + "step": 15748, + "time_per_iteration": 3.1864516735076904 + }, + { + "auxiliary_loss_clip": 0.01053096, + "auxiliary_loss_mlp": 0.01039837, + "balance_loss_clip": 1.01681805, + "balance_loss_mlp": 1.01724553, + "epoch": 0.946881106267849, + "flos": 27015407360640.0, + "grad_norm": 1.645128205066435, + "language_loss": 0.6795038, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.70043314, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35742188, + "step": 15749, + "time_per_iteration": 2.413214921951294 + }, + { + "auxiliary_loss_clip": 0.01053437, + "auxiliary_loss_mlp": 0.01039913, + "balance_loss_clip": 1.01417553, + "balance_loss_mlp": 1.01645637, + "epoch": 0.9469412295205171, + "flos": 20189154355200.0, + "grad_norm": 2.032378175357304, + "language_loss": 0.7736485, + "learning_rate": 2.942651169791621e-08, + "loss": 0.79458201, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36914062, + "step": 15750, + "time_per_iteration": 2.366022825241089 + }, + { + "auxiliary_loss_clip": 0.01049806, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.01484275, + "balance_loss_mlp": 1.01507854, + "epoch": 0.947001352773185, + "flos": 21323831408640.0, + "grad_norm": 1.6304530114950968, + "language_loss": 0.68132639, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70219821, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 15751, + "time_per_iteration": 2.359426975250244 + }, + { + "auxiliary_loss_clip": 0.01051278, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.01390803, + "balance_loss_mlp": 1.01581454, + "epoch": 0.947061476025853, + "flos": 21942981192960.0, + "grad_norm": 1.6053485425333964, + "language_loss": 0.65992463, + "learning_rate": 2.929353580532723e-08, + "loss": 0.68079329, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 15752, + "time_per_iteration": 2.389608144760132 + }, + { + "auxiliary_loss_clip": 0.01051341, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.02047551, + "balance_loss_mlp": 1.01596475, + "epoch": 0.947121599278521, + "flos": 21393727683840.0, + "grad_norm": 1.649038898526814, + "language_loss": 0.72704154, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.74799395, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 15753, + "time_per_iteration": 2.3667352199554443 + }, + { + "auxiliary_loss_clip": 0.01053558, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.01375103, + "balance_loss_mlp": 1.01605284, + "epoch": 0.9471817225311889, + "flos": 23074620958080.0, + "grad_norm": 3.2016480156782885, + "language_loss": 0.72156245, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.74250913, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.375, + "step": 15754, + "time_per_iteration": 2.3784995079040527 + }, + { + "auxiliary_loss_clip": 0.01052847, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.01383758, + "balance_loss_mlp": 1.01563597, + "epoch": 0.947241845783857, + "flos": 11909955542400.0, + "grad_norm": 2.1257624029497078, + "language_loss": 0.80202901, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.82292974, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.37109375, + "step": 15755, + "time_per_iteration": 2.322429656982422 + }, + { + "auxiliary_loss_clip": 0.01056105, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.01423943, + "balance_loss_mlp": 1.01645529, + "epoch": 0.9473019690365249, + "flos": 20739629761920.0, + "grad_norm": 2.4589946703026953, + "language_loss": 0.76531327, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.78628355, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.39648438, + "step": 15756, + "time_per_iteration": 2.4107859134674072 + }, + { + "auxiliary_loss_clip": 0.01052551, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.01306951, + "balance_loss_mlp": 1.01684213, + "epoch": 0.9473620922891929, + "flos": 17638923427200.0, + "grad_norm": 2.0544760427845774, + "language_loss": 0.76171911, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.78259164, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35742188, + "step": 15757, + "time_per_iteration": 3.5786876678466797 + }, + { + "auxiliary_loss_clip": 0.01053403, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.01428819, + "balance_loss_mlp": 1.01627028, + "epoch": 0.9474222155418608, + "flos": 23548881133440.0, + "grad_norm": 2.4125512415762604, + "language_loss": 0.80745155, + "learning_rate": 2.889640171327512e-08, + "loss": 0.82836914, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 15758, + "time_per_iteration": 2.39438533782959 + }, + { + "auxiliary_loss_clip": 0.01049845, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.0154866, + "balance_loss_mlp": 1.01568794, + "epoch": 0.9474823387945288, + "flos": 27088515480960.0, + "grad_norm": 1.436839291673015, + "language_loss": 0.72752243, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.7483902, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 15759, + "time_per_iteration": 2.39927077293396 + }, + { + "auxiliary_loss_clip": 0.01048563, + "auxiliary_loss_mlp": 0.01033289, + "balance_loss_clip": 1.01410866, + "balance_loss_mlp": 1.01566064, + "epoch": 0.9475424620471967, + "flos": 22965412625280.0, + "grad_norm": 1.4276200640636403, + "language_loss": 0.76750857, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.7883271, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.328125, + "step": 15760, + "time_per_iteration": 2.4052064418792725 + }, + { + "auxiliary_loss_clip": 0.01051921, + "auxiliary_loss_mlp": 0.01033266, + "balance_loss_clip": 1.01254773, + "balance_loss_mlp": 1.0167259, + "epoch": 0.9476025852998647, + "flos": 20046638718720.0, + "grad_norm": 1.831280554667467, + "language_loss": 0.7392031, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.76005495, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.3515625, + "step": 15761, + "time_per_iteration": 2.3365721702575684 + }, + { + "auxiliary_loss_clip": 0.01051774, + "auxiliary_loss_mlp": 0.01038277, + "balance_loss_clip": 1.01637864, + "balance_loss_mlp": 1.01695716, + "epoch": 0.9476627085525327, + "flos": 14974596576000.0, + "grad_norm": 2.0373545266415047, + "language_loss": 0.73546183, + "learning_rate": 2.863314050734722e-08, + "loss": 0.75636244, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 15762, + "time_per_iteration": 2.3458595275878906 + }, + { + "auxiliary_loss_clip": 0.01052827, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.01399112, + "balance_loss_mlp": 1.01517081, + "epoch": 0.9477228318052007, + "flos": 18696791756160.0, + "grad_norm": 1.8549977343844324, + "language_loss": 0.68747044, + "learning_rate": 2.856751208570518e-08, + "loss": 0.70838565, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.375, + "step": 15763, + "time_per_iteration": 2.343414783477783 + }, + { + "auxiliary_loss_clip": 0.01051421, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.01375699, + "balance_loss_mlp": 1.01553822, + "epoch": 0.9477829550578686, + "flos": 23874027425280.0, + "grad_norm": 1.7249302035119354, + "language_loss": 0.71828848, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.73916018, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.359375, + "step": 15764, + "time_per_iteration": 2.377281665802002 + }, + { + "auxiliary_loss_clip": 0.01049942, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.01291132, + "balance_loss_mlp": 1.01787353, + "epoch": 0.9478430783105366, + "flos": 22561851686400.0, + "grad_norm": 1.8978506820146028, + "language_loss": 0.72125614, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.7420789, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.3203125, + "step": 15765, + "time_per_iteration": 2.3711142539978027 + }, + { + "auxiliary_loss_clip": 0.01006984, + "auxiliary_loss_mlp": 0.01002558, + "balance_loss_clip": 1.00059092, + "balance_loss_mlp": 1.00059164, + "epoch": 0.9479032015632046, + "flos": 60855182513280.0, + "grad_norm": 0.812818388173387, + "language_loss": 0.59277248, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61286789, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.06396484, + "step": 15766, + "time_per_iteration": 2.7828454971313477 + }, + { + "auxiliary_loss_clip": 0.01051696, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.01502442, + "balance_loss_mlp": 1.01631093, + "epoch": 0.9479633248158725, + "flos": 14683001967360.0, + "grad_norm": 1.9345226492966823, + "language_loss": 0.76072192, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.78161216, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 15767, + "time_per_iteration": 5.166979074478149 + }, + { + "auxiliary_loss_clip": 0.01053683, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.01594377, + "balance_loss_mlp": 1.01641369, + "epoch": 0.9480234480685406, + "flos": 20332996623360.0, + "grad_norm": 2.439916371343903, + "language_loss": 0.744923, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.76585042, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.37304688, + "step": 15768, + "time_per_iteration": 2.331920862197876 + }, + { + "auxiliary_loss_clip": 0.01006985, + "auxiliary_loss_mlp": 0.01002668, + "balance_loss_clip": 1.00055826, + "balance_loss_mlp": 1.00066161, + "epoch": 0.9480835713212085, + "flos": 70289516436480.0, + "grad_norm": 0.7373441220008005, + "language_loss": 0.55396569, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57406223, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06347656, + "step": 15769, + "time_per_iteration": 3.0290675163269043 + }, + { + "auxiliary_loss_clip": 0.01050376, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.01279151, + "balance_loss_mlp": 1.01531935, + "epoch": 0.9481436945738765, + "flos": 25448505275520.0, + "grad_norm": 1.3188865045926519, + "language_loss": 0.7798844, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.80073708, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 15770, + "time_per_iteration": 2.4093711376190186 + }, + { + "auxiliary_loss_clip": 0.01053756, + "auxiliary_loss_mlp": 0.01039875, + "balance_loss_clip": 1.0146389, + "balance_loss_mlp": 1.01718915, + "epoch": 0.9482038178265444, + "flos": 26978678743680.0, + "grad_norm": 1.8548990544039272, + "language_loss": 0.80541909, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.8263554, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 15771, + "time_per_iteration": 2.410914659500122 + }, + { + "auxiliary_loss_clip": 0.01051825, + "auxiliary_loss_mlp": 0.01040182, + "balance_loss_clip": 1.01775885, + "balance_loss_mlp": 1.0162394, + "epoch": 0.9482639410792124, + "flos": 17784476352000.0, + "grad_norm": 1.91729281179819, + "language_loss": 0.71073043, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.73165053, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 15772, + "time_per_iteration": 2.3288848400115967 + }, + { + "auxiliary_loss_clip": 0.01051808, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.01109636, + "balance_loss_mlp": 1.01643836, + "epoch": 0.9483240643318803, + "flos": 20995612917120.0, + "grad_norm": 1.5189649135938619, + "language_loss": 0.74679327, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.76763541, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35351562, + "step": 15773, + "time_per_iteration": 2.3830127716064453 + }, + { + "auxiliary_loss_clip": 0.01051614, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_clip": 1.020051, + "balance_loss_mlp": 1.015378, + "epoch": 0.9483841875845483, + "flos": 20082285083520.0, + "grad_norm": 2.1608065594412027, + "language_loss": 0.63782197, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.65879023, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 15774, + "time_per_iteration": 2.342543601989746 + }, + { + "auxiliary_loss_clip": 0.01051416, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.01305795, + "balance_loss_mlp": 1.01532567, + "epoch": 0.9484443108372163, + "flos": 20812598236800.0, + "grad_norm": 1.836315133660154, + "language_loss": 0.6033777, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.62425613, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36132812, + "step": 15775, + "time_per_iteration": 2.3637495040893555 + }, + { + "auxiliary_loss_clip": 0.01052092, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.01695418, + "balance_loss_mlp": 1.01555753, + "epoch": 0.9485044340898843, + "flos": 36427712215680.0, + "grad_norm": 1.5475583933942398, + "language_loss": 0.63015133, + "learning_rate": 2.772114638584555e-08, + "loss": 0.6510734, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36523438, + "step": 15776, + "time_per_iteration": 2.479140281677246 + }, + { + "auxiliary_loss_clip": 0.01051124, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.01497483, + "balance_loss_mlp": 1.01573062, + "epoch": 0.9485645573425522, + "flos": 22601408123520.0, + "grad_norm": 1.713684047541088, + "language_loss": 0.74484086, + "learning_rate": 2.765656478622458e-08, + "loss": 0.76573515, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35351562, + "step": 15777, + "time_per_iteration": 2.39367938041687 + }, + { + "auxiliary_loss_clip": 0.01056242, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.01395464, + "balance_loss_mlp": 1.01673436, + "epoch": 0.9486246805952202, + "flos": 22016682806400.0, + "grad_norm": 2.366852912526445, + "language_loss": 0.73864162, + "learning_rate": 2.759205797806441e-08, + "loss": 0.75961173, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.39453125, + "step": 15778, + "time_per_iteration": 2.4085848331451416 + }, + { + "auxiliary_loss_clip": 0.01048108, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.01165605, + "balance_loss_mlp": 1.01622701, + "epoch": 0.9486848038478882, + "flos": 16507737509760.0, + "grad_norm": 1.874035045407014, + "language_loss": 0.70931852, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.73011273, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.31835938, + "step": 15779, + "time_per_iteration": 3.8160581588745117 + }, + { + "auxiliary_loss_clip": 0.01051313, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.01617515, + "balance_loss_mlp": 1.01591027, + "epoch": 0.9487449271005561, + "flos": 19243392001920.0, + "grad_norm": 2.1594258132610964, + "language_loss": 0.79951775, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.82043672, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 15780, + "time_per_iteration": 2.405877113342285 + }, + { + "auxiliary_loss_clip": 0.01052869, + "auxiliary_loss_mlp": 0.01032763, + "balance_loss_clip": 1.01052999, + "balance_loss_mlp": 1.01689065, + "epoch": 0.9488050503532242, + "flos": 21761607346560.0, + "grad_norm": 1.824461358554288, + "language_loss": 0.66746843, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68832469, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 15781, + "time_per_iteration": 2.396348476409912 + }, + { + "auxiliary_loss_clip": 0.01050663, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.01505208, + "balance_loss_mlp": 1.01588678, + "epoch": 0.9488651736058921, + "flos": 18367944860160.0, + "grad_norm": 1.9328750435883062, + "language_loss": 0.80601978, + "learning_rate": 2.733477870890999e-08, + "loss": 0.82691932, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.34765625, + "step": 15782, + "time_per_iteration": 2.347132682800293 + }, + { + "auxiliary_loss_clip": 0.01006951, + "auxiliary_loss_mlp": 0.01003248, + "balance_loss_clip": 1.00120986, + "balance_loss_mlp": 1.0007534, + "epoch": 0.9489252968585601, + "flos": 70080211635840.0, + "grad_norm": 0.7177658861799657, + "language_loss": 0.59844077, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61854273, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.06201172, + "step": 15783, + "time_per_iteration": 3.133420705795288 + }, + { + "auxiliary_loss_clip": 0.0105305, + "auxiliary_loss_mlp": 0.01041193, + "balance_loss_clip": 1.01663589, + "balance_loss_mlp": 1.01687169, + "epoch": 0.948985420111228, + "flos": 27854195708160.0, + "grad_norm": 1.6353976976184954, + "language_loss": 0.74750841, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76845086, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36132812, + "step": 15784, + "time_per_iteration": 2.414520263671875 + }, + { + "auxiliary_loss_clip": 0.01051914, + "auxiliary_loss_mlp": 0.01038284, + "balance_loss_clip": 1.01249862, + "balance_loss_mlp": 1.01556623, + "epoch": 0.949045543363896, + "flos": 24314910474240.0, + "grad_norm": 1.9744259501028116, + "language_loss": 0.7099762, + "learning_rate": 2.714260468695806e-08, + "loss": 0.73087811, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36328125, + "step": 15785, + "time_per_iteration": 2.407527446746826 + }, + { + "auxiliary_loss_clip": 0.01051385, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.01070333, + "balance_loss_mlp": 1.01540065, + "epoch": 0.9491056666165639, + "flos": 24240580456320.0, + "grad_norm": 1.4429142743868508, + "language_loss": 0.77154326, + "learning_rate": 2.707869629830495e-08, + "loss": 0.79240382, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 15786, + "time_per_iteration": 2.378446578979492 + }, + { + "auxiliary_loss_clip": 0.01051486, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.01831484, + "balance_loss_mlp": 1.01598048, + "epoch": 0.949165789869232, + "flos": 24530220028800.0, + "grad_norm": 1.7446507128857964, + "language_loss": 0.7962755, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81719244, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35546875, + "step": 15787, + "time_per_iteration": 2.4200310707092285 + }, + { + "auxiliary_loss_clip": 0.01051004, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.0144614, + "balance_loss_mlp": 1.01679325, + "epoch": 0.9492259131218999, + "flos": 22234296510720.0, + "grad_norm": 1.4995088344008733, + "language_loss": 0.77073526, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.79161632, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34179688, + "step": 15788, + "time_per_iteration": 2.369018793106079 + }, + { + "auxiliary_loss_clip": 0.01051984, + "auxiliary_loss_mlp": 0.01040484, + "balance_loss_clip": 1.01674962, + "balance_loss_mlp": 1.0156194, + "epoch": 0.9492860363745679, + "flos": 22965203157120.0, + "grad_norm": 1.8210914559542963, + "language_loss": 0.7247588, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.74568349, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36328125, + "step": 15789, + "time_per_iteration": 2.4013288021087646 + }, + { + "auxiliary_loss_clip": 0.01050087, + "auxiliary_loss_mlp": 0.01037536, + "balance_loss_clip": 1.01257348, + "balance_loss_mlp": 1.0156306, + "epoch": 0.9493461596272358, + "flos": 18369271491840.0, + "grad_norm": 1.9197003646407504, + "language_loss": 0.74717146, + "learning_rate": 2.682381090161989e-08, + "loss": 0.76804769, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.34375, + "step": 15790, + "time_per_iteration": 2.3436598777770996 + }, + { + "auxiliary_loss_clip": 0.01052926, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.0160892, + "balance_loss_mlp": 1.015625, + "epoch": 0.9494062828799038, + "flos": 20010678151680.0, + "grad_norm": 2.0222143829363985, + "language_loss": 0.79554653, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.81648904, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37304688, + "step": 15791, + "time_per_iteration": 2.3818650245666504 + }, + { + "auxiliary_loss_clip": 0.01054235, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.01416469, + "balance_loss_mlp": 1.01602292, + "epoch": 0.9494664061325718, + "flos": 27227539981440.0, + "grad_norm": 1.979923922052732, + "language_loss": 0.75001168, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.77094185, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3828125, + "step": 15792, + "time_per_iteration": 2.4420721530914307 + }, + { + "auxiliary_loss_clip": 0.01051171, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.01235151, + "balance_loss_mlp": 1.01564395, + "epoch": 0.9495265293852397, + "flos": 18368817644160.0, + "grad_norm": 2.9060022726019583, + "language_loss": 0.79395521, + "learning_rate": 2.663343248754679e-08, + "loss": 0.81481302, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 15793, + "time_per_iteration": 2.3762784004211426 + }, + { + "auxiliary_loss_clip": 0.01050113, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.01739216, + "balance_loss_mlp": 1.01555562, + "epoch": 0.9495866526379078, + "flos": 23074655869440.0, + "grad_norm": 1.6289336463375503, + "language_loss": 0.78482169, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.8057211, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 15794, + "time_per_iteration": 2.3643834590911865 + }, + { + "auxiliary_loss_clip": 0.01053192, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.01452279, + "balance_loss_mlp": 1.01622009, + "epoch": 0.9496467758905757, + "flos": 17528947044480.0, + "grad_norm": 1.8333361013214984, + "language_loss": 0.61554813, + "learning_rate": 2.650688769211107e-08, + "loss": 0.63648069, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37109375, + "step": 15795, + "time_per_iteration": 2.3806416988372803 + }, + { + "auxiliary_loss_clip": 0.01050474, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.01111174, + "balance_loss_mlp": 1.01591456, + "epoch": 0.9497068991432437, + "flos": 24132768577920.0, + "grad_norm": 1.8133118896862435, + "language_loss": 0.80635405, + "learning_rate": 2.644372754577895e-08, + "loss": 0.82719582, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34570312, + "step": 15796, + "time_per_iteration": 3.691746234893799 + }, + { + "auxiliary_loss_clip": 0.01052817, + "auxiliary_loss_mlp": 0.0104038, + "balance_loss_clip": 1.01614499, + "balance_loss_mlp": 1.01675487, + "epoch": 0.9497670223959116, + "flos": 20302272760320.0, + "grad_norm": 1.85903592389378, + "language_loss": 0.76504666, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.78597862, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.359375, + "step": 15797, + "time_per_iteration": 2.4051671028137207 + }, + { + "auxiliary_loss_clip": 0.01053172, + "auxiliary_loss_mlp": 0.01042131, + "balance_loss_clip": 1.01940966, + "balance_loss_mlp": 1.01714063, + "epoch": 0.9498271456485796, + "flos": 13698067201920.0, + "grad_norm": 2.0128197947402846, + "language_loss": 0.66863424, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.68958724, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.359375, + "step": 15798, + "time_per_iteration": 2.3455209732055664 + }, + { + "auxiliary_loss_clip": 0.01053339, + "auxiliary_loss_mlp": 0.01038318, + "balance_loss_clip": 1.01612127, + "balance_loss_mlp": 1.01687527, + "epoch": 0.9498872689012475, + "flos": 20812947350400.0, + "grad_norm": 2.658792667723561, + "language_loss": 0.77899957, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79991609, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.36523438, + "step": 15799, + "time_per_iteration": 2.431056261062622 + }, + { + "auxiliary_loss_clip": 0.0105031, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.01260281, + "balance_loss_mlp": 1.01577735, + "epoch": 0.9499473921539155, + "flos": 21031643306880.0, + "grad_norm": 1.6386593781683665, + "language_loss": 0.72319591, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.7440356, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34570312, + "step": 15800, + "time_per_iteration": 2.358853816986084 + }, + { + "auxiliary_loss_clip": 0.01050057, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.0119381, + "balance_loss_mlp": 1.01505017, + "epoch": 0.9500075154065835, + "flos": 20997567953280.0, + "grad_norm": 1.741097787848468, + "language_loss": 0.7292456, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.75008857, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 15801, + "time_per_iteration": 2.4055848121643066 + }, + { + "auxiliary_loss_clip": 0.01052263, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.01132059, + "balance_loss_mlp": 1.01650167, + "epoch": 0.9500676386592515, + "flos": 25120705720320.0, + "grad_norm": 1.5642727877451794, + "language_loss": 0.82085252, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.8417123, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 15802, + "time_per_iteration": 2.410964250564575 + }, + { + "auxiliary_loss_clip": 0.01053156, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.01771951, + "balance_loss_mlp": 1.01748276, + "epoch": 0.9501277619119194, + "flos": 27522486080640.0, + "grad_norm": 1.8014040037056822, + "language_loss": 0.700629, + "learning_rate": 2.60037021038646e-08, + "loss": 0.72157431, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 15803, + "time_per_iteration": 2.4151597023010254 + }, + { + "auxiliary_loss_clip": 0.01050508, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.01274633, + "balance_loss_mlp": 1.01532125, + "epoch": 0.9501878851645874, + "flos": 20812912439040.0, + "grad_norm": 1.60386733006486, + "language_loss": 0.76881194, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78967363, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 15804, + "time_per_iteration": 2.353442430496216 + }, + { + "auxiliary_loss_clip": 0.01053483, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.01130688, + "balance_loss_mlp": 1.01724744, + "epoch": 0.9502480084172553, + "flos": 18368398707840.0, + "grad_norm": 1.7102479561068251, + "language_loss": 0.74626267, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.76714504, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 15805, + "time_per_iteration": 2.3542933464050293 + }, + { + "auxiliary_loss_clip": 0.01053945, + "auxiliary_loss_mlp": 0.01044383, + "balance_loss_clip": 1.01875293, + "balance_loss_mlp": 1.01737309, + "epoch": 0.9503081316699233, + "flos": 23548497108480.0, + "grad_norm": 1.41784939736946, + "language_loss": 0.81602895, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.83701217, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36523438, + "step": 15806, + "time_per_iteration": 3.779156446456909 + }, + { + "auxiliary_loss_clip": 0.01053679, + "auxiliary_loss_mlp": 0.01035991, + "balance_loss_clip": 1.01321054, + "balance_loss_mlp": 1.01606154, + "epoch": 0.9503682549225914, + "flos": 18039481989120.0, + "grad_norm": 2.006503519273143, + "language_loss": 0.83730197, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.85819864, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.375, + "step": 15807, + "time_per_iteration": 3.8365230560302734 + }, + { + "auxiliary_loss_clip": 0.01049858, + "auxiliary_loss_mlp": 0.01031757, + "balance_loss_clip": 1.01125288, + "balance_loss_mlp": 1.01537561, + "epoch": 0.9504283781752593, + "flos": 25884919670400.0, + "grad_norm": 1.8949183599474633, + "language_loss": 0.73129475, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.7521109, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34375, + "step": 15808, + "time_per_iteration": 2.409778356552124 + }, + { + "auxiliary_loss_clip": 0.01051121, + "auxiliary_loss_mlp": 0.01037303, + "balance_loss_clip": 1.01445079, + "balance_loss_mlp": 1.01613343, + "epoch": 0.9504885014279273, + "flos": 22123028407680.0, + "grad_norm": 1.5375428449142265, + "language_loss": 0.70733684, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72822106, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34960938, + "step": 15809, + "time_per_iteration": 2.367258071899414 + }, + { + "auxiliary_loss_clip": 0.0105088, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.01018357, + "balance_loss_mlp": 1.01569915, + "epoch": 0.9505486246805952, + "flos": 21614902346880.0, + "grad_norm": 1.5464392002674792, + "language_loss": 0.76276183, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.78358924, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 15810, + "time_per_iteration": 2.3793888092041016 + }, + { + "auxiliary_loss_clip": 0.01051733, + "auxiliary_loss_mlp": 0.01046664, + "balance_loss_clip": 1.02085519, + "balance_loss_mlp": 1.01553297, + "epoch": 0.9506087479332632, + "flos": 22527147928320.0, + "grad_norm": 1.389929406750289, + "language_loss": 0.80940664, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.83039057, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36328125, + "step": 15811, + "time_per_iteration": 2.3715226650238037 + }, + { + "auxiliary_loss_clip": 0.0105169, + "auxiliary_loss_mlp": 0.01037332, + "balance_loss_clip": 1.01501656, + "balance_loss_mlp": 1.01580882, + "epoch": 0.9506688711859311, + "flos": 27526116862080.0, + "grad_norm": 2.080237516477039, + "language_loss": 0.713368, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.73425829, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.359375, + "step": 15812, + "time_per_iteration": 2.421966791152954 + }, + { + "auxiliary_loss_clip": 0.01052612, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.01526988, + "balance_loss_mlp": 1.0161047, + "epoch": 0.9507289944385992, + "flos": 19864741201920.0, + "grad_norm": 4.107636292967543, + "language_loss": 0.66446549, + "learning_rate": 2.538145713158446e-08, + "loss": 0.6854012, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36523438, + "step": 15813, + "time_per_iteration": 2.3678653240203857 + }, + { + "auxiliary_loss_clip": 0.01053066, + "auxiliary_loss_mlp": 0.01042999, + "balance_loss_clip": 1.01883543, + "balance_loss_mlp": 1.01618659, + "epoch": 0.9507891176912671, + "flos": 25192068272640.0, + "grad_norm": 1.3700365067867208, + "language_loss": 0.71002227, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.73098296, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36914062, + "step": 15814, + "time_per_iteration": 2.414485454559326 + }, + { + "auxiliary_loss_clip": 0.01051045, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.01535678, + "balance_loss_mlp": 1.01659274, + "epoch": 0.9508492409439351, + "flos": 24898413893760.0, + "grad_norm": 2.187335771931001, + "language_loss": 0.64592427, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.66679323, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34375, + "step": 15815, + "time_per_iteration": 2.4007251262664795 + }, + { + "auxiliary_loss_clip": 0.01050195, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.00998282, + "balance_loss_mlp": 1.01477575, + "epoch": 0.950909364196603, + "flos": 29782937790720.0, + "grad_norm": 2.733884323610963, + "language_loss": 0.59094274, + "learning_rate": 2.519624364862061e-08, + "loss": 0.61176795, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 15816, + "time_per_iteration": 2.435983180999756 + }, + { + "auxiliary_loss_clip": 0.01050976, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.02038479, + "balance_loss_mlp": 1.01588655, + "epoch": 0.950969487449271, + "flos": 24716621111040.0, + "grad_norm": 1.602794943518448, + "language_loss": 0.74385679, + "learning_rate": 2.513465558735994e-08, + "loss": 0.76478922, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 15817, + "time_per_iteration": 2.4036643505096436 + }, + { + "auxiliary_loss_clip": 0.01053335, + "auxiliary_loss_mlp": 0.01042289, + "balance_loss_clip": 1.01448905, + "balance_loss_mlp": 1.01636279, + "epoch": 0.9510296107019389, + "flos": 13698311581440.0, + "grad_norm": 1.689703042979272, + "language_loss": 0.61688828, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.63784456, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.36914062, + "step": 15818, + "time_per_iteration": 2.342528820037842 + }, + { + "auxiliary_loss_clip": 0.01052762, + "auxiliary_loss_mlp": 0.01038299, + "balance_loss_clip": 1.01464748, + "balance_loss_mlp": 1.01618862, + "epoch": 0.9510897339546069, + "flos": 17310879492480.0, + "grad_norm": 1.7897669373789604, + "language_loss": 0.69764018, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71855074, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36523438, + "step": 15819, + "time_per_iteration": 3.8512356281280518 + }, + { + "auxiliary_loss_clip": 0.01054041, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.0114603, + "balance_loss_mlp": 1.01720726, + "epoch": 0.951149857207275, + "flos": 14793048172800.0, + "grad_norm": 1.6010985429242954, + "language_loss": 0.75245774, + "learning_rate": 2.49503407354561e-08, + "loss": 0.7733587, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 15820, + "time_per_iteration": 2.3487792015075684 + }, + { + "auxiliary_loss_clip": 0.01053228, + "auxiliary_loss_mlp": 0.01042734, + "balance_loss_clip": 1.02006054, + "balance_loss_mlp": 1.01717699, + "epoch": 0.9512099804599429, + "flos": 19390131912960.0, + "grad_norm": 10.470473736383695, + "language_loss": 0.79290527, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.81386489, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 15821, + "time_per_iteration": 2.352457046508789 + }, + { + "auxiliary_loss_clip": 0.01051558, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.00825179, + "balance_loss_mlp": 1.01621509, + "epoch": 0.9512701037126109, + "flos": 36756384554880.0, + "grad_norm": 2.4159442541923624, + "language_loss": 0.71920443, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.74005121, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.35546875, + "step": 15822, + "time_per_iteration": 2.4984288215637207 + }, + { + "auxiliary_loss_clip": 0.01051611, + "auxiliary_loss_mlp": 0.01038288, + "balance_loss_clip": 1.01691365, + "balance_loss_mlp": 1.0165062, + "epoch": 0.9513302269652788, + "flos": 22637159222400.0, + "grad_norm": 1.568934040627301, + "language_loss": 0.67450416, + "learning_rate": 2.47666999302647e-08, + "loss": 0.6954031, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.3515625, + "step": 15823, + "time_per_iteration": 2.361361026763916 + }, + { + "auxiliary_loss_clip": 0.01049632, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.0179565, + "balance_loss_mlp": 1.01669717, + "epoch": 0.9513903502179468, + "flos": 22892129948160.0, + "grad_norm": 1.9296844376432272, + "language_loss": 0.78269511, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.80357593, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33007812, + "step": 15824, + "time_per_iteration": 2.361347198486328 + }, + { + "auxiliary_loss_clip": 0.01054125, + "auxiliary_loss_mlp": 0.01036686, + "balance_loss_clip": 1.01050806, + "balance_loss_mlp": 1.01672745, + "epoch": 0.9514504734706147, + "flos": 27927373651200.0, + "grad_norm": 2.4146700649466246, + "language_loss": 0.74586093, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.76676905, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 15825, + "time_per_iteration": 2.4211010932922363 + }, + { + "auxiliary_loss_clip": 0.01006731, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 1.00182772, + "balance_loss_mlp": 1.00045443, + "epoch": 0.9515105967232828, + "flos": 67363480967040.0, + "grad_norm": 0.8149926611730905, + "language_loss": 0.53477883, + "learning_rate": 2.458373323445806e-08, + "loss": 0.5548861, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.0625, + "step": 15826, + "time_per_iteration": 2.928389310836792 + }, + { + "auxiliary_loss_clip": 0.01050741, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.02028763, + "balance_loss_mlp": 1.01530159, + "epoch": 0.9515707199759507, + "flos": 25845398144640.0, + "grad_norm": 2.0156449637533647, + "language_loss": 0.7388829, + "learning_rate": 2.452289414874076e-08, + "loss": 0.7598294, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35351562, + "step": 15827, + "time_per_iteration": 2.421349048614502 + }, + { + "auxiliary_loss_clip": 0.01051122, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.01299596, + "balance_loss_mlp": 1.01551127, + "epoch": 0.9516308432286187, + "flos": 21828082308480.0, + "grad_norm": 1.7447525695548816, + "language_loss": 0.75603145, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.77691096, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 15828, + "time_per_iteration": 2.3925609588623047 + }, + { + "auxiliary_loss_clip": 0.01050457, + "auxiliary_loss_mlp": 0.0103651, + "balance_loss_clip": 1.01660228, + "balance_loss_mlp": 1.01656032, + "epoch": 0.9516909664812866, + "flos": 27268423050240.0, + "grad_norm": 1.4952419255400673, + "language_loss": 0.73961169, + "learning_rate": 2.440144071047978e-08, + "loss": 0.76048136, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33984375, + "step": 15829, + "time_per_iteration": 2.4239468574523926 + }, + { + "auxiliary_loss_clip": 0.0105122, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.01386285, + "balance_loss_mlp": 1.01599991, + "epoch": 0.9517510897339546, + "flos": 21214273962240.0, + "grad_norm": 1.7126526302928866, + "language_loss": 0.62630898, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.6471858, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15830, + "time_per_iteration": 2.3966004848480225 + }, + { + "auxiliary_loss_clip": 0.01053347, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_clip": 1.01823401, + "balance_loss_mlp": 1.01652503, + "epoch": 0.9518112129866225, + "flos": 18732996702720.0, + "grad_norm": 2.311221961994566, + "language_loss": 0.74203545, + "learning_rate": 2.428028693179729e-08, + "loss": 0.76299858, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3671875, + "step": 15831, + "time_per_iteration": 2.3690803050994873 + }, + { + "auxiliary_loss_clip": 0.0104869, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.01158357, + "balance_loss_mlp": 1.01523876, + "epoch": 0.9518713362392905, + "flos": 16762743146880.0, + "grad_norm": 1.7297189926986796, + "language_loss": 0.6618067, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.68260926, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.3359375, + "step": 15832, + "time_per_iteration": 2.351290464401245 + }, + { + "auxiliary_loss_clip": 0.01052217, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.01436472, + "balance_loss_mlp": 1.01769352, + "epoch": 0.9519314594919586, + "flos": 15229776769920.0, + "grad_norm": 1.8320567973791722, + "language_loss": 0.79156792, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.81245732, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34570312, + "step": 15833, + "time_per_iteration": 2.3685672283172607 + }, + { + "auxiliary_loss_clip": 0.01050846, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.01517487, + "balance_loss_mlp": 1.01620221, + "epoch": 0.9519915827446265, + "flos": 19351971930240.0, + "grad_norm": 2.2140118028917724, + "language_loss": 0.75770491, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.77859318, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34765625, + "step": 15834, + "time_per_iteration": 2.3333375453948975 + }, + { + "auxiliary_loss_clip": 0.01055711, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.01565611, + "balance_loss_mlp": 1.01755822, + "epoch": 0.9520517059972945, + "flos": 22265404398720.0, + "grad_norm": 2.449352609370315, + "language_loss": 0.76886183, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78982943, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 15835, + "time_per_iteration": 2.3772284984588623 + }, + { + "auxiliary_loss_clip": 0.01052477, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.01876569, + "balance_loss_mlp": 1.01587021, + "epoch": 0.9521118292499624, + "flos": 14861547993600.0, + "grad_norm": 1.9487383823420492, + "language_loss": 0.67290878, + "learning_rate": 2.397871361623238e-08, + "loss": 0.69387043, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36523438, + "step": 15836, + "time_per_iteration": 3.6042189598083496 + }, + { + "auxiliary_loss_clip": 0.01049524, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.01203966, + "balance_loss_mlp": 1.01539505, + "epoch": 0.9521719525026304, + "flos": 23507823507840.0, + "grad_norm": 1.6634554968979125, + "language_loss": 0.71155626, + "learning_rate": 2.391862373676057e-08, + "loss": 0.73239052, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34179688, + "step": 15837, + "time_per_iteration": 2.3780572414398193 + }, + { + "auxiliary_loss_clip": 0.01052756, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.01445174, + "balance_loss_mlp": 1.0157752, + "epoch": 0.9522320757552983, + "flos": 19714021395840.0, + "grad_norm": 2.283966890035632, + "language_loss": 0.74443007, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.76534033, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36914062, + "step": 15838, + "time_per_iteration": 2.357729911804199 + }, + { + "auxiliary_loss_clip": 0.0105227, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.01407444, + "balance_loss_mlp": 1.0158546, + "epoch": 0.9522921990079664, + "flos": 25920112187520.0, + "grad_norm": 1.7633834837664393, + "language_loss": 0.78806812, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80897057, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 15839, + "time_per_iteration": 2.4131290912628174 + }, + { + "auxiliary_loss_clip": 0.01052377, + "auxiliary_loss_mlp": 0.01038785, + "balance_loss_clip": 1.016397, + "balance_loss_mlp": 1.01626158, + "epoch": 0.9523523222606343, + "flos": 19207117232640.0, + "grad_norm": 1.6684109871244057, + "language_loss": 0.81014609, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.83105773, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36132812, + "step": 15840, + "time_per_iteration": 2.3694798946380615 + }, + { + "auxiliary_loss_clip": 0.01048504, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.01082134, + "balance_loss_mlp": 1.01561677, + "epoch": 0.9524124455133023, + "flos": 20920270469760.0, + "grad_norm": 1.918041691759174, + "language_loss": 0.74608397, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.76687586, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.328125, + "step": 15841, + "time_per_iteration": 2.4067153930664062 + }, + { + "auxiliary_loss_clip": 0.01046933, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.01398456, + "balance_loss_mlp": 1.01415849, + "epoch": 0.9524725687659702, + "flos": 18842554149120.0, + "grad_norm": 1.7775436571592924, + "language_loss": 0.79999256, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.82081079, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.328125, + "step": 15842, + "time_per_iteration": 2.3869404792785645 + }, + { + "auxiliary_loss_clip": 0.01051934, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.01174259, + "balance_loss_mlp": 1.01716566, + "epoch": 0.9525326920186382, + "flos": 22673573637120.0, + "grad_norm": 1.6348633690873502, + "language_loss": 0.73576659, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.75662738, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 15843, + "time_per_iteration": 2.3867440223693848 + }, + { + "auxiliary_loss_clip": 0.01052376, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.00728488, + "balance_loss_mlp": 1.01637542, + "epoch": 0.9525928152713061, + "flos": 22085671386240.0, + "grad_norm": 1.7341239183037538, + "language_loss": 0.79630184, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.81714934, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.359375, + "step": 15844, + "time_per_iteration": 2.4296345710754395 + }, + { + "auxiliary_loss_clip": 0.01054904, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.01525569, + "balance_loss_mlp": 1.01689696, + "epoch": 0.9526529385239741, + "flos": 20703669194880.0, + "grad_norm": 2.1980339555288304, + "language_loss": 0.71574736, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.73671365, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37890625, + "step": 15845, + "time_per_iteration": 2.3789660930633545 + }, + { + "auxiliary_loss_clip": 0.0105134, + "auxiliary_loss_mlp": 0.0103694, + "balance_loss_clip": 1.01340818, + "balance_loss_mlp": 1.01513517, + "epoch": 0.9527130617766422, + "flos": 23366913793920.0, + "grad_norm": 1.4963006906385692, + "language_loss": 0.76256394, + "learning_rate": 2.338118708818282e-08, + "loss": 0.78344679, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 15846, + "time_per_iteration": 3.7917609214782715 + }, + { + "auxiliary_loss_clip": 0.01051112, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.01392066, + "balance_loss_mlp": 1.01583636, + "epoch": 0.9527731850293101, + "flos": 18985034874240.0, + "grad_norm": 1.7818698103887793, + "language_loss": 0.79906321, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.8199414, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 15847, + "time_per_iteration": 3.664309501647949 + }, + { + "auxiliary_loss_clip": 0.01049506, + "auxiliary_loss_mlp": 0.01039582, + "balance_loss_clip": 1.0183028, + "balance_loss_mlp": 1.01571226, + "epoch": 0.9528333082819781, + "flos": 19317023792640.0, + "grad_norm": 1.6472156535618263, + "language_loss": 0.78919411, + "learning_rate": 2.326258115328672e-08, + "loss": 0.81008506, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 15848, + "time_per_iteration": 2.370346784591675 + }, + { + "auxiliary_loss_clip": 0.01054869, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.01789594, + "balance_loss_mlp": 1.01667953, + "epoch": 0.952893431534646, + "flos": 23950207745280.0, + "grad_norm": 1.6719529551263836, + "language_loss": 0.7348671, + "learning_rate": 2.320339062183674e-08, + "loss": 0.75583923, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38085938, + "step": 15849, + "time_per_iteration": 2.3756625652313232 + }, + { + "auxiliary_loss_clip": 0.01056851, + "auxiliary_loss_mlp": 0.01041592, + "balance_loss_clip": 1.01568818, + "balance_loss_mlp": 1.01810658, + "epoch": 0.952953554787314, + "flos": 21029548625280.0, + "grad_norm": 1.700642233437923, + "language_loss": 0.75860387, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77958834, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38671875, + "step": 15850, + "time_per_iteration": 2.380350351333618 + }, + { + "auxiliary_loss_clip": 0.01050927, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.0159452, + "balance_loss_mlp": 1.01546395, + "epoch": 0.9530136780399819, + "flos": 22381769560320.0, + "grad_norm": 2.0573234674398813, + "language_loss": 0.7430864, + "learning_rate": 2.308523444215482e-08, + "loss": 0.76398402, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 15851, + "time_per_iteration": 2.3749444484710693 + }, + { + "auxiliary_loss_clip": 0.01050695, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.0106523, + "balance_loss_mlp": 1.01563227, + "epoch": 0.95307380129265, + "flos": 22158639861120.0, + "grad_norm": 1.7663350672242917, + "language_loss": 0.80397546, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.82481819, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34960938, + "step": 15852, + "time_per_iteration": 2.4308059215545654 + }, + { + "auxiliary_loss_clip": 0.01052324, + "auxiliary_loss_mlp": 0.0103886, + "balance_loss_clip": 1.01565027, + "balance_loss_mlp": 1.01668894, + "epoch": 0.9531339245453179, + "flos": 44020937168640.0, + "grad_norm": 2.105120781690506, + "language_loss": 0.60533524, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.62624705, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 15853, + "time_per_iteration": 2.608398199081421 + }, + { + "auxiliary_loss_clip": 0.01048171, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.01290965, + "balance_loss_mlp": 1.01466072, + "epoch": 0.9531940477979859, + "flos": 20265648877440.0, + "grad_norm": 2.2540896926893272, + "language_loss": 0.73172218, + "learning_rate": 2.290856241425998e-08, + "loss": 0.75254744, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3359375, + "step": 15854, + "time_per_iteration": 2.3858566284179688 + }, + { + "auxiliary_loss_clip": 0.01051467, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.00931501, + "balance_loss_mlp": 1.01512969, + "epoch": 0.9532541710506538, + "flos": 25334618820480.0, + "grad_norm": 2.09699223134901, + "language_loss": 0.69564271, + "learning_rate": 2.284982167833127e-08, + "loss": 0.71648461, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 15855, + "time_per_iteration": 2.390559673309326 + }, + { + "auxiliary_loss_clip": 0.01051303, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.01539469, + "balance_loss_mlp": 1.0155952, + "epoch": 0.9533142943033218, + "flos": 26468073976320.0, + "grad_norm": 1.6867898340582466, + "language_loss": 0.78068388, + "learning_rate": 2.279115591613556e-08, + "loss": 0.80157053, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35742188, + "step": 15856, + "time_per_iteration": 2.4246580600738525 + }, + { + "auxiliary_loss_clip": 0.01050282, + "auxiliary_loss_mlp": 0.01038129, + "balance_loss_clip": 1.01631379, + "balance_loss_mlp": 1.0150317, + "epoch": 0.9533744175559897, + "flos": 23655890050560.0, + "grad_norm": 1.6115890183929678, + "language_loss": 0.78910303, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80998719, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 15857, + "time_per_iteration": 2.404137134552002 + }, + { + "auxiliary_loss_clip": 0.01007056, + "auxiliary_loss_mlp": 0.010019, + "balance_loss_clip": 0.9997896, + "balance_loss_mlp": 1.00078034, + "epoch": 0.9534345408086577, + "flos": 61049019715200.0, + "grad_norm": 0.7071440029564339, + "language_loss": 0.62705445, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64714402, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.0625, + "step": 15858, + "time_per_iteration": 2.9848430156707764 + }, + { + "auxiliary_loss_clip": 0.01051617, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.01619053, + "balance_loss_mlp": 1.01629174, + "epoch": 0.9534946640613258, + "flos": 18950715141120.0, + "grad_norm": 1.4766298835809097, + "language_loss": 0.57757145, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.59847665, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35351562, + "step": 15859, + "time_per_iteration": 3.788269281387329 + }, + { + "auxiliary_loss_clip": 0.01049415, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.01302826, + "balance_loss_mlp": 1.01534092, + "epoch": 0.9535547873139937, + "flos": 16653360257280.0, + "grad_norm": 2.1585969325443366, + "language_loss": 0.82529777, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.84612995, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34179688, + "step": 15860, + "time_per_iteration": 2.335883617401123 + }, + { + "auxiliary_loss_clip": 0.01050509, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.01234651, + "balance_loss_mlp": 1.01495755, + "epoch": 0.9536149105666617, + "flos": 20666731109760.0, + "grad_norm": 1.7768978343944564, + "language_loss": 0.67905796, + "learning_rate": 2.249895178891159e-08, + "loss": 0.69990337, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 15861, + "time_per_iteration": 2.4390482902526855 + }, + { + "auxiliary_loss_clip": 0.01052147, + "auxiliary_loss_mlp": 0.01038488, + "balance_loss_clip": 1.01465797, + "balance_loss_mlp": 1.01635587, + "epoch": 0.9536750338193296, + "flos": 30699198178560.0, + "grad_norm": 1.7814212858081262, + "language_loss": 0.66205788, + "learning_rate": 2.244073591573037e-08, + "loss": 0.68296427, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 15862, + "time_per_iteration": 2.4228644371032715 + }, + { + "auxiliary_loss_clip": 0.01049802, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.01601672, + "balance_loss_mlp": 1.01562989, + "epoch": 0.9537351570719976, + "flos": 20404638466560.0, + "grad_norm": 1.5509038057676798, + "language_loss": 0.68582076, + "learning_rate": 2.238259503179485e-08, + "loss": 0.70669031, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 15863, + "time_per_iteration": 2.375710964202881 + }, + { + "auxiliary_loss_clip": 0.01050876, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.01386476, + "balance_loss_mlp": 1.01590776, + "epoch": 0.9537952803246655, + "flos": 29928106690560.0, + "grad_norm": 1.9378125784158517, + "language_loss": 0.79251802, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.81338972, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 15864, + "time_per_iteration": 2.429142475128174 + }, + { + "auxiliary_loss_clip": 0.01051006, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.00838685, + "balance_loss_mlp": 1.01665926, + "epoch": 0.9538554035773336, + "flos": 20520375223680.0, + "grad_norm": 3.627429013368778, + "language_loss": 0.60524046, + "learning_rate": 2.226653824047586e-08, + "loss": 0.62604839, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 15865, + "time_per_iteration": 2.367243528366089 + }, + { + "auxiliary_loss_clip": 0.01051119, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.00982928, + "balance_loss_mlp": 1.01503944, + "epoch": 0.9539155268300015, + "flos": 18405511349760.0, + "grad_norm": 1.8743952159203336, + "language_loss": 0.70323241, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.72407293, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 15866, + "time_per_iteration": 2.3556034564971924 + }, + { + "auxiliary_loss_clip": 0.01051253, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.01076066, + "balance_loss_mlp": 1.01553202, + "epoch": 0.9539756500826695, + "flos": 26212090821120.0, + "grad_norm": 2.236255204699346, + "language_loss": 0.86048293, + "learning_rate": 2.215078143255855e-08, + "loss": 0.88133669, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 15867, + "time_per_iteration": 2.497847557067871 + }, + { + "auxiliary_loss_clip": 0.01007209, + "auxiliary_loss_mlp": 0.01004126, + "balance_loss_clip": 1.00211143, + "balance_loss_mlp": 1.00093174, + "epoch": 0.9540357733353374, + "flos": 68285501729280.0, + "grad_norm": 0.827242637067941, + "language_loss": 0.61838388, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63849723, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.0625, + "step": 15868, + "time_per_iteration": 3.088291883468628 + }, + { + "auxiliary_loss_clip": 0.01050596, + "auxiliary_loss_mlp": 0.01035674, + "balance_loss_clip": 1.01156998, + "balance_loss_mlp": 1.01523328, + "epoch": 0.9540958965880054, + "flos": 21287207525760.0, + "grad_norm": 1.8164930112341908, + "language_loss": 0.60558134, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62644404, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35351562, + "step": 15869, + "time_per_iteration": 2.3758113384246826 + }, + { + "auxiliary_loss_clip": 0.01050131, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.01788545, + "balance_loss_mlp": 1.01558471, + "epoch": 0.9541560198406733, + "flos": 19750505633280.0, + "grad_norm": 4.109882110151423, + "language_loss": 0.72463799, + "learning_rate": 2.197770872795579e-08, + "loss": 0.7455281, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34765625, + "step": 15870, + "time_per_iteration": 2.381094455718994 + }, + { + "auxiliary_loss_clip": 0.01050168, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.01499224, + "balance_loss_mlp": 1.01566541, + "epoch": 0.9542161430933414, + "flos": 24714526429440.0, + "grad_norm": 1.9469304657613002, + "language_loss": 0.77361041, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.79448676, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 15871, + "time_per_iteration": 2.430687427520752 + }, + { + "auxiliary_loss_clip": 0.01051785, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.01430714, + "balance_loss_mlp": 1.01586151, + "epoch": 0.9542762663460094, + "flos": 31064494400640.0, + "grad_norm": 1.6688153019976901, + "language_loss": 0.5975728, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.61847717, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 15872, + "time_per_iteration": 2.4542057514190674 + }, + { + "auxiliary_loss_clip": 0.01053551, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.01431513, + "balance_loss_mlp": 1.01605821, + "epoch": 0.9543363895986773, + "flos": 20775695063040.0, + "grad_norm": 2.2779787489912864, + "language_loss": 0.75900149, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.77993363, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 15873, + "time_per_iteration": 2.3682265281677246 + }, + { + "auxiliary_loss_clip": 0.01053063, + "auxiliary_loss_mlp": 0.01043599, + "balance_loss_clip": 1.01911318, + "balance_loss_mlp": 1.01632762, + "epoch": 0.9543965128513453, + "flos": 24461580562560.0, + "grad_norm": 2.091703211191758, + "language_loss": 0.62973905, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.65070564, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 15874, + "time_per_iteration": 2.4145166873931885 + }, + { + "auxiliary_loss_clip": 0.01051383, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.01629865, + "balance_loss_mlp": 1.01620173, + "epoch": 0.9544566361040132, + "flos": 15260814835200.0, + "grad_norm": 1.872339062188966, + "language_loss": 0.90211391, + "learning_rate": 2.169075438538104e-08, + "loss": 0.92301178, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 15875, + "time_per_iteration": 3.57841420173645 + }, + { + "auxiliary_loss_clip": 0.01053631, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.01503134, + "balance_loss_mlp": 1.01596415, + "epoch": 0.9545167593566812, + "flos": 25917668392320.0, + "grad_norm": 1.640787690358709, + "language_loss": 0.68467742, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.70560515, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37695312, + "step": 15876, + "time_per_iteration": 2.382108449935913 + }, + { + "auxiliary_loss_clip": 0.01052565, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.01304626, + "balance_loss_mlp": 1.01559424, + "epoch": 0.9545768826093491, + "flos": 25627051301760.0, + "grad_norm": 1.974313649921627, + "language_loss": 0.70749456, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.72838658, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 15877, + "time_per_iteration": 2.4322316646575928 + }, + { + "auxiliary_loss_clip": 0.01052651, + "auxiliary_loss_mlp": 0.01038416, + "balance_loss_clip": 1.01319194, + "balance_loss_mlp": 1.01628089, + "epoch": 0.9546370058620172, + "flos": 22490419311360.0, + "grad_norm": 1.6528747113632685, + "language_loss": 0.71948212, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.7403928, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36328125, + "step": 15878, + "time_per_iteration": 2.359132766723633 + }, + { + "auxiliary_loss_clip": 0.010509, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.0099628, + "balance_loss_mlp": 1.0163722, + "epoch": 0.9546971291146851, + "flos": 24608390296320.0, + "grad_norm": 1.3400905069472893, + "language_loss": 0.69370234, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.71453786, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34570312, + "step": 15879, + "time_per_iteration": 2.413098096847534 + }, + { + "auxiliary_loss_clip": 0.01049995, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.01723576, + "balance_loss_mlp": 1.01540184, + "epoch": 0.9547572523673531, + "flos": 28656499818240.0, + "grad_norm": 1.9153211130218188, + "language_loss": 0.86591649, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.88681495, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34570312, + "step": 15880, + "time_per_iteration": 2.412125825881958 + }, + { + "auxiliary_loss_clip": 0.01052488, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.01181471, + "balance_loss_mlp": 1.01627576, + "epoch": 0.954817375620021, + "flos": 33801196233600.0, + "grad_norm": 1.7438157496885023, + "language_loss": 0.73170936, + "learning_rate": 2.134888478151753e-08, + "loss": 0.75259924, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 15881, + "time_per_iteration": 2.4881432056427 + }, + { + "auxiliary_loss_clip": 0.01050988, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.01355815, + "balance_loss_mlp": 1.01630282, + "epoch": 0.954877498872689, + "flos": 14427367925760.0, + "grad_norm": 1.9005336511055235, + "language_loss": 0.7247647, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.74564254, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 15882, + "time_per_iteration": 2.3470170497894287 + }, + { + "auxiliary_loss_clip": 0.01051172, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.01663792, + "balance_loss_mlp": 1.0159651, + "epoch": 0.9549376221253569, + "flos": 59267333612160.0, + "grad_norm": 1.7395896759439178, + "language_loss": 0.67185152, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.69275498, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 15883, + "time_per_iteration": 2.714733123779297 + }, + { + "auxiliary_loss_clip": 0.01053042, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.01124454, + "balance_loss_mlp": 1.01611388, + "epoch": 0.954997745378025, + "flos": 17273452648320.0, + "grad_norm": 2.269980442516136, + "language_loss": 0.79954946, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.82043636, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36914062, + "step": 15884, + "time_per_iteration": 2.31618332862854 + }, + { + "auxiliary_loss_clip": 0.01052871, + "auxiliary_loss_mlp": 0.01038452, + "balance_loss_clip": 1.0142169, + "balance_loss_mlp": 1.01632714, + "epoch": 0.955057868630693, + "flos": 13005530006400.0, + "grad_norm": 1.740780334758101, + "language_loss": 0.78332984, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.80424309, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 15885, + "time_per_iteration": 3.740532398223877 + }, + { + "auxiliary_loss_clip": 0.01052777, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.01513064, + "balance_loss_mlp": 1.01587021, + "epoch": 0.9551179918833609, + "flos": 22636600640640.0, + "grad_norm": 2.732244072917136, + "language_loss": 0.71443319, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.73534334, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36914062, + "step": 15886, + "time_per_iteration": 3.740164041519165 + }, + { + "auxiliary_loss_clip": 0.01055447, + "auxiliary_loss_mlp": 0.01042263, + "balance_loss_clip": 1.01505983, + "balance_loss_mlp": 1.01716113, + "epoch": 0.9551781151360289, + "flos": 21541759315200.0, + "grad_norm": 5.2915517038807005, + "language_loss": 0.74242061, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.76339769, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.3828125, + "step": 15887, + "time_per_iteration": 2.363351583480835 + }, + { + "auxiliary_loss_clip": 0.01050116, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.01685786, + "balance_loss_mlp": 1.01554072, + "epoch": 0.9552382383886968, + "flos": 20701260311040.0, + "grad_norm": 1.8998533847813919, + "language_loss": 0.57885987, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.59975183, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34570312, + "step": 15888, + "time_per_iteration": 2.383162498474121 + }, + { + "auxiliary_loss_clip": 0.01006771, + "auxiliary_loss_mlp": 0.01002216, + "balance_loss_clip": 1.00033236, + "balance_loss_mlp": 1.00059724, + "epoch": 0.9552983616413648, + "flos": 67766448412800.0, + "grad_norm": 0.7135347869678657, + "language_loss": 0.57983088, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59992075, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.06176758, + "step": 15889, + "time_per_iteration": 3.0720791816711426 + }, + { + "auxiliary_loss_clip": 0.01050742, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.011621, + "balance_loss_mlp": 1.01476526, + "epoch": 0.9553584848940327, + "flos": 21578906868480.0, + "grad_norm": 1.3655724251462567, + "language_loss": 0.68234438, + "learning_rate": 2.084114508877466e-08, + "loss": 0.70319152, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 15890, + "time_per_iteration": 2.4154703617095947 + }, + { + "auxiliary_loss_clip": 0.01051472, + "auxiliary_loss_mlp": 0.01039797, + "balance_loss_clip": 1.01715899, + "balance_loss_mlp": 1.01634371, + "epoch": 0.9554186081467008, + "flos": 24206993861760.0, + "grad_norm": 1.7197643165885403, + "language_loss": 0.74944818, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.77036089, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15891, + "time_per_iteration": 2.3969712257385254 + }, + { + "auxiliary_loss_clip": 0.01048941, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.01639247, + "balance_loss_mlp": 1.01502037, + "epoch": 0.9554787313993687, + "flos": 16250672102400.0, + "grad_norm": 1.754978039042566, + "language_loss": 0.78418338, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80503249, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.33984375, + "step": 15892, + "time_per_iteration": 2.3675899505615234 + }, + { + "auxiliary_loss_clip": 0.01050566, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.01384306, + "balance_loss_mlp": 1.01579142, + "epoch": 0.9555388546520367, + "flos": 23403014006400.0, + "grad_norm": 1.508050145875867, + "language_loss": 0.71407998, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.73495042, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 15893, + "time_per_iteration": 2.412060022354126 + }, + { + "auxiliary_loss_clip": 0.01052385, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.01179743, + "balance_loss_mlp": 1.01741362, + "epoch": 0.9555989779047046, + "flos": 14793152906880.0, + "grad_norm": 1.78658891002791, + "language_loss": 0.66864103, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.68951416, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34960938, + "step": 15894, + "time_per_iteration": 2.416110038757324 + }, + { + "auxiliary_loss_clip": 0.01051455, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.01320648, + "balance_loss_mlp": 1.01555932, + "epoch": 0.9556591011573726, + "flos": 22235658053760.0, + "grad_norm": 1.6616463448796988, + "language_loss": 0.82594383, + "learning_rate": 2.056169412853581e-08, + "loss": 0.84682989, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.359375, + "step": 15895, + "time_per_iteration": 2.421842575073242 + }, + { + "auxiliary_loss_clip": 0.01050108, + "auxiliary_loss_mlp": 0.01035361, + "balance_loss_clip": 1.01219833, + "balance_loss_mlp": 1.014727, + "epoch": 0.9557192244100405, + "flos": 27854056062720.0, + "grad_norm": 1.4569598703773616, + "language_loss": 0.72959042, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.75044513, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35351562, + "step": 15896, + "time_per_iteration": 2.454200267791748 + }, + { + "auxiliary_loss_clip": 0.01050851, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.01547527, + "balance_loss_mlp": 1.01580548, + "epoch": 0.9557793476627086, + "flos": 17601845696640.0, + "grad_norm": 3.3124609656397728, + "language_loss": 0.80606771, + "learning_rate": 2.045043915311706e-08, + "loss": 0.82693875, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34960938, + "step": 15897, + "time_per_iteration": 2.360569715499878 + }, + { + "auxiliary_loss_clip": 0.0105087, + "auxiliary_loss_mlp": 0.01038863, + "balance_loss_clip": 1.01461589, + "balance_loss_mlp": 1.01547599, + "epoch": 0.9558394709153766, + "flos": 23874446361600.0, + "grad_norm": 1.9297708239050082, + "language_loss": 0.73269582, + "learning_rate": 2.03949242614303e-08, + "loss": 0.75359315, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35546875, + "step": 15898, + "time_per_iteration": 3.8735599517822266 + }, + { + "auxiliary_loss_clip": 0.01007142, + "auxiliary_loss_mlp": 0.01002435, + "balance_loss_clip": 1.0003252, + "balance_loss_mlp": 1.00077939, + "epoch": 0.9558995941680445, + "flos": 53679168840960.0, + "grad_norm": 0.8772295032498897, + "language_loss": 0.52358651, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54368234, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06347656, + "step": 15899, + "time_per_iteration": 2.938342809677124 + }, + { + "auxiliary_loss_clip": 0.01053535, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.01321983, + "balance_loss_mlp": 1.0155921, + "epoch": 0.9559597174207125, + "flos": 13763843936640.0, + "grad_norm": 2.6856510945573433, + "language_loss": 0.68965912, + "learning_rate": 2.028411968062782e-08, + "loss": 0.71057606, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37890625, + "step": 15900, + "time_per_iteration": 2.3671762943267822 + }, + { + "auxiliary_loss_clip": 0.01053072, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.01358485, + "balance_loss_mlp": 1.01708114, + "epoch": 0.9560198406733804, + "flos": 19935370615680.0, + "grad_norm": 1.985604744309505, + "language_loss": 0.83391601, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.85482228, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 15901, + "time_per_iteration": 2.369601011276245 + }, + { + "auxiliary_loss_clip": 0.01006973, + "auxiliary_loss_mlp": 0.01002165, + "balance_loss_clip": 1.00006747, + "balance_loss_mlp": 1.00059128, + "epoch": 0.9560799639260484, + "flos": 57286744427520.0, + "grad_norm": 0.7068364703246808, + "language_loss": 0.5440585, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56414992, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.06347656, + "step": 15902, + "time_per_iteration": 3.09403395652771 + }, + { + "auxiliary_loss_clip": 0.01047413, + "auxiliary_loss_mlp": 0.01036106, + "balance_loss_clip": 1.01604271, + "balance_loss_mlp": 1.01498163, + "epoch": 0.9561400871787163, + "flos": 18916151028480.0, + "grad_norm": 1.7899687690076742, + "language_loss": 0.86226821, + "learning_rate": 2.01184758473425e-08, + "loss": 0.88310337, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.32421875, + "step": 15903, + "time_per_iteration": 2.3524932861328125 + }, + { + "auxiliary_loss_clip": 0.01050646, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.01141727, + "balance_loss_mlp": 1.01626682, + "epoch": 0.9562002104313844, + "flos": 18037666598400.0, + "grad_norm": 1.7770060462807518, + "language_loss": 0.81366104, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.83448821, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.34375, + "step": 15904, + "time_per_iteration": 2.348975419998169 + }, + { + "auxiliary_loss_clip": 0.01053555, + "auxiliary_loss_mlp": 0.01043021, + "balance_loss_clip": 1.016747, + "balance_loss_mlp": 1.01714206, + "epoch": 0.9562603336840523, + "flos": 24716516376960.0, + "grad_norm": 3.290622354023181, + "language_loss": 0.61336327, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.63432908, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.36328125, + "step": 15905, + "time_per_iteration": 2.383650064468384 + }, + { + "auxiliary_loss_clip": 0.0105063, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.01349592, + "balance_loss_mlp": 1.01562405, + "epoch": 0.9563204569367203, + "flos": 21176183802240.0, + "grad_norm": 1.930597514132045, + "language_loss": 0.71148258, + "learning_rate": 1.995350770979254e-08, + "loss": 0.73234165, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34960938, + "step": 15906, + "time_per_iteration": 2.3577215671539307 + }, + { + "auxiliary_loss_clip": 0.01053736, + "auxiliary_loss_mlp": 0.01042612, + "balance_loss_clip": 1.01797128, + "balance_loss_mlp": 1.01717734, + "epoch": 0.9563805801893882, + "flos": 20228710792320.0, + "grad_norm": 1.660225473608005, + "language_loss": 0.72437561, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.74533904, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 15907, + "time_per_iteration": 2.3680317401885986 + }, + { + "auxiliary_loss_clip": 0.0105002, + "auxiliary_loss_mlp": 0.01037134, + "balance_loss_clip": 1.01450849, + "balance_loss_mlp": 1.01557803, + "epoch": 0.9564407034420562, + "flos": 25409821622400.0, + "grad_norm": 1.8339795724962484, + "language_loss": 0.72196662, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.74283814, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34570312, + "step": 15908, + "time_per_iteration": 2.430176258087158 + }, + { + "auxiliary_loss_clip": 0.01051539, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.01411939, + "balance_loss_mlp": 1.01660371, + "epoch": 0.9565008266947241, + "flos": 18622915585920.0, + "grad_norm": 1.7920929162231434, + "language_loss": 0.83762777, + "learning_rate": 1.978921532427802e-08, + "loss": 0.85851181, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34960938, + "step": 15909, + "time_per_iteration": 2.332280397415161 + }, + { + "auxiliary_loss_clip": 0.01050104, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.01499104, + "balance_loss_mlp": 1.01504803, + "epoch": 0.9565609499473922, + "flos": 24861021960960.0, + "grad_norm": 1.835490870738085, + "language_loss": 0.68661487, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.70749146, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 15910, + "time_per_iteration": 2.413201093673706 + }, + { + "auxiliary_loss_clip": 0.01053175, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.01497245, + "balance_loss_mlp": 1.01637292, + "epoch": 0.9566210732000601, + "flos": 21797393356800.0, + "grad_norm": 1.6790425612849469, + "language_loss": 0.75496143, + "learning_rate": 1.968006251276444e-08, + "loss": 0.77588946, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 15911, + "time_per_iteration": 2.383683443069458 + }, + { + "auxiliary_loss_clip": 0.01051193, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.00830543, + "balance_loss_mlp": 1.01583552, + "epoch": 0.9566811964527281, + "flos": 18696617199360.0, + "grad_norm": 2.2272979818333396, + "language_loss": 0.70965379, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.73048174, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 15912, + "time_per_iteration": 2.3850502967834473 + }, + { + "auxiliary_loss_clip": 0.01053419, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_clip": 1.01786613, + "balance_loss_mlp": 1.01678073, + "epoch": 0.9567413197053961, + "flos": 12999944188800.0, + "grad_norm": 3.1665744236526407, + "language_loss": 0.72576618, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74672699, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 15913, + "time_per_iteration": 2.334707260131836 + }, + { + "auxiliary_loss_clip": 0.01051966, + "auxiliary_loss_mlp": 0.01031529, + "balance_loss_clip": 1.01021397, + "balance_loss_mlp": 1.01667702, + "epoch": 0.956801442958064, + "flos": 19718245670400.0, + "grad_norm": 1.8521800438790648, + "language_loss": 0.74427569, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.76511061, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35351562, + "step": 15914, + "time_per_iteration": 2.463934898376465 + }, + { + "auxiliary_loss_clip": 0.01050996, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.01070821, + "balance_loss_mlp": 1.01619768, + "epoch": 0.956861566210732, + "flos": 18221868264960.0, + "grad_norm": 1.4411332682011917, + "language_loss": 0.68151999, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.70236146, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 15915, + "time_per_iteration": 3.6021080017089844 + }, + { + "auxiliary_loss_clip": 0.01050737, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.01531625, + "balance_loss_mlp": 1.01572609, + "epoch": 0.9569216894634, + "flos": 22195961971200.0, + "grad_norm": 1.849582597038872, + "language_loss": 0.6504885, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.67137861, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34960938, + "step": 15916, + "time_per_iteration": 2.39572811126709 + }, + { + "auxiliary_loss_clip": 0.01047291, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.01304674, + "balance_loss_mlp": 1.01505637, + "epoch": 0.956981812716068, + "flos": 21688219935360.0, + "grad_norm": 2.0580137084885535, + "language_loss": 0.81759721, + "learning_rate": 1.935440639853536e-08, + "loss": 0.83839095, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.32226562, + "step": 15917, + "time_per_iteration": 2.3666906356811523 + }, + { + "auxiliary_loss_clip": 0.01051644, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.0148772, + "balance_loss_mlp": 1.01652825, + "epoch": 0.9570419359687359, + "flos": 13990045835520.0, + "grad_norm": 2.920233987316388, + "language_loss": 0.74378479, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.76467609, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 15918, + "time_per_iteration": 2.3647429943084717 + }, + { + "auxiliary_loss_clip": 0.01007138, + "auxiliary_loss_mlp": 0.01004545, + "balance_loss_clip": 1.00261354, + "balance_loss_mlp": 1.0008328, + "epoch": 0.9571020592214039, + "flos": 65193174120960.0, + "grad_norm": 0.6294961240596305, + "language_loss": 0.53108186, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55119872, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.06298828, + "step": 15919, + "time_per_iteration": 3.116783618927002 + }, + { + "auxiliary_loss_clip": 0.0105434, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.01638556, + "balance_loss_mlp": 1.01752639, + "epoch": 0.9571621824740718, + "flos": 17383114828800.0, + "grad_norm": 2.693770931813411, + "language_loss": 0.7764731, + "learning_rate": 1.919259224843972e-08, + "loss": 0.797418, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36914062, + "step": 15920, + "time_per_iteration": 2.365687608718872 + }, + { + "auxiliary_loss_clip": 0.01053216, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.01218522, + "balance_loss_mlp": 1.01653087, + "epoch": 0.9572223057267398, + "flos": 14537309397120.0, + "grad_norm": 1.888686197659497, + "language_loss": 0.80151856, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.82241309, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36523438, + "step": 15921, + "time_per_iteration": 2.3548638820648193 + }, + { + "auxiliary_loss_clip": 0.01053835, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.01142108, + "balance_loss_mlp": 1.01647007, + "epoch": 0.9572824289794077, + "flos": 33946400044800.0, + "grad_norm": 1.9294571009496784, + "language_loss": 0.52243096, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.54333913, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 15922, + "time_per_iteration": 2.4977126121520996 + }, + { + "auxiliary_loss_clip": 0.0105315, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.01556504, + "balance_loss_mlp": 1.01613259, + "epoch": 0.9573425522320758, + "flos": 18693998847360.0, + "grad_norm": 2.081881025938299, + "language_loss": 0.84550434, + "learning_rate": 1.903145411006557e-08, + "loss": 0.86643124, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 15923, + "time_per_iteration": 2.3350837230682373 + }, + { + "auxiliary_loss_clip": 0.01049759, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.01330519, + "balance_loss_mlp": 1.01547003, + "epoch": 0.9574026754847437, + "flos": 28509096591360.0, + "grad_norm": 1.6304001788483007, + "language_loss": 0.76547694, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.78633004, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 15924, + "time_per_iteration": 2.421373128890991 + }, + { + "auxiliary_loss_clip": 0.01051638, + "auxiliary_loss_mlp": 0.0103935, + "balance_loss_clip": 1.01627111, + "balance_loss_mlp": 1.01517534, + "epoch": 0.9574627987374117, + "flos": 24351255066240.0, + "grad_norm": 2.1533653983512173, + "language_loss": 0.86841249, + "learning_rate": 1.892440427371711e-08, + "loss": 0.8893224, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 15925, + "time_per_iteration": 3.9438793659210205 + }, + { + "auxiliary_loss_clip": 0.01055051, + "auxiliary_loss_mlp": 0.01038953, + "balance_loss_clip": 1.01426458, + "balance_loss_mlp": 1.01644588, + "epoch": 0.9575229219900797, + "flos": 23509638898560.0, + "grad_norm": 1.7783878069482557, + "language_loss": 0.76711237, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.78805244, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.38671875, + "step": 15926, + "time_per_iteration": 3.7876646518707275 + }, + { + "auxiliary_loss_clip": 0.01051966, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.01465511, + "balance_loss_mlp": 1.01705217, + "epoch": 0.9575830452427476, + "flos": 22673713282560.0, + "grad_norm": 1.5737730451521168, + "language_loss": 0.78740114, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.80826402, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.34765625, + "step": 15927, + "time_per_iteration": 2.3834445476531982 + }, + { + "auxiliary_loss_clip": 0.01052637, + "auxiliary_loss_mlp": 0.01039926, + "balance_loss_clip": 1.01404595, + "balance_loss_mlp": 1.01573634, + "epoch": 0.9576431684954156, + "flos": 30483853712640.0, + "grad_norm": 1.7859286553252403, + "language_loss": 0.70555919, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.72648478, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.36914062, + "step": 15928, + "time_per_iteration": 2.4634289741516113 + }, + { + "auxiliary_loss_clip": 0.01052144, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.01154673, + "balance_loss_mlp": 1.01676917, + "epoch": 0.9577032917480836, + "flos": 21686369633280.0, + "grad_norm": 1.6838387529054268, + "language_loss": 0.83342946, + "learning_rate": 1.871120608822485e-08, + "loss": 0.85430968, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 15929, + "time_per_iteration": 2.368654489517212 + }, + { + "auxiliary_loss_clip": 0.01055361, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.01652849, + "balance_loss_mlp": 1.01708698, + "epoch": 0.9577634150007516, + "flos": 29021865863040.0, + "grad_norm": 1.6004715597043562, + "language_loss": 0.72961676, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.75058293, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.3828125, + "step": 15930, + "time_per_iteration": 2.4118032455444336 + }, + { + "auxiliary_loss_clip": 0.01050395, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.01435447, + "balance_loss_mlp": 1.01579487, + "epoch": 0.9578235382534195, + "flos": 19281202871040.0, + "grad_norm": 1.568409989377175, + "language_loss": 0.63862371, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.65948528, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34570312, + "step": 15931, + "time_per_iteration": 2.364921808242798 + }, + { + "auxiliary_loss_clip": 0.01049849, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.01120806, + "balance_loss_mlp": 1.01584554, + "epoch": 0.9578836615060875, + "flos": 13698416315520.0, + "grad_norm": 2.4273404826043192, + "language_loss": 0.70954621, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.73037404, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.33984375, + "step": 15932, + "time_per_iteration": 2.333998203277588 + }, + { + "auxiliary_loss_clip": 0.01054839, + "auxiliary_loss_mlp": 0.01052303, + "balance_loss_clip": 1.02663684, + "balance_loss_mlp": 1.01668859, + "epoch": 0.9579437847587554, + "flos": 17053604616960.0, + "grad_norm": 1.8148009488218682, + "language_loss": 0.76566553, + "learning_rate": 1.849920999338961e-08, + "loss": 0.78673697, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.38085938, + "step": 15933, + "time_per_iteration": 2.353330612182617 + }, + { + "auxiliary_loss_clip": 0.01006971, + "auxiliary_loss_mlp": 0.01003413, + "balance_loss_clip": 1.00131488, + "balance_loss_mlp": 1.00058162, + "epoch": 0.9580039080114234, + "flos": 60568056558720.0, + "grad_norm": 0.7049301204979743, + "language_loss": 0.57366931, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59377325, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.06396484, + "step": 15934, + "time_per_iteration": 3.1124107837677 + }, + { + "auxiliary_loss_clip": 0.01007134, + "auxiliary_loss_mlp": 0.01003703, + "balance_loss_clip": 1.00161636, + "balance_loss_mlp": 1.00074077, + "epoch": 0.9580640312640913, + "flos": 66232120492800.0, + "grad_norm": 0.9177390137783973, + "language_loss": 0.66021359, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.68032193, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06396484, + "step": 15935, + "time_per_iteration": 2.957770586013794 + }, + { + "auxiliary_loss_clip": 0.0100736, + "auxiliary_loss_mlp": 0.01006069, + "balance_loss_clip": 1.00407791, + "balance_loss_mlp": 1.00093269, + "epoch": 0.9581241545167594, + "flos": 62214979213440.0, + "grad_norm": 0.7920324677844052, + "language_loss": 0.57112312, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59125733, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.06445312, + "step": 15936, + "time_per_iteration": 3.054417848587036 + }, + { + "auxiliary_loss_clip": 0.0105218, + "auxiliary_loss_mlp": 0.01039895, + "balance_loss_clip": 1.01608849, + "balance_loss_mlp": 1.01592326, + "epoch": 0.9581842777694273, + "flos": 23766983596800.0, + "grad_norm": 1.904979880042566, + "language_loss": 0.78604603, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80696678, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 15937, + "time_per_iteration": 3.957155227661133 + }, + { + "auxiliary_loss_clip": 0.01051572, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.01561546, + "balance_loss_mlp": 1.01545262, + "epoch": 0.9582444010220953, + "flos": 21212074546560.0, + "grad_norm": 2.090757908501192, + "language_loss": 0.68886894, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70979077, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.359375, + "step": 15938, + "time_per_iteration": 2.3920342922210693 + }, + { + "auxiliary_loss_clip": 0.01051042, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.01233506, + "balance_loss_mlp": 1.01504302, + "epoch": 0.9583045242747633, + "flos": 23804026416000.0, + "grad_norm": 2.4568010263776046, + "language_loss": 0.66985846, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.6907146, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 15939, + "time_per_iteration": 2.4049181938171387 + }, + { + "auxiliary_loss_clip": 0.01050584, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.01638138, + "balance_loss_mlp": 1.01546407, + "epoch": 0.9583646475274312, + "flos": 24130394605440.0, + "grad_norm": 1.5703089510582406, + "language_loss": 0.74659812, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.76748997, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 15940, + "time_per_iteration": 2.452683210372925 + }, + { + "auxiliary_loss_clip": 0.0105125, + "auxiliary_loss_mlp": 0.01037302, + "balance_loss_clip": 1.01160002, + "balance_loss_mlp": 1.01574183, + "epoch": 0.9584247707800992, + "flos": 20885601623040.0, + "grad_norm": 1.6876320732495553, + "language_loss": 0.73961294, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.7604984, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.35546875, + "step": 15941, + "time_per_iteration": 2.3460428714752197 + }, + { + "auxiliary_loss_clip": 0.01050393, + "auxiliary_loss_mlp": 0.01036356, + "balance_loss_clip": 1.01375437, + "balance_loss_mlp": 1.01579976, + "epoch": 0.9584848940327672, + "flos": 26066398250880.0, + "grad_norm": 1.658207708548181, + "language_loss": 0.7314378, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.75230521, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 15942, + "time_per_iteration": 2.428291082382202 + }, + { + "auxiliary_loss_clip": 0.01052062, + "auxiliary_loss_mlp": 0.01042453, + "balance_loss_clip": 1.0185039, + "balance_loss_mlp": 1.01551652, + "epoch": 0.9585450172854352, + "flos": 34491638747520.0, + "grad_norm": 1.6117292182477234, + "language_loss": 0.72885191, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74979699, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36523438, + "step": 15943, + "time_per_iteration": 2.4837348461151123 + }, + { + "auxiliary_loss_clip": 0.01052959, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.0155108, + "balance_loss_mlp": 1.01549745, + "epoch": 0.9586051405381031, + "flos": 23109673829760.0, + "grad_norm": 1.7088793273147398, + "language_loss": 0.69376874, + "learning_rate": 1.792242006001965e-08, + "loss": 0.71469307, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 15944, + "time_per_iteration": 2.4655656814575195 + }, + { + "auxiliary_loss_clip": 0.01051063, + "auxiliary_loss_mlp": 0.01039708, + "balance_loss_clip": 1.01640248, + "balance_loss_mlp": 1.01521587, + "epoch": 0.9586652637907711, + "flos": 19603137317760.0, + "grad_norm": 1.9724868568027907, + "language_loss": 0.67324072, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.69414842, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 15945, + "time_per_iteration": 2.3966352939605713 + }, + { + "auxiliary_loss_clip": 0.01006967, + "auxiliary_loss_mlp": 0.01001957, + "balance_loss_clip": 0.99985874, + "balance_loss_mlp": 1.00052619, + "epoch": 0.958725387043439, + "flos": 72069912685440.0, + "grad_norm": 0.7461275071718755, + "language_loss": 0.6191324, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63922161, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.06445312, + "step": 15946, + "time_per_iteration": 3.1014492511749268 + }, + { + "auxiliary_loss_clip": 0.01050497, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.0148623, + "balance_loss_mlp": 1.01594818, + "epoch": 0.958785510296107, + "flos": 28910702494080.0, + "grad_norm": 1.7485730872887275, + "language_loss": 0.7618624, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.7827282, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34570312, + "step": 15947, + "time_per_iteration": 2.423203945159912 + }, + { + "auxiliary_loss_clip": 0.01049966, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.01067472, + "balance_loss_mlp": 1.01532507, + "epoch": 0.958845633548775, + "flos": 18476245497600.0, + "grad_norm": 2.168199441327553, + "language_loss": 0.70417219, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72499579, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 15948, + "time_per_iteration": 2.436776876449585 + }, + { + "auxiliary_loss_clip": 0.01050702, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.01679587, + "balance_loss_mlp": 1.01601624, + "epoch": 0.958905756801443, + "flos": 24205771964160.0, + "grad_norm": 2.026471604495809, + "language_loss": 0.79469323, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.8155998, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 15949, + "time_per_iteration": 2.4446756839752197 + }, + { + "auxiliary_loss_clip": 0.0105316, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.01692128, + "balance_loss_mlp": 1.01724553, + "epoch": 0.9589658800541109, + "flos": 25006819265280.0, + "grad_norm": 1.9971695290664462, + "language_loss": 0.69398749, + "learning_rate": 1.761164038992602e-08, + "loss": 0.71493554, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.359375, + "step": 15950, + "time_per_iteration": 2.411465644836426 + }, + { + "auxiliary_loss_clip": 0.01050533, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.01484787, + "balance_loss_mlp": 1.0153451, + "epoch": 0.9590260033067789, + "flos": 23513409325440.0, + "grad_norm": 1.7868211951469781, + "language_loss": 0.86944973, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.89030051, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.3515625, + "step": 15951, + "time_per_iteration": 2.4062821865081787 + }, + { + "auxiliary_loss_clip": 0.0105461, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.01578999, + "balance_loss_mlp": 1.01729059, + "epoch": 0.9590861265594469, + "flos": 25519169600640.0, + "grad_norm": 2.3563562045716773, + "language_loss": 0.81316125, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.83411592, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.375, + "step": 15952, + "time_per_iteration": 2.4176366329193115 + }, + { + "auxiliary_loss_clip": 0.01052199, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.0133791, + "balance_loss_mlp": 1.01638043, + "epoch": 0.9591462498121148, + "flos": 21178243572480.0, + "grad_norm": 1.6970389467113942, + "language_loss": 0.70143539, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.72233826, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.35742188, + "step": 15953, + "time_per_iteration": 2.3623769283294678 + }, + { + "auxiliary_loss_clip": 0.01053496, + "auxiliary_loss_mlp": 0.0104353, + "balance_loss_clip": 1.01816213, + "balance_loss_mlp": 1.0164423, + "epoch": 0.9592063730647828, + "flos": 21722050909440.0, + "grad_norm": 2.9526635702808113, + "language_loss": 0.59713745, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.61810774, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 15954, + "time_per_iteration": 2.372628927230835 + }, + { + "auxiliary_loss_clip": 0.0105205, + "auxiliary_loss_mlp": 0.01036171, + "balance_loss_clip": 1.01260376, + "balance_loss_mlp": 1.01585698, + "epoch": 0.9592664963174508, + "flos": 29890295821440.0, + "grad_norm": 2.412637640972294, + "language_loss": 0.74248457, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.76336676, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 15955, + "time_per_iteration": 3.7577335834503174 + }, + { + "auxiliary_loss_clip": 0.01051604, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.01451159, + "balance_loss_mlp": 1.01549995, + "epoch": 0.9593266195701188, + "flos": 17998773477120.0, + "grad_norm": 1.7924351932755835, + "language_loss": 0.63233501, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.65323222, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 15956, + "time_per_iteration": 2.35461688041687 + }, + { + "auxiliary_loss_clip": 0.01052179, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.01150346, + "balance_loss_mlp": 1.01649356, + "epoch": 0.9593867428227867, + "flos": 18837422179200.0, + "grad_norm": 2.552329354762303, + "language_loss": 0.61143756, + "learning_rate": 1.725248447997507e-08, + "loss": 0.6323083, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 15957, + "time_per_iteration": 2.319568157196045 + }, + { + "auxiliary_loss_clip": 0.01052039, + "auxiliary_loss_mlp": 0.01042529, + "balance_loss_clip": 1.01824629, + "balance_loss_mlp": 1.01590097, + "epoch": 0.9594468660754547, + "flos": 29565847756800.0, + "grad_norm": 1.929686725357339, + "language_loss": 0.75101221, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.77195787, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36132812, + "step": 15958, + "time_per_iteration": 2.4654414653778076 + }, + { + "auxiliary_loss_clip": 0.01051059, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.01029754, + "balance_loss_mlp": 1.01536083, + "epoch": 0.9595069893281226, + "flos": 20702237829120.0, + "grad_norm": 1.7612367540372689, + "language_loss": 0.75364256, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.77447963, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35742188, + "step": 15959, + "time_per_iteration": 2.3680059909820557 + }, + { + "auxiliary_loss_clip": 0.01052616, + "auxiliary_loss_mlp": 0.01035692, + "balance_loss_clip": 1.01156378, + "balance_loss_mlp": 1.01603055, + "epoch": 0.9595671125807906, + "flos": 22452573530880.0, + "grad_norm": 2.0743246643747812, + "language_loss": 0.66044897, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.68133211, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36523438, + "step": 15960, + "time_per_iteration": 2.3895580768585205 + }, + { + "auxiliary_loss_clip": 0.01049824, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.01314116, + "balance_loss_mlp": 1.0156914, + "epoch": 0.9596272358334585, + "flos": 23914072621440.0, + "grad_norm": 1.6142745323316712, + "language_loss": 0.79132092, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.81218088, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34179688, + "step": 15961, + "time_per_iteration": 2.393799066543579 + }, + { + "auxiliary_loss_clip": 0.01049245, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.01061714, + "balance_loss_mlp": 1.01521397, + "epoch": 0.9596873590861266, + "flos": 17671672149120.0, + "grad_norm": 1.7608118932645207, + "language_loss": 0.76524007, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78605199, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 15962, + "time_per_iteration": 2.372875928878784 + }, + { + "auxiliary_loss_clip": 0.01052752, + "auxiliary_loss_mlp": 0.01043685, + "balance_loss_clip": 1.01810241, + "balance_loss_mlp": 1.01592052, + "epoch": 0.9597474823387945, + "flos": 25807447630080.0, + "grad_norm": 2.1363531846559987, + "language_loss": 0.73042309, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.75138736, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 15963, + "time_per_iteration": 2.4120633602142334 + }, + { + "auxiliary_loss_clip": 0.01048917, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.01636434, + "balance_loss_mlp": 1.01514459, + "epoch": 0.9598076055914625, + "flos": 23767402533120.0, + "grad_norm": 1.4476882599768301, + "language_loss": 0.74559075, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76646554, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.33789062, + "step": 15964, + "time_per_iteration": 2.4225552082061768 + }, + { + "auxiliary_loss_clip": 0.01006988, + "auxiliary_loss_mlp": 0.01003498, + "balance_loss_clip": 1.00157881, + "balance_loss_mlp": 1.00050068, + "epoch": 0.9598677288441305, + "flos": 56513383701120.0, + "grad_norm": 0.869836305975847, + "language_loss": 0.5764432, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59654808, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.06445312, + "step": 15965, + "time_per_iteration": 5.703410625457764 + }, + { + "auxiliary_loss_clip": 0.01051412, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.0126003, + "balance_loss_mlp": 1.015607, + "epoch": 0.9599278520967984, + "flos": 22996520513280.0, + "grad_norm": 1.6885341250110466, + "language_loss": 0.79604876, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81689924, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.359375, + "step": 15966, + "time_per_iteration": 2.369694709777832 + }, + { + "auxiliary_loss_clip": 0.01049367, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.0097723, + "balance_loss_mlp": 1.01490414, + "epoch": 0.9599879753494664, + "flos": 23038555656960.0, + "grad_norm": 1.594428318537607, + "language_loss": 0.80313581, + "learning_rate": 1.674579558025102e-08, + "loss": 0.82395178, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 15967, + "time_per_iteration": 2.3785345554351807 + }, + { + "auxiliary_loss_clip": 0.01053268, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.01256251, + "balance_loss_mlp": 1.01647484, + "epoch": 0.9600480986021344, + "flos": 16391546904960.0, + "grad_norm": 2.119692171994962, + "language_loss": 0.81605113, + "learning_rate": 1.669554028728348e-08, + "loss": 0.83695048, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.3671875, + "step": 15968, + "time_per_iteration": 2.322484016418457 + }, + { + "auxiliary_loss_clip": 0.01056192, + "auxiliary_loss_mlp": 0.01045829, + "balance_loss_clip": 1.01671791, + "balance_loss_mlp": 1.017488, + "epoch": 0.9601082218548024, + "flos": 24275388948480.0, + "grad_norm": 4.0975063788178385, + "language_loss": 0.69170868, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.71272898, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.38671875, + "step": 15969, + "time_per_iteration": 2.3945462703704834 + }, + { + "auxiliary_loss_clip": 0.01049316, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.01504469, + "balance_loss_mlp": 1.01488876, + "epoch": 0.9601683451074703, + "flos": 19608967514880.0, + "grad_norm": 3.019085633373059, + "language_loss": 0.82089639, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.84176493, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34375, + "step": 15970, + "time_per_iteration": 2.369249105453491 + }, + { + "auxiliary_loss_clip": 0.01050357, + "auxiliary_loss_mlp": 0.01037161, + "balance_loss_clip": 1.01378393, + "balance_loss_mlp": 1.0161171, + "epoch": 0.9602284683601383, + "flos": 26649901670400.0, + "grad_norm": 1.516527873753173, + "language_loss": 0.78257477, + "learning_rate": 1.654522565861316e-08, + "loss": 0.80344999, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34179688, + "step": 15971, + "time_per_iteration": 2.4472031593322754 + }, + { + "auxiliary_loss_clip": 0.01053519, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.01198149, + "balance_loss_mlp": 1.0167737, + "epoch": 0.9602885916128062, + "flos": 15553352050560.0, + "grad_norm": 2.153835417099332, + "language_loss": 0.6855886, + "learning_rate": 1.64952712054669e-08, + "loss": 0.70648998, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3671875, + "step": 15972, + "time_per_iteration": 2.346890926361084 + }, + { + "auxiliary_loss_clip": 0.01050275, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.01110125, + "balance_loss_mlp": 1.01544368, + "epoch": 0.9603487148654742, + "flos": 16501593110400.0, + "grad_norm": 2.0494287060031238, + "language_loss": 0.76786613, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78870702, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 15973, + "time_per_iteration": 2.330226182937622 + }, + { + "auxiliary_loss_clip": 0.0105283, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.01596713, + "balance_loss_mlp": 1.01708269, + "epoch": 0.9604088381181421, + "flos": 20844439263360.0, + "grad_norm": 1.6948545178165422, + "language_loss": 0.69432819, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71523446, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35742188, + "step": 15974, + "time_per_iteration": 2.3609750270843506 + }, + { + "auxiliary_loss_clip": 0.01051535, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.01405931, + "balance_loss_mlp": 1.01526833, + "epoch": 0.9604689613708102, + "flos": 19682075635200.0, + "grad_norm": 1.6907352343888187, + "language_loss": 0.69477212, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.71566111, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36328125, + "step": 15975, + "time_per_iteration": 2.412550687789917 + }, + { + "auxiliary_loss_clip": 0.01050493, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.00985169, + "balance_loss_mlp": 1.01603985, + "epoch": 0.9605290846234781, + "flos": 24096423985920.0, + "grad_norm": 2.120382154347477, + "language_loss": 0.57783139, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.5986439, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34375, + "step": 15976, + "time_per_iteration": 2.391308307647705 + }, + { + "auxiliary_loss_clip": 0.01050667, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.01224804, + "balance_loss_mlp": 1.01512885, + "epoch": 0.9605892078761461, + "flos": 27121438759680.0, + "grad_norm": 1.9646584074681954, + "language_loss": 0.69094861, + "learning_rate": 1.624662719799219e-08, + "loss": 0.7117945, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 15977, + "time_per_iteration": 3.8302674293518066 + }, + { + "auxiliary_loss_clip": 0.01051767, + "auxiliary_loss_mlp": 0.01042408, + "balance_loss_clip": 1.0195322, + "balance_loss_mlp": 1.01643729, + "epoch": 0.9606493311288141, + "flos": 14136052608000.0, + "grad_norm": 3.0811973166756212, + "language_loss": 0.83131748, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.85225928, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 15978, + "time_per_iteration": 2.4253768920898438 + }, + { + "auxiliary_loss_clip": 0.01053842, + "auxiliary_loss_mlp": 0.01045657, + "balance_loss_clip": 1.02015853, + "balance_loss_mlp": 1.01634765, + "epoch": 0.960709454381482, + "flos": 15812477228160.0, + "grad_norm": 2.141447003186882, + "language_loss": 0.83917761, + "learning_rate": 1.614769615070921e-08, + "loss": 0.86017257, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.375, + "step": 15979, + "time_per_iteration": 2.3394477367401123 + }, + { + "auxiliary_loss_clip": 0.01051659, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.01471233, + "balance_loss_mlp": 1.0155946, + "epoch": 0.96076957763415, + "flos": 22564295481600.0, + "grad_norm": 1.6380020480426722, + "language_loss": 0.80350435, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82439417, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 15980, + "time_per_iteration": 2.371248483657837 + }, + { + "auxiliary_loss_clip": 0.01052359, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.01023149, + "balance_loss_mlp": 1.0157671, + "epoch": 0.960829700886818, + "flos": 24680101962240.0, + "grad_norm": 1.8129794790741613, + "language_loss": 0.6898396, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.71070045, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3671875, + "step": 15981, + "time_per_iteration": 2.421250104904175 + }, + { + "auxiliary_loss_clip": 0.01049998, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.01093197, + "balance_loss_mlp": 1.01539731, + "epoch": 0.960889824139486, + "flos": 26541007539840.0, + "grad_norm": 1.4124070843841663, + "language_loss": 0.70551586, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.72633356, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34570312, + "step": 15982, + "time_per_iteration": 2.4169585704803467 + }, + { + "auxiliary_loss_clip": 0.01007031, + "auxiliary_loss_mlp": 0.01003281, + "balance_loss_clip": 1.00104034, + "balance_loss_mlp": 1.00065887, + "epoch": 0.9609499473921539, + "flos": 71111826622080.0, + "grad_norm": 0.6810248858391218, + "language_loss": 0.53341031, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55351341, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.06347656, + "step": 15983, + "time_per_iteration": 3.120314598083496 + }, + { + "auxiliary_loss_clip": 0.01050478, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.01489997, + "balance_loss_mlp": 1.01558781, + "epoch": 0.9610100706448219, + "flos": 20551587845760.0, + "grad_norm": 1.931127767024186, + "language_loss": 0.69094241, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.71181816, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34960938, + "step": 15984, + "time_per_iteration": 2.3552684783935547 + }, + { + "auxiliary_loss_clip": 0.01050555, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.01442778, + "balance_loss_mlp": 1.01644397, + "epoch": 0.9610701938974898, + "flos": 14063328512640.0, + "grad_norm": 2.0570162471749938, + "language_loss": 0.68679589, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.70765829, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 15985, + "time_per_iteration": 2.3653926849365234 + }, + { + "auxiliary_loss_clip": 0.01050884, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.0151453, + "balance_loss_mlp": 1.01556051, + "epoch": 0.9611303171501578, + "flos": 20228955171840.0, + "grad_norm": 2.032793942845237, + "language_loss": 0.7983613, + "learning_rate": 1.580380726142283e-08, + "loss": 0.81923568, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3515625, + "step": 15986, + "time_per_iteration": 2.3483502864837646 + }, + { + "auxiliary_loss_clip": 0.01052236, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.01871681, + "balance_loss_mlp": 1.01617587, + "epoch": 0.9611904404028258, + "flos": 20950261194240.0, + "grad_norm": 2.3177499323643684, + "language_loss": 0.65122116, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.67217743, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.359375, + "step": 15987, + "time_per_iteration": 2.3649208545684814 + }, + { + "auxiliary_loss_clip": 0.01049717, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.01594257, + "balance_loss_mlp": 1.01570582, + "epoch": 0.9612505636554938, + "flos": 24826562582400.0, + "grad_norm": 2.340572917743407, + "language_loss": 0.6806004, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.70145828, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 15988, + "time_per_iteration": 2.4407198429107666 + }, + { + "auxiliary_loss_clip": 0.01050894, + "auxiliary_loss_mlp": 0.01041913, + "balance_loss_clip": 1.02006233, + "balance_loss_mlp": 1.01587963, + "epoch": 0.9613106869081617, + "flos": 17164034847360.0, + "grad_norm": 2.991099221301518, + "language_loss": 0.75382102, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.77474916, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 15989, + "time_per_iteration": 2.361701250076294 + }, + { + "auxiliary_loss_clip": 0.01006674, + "auxiliary_loss_mlp": 0.01001742, + "balance_loss_clip": 0.99957198, + "balance_loss_mlp": 1.0002985, + "epoch": 0.9613708101608297, + "flos": 61560741646080.0, + "grad_norm": 0.8241164784891363, + "language_loss": 0.6313653, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.6514495, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.02172852, + "router_z_loss_mlp": 0.06347656, + "step": 15990, + "time_per_iteration": 2.904916286468506 + }, + { + "auxiliary_loss_clip": 0.01051041, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.01472831, + "balance_loss_mlp": 1.01543188, + "epoch": 0.9614309334134977, + "flos": 27416664149760.0, + "grad_norm": 1.9128738380460268, + "language_loss": 0.7902748, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.81116319, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 15991, + "time_per_iteration": 2.428565263748169 + }, + { + "auxiliary_loss_clip": 0.01055494, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_clip": 1.01953924, + "balance_loss_mlp": 1.01680899, + "epoch": 0.9614910566661656, + "flos": 22818079221120.0, + "grad_norm": 2.4417071326628474, + "language_loss": 0.86077636, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.88177013, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38671875, + "step": 15992, + "time_per_iteration": 2.3566465377807617 + }, + { + "auxiliary_loss_clip": 0.01050924, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.01333976, + "balance_loss_mlp": 1.01589024, + "epoch": 0.9615511799188337, + "flos": 20666766021120.0, + "grad_norm": 2.076943338727949, + "language_loss": 0.73948586, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.76036882, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 15993, + "time_per_iteration": 2.3684961795806885 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01042023, + "balance_loss_clip": 1.01797891, + "balance_loss_mlp": 1.01562655, + "epoch": 0.9616113031715016, + "flos": 33147726716160.0, + "grad_norm": 1.423823370090843, + "language_loss": 0.69024044, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.71117878, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 15994, + "time_per_iteration": 2.47804856300354 + }, + { + "auxiliary_loss_clip": 0.01050684, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.01161385, + "balance_loss_mlp": 1.01608562, + "epoch": 0.9616714264241696, + "flos": 25008634656000.0, + "grad_norm": 1.683853377432611, + "language_loss": 0.85164827, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.87249953, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34570312, + "step": 15995, + "time_per_iteration": 3.776531219482422 + }, + { + "auxiliary_loss_clip": 0.01052789, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.01698804, + "balance_loss_mlp": 1.01602054, + "epoch": 0.9617315496768375, + "flos": 13546788814080.0, + "grad_norm": 1.7917655461924809, + "language_loss": 0.77197307, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.79291892, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 15996, + "time_per_iteration": 2.3838090896606445 + }, + { + "auxiliary_loss_clip": 0.01051374, + "auxiliary_loss_mlp": 0.01043199, + "balance_loss_clip": 1.01771224, + "balance_loss_mlp": 1.01606226, + "epoch": 0.9617916729295055, + "flos": 11253728027520.0, + "grad_norm": 2.3047691136766546, + "language_loss": 0.78710508, + "learning_rate": 1.52708595287494e-08, + "loss": 0.80805087, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.35351562, + "step": 15997, + "time_per_iteration": 2.3416249752044678 + }, + { + "auxiliary_loss_clip": 0.01048546, + "auxiliary_loss_mlp": 0.01027751, + "balance_loss_clip": 1.0075686, + "balance_loss_mlp": 1.01572919, + "epoch": 0.9618517961821734, + "flos": 22818637802880.0, + "grad_norm": 1.6720711531424477, + "language_loss": 0.67591751, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69668055, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.328125, + "step": 15998, + "time_per_iteration": 2.3645706176757812 + }, + { + "auxiliary_loss_clip": 0.01050649, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.01479328, + "balance_loss_mlp": 1.0156579, + "epoch": 0.9619119194348414, + "flos": 16616422172160.0, + "grad_norm": 1.530060499041349, + "language_loss": 0.73690301, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.75778598, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34960938, + "step": 15999, + "time_per_iteration": 2.3484597206115723 + }, + { + "auxiliary_loss_clip": 0.0104899, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.0121212, + "balance_loss_mlp": 1.01534975, + "epoch": 0.9619720426875094, + "flos": 24533990455680.0, + "grad_norm": 2.8654942682468505, + "language_loss": 0.66509032, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.68592203, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3359375, + "step": 16000, + "time_per_iteration": 2.395911693572998 + }, + { + "auxiliary_loss_clip": 0.01050933, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.01260185, + "balance_loss_mlp": 1.01547718, + "epoch": 0.9620321659401774, + "flos": 20631154567680.0, + "grad_norm": 1.5610776269498494, + "language_loss": 0.76016855, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.78103054, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 16001, + "time_per_iteration": 2.376692056655884 + }, + { + "auxiliary_loss_clip": 0.01051213, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.0118705, + "balance_loss_mlp": 1.01516843, + "epoch": 0.9620922891928453, + "flos": 18514300746240.0, + "grad_norm": 1.5260259247163617, + "language_loss": 0.69484949, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.71571875, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.359375, + "step": 16002, + "time_per_iteration": 2.362133741378784 + }, + { + "auxiliary_loss_clip": 0.01050777, + "auxiliary_loss_mlp": 0.01039062, + "balance_loss_clip": 1.01583958, + "balance_loss_mlp": 1.01628304, + "epoch": 0.9621524124455133, + "flos": 28766127087360.0, + "grad_norm": 1.2952124351001784, + "language_loss": 0.65549731, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.67639565, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34570312, + "step": 16003, + "time_per_iteration": 2.442814350128174 + }, + { + "auxiliary_loss_clip": 0.01052273, + "auxiliary_loss_mlp": 0.01043937, + "balance_loss_clip": 1.02004755, + "balance_loss_mlp": 1.01668859, + "epoch": 0.9622125356981813, + "flos": 19097873988480.0, + "grad_norm": 1.7696233137136212, + "language_loss": 0.76585895, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78682101, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 16004, + "time_per_iteration": 3.855914831161499 + }, + { + "auxiliary_loss_clip": 0.01049639, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.01252079, + "balance_loss_mlp": 1.01556885, + "epoch": 0.9622726589508492, + "flos": 20301784001280.0, + "grad_norm": 1.8782947453947194, + "language_loss": 0.8063792, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.82722294, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.33984375, + "step": 16005, + "time_per_iteration": 3.6531317234039307 + }, + { + "auxiliary_loss_clip": 0.01050286, + "auxiliary_loss_mlp": 0.01037772, + "balance_loss_clip": 1.01520586, + "balance_loss_mlp": 1.01555324, + "epoch": 0.9623327822035173, + "flos": 54927699304320.0, + "grad_norm": 3.4277770886657244, + "language_loss": 0.68012023, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.70100081, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 16006, + "time_per_iteration": 2.671592950820923 + }, + { + "auxiliary_loss_clip": 0.01048217, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.01639092, + "balance_loss_mlp": 1.0158757, + "epoch": 0.9623929054561852, + "flos": 21758046387840.0, + "grad_norm": 1.5966904462722127, + "language_loss": 0.78948289, + "learning_rate": 1.479426394188521e-08, + "loss": 0.81033283, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.32421875, + "step": 16007, + "time_per_iteration": 2.3723080158233643 + }, + { + "auxiliary_loss_clip": 0.01052347, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.01677012, + "balance_loss_mlp": 1.01636708, + "epoch": 0.9624530287088532, + "flos": 17930587858560.0, + "grad_norm": 2.277327722848451, + "language_loss": 0.69445264, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.71537673, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 16008, + "time_per_iteration": 2.356457233428955 + }, + { + "auxiliary_loss_clip": 0.01054318, + "auxiliary_loss_mlp": 0.01038418, + "balance_loss_clip": 1.01375413, + "balance_loss_mlp": 1.01777267, + "epoch": 0.9625131519615211, + "flos": 23252748048000.0, + "grad_norm": 2.5727544000575415, + "language_loss": 0.74813676, + "learning_rate": 1.469984811730529e-08, + "loss": 0.76906413, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 16009, + "time_per_iteration": 2.4390571117401123 + }, + { + "auxiliary_loss_clip": 0.01050153, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.0132525, + "balance_loss_mlp": 1.0150224, + "epoch": 0.9625732752141891, + "flos": 18915627358080.0, + "grad_norm": 1.7231996586273772, + "language_loss": 0.76032519, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.78117537, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 16010, + "time_per_iteration": 2.3641135692596436 + }, + { + "auxiliary_loss_clip": 0.01055513, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_clip": 1.01605785, + "balance_loss_mlp": 1.01746249, + "epoch": 0.962633398466857, + "flos": 16251998734080.0, + "grad_norm": 1.735356902913667, + "language_loss": 0.71053302, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.731534, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.37890625, + "step": 16011, + "time_per_iteration": 2.3207857608795166 + }, + { + "auxiliary_loss_clip": 0.01050486, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.01527405, + "balance_loss_mlp": 1.0165664, + "epoch": 0.962693521719525, + "flos": 54195466026240.0, + "grad_norm": 2.89073854199058, + "language_loss": 0.69931227, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.72017622, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33984375, + "step": 16012, + "time_per_iteration": 2.685346841812134 + }, + { + "auxiliary_loss_clip": 0.01056065, + "auxiliary_loss_mlp": 0.01043354, + "balance_loss_clip": 1.01650786, + "balance_loss_mlp": 1.01756883, + "epoch": 0.962753644972193, + "flos": 33104504586240.0, + "grad_norm": 1.8620476249568032, + "language_loss": 0.73530102, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.7562952, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.38476562, + "step": 16013, + "time_per_iteration": 2.494292736053467 + }, + { + "auxiliary_loss_clip": 0.01051349, + "auxiliary_loss_mlp": 0.01038484, + "balance_loss_clip": 1.01281786, + "balance_loss_mlp": 1.01601541, + "epoch": 0.962813768224861, + "flos": 42229020170880.0, + "grad_norm": 2.8548739377325516, + "language_loss": 0.65165585, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.67255419, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.35351562, + "step": 16014, + "time_per_iteration": 2.596019744873047 + }, + { + "auxiliary_loss_clip": 0.01049492, + "auxiliary_loss_mlp": 0.010379, + "balance_loss_clip": 1.01598954, + "balance_loss_mlp": 1.01598716, + "epoch": 0.9628738914775289, + "flos": 43943011280640.0, + "grad_norm": 1.5518636576123002, + "language_loss": 0.73462719, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.75550115, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3359375, + "step": 16015, + "time_per_iteration": 2.588960886001587 + }, + { + "auxiliary_loss_clip": 0.01051686, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.01830149, + "balance_loss_mlp": 1.01549733, + "epoch": 0.9629340147301969, + "flos": 15595282460160.0, + "grad_norm": 1.921790073642791, + "language_loss": 0.78790063, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.80883086, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 16016, + "time_per_iteration": 3.7822258472442627 + }, + { + "auxiliary_loss_clip": 0.01007553, + "auxiliary_loss_mlp": 0.0100249, + "balance_loss_clip": 1.00051105, + "balance_loss_mlp": 1.00122738, + "epoch": 0.9629941379828649, + "flos": 62947875807360.0, + "grad_norm": 0.8143764327654579, + "language_loss": 0.63251495, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65261543, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.06347656, + "step": 16017, + "time_per_iteration": 2.949894428253174 + }, + { + "auxiliary_loss_clip": 0.01053599, + "auxiliary_loss_mlp": 0.01034226, + "balance_loss_clip": 1.01212478, + "balance_loss_mlp": 1.01678205, + "epoch": 0.9630542612355328, + "flos": 29897801763840.0, + "grad_norm": 2.096234912483926, + "language_loss": 0.67619795, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.6970762, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3671875, + "step": 16018, + "time_per_iteration": 2.4287383556365967 + }, + { + "auxiliary_loss_clip": 0.01051831, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.01379585, + "balance_loss_mlp": 1.01621151, + "epoch": 0.9631143844882009, + "flos": 17893614862080.0, + "grad_norm": 2.128348342985032, + "language_loss": 0.80235201, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.82325387, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35742188, + "step": 16019, + "time_per_iteration": 2.354036808013916 + }, + { + "auxiliary_loss_clip": 0.01049211, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.01141322, + "balance_loss_mlp": 1.01519477, + "epoch": 0.9631745077408688, + "flos": 26138005182720.0, + "grad_norm": 1.5100201127152957, + "language_loss": 0.72283399, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.7436415, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 16020, + "time_per_iteration": 2.3910152912139893 + }, + { + "auxiliary_loss_clip": 0.01051698, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.01069403, + "balance_loss_mlp": 1.01636624, + "epoch": 0.9632346309935368, + "flos": 24972464620800.0, + "grad_norm": 1.7591189458215046, + "language_loss": 0.78095269, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.80179995, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 16021, + "time_per_iteration": 2.403712034225464 + }, + { + "auxiliary_loss_clip": 0.01055369, + "auxiliary_loss_mlp": 0.01049203, + "balance_loss_clip": 1.02056909, + "balance_loss_mlp": 1.01644444, + "epoch": 0.9632947542462047, + "flos": 23616263790720.0, + "grad_norm": 1.8300683505847224, + "language_loss": 0.66139209, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.6824379, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 0.38867188, + "step": 16022, + "time_per_iteration": 2.384716272354126 + }, + { + "auxiliary_loss_clip": 0.01049787, + "auxiliary_loss_mlp": 0.01037062, + "balance_loss_clip": 1.01509154, + "balance_loss_mlp": 1.01562238, + "epoch": 0.9633548774988727, + "flos": 26394407274240.0, + "grad_norm": 1.9620299552442224, + "language_loss": 0.73913723, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.76000571, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34179688, + "step": 16023, + "time_per_iteration": 2.4288840293884277 + }, + { + "auxiliary_loss_clip": 0.01051119, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.01157522, + "balance_loss_mlp": 1.01570904, + "epoch": 0.9634150007515406, + "flos": 23766634483200.0, + "grad_norm": 1.4817392733395718, + "language_loss": 0.82273638, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.84359121, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 16024, + "time_per_iteration": 2.370471239089966 + }, + { + "auxiliary_loss_clip": 0.01054888, + "auxiliary_loss_mlp": 0.01041747, + "balance_loss_clip": 1.01715457, + "balance_loss_mlp": 1.01774609, + "epoch": 0.9634751240042086, + "flos": 24134165032320.0, + "grad_norm": 1.5911684270175708, + "language_loss": 0.81675315, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83771944, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37109375, + "step": 16025, + "time_per_iteration": 2.4052963256835938 + }, + { + "auxiliary_loss_clip": 0.01052042, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.01246524, + "balance_loss_mlp": 1.0159291, + "epoch": 0.9635352472568766, + "flos": 24348043221120.0, + "grad_norm": 1.8271223705506954, + "language_loss": 0.77671736, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.79760206, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 16026, + "time_per_iteration": 2.426281213760376 + }, + { + "auxiliary_loss_clip": 0.01050911, + "auxiliary_loss_mlp": 0.01039552, + "balance_loss_clip": 1.01581693, + "balance_loss_mlp": 1.01556015, + "epoch": 0.9635953705095446, + "flos": 23983724517120.0, + "grad_norm": 1.6243477824394057, + "language_loss": 0.63965183, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.66055644, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.35546875, + "step": 16027, + "time_per_iteration": 2.379905939102173 + }, + { + "auxiliary_loss_clip": 0.01053696, + "auxiliary_loss_mlp": 0.01041597, + "balance_loss_clip": 1.01777899, + "balance_loss_mlp": 1.01666403, + "epoch": 0.9636554937622125, + "flos": 19827419091840.0, + "grad_norm": 1.776974235071988, + "language_loss": 0.88229984, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.90325272, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 16028, + "time_per_iteration": 2.3717849254608154 + }, + { + "auxiliary_loss_clip": 0.01007345, + "auxiliary_loss_mlp": 0.01003455, + "balance_loss_clip": 1.00150049, + "balance_loss_mlp": 1.00090837, + "epoch": 0.9637156170148805, + "flos": 67432329901440.0, + "grad_norm": 0.6823006023065247, + "language_loss": 0.53216922, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55227721, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.06445312, + "step": 16029, + "time_per_iteration": 2.9912073612213135 + }, + { + "auxiliary_loss_clip": 0.01052691, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.01473856, + "balance_loss_mlp": 1.01644135, + "epoch": 0.9637757402675484, + "flos": 20299933699200.0, + "grad_norm": 1.512703744425463, + "language_loss": 0.7481575, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76905823, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 16030, + "time_per_iteration": 2.3820552825927734 + }, + { + "auxiliary_loss_clip": 0.01049642, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.00984061, + "balance_loss_mlp": 1.01558781, + "epoch": 0.9638358635202164, + "flos": 27233335267200.0, + "grad_norm": 2.0130671872605452, + "language_loss": 0.66715676, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68796247, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 16031, + "time_per_iteration": 2.413482189178467 + }, + { + "auxiliary_loss_clip": 0.01007201, + "auxiliary_loss_mlp": 0.01002031, + "balance_loss_clip": 1.00010026, + "balance_loss_mlp": 1.00088799, + "epoch": 0.9638959867728845, + "flos": 70285536541440.0, + "grad_norm": 0.8313907646854893, + "language_loss": 0.60793722, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62802947, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.06347656, + "step": 16032, + "time_per_iteration": 3.0752902030944824 + }, + { + "auxiliary_loss_clip": 0.01048219, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.01608431, + "balance_loss_mlp": 1.01477051, + "epoch": 0.9639561100255524, + "flos": 25406435220480.0, + "grad_norm": 1.8437130517845532, + "language_loss": 0.67502177, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.69587278, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33398438, + "step": 16033, + "time_per_iteration": 2.4047117233276367 + }, + { + "auxiliary_loss_clip": 0.0105127, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.01673937, + "balance_loss_mlp": 1.01643705, + "epoch": 0.9640162332782204, + "flos": 18112904311680.0, + "grad_norm": 1.5722055759385611, + "language_loss": 0.6659829, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.68687969, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34765625, + "step": 16034, + "time_per_iteration": 2.345611810684204 + }, + { + "auxiliary_loss_clip": 0.01051632, + "auxiliary_loss_mlp": 0.01042874, + "balance_loss_clip": 1.01846015, + "balance_loss_mlp": 1.01590931, + "epoch": 0.9640763565308883, + "flos": 23439184041600.0, + "grad_norm": 2.201121793371913, + "language_loss": 0.74668491, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.76762998, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35742188, + "step": 16035, + "time_per_iteration": 3.656064510345459 + }, + { + "auxiliary_loss_clip": 0.01051497, + "auxiliary_loss_mlp": 0.01036806, + "balance_loss_clip": 1.01459718, + "balance_loss_mlp": 1.01669717, + "epoch": 0.9641364797835563, + "flos": 22418253797760.0, + "grad_norm": 1.8823427411561207, + "language_loss": 0.84114087, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.86202395, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 16036, + "time_per_iteration": 2.3741323947906494 + }, + { + "auxiliary_loss_clip": 0.01051012, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.01044691, + "balance_loss_mlp": 1.01571178, + "epoch": 0.9641966030362242, + "flos": 30621202467840.0, + "grad_norm": 1.757069841405239, + "language_loss": 0.71011567, + "learning_rate": 1.340965177371789e-08, + "loss": 0.7309593, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 16037, + "time_per_iteration": 2.4310102462768555 + }, + { + "auxiliary_loss_clip": 0.0105111, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.00888431, + "balance_loss_mlp": 1.01588678, + "epoch": 0.9642567262888923, + "flos": 20952251141760.0, + "grad_norm": 1.633194008876519, + "language_loss": 0.63964403, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.66046917, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 16038, + "time_per_iteration": 2.3781497478485107 + }, + { + "auxiliary_loss_clip": 0.01052532, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.01599014, + "balance_loss_mlp": 1.01608419, + "epoch": 0.9643168495415602, + "flos": 22638276385920.0, + "grad_norm": 1.540291062371269, + "language_loss": 0.71683729, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73775971, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 16039, + "time_per_iteration": 2.43440580368042 + }, + { + "auxiliary_loss_clip": 0.01051811, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.01200128, + "balance_loss_mlp": 1.01584184, + "epoch": 0.9643769727942282, + "flos": 20265229941120.0, + "grad_norm": 1.9839222301543957, + "language_loss": 0.74496377, + "learning_rate": 1.327491870605657e-08, + "loss": 0.76585209, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.359375, + "step": 16040, + "time_per_iteration": 2.3355650901794434 + }, + { + "auxiliary_loss_clip": 0.01054155, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.01539433, + "balance_loss_mlp": 1.01674306, + "epoch": 0.9644370960468961, + "flos": 13880977148160.0, + "grad_norm": 2.1546593784901273, + "language_loss": 0.74001008, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.76094556, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 16041, + "time_per_iteration": 2.357393741607666 + }, + { + "auxiliary_loss_clip": 0.01048625, + "auxiliary_loss_mlp": 0.01033329, + "balance_loss_clip": 1.01270628, + "balance_loss_mlp": 1.01528037, + "epoch": 0.9644972192995641, + "flos": 17237247701760.0, + "grad_norm": 1.8825179655395745, + "language_loss": 0.73261982, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.75343937, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.33203125, + "step": 16042, + "time_per_iteration": 2.3535566329956055 + }, + { + "auxiliary_loss_clip": 0.01053299, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.01548338, + "balance_loss_mlp": 1.01620901, + "epoch": 0.964557342552232, + "flos": 23839253844480.0, + "grad_norm": 1.7285551473736833, + "language_loss": 0.81874263, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83965242, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.37109375, + "step": 16043, + "time_per_iteration": 2.393782377243042 + }, + { + "auxiliary_loss_clip": 0.01050575, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.01270103, + "balance_loss_mlp": 1.01605856, + "epoch": 0.9646174658049, + "flos": 21652049900160.0, + "grad_norm": 1.5512683126598588, + "language_loss": 0.7280798, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.74892414, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34570312, + "step": 16044, + "time_per_iteration": 3.762526512145996 + }, + { + "auxiliary_loss_clip": 0.01048567, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.00790071, + "balance_loss_mlp": 1.01514912, + "epoch": 0.9646775890575681, + "flos": 17128597950720.0, + "grad_norm": 1.7590764712143856, + "language_loss": 0.7147361, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.73552084, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33398438, + "step": 16045, + "time_per_iteration": 3.6612355709075928 + }, + { + "auxiliary_loss_clip": 0.01051988, + "auxiliary_loss_mlp": 0.01040149, + "balance_loss_clip": 1.01488805, + "balance_loss_mlp": 1.01596606, + "epoch": 0.964737712310236, + "flos": 13004901601920.0, + "grad_norm": 2.0231926437475085, + "language_loss": 0.76914036, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.79006171, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.359375, + "step": 16046, + "time_per_iteration": 2.4055087566375732 + }, + { + "auxiliary_loss_clip": 0.01053829, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.01655316, + "epoch": 0.964797835562904, + "flos": 24278112034560.0, + "grad_norm": 1.5048305763435106, + "language_loss": 0.63306832, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.65403593, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 16047, + "time_per_iteration": 2.379857301712036 + }, + { + "auxiliary_loss_clip": 0.0105367, + "auxiliary_loss_mlp": 0.01038581, + "balance_loss_clip": 1.01497722, + "balance_loss_mlp": 1.01733494, + "epoch": 0.9648579588155719, + "flos": 20521632032640.0, + "grad_norm": 1.75590866193559, + "language_loss": 0.69794703, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71886957, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 16048, + "time_per_iteration": 2.3765525817871094 + }, + { + "auxiliary_loss_clip": 0.01053299, + "auxiliary_loss_mlp": 0.0104015, + "balance_loss_clip": 1.01559305, + "balance_loss_mlp": 1.01587236, + "epoch": 0.9649180820682399, + "flos": 32153645174400.0, + "grad_norm": 1.840631966533396, + "language_loss": 0.64546549, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66639996, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 16049, + "time_per_iteration": 2.446866035461426 + }, + { + "auxiliary_loss_clip": 0.01053086, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.01098967, + "balance_loss_mlp": 1.01746321, + "epoch": 0.9649782053209078, + "flos": 20521527298560.0, + "grad_norm": 2.1310071579536527, + "language_loss": 0.71648538, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73735309, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 16050, + "time_per_iteration": 2.384705066680908 + }, + { + "auxiliary_loss_clip": 0.01053592, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.0137403, + "balance_loss_mlp": 1.01533866, + "epoch": 0.9650383285735759, + "flos": 43066342241280.0, + "grad_norm": 1.7392375202158004, + "language_loss": 0.70926976, + "learning_rate": 1.278669873970606e-08, + "loss": 0.73019844, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.3828125, + "step": 16051, + "time_per_iteration": 2.5842931270599365 + }, + { + "auxiliary_loss_clip": 0.01007038, + "auxiliary_loss_mlp": 0.01003749, + "balance_loss_clip": 1.00186563, + "balance_loss_mlp": 1.0006038, + "epoch": 0.9650984518262438, + "flos": 61745117869440.0, + "grad_norm": 1.0114743959708241, + "language_loss": 0.59229559, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61240345, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.06445312, + "step": 16052, + "time_per_iteration": 3.048064708709717 + }, + { + "auxiliary_loss_clip": 0.01048864, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.0075326, + "balance_loss_mlp": 1.01469183, + "epoch": 0.9651585750789118, + "flos": 29788034849280.0, + "grad_norm": 1.6254031523227979, + "language_loss": 0.7575264, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.77830744, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34179688, + "step": 16053, + "time_per_iteration": 2.4541800022125244 + }, + { + "auxiliary_loss_clip": 0.01052269, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.01183808, + "balance_loss_mlp": 1.01632297, + "epoch": 0.9652186983315797, + "flos": 16872126036480.0, + "grad_norm": 1.9655886793416415, + "language_loss": 0.69826424, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.71913671, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 16054, + "time_per_iteration": 2.35073184967041 + }, + { + "auxiliary_loss_clip": 0.01051716, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02014792, + "balance_loss_mlp": 1.01631272, + "epoch": 0.9652788215842477, + "flos": 31648416756480.0, + "grad_norm": 1.4650773630131453, + "language_loss": 0.63039398, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.65132505, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.35546875, + "step": 16055, + "time_per_iteration": 2.4677865505218506 + }, + { + "auxiliary_loss_clip": 0.01049691, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.01083601, + "balance_loss_mlp": 1.01641965, + "epoch": 0.9653389448369156, + "flos": 24753105348480.0, + "grad_norm": 2.198092535053469, + "language_loss": 0.7817114, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.80252731, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33203125, + "step": 16056, + "time_per_iteration": 3.8349571228027344 + }, + { + "auxiliary_loss_clip": 0.01050601, + "auxiliary_loss_mlp": 0.01035348, + "balance_loss_clip": 1.01235271, + "balance_loss_mlp": 1.01506138, + "epoch": 0.9653990680895836, + "flos": 20296372740480.0, + "grad_norm": 1.5200186853521038, + "language_loss": 0.72559613, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.74645567, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 16057, + "time_per_iteration": 2.3470540046691895 + }, + { + "auxiliary_loss_clip": 0.01050082, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.01684713, + "balance_loss_mlp": 1.01565528, + "epoch": 0.9654591913422517, + "flos": 22527671598720.0, + "grad_norm": 1.8385092180488218, + "language_loss": 0.72378552, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74467659, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34375, + "step": 16058, + "time_per_iteration": 2.3821005821228027 + }, + { + "auxiliary_loss_clip": 0.01050908, + "auxiliary_loss_mlp": 0.01034618, + "balance_loss_clip": 1.01207519, + "balance_loss_mlp": 1.0157119, + "epoch": 0.9655193145949196, + "flos": 26761728355200.0, + "grad_norm": 1.399011642231527, + "language_loss": 0.74907136, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76992655, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.3515625, + "step": 16059, + "time_per_iteration": 2.4197094440460205 + }, + { + "auxiliary_loss_clip": 0.01052731, + "auxiliary_loss_mlp": 0.01040154, + "balance_loss_clip": 1.01696825, + "balance_loss_mlp": 1.01567423, + "epoch": 0.9655794378475876, + "flos": 41969196766080.0, + "grad_norm": 1.6966623859332075, + "language_loss": 0.74133748, + "learning_rate": 1.239402791721722e-08, + "loss": 0.7622664, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 16060, + "time_per_iteration": 2.555171251296997 + }, + { + "auxiliary_loss_clip": 0.01048996, + "auxiliary_loss_mlp": 0.01036974, + "balance_loss_clip": 1.01666069, + "balance_loss_mlp": 1.01586652, + "epoch": 0.9656395611002555, + "flos": 27708188935680.0, + "grad_norm": 1.550753055344959, + "language_loss": 0.77632302, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.79718274, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33203125, + "step": 16061, + "time_per_iteration": 2.422240972518921 + }, + { + "auxiliary_loss_clip": 0.01007594, + "auxiliary_loss_mlp": 0.01001934, + "balance_loss_clip": 0.99990708, + "balance_loss_mlp": 1.00117517, + "epoch": 0.9656996843529235, + "flos": 68965331189760.0, + "grad_norm": 0.7226771919139883, + "language_loss": 0.64287436, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66296965, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.06445312, + "step": 16062, + "time_per_iteration": 3.054680347442627 + }, + { + "auxiliary_loss_clip": 0.01047314, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.01368272, + "balance_loss_mlp": 1.01432729, + "epoch": 0.9657598076055914, + "flos": 20630281783680.0, + "grad_norm": 3.1477779613252572, + "language_loss": 0.94945234, + "learning_rate": 1.226449424760867e-08, + "loss": 0.97027236, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.33007812, + "step": 16063, + "time_per_iteration": 2.352570056915283 + }, + { + "auxiliary_loss_clip": 0.01052064, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.01225352, + "balance_loss_mlp": 1.01627839, + "epoch": 0.9658199308582595, + "flos": 20447546394240.0, + "grad_norm": 1.7583367649097892, + "language_loss": 0.83459657, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.85547674, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 16064, + "time_per_iteration": 2.386924982070923 + }, + { + "auxiliary_loss_clip": 0.01052331, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.01222122, + "balance_loss_mlp": 1.01759255, + "epoch": 0.9658800541109274, + "flos": 24716865490560.0, + "grad_norm": 1.5521294427621328, + "language_loss": 0.84755284, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86841959, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34765625, + "step": 16065, + "time_per_iteration": 2.3992273807525635 + }, + { + "auxiliary_loss_clip": 0.01050511, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.01421905, + "balance_loss_mlp": 1.01529014, + "epoch": 0.9659401773635954, + "flos": 21609141972480.0, + "grad_norm": 2.4897704004852548, + "language_loss": 0.67904586, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69991714, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.3515625, + "step": 16066, + "time_per_iteration": 2.396071195602417 + }, + { + "auxiliary_loss_clip": 0.01050322, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.01245284, + "balance_loss_mlp": 1.01549292, + "epoch": 0.9660003006162633, + "flos": 20300212990080.0, + "grad_norm": 1.8265606872692268, + "language_loss": 0.82990187, + "learning_rate": 1.209283794752558e-08, + "loss": 0.85074878, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 16067, + "time_per_iteration": 2.3703489303588867 + }, + { + "auxiliary_loss_clip": 0.01050158, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.01008701, + "balance_loss_mlp": 1.01539946, + "epoch": 0.9660604238689313, + "flos": 24460812512640.0, + "grad_norm": 1.9040689180244306, + "language_loss": 0.70806754, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.72889894, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34765625, + "step": 16068, + "time_per_iteration": 2.4282138347625732 + }, + { + "auxiliary_loss_clip": 0.01046489, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.00981414, + "balance_loss_mlp": 1.01469755, + "epoch": 0.9661205471215992, + "flos": 19863030545280.0, + "grad_norm": 1.8175565054097025, + "language_loss": 0.68479562, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70554572, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.31835938, + "step": 16069, + "time_per_iteration": 2.369292736053467 + }, + { + "auxiliary_loss_clip": 0.01050525, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.01280999, + "balance_loss_mlp": 1.01540565, + "epoch": 0.9661806703742672, + "flos": 20556859461120.0, + "grad_norm": 2.2091311201680073, + "language_loss": 0.89868468, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91953123, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 16070, + "time_per_iteration": 2.3826332092285156 + }, + { + "auxiliary_loss_clip": 0.01054125, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_clip": 1.01861978, + "balance_loss_mlp": 1.0182817, + "epoch": 0.9662407936269353, + "flos": 21429967541760.0, + "grad_norm": 1.7780917273529135, + "language_loss": 0.78577352, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.80675173, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.35742188, + "step": 16071, + "time_per_iteration": 2.377195119857788 + }, + { + "auxiliary_loss_clip": 0.01050785, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_clip": 1.01525402, + "balance_loss_mlp": 1.01601005, + "epoch": 0.9663009168796032, + "flos": 14902012126080.0, + "grad_norm": 2.713744059565762, + "language_loss": 0.66328382, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.68424273, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.34765625, + "step": 16072, + "time_per_iteration": 2.353283405303955 + }, + { + "auxiliary_loss_clip": 0.01051907, + "auxiliary_loss_mlp": 0.01038177, + "balance_loss_clip": 1.01596892, + "balance_loss_mlp": 1.01599228, + "epoch": 0.9663610401322712, + "flos": 24309883238400.0, + "grad_norm": 1.6084159205770305, + "language_loss": 0.78418231, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.80508316, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 16073, + "time_per_iteration": 2.3780970573425293 + }, + { + "auxiliary_loss_clip": 0.01053711, + "auxiliary_loss_mlp": 0.01036765, + "balance_loss_clip": 1.0130899, + "balance_loss_mlp": 1.01692629, + "epoch": 0.9664211633849391, + "flos": 17636933479680.0, + "grad_norm": 2.238024998352046, + "language_loss": 0.76859093, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78949571, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3671875, + "step": 16074, + "time_per_iteration": 2.375025749206543 + }, + { + "auxiliary_loss_clip": 0.01052333, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.01611662, + "balance_loss_mlp": 1.0155549, + "epoch": 0.9664812866376071, + "flos": 29788104672000.0, + "grad_norm": 1.502088676803031, + "language_loss": 0.76369214, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.784621, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 16075, + "time_per_iteration": 3.6689798831939697 + }, + { + "auxiliary_loss_clip": 0.01052629, + "auxiliary_loss_mlp": 0.01039939, + "balance_loss_clip": 1.01672935, + "balance_loss_mlp": 1.0170691, + "epoch": 0.966541409890275, + "flos": 14281465887360.0, + "grad_norm": 3.3117223884037257, + "language_loss": 0.79492158, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81584728, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 16076, + "time_per_iteration": 2.3756070137023926 + }, + { + "auxiliary_loss_clip": 0.01052837, + "auxiliary_loss_mlp": 0.01045178, + "balance_loss_clip": 1.02042997, + "balance_loss_mlp": 1.01663959, + "epoch": 0.9666015331429431, + "flos": 19859539409280.0, + "grad_norm": 1.7562952581655573, + "language_loss": 0.72777057, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74875069, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.36328125, + "step": 16077, + "time_per_iteration": 2.3515450954437256 + }, + { + "auxiliary_loss_clip": 0.01050174, + "auxiliary_loss_mlp": 0.01034835, + "balance_loss_clip": 1.01195884, + "balance_loss_mlp": 1.01516652, + "epoch": 0.966661656395611, + "flos": 27124720427520.0, + "grad_norm": 1.669460946959248, + "language_loss": 0.5999918, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.62084186, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34960938, + "step": 16078, + "time_per_iteration": 2.417071580886841 + }, + { + "auxiliary_loss_clip": 0.01051986, + "auxiliary_loss_mlp": 0.01043044, + "balance_loss_clip": 1.01860666, + "balance_loss_mlp": 1.01535034, + "epoch": 0.966721779648279, + "flos": 21507125379840.0, + "grad_norm": 2.0126344063170105, + "language_loss": 0.741157, + "learning_rate": 1.158510609718899e-08, + "loss": 0.76210737, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 16079, + "time_per_iteration": 2.370870590209961 + }, + { + "auxiliary_loss_clip": 0.01049481, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.0117383, + "balance_loss_mlp": 1.01554215, + "epoch": 0.9667819029009469, + "flos": 23876994890880.0, + "grad_norm": 2.3234999900659634, + "language_loss": 0.72863531, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74946642, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 16080, + "time_per_iteration": 2.425915002822876 + }, + { + "auxiliary_loss_clip": 0.01050318, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.01326394, + "balance_loss_mlp": 1.01523316, + "epoch": 0.9668420261536149, + "flos": 21506147861760.0, + "grad_norm": 1.8334696050096901, + "language_loss": 0.75770211, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.77855462, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 16081, + "time_per_iteration": 2.3596010208129883 + }, + { + "auxiliary_loss_clip": 0.01049686, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.01097894, + "balance_loss_mlp": 1.01445818, + "epoch": 0.9669021494062828, + "flos": 26686071705600.0, + "grad_norm": 1.6910526854710999, + "language_loss": 0.68616599, + "learning_rate": 1.145986954691236e-08, + "loss": 0.70699102, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.3515625, + "step": 16082, + "time_per_iteration": 2.4299964904785156 + }, + { + "auxiliary_loss_clip": 0.0105175, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.01112008, + "balance_loss_mlp": 1.01714206, + "epoch": 0.9669622726589508, + "flos": 29823751036800.0, + "grad_norm": 1.4783035108407874, + "language_loss": 0.77265406, + "learning_rate": 1.141827483932789e-08, + "loss": 0.7935102, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.34570312, + "step": 16083, + "time_per_iteration": 3.845435380935669 + }, + { + "auxiliary_loss_clip": 0.01052465, + "auxiliary_loss_mlp": 0.01039132, + "balance_loss_clip": 1.01725769, + "balance_loss_mlp": 1.01612091, + "epoch": 0.9670223959116189, + "flos": 22921597002240.0, + "grad_norm": 1.9640091713468955, + "language_loss": 0.80318689, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.82410288, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.36328125, + "step": 16084, + "time_per_iteration": 3.868178129196167 + }, + { + "auxiliary_loss_clip": 0.01052596, + "auxiliary_loss_mlp": 0.01035746, + "balance_loss_clip": 1.01199925, + "balance_loss_mlp": 1.01576781, + "epoch": 0.9670825191642868, + "flos": 18623509079040.0, + "grad_norm": 2.4821422164740206, + "language_loss": 0.7075085, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.72839195, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36914062, + "step": 16085, + "time_per_iteration": 2.360260248184204 + }, + { + "auxiliary_loss_clip": 0.01055189, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.01348829, + "balance_loss_mlp": 1.01694584, + "epoch": 0.9671426424169548, + "flos": 24496807991040.0, + "grad_norm": 1.9547572969234808, + "language_loss": 0.70051897, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.72145712, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 16086, + "time_per_iteration": 2.3938238620758057 + }, + { + "auxiliary_loss_clip": 0.01050817, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.01721704, + "balance_loss_mlp": 1.01656139, + "epoch": 0.9672027656696227, + "flos": 20370283822080.0, + "grad_norm": 8.866218432697908, + "language_loss": 0.791125, + "learning_rate": 1.125265009690235e-08, + "loss": 0.81202376, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34179688, + "step": 16087, + "time_per_iteration": 2.376634120941162 + }, + { + "auxiliary_loss_clip": 0.01049719, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.01236701, + "balance_loss_mlp": 1.01572192, + "epoch": 0.9672628889222907, + "flos": 18879178032000.0, + "grad_norm": 1.9423671171785017, + "language_loss": 0.73270673, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.75355101, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.33984375, + "step": 16088, + "time_per_iteration": 2.3510990142822266 + }, + { + "auxiliary_loss_clip": 0.01049123, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.0093472, + "balance_loss_mlp": 1.01529431, + "epoch": 0.9673230121749586, + "flos": 28693961573760.0, + "grad_norm": 1.801341740391533, + "language_loss": 0.71765542, + "learning_rate": 1.117029020040916e-08, + "loss": 0.73845309, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33789062, + "step": 16089, + "time_per_iteration": 2.4353060722351074 + }, + { + "auxiliary_loss_clip": 0.01053138, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.01483238, + "balance_loss_mlp": 1.0167805, + "epoch": 0.9673831354276267, + "flos": 20483437138560.0, + "grad_norm": 8.766553250990611, + "language_loss": 0.76837862, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.78929251, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.36328125, + "step": 16090, + "time_per_iteration": 2.375403642654419 + }, + { + "auxiliary_loss_clip": 0.01054011, + "auxiliary_loss_mlp": 0.01037949, + "balance_loss_clip": 1.01402402, + "balance_loss_mlp": 1.01607621, + "epoch": 0.9674432586802946, + "flos": 26796327379200.0, + "grad_norm": 1.7329725719005327, + "language_loss": 0.69959539, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.72051501, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.37890625, + "step": 16091, + "time_per_iteration": 2.4481067657470703 + }, + { + "auxiliary_loss_clip": 0.01050303, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.01882422, + "balance_loss_mlp": 1.01560974, + "epoch": 0.9675033819329626, + "flos": 22309813514880.0, + "grad_norm": 1.751207181322269, + "language_loss": 0.7746352, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79556084, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 16092, + "time_per_iteration": 2.398439884185791 + }, + { + "auxiliary_loss_clip": 0.01051331, + "auxiliary_loss_mlp": 0.01033416, + "balance_loss_clip": 1.01239979, + "balance_loss_mlp": 1.01679778, + "epoch": 0.9675635051856305, + "flos": 12674937542400.0, + "grad_norm": 1.7338812311684877, + "language_loss": 0.77624905, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.79709649, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34570312, + "step": 16093, + "time_per_iteration": 2.3603732585906982 + }, + { + "auxiliary_loss_clip": 0.01051144, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.0131619, + "balance_loss_mlp": 1.015908, + "epoch": 0.9676236284382985, + "flos": 24607308044160.0, + "grad_norm": 1.6232732914625876, + "language_loss": 0.70292461, + "learning_rate": 1.096571027726112e-08, + "loss": 0.72380531, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3515625, + "step": 16094, + "time_per_iteration": 2.4123642444610596 + }, + { + "auxiliary_loss_clip": 0.01052977, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.00972712, + "balance_loss_mlp": 1.01621914, + "epoch": 0.9676837516909664, + "flos": 23366041009920.0, + "grad_norm": 1.4725094179332505, + "language_loss": 0.76970899, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.79056466, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3671875, + "step": 16095, + "time_per_iteration": 2.392596960067749 + }, + { + "auxiliary_loss_clip": 0.01054089, + "auxiliary_loss_mlp": 0.01038643, + "balance_loss_clip": 1.01428807, + "balance_loss_mlp": 1.01695085, + "epoch": 0.9677438749436345, + "flos": 20485811111040.0, + "grad_norm": 2.5971059770137193, + "language_loss": 0.72513145, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.74605876, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37109375, + "step": 16096, + "time_per_iteration": 3.8083598613739014 + }, + { + "auxiliary_loss_clip": 0.01052905, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.01306963, + "balance_loss_mlp": 1.01641178, + "epoch": 0.9678039981963025, + "flos": 47554147825920.0, + "grad_norm": 1.9611397659488752, + "language_loss": 0.7317754, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.75267303, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36523438, + "step": 16097, + "time_per_iteration": 2.6593122482299805 + }, + { + "auxiliary_loss_clip": 0.01050673, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.01354647, + "balance_loss_mlp": 1.01544416, + "epoch": 0.9678641214489704, + "flos": 25039463253120.0, + "grad_norm": 1.5444669084516687, + "language_loss": 0.7905584, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.8114295, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3515625, + "step": 16098, + "time_per_iteration": 2.405832529067993 + }, + { + "auxiliary_loss_clip": 0.01050099, + "auxiliary_loss_mlp": 0.01035501, + "balance_loss_clip": 1.01427019, + "balance_loss_mlp": 1.01516223, + "epoch": 0.9679242447016384, + "flos": 19240040511360.0, + "grad_norm": 1.9411390966098805, + "language_loss": 0.91722441, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.93808043, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34960938, + "step": 16099, + "time_per_iteration": 2.4160869121551514 + }, + { + "auxiliary_loss_clip": 0.01052689, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.011832, + "balance_loss_mlp": 1.01542008, + "epoch": 0.9679843679543063, + "flos": 33253304267520.0, + "grad_norm": 2.1948511427926136, + "language_loss": 0.67206645, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.69295764, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 16100, + "time_per_iteration": 2.4849400520324707 + }, + { + "auxiliary_loss_clip": 0.01051716, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.01559091, + "balance_loss_mlp": 1.0158267, + "epoch": 0.9680444912069743, + "flos": 22782537590400.0, + "grad_norm": 1.6905209388930482, + "language_loss": 0.74275255, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.76364452, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.359375, + "step": 16101, + "time_per_iteration": 2.405630111694336 + }, + { + "auxiliary_loss_clip": 0.01051557, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.0116322, + "balance_loss_mlp": 1.01616979, + "epoch": 0.9681046144596422, + "flos": 24022966752000.0, + "grad_norm": 1.595957555999901, + "language_loss": 0.74549711, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.7663725, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35351562, + "step": 16102, + "time_per_iteration": 2.381563901901245 + }, + { + "auxiliary_loss_clip": 0.0105364, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.01216507, + "balance_loss_mlp": 1.0170784, + "epoch": 0.9681647377123103, + "flos": 23439882268800.0, + "grad_norm": 1.8087453083679341, + "language_loss": 0.7827543, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.80366838, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36523438, + "step": 16103, + "time_per_iteration": 2.4016358852386475 + }, + { + "auxiliary_loss_clip": 0.01050817, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.01319385, + "balance_loss_mlp": 1.01588869, + "epoch": 0.9682248609649782, + "flos": 22673957662080.0, + "grad_norm": 1.5498623055625385, + "language_loss": 0.81435454, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.83520854, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34960938, + "step": 16104, + "time_per_iteration": 2.3659982681274414 + }, + { + "auxiliary_loss_clip": 0.01047904, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.01043534, + "balance_loss_mlp": 1.01467633, + "epoch": 0.9682849842176462, + "flos": 24427191006720.0, + "grad_norm": 1.5134426534371945, + "language_loss": 0.78478014, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.80556571, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.33203125, + "step": 16105, + "time_per_iteration": 2.418720245361328 + }, + { + "auxiliary_loss_clip": 0.01006695, + "auxiliary_loss_mlp": 0.01002117, + "balance_loss_clip": 1.00020969, + "balance_loss_mlp": 1.00052822, + "epoch": 0.9683451074703141, + "flos": 59992093992960.0, + "grad_norm": 0.8186738148492448, + "language_loss": 0.56804675, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58813483, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.06152344, + "step": 16106, + "time_per_iteration": 3.0642759799957275 + }, + { + "auxiliary_loss_clip": 0.01006872, + "auxiliary_loss_mlp": 0.01002002, + "balance_loss_clip": 0.99988025, + "balance_loss_mlp": 1.00069344, + "epoch": 0.9684052307229821, + "flos": 52693361291520.0, + "grad_norm": 0.8840205396416722, + "language_loss": 0.61826897, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63835776, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.06176758, + "step": 16107, + "time_per_iteration": 2.947842597961426 + }, + { + "auxiliary_loss_clip": 0.01050739, + "auxiliary_loss_mlp": 0.01043276, + "balance_loss_clip": 1.01923215, + "balance_loss_mlp": 1.01540971, + "epoch": 0.96846535397565, + "flos": 22782747058560.0, + "grad_norm": 2.272632826689127, + "language_loss": 0.75265068, + "learning_rate": 1.040291854638875e-08, + "loss": 0.7735908, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3515625, + "step": 16108, + "time_per_iteration": 2.4018356800079346 + }, + { + "auxiliary_loss_clip": 0.01053028, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.01711178, + "balance_loss_mlp": 1.01648057, + "epoch": 0.968525477228318, + "flos": 23323063259520.0, + "grad_norm": 2.142570279997239, + "language_loss": 0.58724678, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.60819709, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 16109, + "time_per_iteration": 2.422017812728882 + }, + { + "auxiliary_loss_clip": 0.01006836, + "auxiliary_loss_mlp": 0.01001568, + "balance_loss_clip": 0.9995774, + "balance_loss_mlp": 1.0006156, + "epoch": 0.9685856004809861, + "flos": 67879496995200.0, + "grad_norm": 0.67232148830515, + "language_loss": 0.54346168, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.5635457, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.06225586, + "step": 16110, + "time_per_iteration": 3.005385637283325 + }, + { + "auxiliary_loss_clip": 0.01054627, + "auxiliary_loss_mlp": 0.01042535, + "balance_loss_clip": 1.01559377, + "balance_loss_mlp": 1.01668239, + "epoch": 0.968645723733654, + "flos": 33946504778880.0, + "grad_norm": 1.866502823363403, + "language_loss": 0.6365602, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.65753186, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.37890625, + "step": 16111, + "time_per_iteration": 2.4785172939300537 + }, + { + "auxiliary_loss_clip": 0.01048246, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.01162708, + "balance_loss_mlp": 1.01478171, + "epoch": 0.968705846986322, + "flos": 18550505692800.0, + "grad_norm": 1.9128776343736997, + "language_loss": 0.75264025, + "learning_rate": 1.024483677309118e-08, + "loss": 0.77345359, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.33398438, + "step": 16112, + "time_per_iteration": 2.3386454582214355 + }, + { + "auxiliary_loss_clip": 0.01049344, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.01105523, + "balance_loss_mlp": 1.01489007, + "epoch": 0.9687659702389899, + "flos": 17419948179840.0, + "grad_norm": 1.7900726865168421, + "language_loss": 0.68051887, + "learning_rate": 1.020550495531558e-08, + "loss": 0.7013377, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34375, + "step": 16113, + "time_per_iteration": 2.376335382461548 + }, + { + "auxiliary_loss_clip": 0.01006947, + "auxiliary_loss_mlp": 0.01004109, + "balance_loss_clip": 1.00232065, + "balance_loss_mlp": 1.00062847, + "epoch": 0.9688260934916579, + "flos": 62043834395520.0, + "grad_norm": 0.6953946158264256, + "language_loss": 0.56659311, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58670366, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.06347656, + "step": 16114, + "time_per_iteration": 4.464321851730347 + }, + { + "auxiliary_loss_clip": 0.01050759, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.01611805, + "balance_loss_mlp": 1.01565456, + "epoch": 0.9688862167443258, + "flos": 15075880030080.0, + "grad_norm": 2.264123125287711, + "language_loss": 0.83493125, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.85583299, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3515625, + "step": 16115, + "time_per_iteration": 2.3177521228790283 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.01267314, + "balance_loss_mlp": 1.01591897, + "epoch": 0.9689463399969939, + "flos": 19937186006400.0, + "grad_norm": 1.5715789022818367, + "language_loss": 0.72941732, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.75024366, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.328125, + "step": 16116, + "time_per_iteration": 2.3699638843536377 + }, + { + "auxiliary_loss_clip": 0.01053209, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.01386034, + "balance_loss_mlp": 1.01635742, + "epoch": 0.9690064632496618, + "flos": 19572029429760.0, + "grad_norm": 3.412943354918438, + "language_loss": 0.76608336, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.78698337, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3671875, + "step": 16117, + "time_per_iteration": 2.350480556488037 + }, + { + "auxiliary_loss_clip": 0.01052248, + "auxiliary_loss_mlp": 0.01037589, + "balance_loss_clip": 1.01474833, + "balance_loss_mlp": 1.0155921, + "epoch": 0.9690665865023298, + "flos": 21871199704320.0, + "grad_norm": 2.090434030978091, + "language_loss": 0.78962916, + "learning_rate": 1.000997769426548e-08, + "loss": 0.81052756, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 16118, + "time_per_iteration": 2.3771841526031494 + }, + { + "auxiliary_loss_clip": 0.01053667, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.01709867, + "balance_loss_mlp": 1.01672792, + "epoch": 0.9691267097549977, + "flos": 20993483324160.0, + "grad_norm": 1.6951414269843572, + "language_loss": 0.79164743, + "learning_rate": 9.971098618001272e-09, + "loss": 0.81258178, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36914062, + "step": 16119, + "time_per_iteration": 2.365143299102783 + }, + { + "auxiliary_loss_clip": 0.01049132, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.01351762, + "balance_loss_mlp": 1.01527607, + "epoch": 0.9691868330076657, + "flos": 24278007300480.0, + "grad_norm": 1.3946402113673064, + "language_loss": 0.76491714, + "learning_rate": 9.932295003832747e-09, + "loss": 0.78574085, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.33789062, + "step": 16120, + "time_per_iteration": 2.396498680114746 + }, + { + "auxiliary_loss_clip": 0.01050063, + "auxiliary_loss_mlp": 0.01036456, + "balance_loss_clip": 1.01336515, + "balance_loss_mlp": 1.01560175, + "epoch": 0.9692469562603336, + "flos": 17674744348800.0, + "grad_norm": 1.8071665960932686, + "language_loss": 0.70849288, + "learning_rate": 9.89356685323095e-09, + "loss": 0.72935808, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34375, + "step": 16121, + "time_per_iteration": 2.3384037017822266 + }, + { + "auxiliary_loss_clip": 0.01051415, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.01076639, + "balance_loss_mlp": 1.01618576, + "epoch": 0.9693070795130017, + "flos": 26833160730240.0, + "grad_norm": 1.697101535128724, + "language_loss": 0.70129514, + "learning_rate": 9.854914167664486e-09, + "loss": 0.72215271, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.3515625, + "step": 16122, + "time_per_iteration": 3.795444965362549 + }, + { + "auxiliary_loss_clip": 0.01051468, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.0117985, + "balance_loss_mlp": 1.01622355, + "epoch": 0.9693672027656697, + "flos": 18076315340160.0, + "grad_norm": 1.9201952275766911, + "language_loss": 0.77532089, + "learning_rate": 9.81633694859907e-09, + "loss": 0.79616702, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3515625, + "step": 16123, + "time_per_iteration": 2.3360681533813477 + }, + { + "auxiliary_loss_clip": 0.0105213, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.01455438, + "balance_loss_mlp": 1.01573086, + "epoch": 0.9694273260183376, + "flos": 21761921548800.0, + "grad_norm": 1.4006675433721731, + "language_loss": 0.75026745, + "learning_rate": 9.777835197497753e-09, + "loss": 0.77116477, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 16124, + "time_per_iteration": 2.401848554611206 + }, + { + "auxiliary_loss_clip": 0.01052808, + "auxiliary_loss_mlp": 0.0104215, + "balance_loss_clip": 1.01896358, + "balance_loss_mlp": 1.01614594, + "epoch": 0.9694874492710056, + "flos": 24424956679680.0, + "grad_norm": 2.0781645291887823, + "language_loss": 0.75684053, + "learning_rate": 9.739408915820258e-09, + "loss": 0.77779019, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.3671875, + "step": 16125, + "time_per_iteration": 3.9059674739837646 + }, + { + "auxiliary_loss_clip": 0.01006926, + "auxiliary_loss_mlp": 0.01002081, + "balance_loss_clip": 0.99997139, + "balance_loss_mlp": 1.00054717, + "epoch": 0.9695475725236735, + "flos": 67647674367360.0, + "grad_norm": 0.9176618998559712, + "language_loss": 0.61681956, + "learning_rate": 9.70105810502364e-09, + "loss": 0.6369096, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06396484, + "step": 16126, + "time_per_iteration": 2.9746172428131104 + }, + { + "auxiliary_loss_clip": 0.01050473, + "auxiliary_loss_mlp": 0.01038559, + "balance_loss_clip": 1.01737559, + "balance_loss_mlp": 1.0160681, + "epoch": 0.9696076957763415, + "flos": 19128493117440.0, + "grad_norm": 2.345511807899787, + "language_loss": 0.757061, + "learning_rate": 9.662782766562738e-09, + "loss": 0.7779513, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34375, + "step": 16127, + "time_per_iteration": 2.369414806365967 + }, + { + "auxiliary_loss_clip": 0.01052321, + "auxiliary_loss_mlp": 0.01040299, + "balance_loss_clip": 1.01552701, + "balance_loss_mlp": 1.01533747, + "epoch": 0.9696678190290094, + "flos": 15485934481920.0, + "grad_norm": 1.672816788576485, + "language_loss": 0.70155412, + "learning_rate": 9.62458290188839e-09, + "loss": 0.7224803, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 16128, + "time_per_iteration": 2.3544678688049316 + }, + { + "auxiliary_loss_clip": 0.0105246, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_clip": 1.0175159, + "balance_loss_mlp": 1.01684117, + "epoch": 0.9697279422816775, + "flos": 36207270691200.0, + "grad_norm": 1.5857381621694402, + "language_loss": 0.65888321, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67982304, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 16129, + "time_per_iteration": 2.49839186668396 + }, + { + "auxiliary_loss_clip": 0.01053275, + "auxiliary_loss_mlp": 0.01037583, + "balance_loss_clip": 1.01387262, + "balance_loss_mlp": 1.01610339, + "epoch": 0.9697880655343454, + "flos": 25482825008640.0, + "grad_norm": 1.8286364545465266, + "language_loss": 0.64336139, + "learning_rate": 9.548409599691166e-09, + "loss": 0.66426992, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 16130, + "time_per_iteration": 2.401433229446411 + }, + { + "auxiliary_loss_clip": 0.01053804, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.00954056, + "balance_loss_mlp": 1.0168736, + "epoch": 0.9698481887870134, + "flos": 15332840703360.0, + "grad_norm": 3.4301606142630106, + "language_loss": 0.70327669, + "learning_rate": 9.510436165056867e-09, + "loss": 0.72415161, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36914062, + "step": 16131, + "time_per_iteration": 2.346112012863159 + }, + { + "auxiliary_loss_clip": 0.0105341, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.01536393, + "balance_loss_mlp": 1.01602435, + "epoch": 0.9699083120396813, + "flos": 21981141175680.0, + "grad_norm": 1.8956598443797406, + "language_loss": 0.77784586, + "learning_rate": 9.472538209986058e-09, + "loss": 0.79877651, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 16132, + "time_per_iteration": 2.398916721343994 + }, + { + "auxiliary_loss_clip": 0.01052926, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.01702571, + "balance_loss_mlp": 1.01662874, + "epoch": 0.9699684352923493, + "flos": 15663293521920.0, + "grad_norm": 2.5959859054557097, + "language_loss": 0.79790437, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81883991, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 16133, + "time_per_iteration": 2.3285725116729736 + }, + { + "auxiliary_loss_clip": 0.01049987, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.01113248, + "balance_loss_mlp": 1.01573324, + "epoch": 0.9700285585450172, + "flos": 21907299916800.0, + "grad_norm": 1.5877818432649977, + "language_loss": 0.65287036, + "learning_rate": 9.396968744281863e-09, + "loss": 0.67369723, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.34375, + "step": 16134, + "time_per_iteration": 2.39306640625 + }, + { + "auxiliary_loss_clip": 0.01051587, + "auxiliary_loss_mlp": 0.01037657, + "balance_loss_clip": 1.01450682, + "balance_loss_mlp": 1.01614177, + "epoch": 0.9700886817976853, + "flos": 23913758419200.0, + "grad_norm": 2.229991025923408, + "language_loss": 0.81354457, + "learning_rate": 9.359297236513519e-09, + "loss": 0.83443701, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.35351562, + "step": 16135, + "time_per_iteration": 3.8164432048797607 + }, + { + "auxiliary_loss_clip": 0.01053743, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.01738751, + "balance_loss_mlp": 1.01636052, + "epoch": 0.9701488050503532, + "flos": 25446654973440.0, + "grad_norm": 2.7544422009255816, + "language_loss": 0.74125993, + "learning_rate": 9.321701214040079e-09, + "loss": 0.76223731, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.37304688, + "step": 16136, + "time_per_iteration": 2.3956527709960938 + }, + { + "auxiliary_loss_clip": 0.01051004, + "auxiliary_loss_mlp": 0.01036438, + "balance_loss_clip": 1.0152545, + "balance_loss_mlp": 1.01591992, + "epoch": 0.9702089283030212, + "flos": 20589922385280.0, + "grad_norm": 1.5432292279810815, + "language_loss": 0.77085161, + "learning_rate": 9.28418067828729e-09, + "loss": 0.79172599, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.3515625, + "step": 16137, + "time_per_iteration": 2.3935258388519287 + }, + { + "auxiliary_loss_clip": 0.01007122, + "auxiliary_loss_mlp": 0.01001517, + "balance_loss_clip": 0.99956167, + "balance_loss_mlp": 1.00080061, + "epoch": 0.9702690515556892, + "flos": 70648109677440.0, + "grad_norm": 0.7413621738084187, + "language_loss": 0.54991925, + "learning_rate": 9.246735630678015e-09, + "loss": 0.57000566, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.06347656, + "step": 16138, + "time_per_iteration": 3.1011555194854736 + }, + { + "auxiliary_loss_clip": 0.01051403, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.01330459, + "balance_loss_mlp": 1.01543951, + "epoch": 0.9703291748083571, + "flos": 35879994806400.0, + "grad_norm": 1.7578206922818729, + "language_loss": 0.72241372, + "learning_rate": 9.209366072632007e-09, + "loss": 0.74328387, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.359375, + "step": 16139, + "time_per_iteration": 2.506899356842041 + }, + { + "auxiliary_loss_clip": 0.01053948, + "auxiliary_loss_mlp": 0.01041183, + "balance_loss_clip": 1.0170548, + "balance_loss_mlp": 1.01764393, + "epoch": 0.9703892980610251, + "flos": 24315329410560.0, + "grad_norm": 1.597683626736072, + "language_loss": 0.73195803, + "learning_rate": 9.172072005566134e-09, + "loss": 0.7529093, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36328125, + "step": 16140, + "time_per_iteration": 2.4049429893493652 + }, + { + "auxiliary_loss_clip": 0.01055162, + "auxiliary_loss_mlp": 0.0104548, + "balance_loss_clip": 1.02004075, + "balance_loss_mlp": 1.01661134, + "epoch": 0.970449421313693, + "flos": 18002788283520.0, + "grad_norm": 2.214500367311487, + "language_loss": 0.69595134, + "learning_rate": 9.13485343089504e-09, + "loss": 0.71695781, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38476562, + "step": 16141, + "time_per_iteration": 2.3558924198150635 + }, + { + "auxiliary_loss_clip": 0.0105023, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.01210129, + "balance_loss_mlp": 1.0160253, + "epoch": 0.9705095445663611, + "flos": 25336818236160.0, + "grad_norm": 1.8919819184238322, + "language_loss": 0.69370198, + "learning_rate": 9.097710350029597e-09, + "loss": 0.71454883, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34179688, + "step": 16142, + "time_per_iteration": 2.3896596431732178 + }, + { + "auxiliary_loss_clip": 0.01050916, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.01403487, + "balance_loss_mlp": 1.01574326, + "epoch": 0.970569667819029, + "flos": 26832392680320.0, + "grad_norm": 1.9074397670213503, + "language_loss": 0.5668214, + "learning_rate": 9.060642764378457e-09, + "loss": 0.58769691, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.3515625, + "step": 16143, + "time_per_iteration": 2.409198045730591 + }, + { + "auxiliary_loss_clip": 0.01052264, + "auxiliary_loss_mlp": 0.01038981, + "balance_loss_clip": 1.01633096, + "balance_loss_mlp": 1.01652098, + "epoch": 0.970629791071697, + "flos": 25847457914880.0, + "grad_norm": 2.255471583039371, + "language_loss": 0.69186002, + "learning_rate": 9.023650675347382e-09, + "loss": 0.71277249, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35742188, + "step": 16144, + "time_per_iteration": 2.4054372310638428 + }, + { + "auxiliary_loss_clip": 0.01049663, + "auxiliary_loss_mlp": 0.01040141, + "balance_loss_clip": 1.01863611, + "balance_loss_mlp": 1.01549971, + "epoch": 0.9706899143243649, + "flos": 36538666116480.0, + "grad_norm": 1.6402509363437472, + "language_loss": 0.72733116, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74822915, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34179688, + "step": 16145, + "time_per_iteration": 2.517343282699585 + }, + { + "auxiliary_loss_clip": 0.01052384, + "auxiliary_loss_mlp": 0.01040877, + "balance_loss_clip": 1.01775038, + "balance_loss_mlp": 1.01561809, + "epoch": 0.9707500375770329, + "flos": 12267641088000.0, + "grad_norm": 2.873061359689676, + "language_loss": 0.80198228, + "learning_rate": 8.949892992753395e-09, + "loss": 0.8229149, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 16146, + "time_per_iteration": 2.3292410373687744 + }, + { + "auxiliary_loss_clip": 0.0100665, + "auxiliary_loss_mlp": 0.01002509, + "balance_loss_clip": 1.00057745, + "balance_loss_mlp": 1.00038028, + "epoch": 0.9708101608297008, + "flos": 60849455184000.0, + "grad_norm": 0.7652088937700031, + "language_loss": 0.54750621, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56759787, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.0625, + "step": 16147, + "time_per_iteration": 3.0869803428649902 + }, + { + "auxiliary_loss_clip": 0.01052763, + "auxiliary_loss_mlp": 0.01038834, + "balance_loss_clip": 1.01518285, + "balance_loss_mlp": 1.01580977, + "epoch": 0.9708702840823689, + "flos": 27123568352640.0, + "grad_norm": 2.4046142711012086, + "language_loss": 0.61785817, + "learning_rate": 8.876437313434682e-09, + "loss": 0.63877416, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36914062, + "step": 16148, + "time_per_iteration": 2.4354472160339355 + }, + { + "auxiliary_loss_clip": 0.0104984, + "auxiliary_loss_mlp": 0.01036016, + "balance_loss_clip": 1.01370049, + "balance_loss_mlp": 1.01576948, + "epoch": 0.9709304073350368, + "flos": 20776637669760.0, + "grad_norm": 1.661857388144361, + "language_loss": 0.75210702, + "learning_rate": 8.839822728487155e-09, + "loss": 0.77296561, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33984375, + "step": 16149, + "time_per_iteration": 2.407698154449463 + }, + { + "auxiliary_loss_clip": 0.01051635, + "auxiliary_loss_mlp": 0.01038689, + "balance_loss_clip": 1.01475215, + "balance_loss_mlp": 1.01536775, + "epoch": 0.9709905305877048, + "flos": 41933026730880.0, + "grad_norm": 2.411848504536087, + "language_loss": 0.76338774, + "learning_rate": 8.803283648533222e-09, + "loss": 0.78429091, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 16150, + "time_per_iteration": 2.5546629428863525 + }, + { + "auxiliary_loss_clip": 0.01057406, + "auxiliary_loss_mlp": 0.0104263, + "balance_loss_clip": 1.01530743, + "balance_loss_mlp": 1.01875305, + "epoch": 0.9710506538403728, + "flos": 17164628340480.0, + "grad_norm": 2.2277580575847207, + "language_loss": 0.75050211, + "learning_rate": 8.766820074958214e-09, + "loss": 0.77150249, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.38671875, + "step": 16151, + "time_per_iteration": 2.380120038986206 + }, + { + "auxiliary_loss_clip": 0.0105115, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.01233971, + "balance_loss_mlp": 1.01672053, + "epoch": 0.9711107770930407, + "flos": 21172553020800.0, + "grad_norm": 2.3161722621981995, + "language_loss": 0.75693154, + "learning_rate": 8.730432009145027e-09, + "loss": 0.77779782, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34375, + "step": 16152, + "time_per_iteration": 2.365614652633667 + }, + { + "auxiliary_loss_clip": 0.01051112, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.01197624, + "balance_loss_mlp": 1.01573753, + "epoch": 0.9711709003457087, + "flos": 22235972256000.0, + "grad_norm": 2.3392128573934445, + "language_loss": 0.68480581, + "learning_rate": 8.694119452473448e-09, + "loss": 0.70567298, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 16153, + "time_per_iteration": 3.717405080795288 + }, + { + "auxiliary_loss_clip": 0.01050878, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.0101862, + "balance_loss_mlp": 1.01589191, + "epoch": 0.9712310235983767, + "flos": 26212649402880.0, + "grad_norm": 1.6547607325200764, + "language_loss": 0.71603197, + "learning_rate": 8.65788240632037e-09, + "loss": 0.73687065, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34960938, + "step": 16154, + "time_per_iteration": 2.407580614089966 + }, + { + "auxiliary_loss_clip": 0.01054342, + "auxiliary_loss_mlp": 0.01042046, + "balance_loss_clip": 1.01586747, + "balance_loss_mlp": 1.01810873, + "epoch": 0.9712911468510447, + "flos": 20667115134720.0, + "grad_norm": 2.447867004065475, + "language_loss": 0.81967646, + "learning_rate": 8.621720872059812e-09, + "loss": 0.84064037, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.36328125, + "step": 16155, + "time_per_iteration": 2.3718838691711426 + }, + { + "auxiliary_loss_clip": 0.01052553, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.01480484, + "balance_loss_mlp": 1.01614833, + "epoch": 0.9713512701037126, + "flos": 13552060429440.0, + "grad_norm": 2.118557384170636, + "language_loss": 0.69541401, + "learning_rate": 8.58563485106334e-09, + "loss": 0.71634066, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36328125, + "step": 16156, + "time_per_iteration": 2.3299098014831543 + }, + { + "auxiliary_loss_clip": 0.0105233, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.01011992, + "balance_loss_mlp": 1.01493609, + "epoch": 0.9714113933563806, + "flos": 25847562648960.0, + "grad_norm": 2.9007005351514095, + "language_loss": 0.93096399, + "learning_rate": 8.54962434469919e-09, + "loss": 0.95182312, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.375, + "step": 16157, + "time_per_iteration": 2.3956804275512695 + }, + { + "auxiliary_loss_clip": 0.01053478, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.01442099, + "balance_loss_mlp": 1.01664543, + "epoch": 0.9714715166090485, + "flos": 12742459845120.0, + "grad_norm": 1.862485926334487, + "language_loss": 0.7415154, + "learning_rate": 8.513689354332721e-09, + "loss": 0.76241463, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3671875, + "step": 16158, + "time_per_iteration": 2.34171462059021 + }, + { + "auxiliary_loss_clip": 0.01051277, + "auxiliary_loss_mlp": 0.01040061, + "balance_loss_clip": 1.0177567, + "balance_loss_mlp": 1.01665223, + "epoch": 0.9715316398617165, + "flos": 18404568743040.0, + "grad_norm": 2.2325049220735442, + "language_loss": 0.61718512, + "learning_rate": 8.477829881326836e-09, + "loss": 0.63809848, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 16159, + "time_per_iteration": 2.3459653854370117 + }, + { + "auxiliary_loss_clip": 0.01048611, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.01356387, + "balance_loss_mlp": 1.01565135, + "epoch": 0.9715917631143844, + "flos": 28912517884800.0, + "grad_norm": 1.648462906946207, + "language_loss": 0.79768419, + "learning_rate": 8.44204592704112e-09, + "loss": 0.81850433, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.33007812, + "step": 16160, + "time_per_iteration": 2.424766778945923 + }, + { + "auxiliary_loss_clip": 0.01006963, + "auxiliary_loss_mlp": 0.0100411, + "balance_loss_clip": 1.00228584, + "balance_loss_mlp": 1.00081182, + "epoch": 0.9716518863670525, + "flos": 65936580900480.0, + "grad_norm": 0.7798448724162302, + "language_loss": 0.54399252, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56410325, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.06152344, + "step": 16161, + "time_per_iteration": 4.519224405288696 + }, + { + "auxiliary_loss_clip": 0.01049667, + "auxiliary_loss_mlp": 0.01037202, + "balance_loss_clip": 1.01564908, + "balance_loss_mlp": 1.01598775, + "epoch": 0.9717120096197204, + "flos": 17711787168000.0, + "grad_norm": 1.6877084677405325, + "language_loss": 0.72703922, + "learning_rate": 8.3707045800554e-09, + "loss": 0.74790794, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3359375, + "step": 16162, + "time_per_iteration": 2.3794913291931152 + }, + { + "auxiliary_loss_clip": 0.01049512, + "auxiliary_loss_mlp": 0.01038097, + "balance_loss_clip": 1.01356411, + "balance_loss_mlp": 1.01546884, + "epoch": 0.9717721328723884, + "flos": 24462243878400.0, + "grad_norm": 1.6324985568051364, + "language_loss": 0.7980094, + "learning_rate": 8.335147190060787e-09, + "loss": 0.81888545, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.33984375, + "step": 16163, + "time_per_iteration": 2.4006128311157227 + }, + { + "auxiliary_loss_clip": 0.01051838, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.01164818, + "balance_loss_mlp": 1.01653624, + "epoch": 0.9718322561250564, + "flos": 20775450683520.0, + "grad_norm": 1.6341594270217512, + "language_loss": 0.73977244, + "learning_rate": 8.299665324196903e-09, + "loss": 0.76064175, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35351562, + "step": 16164, + "time_per_iteration": 3.7066211700439453 + }, + { + "auxiliary_loss_clip": 0.01051238, + "auxiliary_loss_mlp": 0.01042725, + "balance_loss_clip": 1.0185492, + "balance_loss_mlp": 1.01574624, + "epoch": 0.9718923793777243, + "flos": 19024905513600.0, + "grad_norm": 2.148674946162132, + "language_loss": 0.84720397, + "learning_rate": 8.264258983809114e-09, + "loss": 0.86814356, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.35546875, + "step": 16165, + "time_per_iteration": 2.3458411693573 + }, + { + "auxiliary_loss_clip": 0.01050212, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.01334023, + "balance_loss_mlp": 1.0156908, + "epoch": 0.9719525026303923, + "flos": 21870641122560.0, + "grad_norm": 1.4822582080081317, + "language_loss": 0.80025721, + "learning_rate": 8.228928170240345e-09, + "loss": 0.82109243, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.34570312, + "step": 16166, + "time_per_iteration": 2.374640941619873 + }, + { + "auxiliary_loss_clip": 0.01050489, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.01623273, + "balance_loss_mlp": 1.01558137, + "epoch": 0.9720126258830603, + "flos": 14428415266560.0, + "grad_norm": 1.8876310531438771, + "language_loss": 0.72100997, + "learning_rate": 8.193672884830195e-09, + "loss": 0.7418853, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.34765625, + "step": 16167, + "time_per_iteration": 2.3485209941864014 + }, + { + "auxiliary_loss_clip": 0.01052018, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.01563537, + "balance_loss_mlp": 1.01711226, + "epoch": 0.9720727491357283, + "flos": 26249936601600.0, + "grad_norm": 1.7512259462599866, + "language_loss": 0.76867938, + "learning_rate": 8.158493128915812e-09, + "loss": 0.78957689, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34765625, + "step": 16168, + "time_per_iteration": 2.4379918575286865 + }, + { + "auxiliary_loss_clip": 0.0105273, + "auxiliary_loss_mlp": 0.01041721, + "balance_loss_clip": 1.01749754, + "balance_loss_mlp": 1.01621985, + "epoch": 0.9721328723883962, + "flos": 22673015055360.0, + "grad_norm": 2.2990461689838972, + "language_loss": 0.74745542, + "learning_rate": 8.123388903830797e-09, + "loss": 0.76839995, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36523438, + "step": 16169, + "time_per_iteration": 2.3691699504852295 + }, + { + "auxiliary_loss_clip": 0.01053755, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.01221657, + "balance_loss_mlp": 1.01664925, + "epoch": 0.9721929956410642, + "flos": 28072926576000.0, + "grad_norm": 1.8534871323007815, + "language_loss": 0.58696675, + "learning_rate": 8.088360210906309e-09, + "loss": 0.60787117, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.37109375, + "step": 16170, + "time_per_iteration": 2.441551446914673 + }, + { + "auxiliary_loss_clip": 0.01052144, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_clip": 1.01617682, + "balance_loss_mlp": 1.01578546, + "epoch": 0.9722531188937321, + "flos": 20995193980800.0, + "grad_norm": 1.800828649495178, + "language_loss": 0.72098637, + "learning_rate": 8.053407051471062e-09, + "loss": 0.74191481, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36328125, + "step": 16171, + "time_per_iteration": 2.3832414150238037 + }, + { + "auxiliary_loss_clip": 0.01051383, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.01857519, + "balance_loss_mlp": 1.0159725, + "epoch": 0.9723132421464001, + "flos": 16069682280960.0, + "grad_norm": 1.8064136121888357, + "language_loss": 0.69329232, + "learning_rate": 8.018529426850218e-09, + "loss": 0.71423507, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35546875, + "step": 16172, + "time_per_iteration": 2.358764410018921 + }, + { + "auxiliary_loss_clip": 0.01049608, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.01082993, + "balance_loss_mlp": 1.01530886, + "epoch": 0.972373365399068, + "flos": 27744533527680.0, + "grad_norm": 1.714706018047328, + "language_loss": 0.86992097, + "learning_rate": 7.983727338366274e-09, + "loss": 0.89073765, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 16173, + "time_per_iteration": 2.410062074661255 + }, + { + "auxiliary_loss_clip": 0.01054753, + "auxiliary_loss_mlp": 0.0104089, + "balance_loss_clip": 1.01547432, + "balance_loss_mlp": 1.01615453, + "epoch": 0.9724334886517361, + "flos": 23001827040000.0, + "grad_norm": 2.6187017012202167, + "language_loss": 0.65290916, + "learning_rate": 7.949000787339289e-09, + "loss": 0.67386556, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.38671875, + "step": 16174, + "time_per_iteration": 2.453516960144043 + }, + { + "auxiliary_loss_clip": 0.01049616, + "auxiliary_loss_mlp": 0.01035393, + "balance_loss_clip": 1.01269555, + "balance_loss_mlp": 1.01566195, + "epoch": 0.972493611904404, + "flos": 25445886923520.0, + "grad_norm": 1.4986065621822637, + "language_loss": 0.79103863, + "learning_rate": 7.914349775085538e-09, + "loss": 0.81188869, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.33984375, + "step": 16175, + "time_per_iteration": 2.428025484085083 + }, + { + "auxiliary_loss_clip": 0.01051329, + "auxiliary_loss_mlp": 0.01038362, + "balance_loss_clip": 1.01456785, + "balance_loss_mlp": 1.01612461, + "epoch": 0.972553735157072, + "flos": 16982102419200.0, + "grad_norm": 3.6715683919531346, + "language_loss": 0.58757693, + "learning_rate": 7.879774302919307e-09, + "loss": 0.60847384, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.3515625, + "step": 16176, + "time_per_iteration": 3.7630529403686523 + }, + { + "auxiliary_loss_clip": 0.01052519, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.01329219, + "balance_loss_mlp": 1.01663339, + "epoch": 0.97261385840974, + "flos": 26103441070080.0, + "grad_norm": 2.6996245155512097, + "language_loss": 0.73221099, + "learning_rate": 7.845274372151545e-09, + "loss": 0.75309908, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 16177, + "time_per_iteration": 2.4367587566375732 + }, + { + "auxiliary_loss_clip": 0.01051208, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.0172236, + "balance_loss_mlp": 1.01535988, + "epoch": 0.9726739816624079, + "flos": 25446689884800.0, + "grad_norm": 1.723086761001516, + "language_loss": 0.69110298, + "learning_rate": 7.810849984090984e-09, + "loss": 0.71201485, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 16178, + "time_per_iteration": 2.395634174346924 + }, + { + "auxiliary_loss_clip": 0.01050845, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.01504242, + "balance_loss_mlp": 1.01483214, + "epoch": 0.972734104915076, + "flos": 29013871161600.0, + "grad_norm": 1.962119092450322, + "language_loss": 0.68698204, + "learning_rate": 7.776501140042358e-09, + "loss": 0.70788491, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.359375, + "step": 16179, + "time_per_iteration": 2.4130783081054688 + }, + { + "auxiliary_loss_clip": 0.0105059, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.01342416, + "balance_loss_mlp": 1.01625824, + "epoch": 0.9727942281677439, + "flos": 23436600600960.0, + "grad_norm": 2.346562990958224, + "language_loss": 0.78120065, + "learning_rate": 7.742227841308624e-09, + "loss": 0.80206358, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 16180, + "time_per_iteration": 2.3911776542663574 + }, + { + "auxiliary_loss_clip": 0.01052961, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.00989962, + "balance_loss_mlp": 1.01569223, + "epoch": 0.9728543514204119, + "flos": 31723200622080.0, + "grad_norm": 1.8934816540408783, + "language_loss": 0.77390093, + "learning_rate": 7.708030089189188e-09, + "loss": 0.79477715, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 16181, + "time_per_iteration": 2.4553840160369873 + }, + { + "auxiliary_loss_clip": 0.01050393, + "auxiliary_loss_mlp": 0.01036164, + "balance_loss_clip": 1.01477814, + "balance_loss_mlp": 1.01540804, + "epoch": 0.9729144746730798, + "flos": 16288448060160.0, + "grad_norm": 1.4529704504412109, + "language_loss": 0.64404643, + "learning_rate": 7.67390788498079e-09, + "loss": 0.66491199, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34960938, + "step": 16182, + "time_per_iteration": 2.3624930381774902 + }, + { + "auxiliary_loss_clip": 0.01054228, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.0148977, + "balance_loss_mlp": 1.01749313, + "epoch": 0.9729745979257478, + "flos": 25040370948480.0, + "grad_norm": 1.851159641089124, + "language_loss": 0.64062685, + "learning_rate": 7.639861229977507e-09, + "loss": 0.6615603, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 16183, + "time_per_iteration": 2.3939785957336426 + }, + { + "auxiliary_loss_clip": 0.01049728, + "auxiliary_loss_mlp": 0.01038514, + "balance_loss_clip": 1.01623392, + "balance_loss_mlp": 1.01549077, + "epoch": 0.9730347211784157, + "flos": 22637613070080.0, + "grad_norm": 1.6074545949767371, + "language_loss": 0.7949146, + "learning_rate": 7.605890125470527e-09, + "loss": 0.81579709, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 16184, + "time_per_iteration": 2.3971481323242188 + }, + { + "auxiliary_loss_clip": 0.01050436, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.01428974, + "balance_loss_mlp": 1.01566172, + "epoch": 0.9730948444310837, + "flos": 10997849606400.0, + "grad_norm": 2.9480884572587502, + "language_loss": 0.80870569, + "learning_rate": 7.571994572747709e-09, + "loss": 0.82956433, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34765625, + "step": 16185, + "time_per_iteration": 2.3304996490478516 + }, + { + "auxiliary_loss_clip": 0.01051949, + "auxiliary_loss_mlp": 0.0103812, + "balance_loss_clip": 1.01501679, + "balance_loss_mlp": 1.01589656, + "epoch": 0.9731549676837516, + "flos": 16798564068480.0, + "grad_norm": 1.7432419504165366, + "language_loss": 0.79116321, + "learning_rate": 7.538174573094469e-09, + "loss": 0.81206387, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.36132812, + "step": 16186, + "time_per_iteration": 2.356036901473999 + }, + { + "auxiliary_loss_clip": 0.01049042, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.01493073, + "balance_loss_mlp": 1.01494479, + "epoch": 0.9732150909364197, + "flos": 21140711994240.0, + "grad_norm": 1.4758802842171408, + "language_loss": 0.65937459, + "learning_rate": 7.504430127793337e-09, + "loss": 0.68023646, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.33984375, + "step": 16187, + "time_per_iteration": 2.399038791656494 + }, + { + "auxiliary_loss_clip": 0.01049723, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.01319325, + "balance_loss_mlp": 1.01494956, + "epoch": 0.9732752141890876, + "flos": 33725993431680.0, + "grad_norm": 2.1063996919487904, + "language_loss": 0.81314504, + "learning_rate": 7.47076123812418e-09, + "loss": 0.83400619, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34765625, + "step": 16188, + "time_per_iteration": 2.4891457557678223 + }, + { + "auxiliary_loss_clip": 0.01049726, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.01704848, + "balance_loss_mlp": 1.01590788, + "epoch": 0.9733353374417556, + "flos": 23403258385920.0, + "grad_norm": 2.1914613033401396, + "language_loss": 0.79027653, + "learning_rate": 7.437167905363084e-09, + "loss": 0.81114417, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.33789062, + "step": 16189, + "time_per_iteration": 2.3915352821350098 + }, + { + "auxiliary_loss_clip": 0.0105038, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.01064456, + "balance_loss_mlp": 1.01558042, + "epoch": 0.9733954606944236, + "flos": 39164239491840.0, + "grad_norm": 1.9151820174112748, + "language_loss": 0.52202719, + "learning_rate": 7.403650130784367e-09, + "loss": 0.54286408, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 16190, + "time_per_iteration": 2.5662381649017334 + }, + { + "auxiliary_loss_clip": 0.01050982, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.01185834, + "balance_loss_mlp": 1.01588714, + "epoch": 0.9734555839470915, + "flos": 21980722239360.0, + "grad_norm": 2.2680067010341, + "language_loss": 0.81707144, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83793634, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.3515625, + "step": 16191, + "time_per_iteration": 2.369935989379883 + }, + { + "auxiliary_loss_clip": 0.01050095, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.01482165, + "balance_loss_mlp": 1.01559186, + "epoch": 0.9735157071997596, + "flos": 16574771053440.0, + "grad_norm": 1.677298901386076, + "language_loss": 0.83594191, + "learning_rate": 7.336841261255111e-09, + "loss": 0.85681045, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 16192, + "time_per_iteration": 2.3512990474700928 + }, + { + "auxiliary_loss_clip": 0.01052017, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.01157403, + "balance_loss_mlp": 1.01663947, + "epoch": 0.9735758304524275, + "flos": 20222391836160.0, + "grad_norm": 1.8771505551021115, + "language_loss": 0.75771618, + "learning_rate": 7.303550168837658e-09, + "loss": 0.778566, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35546875, + "step": 16193, + "time_per_iteration": 3.60482120513916 + }, + { + "auxiliary_loss_clip": 0.0104972, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.01677442, + "balance_loss_mlp": 1.01553476, + "epoch": 0.9736359537050955, + "flos": 23652398914560.0, + "grad_norm": 1.6325566201344517, + "language_loss": 0.85763252, + "learning_rate": 7.270334639669417e-09, + "loss": 0.8785038, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34179688, + "step": 16194, + "time_per_iteration": 2.395846366882324 + }, + { + "auxiliary_loss_clip": 0.01050753, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.01189423, + "balance_loss_mlp": 1.01620603, + "epoch": 0.9736960769577634, + "flos": 15559286981760.0, + "grad_norm": 1.5618432861107077, + "language_loss": 0.76734579, + "learning_rate": 7.237194675009828e-09, + "loss": 0.7882024, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34570312, + "step": 16195, + "time_per_iteration": 2.3392210006713867 + }, + { + "auxiliary_loss_clip": 0.01007502, + "auxiliary_loss_mlp": 0.01002392, + "balance_loss_clip": 1.00031805, + "balance_loss_mlp": 1.00105977, + "epoch": 0.9737562002104314, + "flos": 65348783383680.0, + "grad_norm": 0.7105143616565754, + "language_loss": 0.52495837, + "learning_rate": 7.204130276115439e-09, + "loss": 0.5450573, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.06445312, + "step": 16196, + "time_per_iteration": 2.983163833618164 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.01398373, + "balance_loss_mlp": 1.01480007, + "epoch": 0.9738163234630993, + "flos": 27195559309440.0, + "grad_norm": 1.8840060786221002, + "language_loss": 0.7685082, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78936541, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 16197, + "time_per_iteration": 2.4125523567199707 + }, + { + "auxiliary_loss_clip": 0.01054312, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.01158714, + "balance_loss_mlp": 1.01704621, + "epoch": 0.9738764467157673, + "flos": 21068302101120.0, + "grad_norm": 2.075231776285573, + "language_loss": 0.68784046, + "learning_rate": 7.13822818063492e-09, + "loss": 0.70873809, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37304688, + "step": 16198, + "time_per_iteration": 2.3904638290405273 + }, + { + "auxiliary_loss_clip": 0.01052549, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.01033163, + "balance_loss_mlp": 1.01581955, + "epoch": 0.9739365699684353, + "flos": 21360141089280.0, + "grad_norm": 1.8872430358537475, + "language_loss": 0.79702455, + "learning_rate": 7.10539048654768e-09, + "loss": 0.81790859, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 16199, + "time_per_iteration": 2.459584951400757 + }, + { + "auxiliary_loss_clip": 0.01051322, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.01042724, + "balance_loss_mlp": 1.01571774, + "epoch": 0.9739966932211033, + "flos": 21901155517440.0, + "grad_norm": 1.8227214887819847, + "language_loss": 0.80409551, + "learning_rate": 7.072628363223865e-09, + "loss": 0.82494664, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35546875, + "step": 16200, + "time_per_iteration": 2.376957893371582 + }, + { + "auxiliary_loss_clip": 0.01054686, + "auxiliary_loss_mlp": 0.01041181, + "balance_loss_clip": 1.01557481, + "balance_loss_mlp": 1.01652122, + "epoch": 0.9740568164737712, + "flos": 24826178557440.0, + "grad_norm": 2.333146056883385, + "language_loss": 0.70364141, + "learning_rate": 7.039941811905592e-09, + "loss": 0.72460002, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 16201, + "time_per_iteration": 3.872481107711792 + }, + { + "auxiliary_loss_clip": 0.01052471, + "auxiliary_loss_mlp": 0.01037851, + "balance_loss_clip": 1.0143311, + "balance_loss_mlp": 1.01665306, + "epoch": 0.9741169397264392, + "flos": 23622443101440.0, + "grad_norm": 1.583599394866779, + "language_loss": 0.73521233, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.75611556, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 16202, + "time_per_iteration": 2.393620252609253 + }, + { + "auxiliary_loss_clip": 0.01052723, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.01227129, + "balance_loss_mlp": 1.01632154, + "epoch": 0.9741770629791072, + "flos": 18840215088000.0, + "grad_norm": 1.7389256299574858, + "language_loss": 0.74189705, + "learning_rate": 6.974795430241265e-09, + "loss": 0.76277101, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36328125, + "step": 16203, + "time_per_iteration": 2.380190849304199 + }, + { + "auxiliary_loss_clip": 0.01051239, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.01833963, + "balance_loss_mlp": 1.01525939, + "epoch": 0.9742371862317751, + "flos": 22345145677440.0, + "grad_norm": 1.8863522968256838, + "language_loss": 0.78701407, + "learning_rate": 6.942335602365235e-09, + "loss": 0.80795717, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 16204, + "time_per_iteration": 3.6815130710601807 + }, + { + "auxiliary_loss_clip": 0.01053232, + "auxiliary_loss_mlp": 0.01042479, + "balance_loss_clip": 1.01810038, + "balance_loss_mlp": 1.0164305, + "epoch": 0.9742973094844432, + "flos": 21761677169280.0, + "grad_norm": 2.387298720403749, + "language_loss": 0.81632292, + "learning_rate": 6.909951351435905e-09, + "loss": 0.83728004, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.3671875, + "step": 16205, + "time_per_iteration": 2.4056804180145264 + }, + { + "auxiliary_loss_clip": 0.01051655, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.01247501, + "balance_loss_mlp": 1.016078, + "epoch": 0.9743574327371111, + "flos": 26247562629120.0, + "grad_norm": 1.7005137301601418, + "language_loss": 0.74873126, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76958883, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35546875, + "step": 16206, + "time_per_iteration": 2.404078483581543 + }, + { + "auxiliary_loss_clip": 0.01051649, + "auxiliary_loss_mlp": 0.01037636, + "balance_loss_clip": 1.01337683, + "balance_loss_mlp": 1.01527238, + "epoch": 0.9744175559897791, + "flos": 12348499530240.0, + "grad_norm": 2.3184214631889537, + "language_loss": 0.85256523, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.87345803, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36328125, + "step": 16207, + "time_per_iteration": 2.356764316558838 + }, + { + "auxiliary_loss_clip": 0.01050808, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.01808453, + "balance_loss_mlp": 1.01601696, + "epoch": 0.974477679242447, + "flos": 28396711324800.0, + "grad_norm": 1.5947055134547456, + "language_loss": 0.71413767, + "learning_rate": 6.813252072591425e-09, + "loss": 0.73504567, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 16208, + "time_per_iteration": 2.4173760414123535 + }, + { + "auxiliary_loss_clip": 0.0104725, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.00980973, + "balance_loss_mlp": 1.01486027, + "epoch": 0.974537802495115, + "flos": 17784406529280.0, + "grad_norm": 1.7918250435749647, + "language_loss": 0.77899724, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79976308, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.32421875, + "step": 16209, + "time_per_iteration": 2.3587779998779297 + }, + { + "auxiliary_loss_clip": 0.01051231, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.01182866, + "balance_loss_mlp": 1.01537454, + "epoch": 0.9745979257477829, + "flos": 23841173969280.0, + "grad_norm": 2.0289641531249427, + "language_loss": 0.80138087, + "learning_rate": 6.749163793864144e-09, + "loss": 0.82223356, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 16210, + "time_per_iteration": 2.384678602218628 + }, + { + "auxiliary_loss_clip": 0.01051502, + "auxiliary_loss_mlp": 0.01038484, + "balance_loss_clip": 1.01617992, + "balance_loss_mlp": 1.01635242, + "epoch": 0.9746580490004509, + "flos": 27014953512960.0, + "grad_norm": 4.570296830381062, + "language_loss": 0.79392266, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.81482255, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 16211, + "time_per_iteration": 2.4344282150268555 + }, + { + "auxiliary_loss_clip": 0.01054071, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.0129981, + "balance_loss_mlp": 1.01661289, + "epoch": 0.9747181722531189, + "flos": 19791004677120.0, + "grad_norm": 3.4666580672594107, + "language_loss": 0.79683435, + "learning_rate": 6.685377852219787e-09, + "loss": 0.81774771, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.375, + "step": 16212, + "time_per_iteration": 2.360055923461914 + }, + { + "auxiliary_loss_clip": 0.01049617, + "auxiliary_loss_mlp": 0.01038779, + "balance_loss_clip": 1.01813173, + "balance_loss_mlp": 1.01496208, + "epoch": 0.9747782955057869, + "flos": 31429266952320.0, + "grad_norm": 1.5527170834139825, + "language_loss": 0.81620514, + "learning_rate": 6.653598260829118e-09, + "loss": 0.83708912, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.34570312, + "step": 16213, + "time_per_iteration": 2.446025848388672 + }, + { + "auxiliary_loss_clip": 0.01051541, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.01186705, + "balance_loss_mlp": 1.01567721, + "epoch": 0.9748384187584548, + "flos": 15960369214080.0, + "grad_norm": 1.7876817766041477, + "language_loss": 0.68034029, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.7012012, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.359375, + "step": 16214, + "time_per_iteration": 2.347456216812134 + }, + { + "auxiliary_loss_clip": 0.01053293, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.01040387, + "balance_loss_mlp": 1.0170877, + "epoch": 0.9748985420111228, + "flos": 20557662422400.0, + "grad_norm": 2.783331139475341, + "language_loss": 0.75472248, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.77560699, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36328125, + "step": 16215, + "time_per_iteration": 3.827699661254883 + }, + { + "auxiliary_loss_clip": 0.0105044, + "auxiliary_loss_mlp": 0.01034641, + "balance_loss_clip": 1.01281404, + "balance_loss_mlp": 1.01485527, + "epoch": 0.9749586652637908, + "flos": 36720109785600.0, + "grad_norm": 1.7628413465880899, + "language_loss": 0.68541753, + "learning_rate": 6.558713018834483e-09, + "loss": 0.70626831, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35546875, + "step": 16216, + "time_per_iteration": 2.4815425872802734 + }, + { + "auxiliary_loss_clip": 0.01054112, + "auxiliary_loss_mlp": 0.01036027, + "balance_loss_clip": 1.01193523, + "balance_loss_mlp": 1.0172174, + "epoch": 0.9750187885164587, + "flos": 10998443099520.0, + "grad_norm": 1.8190230855563996, + "language_loss": 0.73264134, + "learning_rate": 6.527235786226937e-09, + "loss": 0.75354266, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36914062, + "step": 16217, + "time_per_iteration": 2.3734238147735596 + }, + { + "auxiliary_loss_clip": 0.01051189, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.01170433, + "balance_loss_mlp": 1.01558006, + "epoch": 0.9750789117691268, + "flos": 25738319404800.0, + "grad_norm": 1.549035784580979, + "language_loss": 0.7936213, + "learning_rate": 6.495834146306167e-09, + "loss": 0.81448478, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 16218, + "time_per_iteration": 2.3923707008361816 + }, + { + "auxiliary_loss_clip": 0.01050836, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.01284814, + "balance_loss_mlp": 1.01591182, + "epoch": 0.9751390350217947, + "flos": 13333120093440.0, + "grad_norm": 2.0019568554250786, + "language_loss": 0.78253645, + "learning_rate": 6.464508100263222e-09, + "loss": 0.80338871, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34960938, + "step": 16219, + "time_per_iteration": 2.3584911823272705 + }, + { + "auxiliary_loss_clip": 0.01052651, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.01548433, + "balance_loss_mlp": 1.01613903, + "epoch": 0.9751991582744627, + "flos": 22819510586880.0, + "grad_norm": 1.6178250847642992, + "language_loss": 0.82534528, + "learning_rate": 6.433257649285817e-09, + "loss": 0.84625524, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36523438, + "step": 16220, + "time_per_iteration": 2.3694510459899902 + }, + { + "auxiliary_loss_clip": 0.01049878, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.01689208, + "balance_loss_mlp": 1.01461542, + "epoch": 0.9752592815271306, + "flos": 19645172461440.0, + "grad_norm": 2.0405355070788946, + "language_loss": 0.76253647, + "learning_rate": 6.402082794559227e-09, + "loss": 0.78341961, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.35351562, + "step": 16221, + "time_per_iteration": 2.3701202869415283 + }, + { + "auxiliary_loss_clip": 0.01050159, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.0109992, + "balance_loss_mlp": 1.01578784, + "epoch": 0.9753194047797986, + "flos": 26689178816640.0, + "grad_norm": 1.6448522916296298, + "language_loss": 0.67757177, + "learning_rate": 6.370983537265395e-09, + "loss": 0.69841403, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34375, + "step": 16222, + "time_per_iteration": 2.4214653968811035 + }, + { + "auxiliary_loss_clip": 0.0105087, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.01356363, + "balance_loss_mlp": 1.01563275, + "epoch": 0.9753795280324665, + "flos": 23220662641920.0, + "grad_norm": 1.7912132424316922, + "language_loss": 0.89669716, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.91756582, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.3515625, + "step": 16223, + "time_per_iteration": 2.4053471088409424 + }, + { + "auxiliary_loss_clip": 0.01051198, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.01469827, + "balance_loss_mlp": 1.01624584, + "epoch": 0.9754396512851345, + "flos": 19462821096960.0, + "grad_norm": 1.7807941164052492, + "language_loss": 0.75998187, + "learning_rate": 6.309011819690457e-09, + "loss": 0.7808727, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.34960938, + "step": 16224, + "time_per_iteration": 2.341597080230713 + }, + { + "auxiliary_loss_clip": 0.01006429, + "auxiliary_loss_mlp": 0.0100252, + "balance_loss_clip": 1.00043356, + "balance_loss_mlp": 1.00022578, + "epoch": 0.9754997745378025, + "flos": 68456053054080.0, + "grad_norm": 0.8160232345922002, + "language_loss": 0.59252238, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61261189, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06201172, + "step": 16225, + "time_per_iteration": 2.9823837280273438 + }, + { + "auxiliary_loss_clip": 0.01050673, + "auxiliary_loss_mlp": 0.01040259, + "balance_loss_clip": 1.01926637, + "balance_loss_mlp": 1.01618183, + "epoch": 0.9755598977904705, + "flos": 26393988337920.0, + "grad_norm": 1.8311766101190035, + "language_loss": 0.69350362, + "learning_rate": 6.247342505960818e-09, + "loss": 0.71441299, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34570312, + "step": 16226, + "time_per_iteration": 2.4117002487182617 + }, + { + "auxiliary_loss_clip": 0.01052966, + "auxiliary_loss_mlp": 0.01042636, + "balance_loss_clip": 1.02024865, + "balance_loss_mlp": 1.01640797, + "epoch": 0.9756200210431384, + "flos": 16616736374400.0, + "grad_norm": 1.6178648217701492, + "language_loss": 0.83631498, + "learning_rate": 6.216621253462894e-09, + "loss": 0.85727102, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36523438, + "step": 16227, + "time_per_iteration": 2.366856575012207 + }, + { + "auxiliary_loss_clip": 0.01049799, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.00996602, + "balance_loss_mlp": 1.01520216, + "epoch": 0.9756801442958064, + "flos": 23621919431040.0, + "grad_norm": 1.7705016363024901, + "language_loss": 0.78587818, + "learning_rate": 6.185975605430549e-09, + "loss": 0.80669808, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34570312, + "step": 16228, + "time_per_iteration": 2.385324001312256 + }, + { + "auxiliary_loss_clip": 0.01007015, + "auxiliary_loss_mlp": 0.01003684, + "balance_loss_clip": 1.0017767, + "balance_loss_mlp": 1.00066829, + "epoch": 0.9757402675484744, + "flos": 61623027244800.0, + "grad_norm": 0.8510095176503469, + "language_loss": 0.55889618, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57900321, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.06347656, + "step": 16229, + "time_per_iteration": 2.988348960876465 + }, + { + "auxiliary_loss_clip": 0.01052077, + "auxiliary_loss_mlp": 0.01040415, + "balance_loss_clip": 1.01712108, + "balance_loss_mlp": 1.01616251, + "epoch": 0.9758003908011423, + "flos": 24057879978240.0, + "grad_norm": 1.6668275266788202, + "language_loss": 0.75956839, + "learning_rate": 6.124911127407984e-09, + "loss": 0.78049326, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 16230, + "time_per_iteration": 2.4021689891815186 + }, + { + "auxiliary_loss_clip": 0.01048109, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.00886846, + "balance_loss_mlp": 1.01521313, + "epoch": 0.9758605140538104, + "flos": 17492148604800.0, + "grad_norm": 1.7388834417032808, + "language_loss": 0.73215628, + "learning_rate": 6.094492299733245e-09, + "loss": 0.75293529, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.328125, + "step": 16231, + "time_per_iteration": 2.3673012256622314 + }, + { + "auxiliary_loss_clip": 0.01053425, + "auxiliary_loss_mlp": 0.01041943, + "balance_loss_clip": 1.01663542, + "balance_loss_mlp": 1.01614225, + "epoch": 0.9759206373064783, + "flos": 24825794532480.0, + "grad_norm": 1.8235620278520732, + "language_loss": 0.77730596, + "learning_rate": 6.064149081155267e-09, + "loss": 0.79825962, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37304688, + "step": 16232, + "time_per_iteration": 2.3963732719421387 + }, + { + "auxiliary_loss_clip": 0.01006644, + "auxiliary_loss_mlp": 0.01003715, + "balance_loss_clip": 1.0016644, + "balance_loss_mlp": 1.00032806, + "epoch": 0.9759807605591463, + "flos": 68158104577920.0, + "grad_norm": 0.7401191002746551, + "language_loss": 0.53879398, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55889755, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.06298828, + "step": 16233, + "time_per_iteration": 4.178099870681763 + }, + { + "auxiliary_loss_clip": 0.01051493, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.01690292, + "balance_loss_mlp": 1.01547194, + "epoch": 0.9760408838118142, + "flos": 18988246719360.0, + "grad_norm": 1.6634579810189019, + "language_loss": 0.72442842, + "learning_rate": 6.003689475888807e-09, + "loss": 0.74534452, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 16234, + "time_per_iteration": 2.356457471847534 + }, + { + "auxiliary_loss_clip": 0.01052112, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.01425767, + "balance_loss_mlp": 1.01588154, + "epoch": 0.9761010070644822, + "flos": 17125106814720.0, + "grad_norm": 2.392312083596412, + "language_loss": 0.80336225, + "learning_rate": 5.973573091493156e-09, + "loss": 0.82426178, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 16235, + "time_per_iteration": 2.359485149383545 + }, + { + "auxiliary_loss_clip": 0.0105141, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.01503062, + "balance_loss_mlp": 1.01589191, + "epoch": 0.9761611303171501, + "flos": 22051561121280.0, + "grad_norm": 2.979687788859341, + "language_loss": 0.78097498, + "learning_rate": 5.943532320779265e-09, + "loss": 0.80187887, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.35546875, + "step": 16236, + "time_per_iteration": 2.367854356765747 + }, + { + "auxiliary_loss_clip": 0.01051082, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.01081371, + "balance_loss_mlp": 1.01577806, + "epoch": 0.9762212535698181, + "flos": 21756580110720.0, + "grad_norm": 1.670529251176396, + "language_loss": 0.76093996, + "learning_rate": 5.913567164886446e-09, + "loss": 0.7817688, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.3515625, + "step": 16237, + "time_per_iteration": 2.3767683506011963 + }, + { + "auxiliary_loss_clip": 0.01050481, + "auxiliary_loss_mlp": 0.01038883, + "balance_loss_clip": 1.01644826, + "balance_loss_mlp": 1.01510918, + "epoch": 0.9762813768224861, + "flos": 25920426389760.0, + "grad_norm": 3.586992358348562, + "language_loss": 0.74114096, + "learning_rate": 5.8836776249509e-09, + "loss": 0.76203465, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35351562, + "step": 16238, + "time_per_iteration": 2.404482841491699 + }, + { + "auxiliary_loss_clip": 0.01050684, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.01518929, + "balance_loss_mlp": 1.01557684, + "epoch": 0.9763415000751541, + "flos": 24050967528960.0, + "grad_norm": 2.148630454851276, + "language_loss": 0.84540528, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.86629832, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.3515625, + "step": 16239, + "time_per_iteration": 2.3649039268493652 + }, + { + "auxiliary_loss_clip": 0.010518, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.01319683, + "balance_loss_mlp": 1.0161562, + "epoch": 0.976401623327822, + "flos": 17017748784000.0, + "grad_norm": 2.5703687252853444, + "language_loss": 0.61810488, + "learning_rate": 5.824125397483115e-09, + "loss": 0.63899922, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35546875, + "step": 16240, + "time_per_iteration": 3.7469937801361084 + }, + { + "auxiliary_loss_clip": 0.01051732, + "auxiliary_loss_mlp": 0.01036818, + "balance_loss_clip": 1.01487207, + "balance_loss_mlp": 1.01629782, + "epoch": 0.97646174658049, + "flos": 16106026872960.0, + "grad_norm": 2.1433431566762424, + "language_loss": 0.84030306, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.86118859, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35351562, + "step": 16241, + "time_per_iteration": 2.3535537719726562 + }, + { + "auxiliary_loss_clip": 0.01052288, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.01597977, + "balance_loss_mlp": 1.01741338, + "epoch": 0.9765218698331579, + "flos": 21251630983680.0, + "grad_norm": 1.7464228752804831, + "language_loss": 0.83729768, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85819948, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34765625, + "step": 16242, + "time_per_iteration": 2.3788530826568604 + }, + { + "auxiliary_loss_clip": 0.01050742, + "auxiliary_loss_mlp": 0.01037928, + "balance_loss_clip": 1.01363301, + "balance_loss_mlp": 1.01494384, + "epoch": 0.9765819930858259, + "flos": 18587059752960.0, + "grad_norm": 1.8129862370198815, + "language_loss": 0.77020705, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.79109371, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35742188, + "step": 16243, + "time_per_iteration": 3.8230788707733154 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01043321, + "balance_loss_clip": 1.01789403, + "balance_loss_mlp": 1.01602066, + "epoch": 0.976642116338494, + "flos": 20265788522880.0, + "grad_norm": 1.691520732168161, + "language_loss": 0.70978725, + "learning_rate": 5.705928383713754e-09, + "loss": 0.73073769, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.35742188, + "step": 16244, + "time_per_iteration": 2.391317844390869 + }, + { + "auxiliary_loss_clip": 0.01052722, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.0134995, + "balance_loss_mlp": 1.01669192, + "epoch": 0.9767022395911619, + "flos": 25549474527360.0, + "grad_norm": 2.8266185094813228, + "language_loss": 0.85087144, + "learning_rate": 5.676568187055197e-09, + "loss": 0.87178022, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.359375, + "step": 16245, + "time_per_iteration": 2.3948748111724854 + }, + { + "auxiliary_loss_clip": 0.01050279, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.01410353, + "balance_loss_mlp": 1.01552403, + "epoch": 0.9767623628438299, + "flos": 21761572435200.0, + "grad_norm": 1.3178895202551233, + "language_loss": 0.79237139, + "learning_rate": 5.647283615340726e-09, + "loss": 0.81322837, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34765625, + "step": 16246, + "time_per_iteration": 2.3962326049804688 + }, + { + "auxiliary_loss_clip": 0.01047024, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.01285791, + "balance_loss_mlp": 1.01480174, + "epoch": 0.9768224860964978, + "flos": 15850218274560.0, + "grad_norm": 1.4278027624585417, + "language_loss": 0.74900198, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76979405, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.32226562, + "step": 16247, + "time_per_iteration": 2.35406494140625 + }, + { + "auxiliary_loss_clip": 0.01051911, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.01333606, + "balance_loss_mlp": 1.01637161, + "epoch": 0.9768826093491658, + "flos": 25150102951680.0, + "grad_norm": 1.9200062458131466, + "language_loss": 0.81004345, + "learning_rate": 5.58894135118404e-09, + "loss": 0.83092844, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 16248, + "time_per_iteration": 2.4182803630828857 + }, + { + "auxiliary_loss_clip": 0.01055015, + "auxiliary_loss_mlp": 0.01043695, + "balance_loss_clip": 1.0178144, + "balance_loss_mlp": 1.01715374, + "epoch": 0.9769427326018337, + "flos": 22965901384320.0, + "grad_norm": 1.8421816659177177, + "language_loss": 0.80745262, + "learning_rate": 5.559883660954278e-09, + "loss": 0.82843971, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.37890625, + "step": 16249, + "time_per_iteration": 2.4294703006744385 + }, + { + "auxiliary_loss_clip": 0.01050378, + "auxiliary_loss_mlp": 0.01033495, + "balance_loss_clip": 1.01086915, + "balance_loss_mlp": 1.01608932, + "epoch": 0.9770028558545018, + "flos": 15263503009920.0, + "grad_norm": 1.9218418797520893, + "language_loss": 0.68108678, + "learning_rate": 5.530901600093507e-09, + "loss": 0.70192546, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34375, + "step": 16250, + "time_per_iteration": 2.3627490997314453 + }, + { + "auxiliary_loss_clip": 0.0100755, + "auxiliary_loss_mlp": 0.01004302, + "balance_loss_clip": 1.0022155, + "balance_loss_mlp": 1.00130868, + "epoch": 0.9770629791071697, + "flos": 71447027385600.0, + "grad_norm": 0.7782993644230678, + "language_loss": 0.60003209, + "learning_rate": 5.501995169700846e-09, + "loss": 0.62015069, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.0625, + "step": 16251, + "time_per_iteration": 3.0625874996185303 + }, + { + "auxiliary_loss_clip": 0.01051318, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.01692677, + "balance_loss_mlp": 1.01599717, + "epoch": 0.9771231023598377, + "flos": 22411934841600.0, + "grad_norm": 3.2244275120166255, + "language_loss": 0.79379612, + "learning_rate": 5.473164370872307e-09, + "loss": 0.81470078, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35351562, + "step": 16252, + "time_per_iteration": 2.406430959701538 + }, + { + "auxiliary_loss_clip": 0.01050615, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.01255584, + "balance_loss_mlp": 1.01541758, + "epoch": 0.9771832256125056, + "flos": 19024940424960.0, + "grad_norm": 5.6127220475732935, + "language_loss": 0.65967035, + "learning_rate": 5.444409204701461e-09, + "loss": 0.68052393, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 16253, + "time_per_iteration": 2.372288465499878 + }, + { + "auxiliary_loss_clip": 0.01054102, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.01825047, + "balance_loss_mlp": 1.01719069, + "epoch": 0.9772433488651736, + "flos": 17821170057600.0, + "grad_norm": 2.6595575531809477, + "language_loss": 0.77415287, + "learning_rate": 5.415729672278324e-09, + "loss": 0.79512894, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 16254, + "time_per_iteration": 2.366090774536133 + }, + { + "auxiliary_loss_clip": 0.01053112, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.01577449, + "balance_loss_mlp": 1.01609945, + "epoch": 0.9773034721178415, + "flos": 37628375472000.0, + "grad_norm": 1.7525651975728227, + "language_loss": 0.64966416, + "learning_rate": 5.387125774690471e-09, + "loss": 0.67059582, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 16255, + "time_per_iteration": 4.062563419342041 + }, + { + "auxiliary_loss_clip": 0.01054601, + "auxiliary_loss_mlp": 0.01044052, + "balance_loss_clip": 1.01602554, + "balance_loss_mlp": 1.01690197, + "epoch": 0.9773635953705095, + "flos": 20301434887680.0, + "grad_norm": 1.6598318259970841, + "language_loss": 0.76730871, + "learning_rate": 5.358597513023033e-09, + "loss": 0.78829527, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.37695312, + "step": 16256, + "time_per_iteration": 2.3865115642547607 + }, + { + "auxiliary_loss_clip": 0.01051787, + "auxiliary_loss_mlp": 0.01039688, + "balance_loss_clip": 1.01318753, + "balance_loss_mlp": 1.01762807, + "epoch": 0.9774237186231776, + "flos": 22308172680960.0, + "grad_norm": 2.1551566595895006, + "language_loss": 0.78750259, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80841732, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.34179688, + "step": 16257, + "time_per_iteration": 2.406813621520996 + }, + { + "auxiliary_loss_clip": 0.01050183, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.01692867, + "balance_loss_mlp": 1.01547432, + "epoch": 0.9774838418758455, + "flos": 24203816928000.0, + "grad_norm": 1.619907992349388, + "language_loss": 0.76701504, + "learning_rate": 5.301767901772391e-09, + "loss": 0.78792918, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.34765625, + "step": 16258, + "time_per_iteration": 2.4205052852630615 + }, + { + "auxiliary_loss_clip": 0.01007159, + "auxiliary_loss_mlp": 0.01001934, + "balance_loss_clip": 0.99995559, + "balance_loss_mlp": 1.00080156, + "epoch": 0.9775439651285135, + "flos": 66353547801600.0, + "grad_norm": 0.6972547420526777, + "language_loss": 0.59889776, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61898869, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.06347656, + "step": 16259, + "time_per_iteration": 3.1085686683654785 + }, + { + "auxiliary_loss_clip": 0.01053666, + "auxiliary_loss_mlp": 0.01038958, + "balance_loss_clip": 1.01447225, + "balance_loss_mlp": 1.01645267, + "epoch": 0.9776040883811814, + "flos": 22600465516800.0, + "grad_norm": 1.6646427519766314, + "language_loss": 0.74797744, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.76890361, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.37304688, + "step": 16260, + "time_per_iteration": 2.3954949378967285 + }, + { + "auxiliary_loss_clip": 0.01051541, + "auxiliary_loss_mlp": 0.0104317, + "balance_loss_clip": 1.01882732, + "balance_loss_mlp": 1.01601219, + "epoch": 0.9776642116338494, + "flos": 18441192625920.0, + "grad_norm": 2.0406287847816644, + "language_loss": 0.80482519, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.82577229, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35546875, + "step": 16261, + "time_per_iteration": 2.350165605545044 + }, + { + "auxiliary_loss_clip": 0.01053002, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.00792146, + "balance_loss_mlp": 1.01659012, + "epoch": 0.9777243348865173, + "flos": 22637333779200.0, + "grad_norm": 2.3314653914763173, + "language_loss": 0.74713063, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76797587, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36328125, + "step": 16262, + "time_per_iteration": 2.3669631481170654 + }, + { + "auxiliary_loss_clip": 0.01052011, + "auxiliary_loss_mlp": 0.01037434, + "balance_loss_clip": 1.01139879, + "balance_loss_mlp": 1.01585507, + "epoch": 0.9777844581391854, + "flos": 31320058619520.0, + "grad_norm": 1.950560921663072, + "language_loss": 0.71727556, + "learning_rate": 5.16101757762133e-09, + "loss": 0.73816997, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.36132812, + "step": 16263, + "time_per_iteration": 2.4454596042633057 + }, + { + "auxiliary_loss_clip": 0.01052445, + "auxiliary_loss_mlp": 0.01038484, + "balance_loss_clip": 1.01458287, + "balance_loss_mlp": 1.01634741, + "epoch": 0.9778445813918533, + "flos": 23037997075200.0, + "grad_norm": 1.7419167589450297, + "language_loss": 0.67526233, + "learning_rate": 5.133094442018038e-09, + "loss": 0.69617164, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 16264, + "time_per_iteration": 2.3725388050079346 + }, + { + "auxiliary_loss_clip": 0.01054636, + "auxiliary_loss_mlp": 0.01041257, + "balance_loss_clip": 1.01383853, + "balance_loss_mlp": 1.01696754, + "epoch": 0.9779047046445213, + "flos": 17565396370560.0, + "grad_norm": 1.9874003721337024, + "language_loss": 0.74033892, + "learning_rate": 5.105246951967679e-09, + "loss": 0.76129782, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.37695312, + "step": 16265, + "time_per_iteration": 2.346679210662842 + }, + { + "auxiliary_loss_clip": 0.01050456, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.01055741, + "balance_loss_mlp": 1.01591182, + "epoch": 0.9779648278971892, + "flos": 20740083609600.0, + "grad_norm": 1.7111586800492788, + "language_loss": 0.69486511, + "learning_rate": 5.077475108526297e-09, + "loss": 0.71570385, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34375, + "step": 16266, + "time_per_iteration": 2.3781681060791016 + }, + { + "auxiliary_loss_clip": 0.01050017, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.01397276, + "balance_loss_mlp": 1.0171473, + "epoch": 0.9780249511498572, + "flos": 21026057489280.0, + "grad_norm": 1.7321289426482234, + "language_loss": 0.87310529, + "learning_rate": 5.049778912747049e-09, + "loss": 0.89395428, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.328125, + "step": 16267, + "time_per_iteration": 2.395775318145752 + }, + { + "auxiliary_loss_clip": 0.01052844, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.00988472, + "balance_loss_mlp": 1.01608896, + "epoch": 0.9780850744025251, + "flos": 30772899792000.0, + "grad_norm": 2.143302800586871, + "language_loss": 0.71657932, + "learning_rate": 5.022158365679985e-09, + "loss": 0.73744673, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 16268, + "time_per_iteration": 2.4571402072906494 + }, + { + "auxiliary_loss_clip": 0.01051789, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.00972235, + "balance_loss_mlp": 1.01662111, + "epoch": 0.9781451976551931, + "flos": 20301958558080.0, + "grad_norm": 1.8098205082854755, + "language_loss": 0.74560487, + "learning_rate": 4.994613468372711e-09, + "loss": 0.7664361, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3515625, + "step": 16269, + "time_per_iteration": 2.355454444885254 + }, + { + "auxiliary_loss_clip": 0.0105223, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.01352096, + "balance_loss_mlp": 1.01620686, + "epoch": 0.9782053209078612, + "flos": 24315329410560.0, + "grad_norm": 2.635316342544413, + "language_loss": 0.71931982, + "learning_rate": 4.967144221869501e-09, + "loss": 0.74021691, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.359375, + "step": 16270, + "time_per_iteration": 2.3895928859710693 + }, + { + "auxiliary_loss_clip": 0.01051778, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.01661611, + "balance_loss_mlp": 1.01523995, + "epoch": 0.9782654441605291, + "flos": 32488042976640.0, + "grad_norm": 1.7755135564018276, + "language_loss": 0.65417469, + "learning_rate": 4.939750627212191e-09, + "loss": 0.67508894, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 16271, + "time_per_iteration": 2.468729257583618 + }, + { + "auxiliary_loss_clip": 0.0104924, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.01124215, + "balance_loss_mlp": 1.01540339, + "epoch": 0.9783255674131971, + "flos": 26977875782400.0, + "grad_norm": 1.4054463684249654, + "language_loss": 0.71214747, + "learning_rate": 4.912432685439505e-09, + "loss": 0.73296887, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 16272, + "time_per_iteration": 2.4224414825439453 + }, + { + "auxiliary_loss_clip": 0.01054319, + "auxiliary_loss_mlp": 0.01040644, + "balance_loss_clip": 1.01588404, + "balance_loss_mlp": 1.01703978, + "epoch": 0.978385690665865, + "flos": 23111140106880.0, + "grad_norm": 1.7526729271866786, + "language_loss": 0.67784607, + "learning_rate": 4.88519039758728e-09, + "loss": 0.69879568, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.37109375, + "step": 16273, + "time_per_iteration": 3.6333563327789307 + }, + { + "auxiliary_loss_clip": 0.01051837, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.01164889, + "balance_loss_mlp": 1.0161531, + "epoch": 0.978445813918533, + "flos": 25408529902080.0, + "grad_norm": 1.6228678277336, + "language_loss": 0.74738228, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76825297, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35742188, + "step": 16274, + "time_per_iteration": 2.419017791748047 + }, + { + "auxiliary_loss_clip": 0.01052383, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.01482296, + "balance_loss_mlp": 1.01695609, + "epoch": 0.9785059371712009, + "flos": 23549090601600.0, + "grad_norm": 1.5785907059284108, + "language_loss": 0.77968282, + "learning_rate": 4.830932787773579e-09, + "loss": 0.8005836, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 16275, + "time_per_iteration": 2.4130842685699463 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.0104089, + "balance_loss_clip": 1.016083, + "balance_loss_mlp": 1.01606548, + "epoch": 0.978566060423869, + "flos": 34349716604160.0, + "grad_norm": 1.6159180691657673, + "language_loss": 0.71950769, + "learning_rate": 4.803917467869567e-09, + "loss": 0.74044311, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.36523438, + "step": 16276, + "time_per_iteration": 2.491328477859497 + }, + { + "auxiliary_loss_clip": 0.01047986, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.01341522, + "balance_loss_mlp": 1.01454401, + "epoch": 0.9786261836765369, + "flos": 11618081642880.0, + "grad_norm": 1.8159449400936531, + "language_loss": 0.86534059, + "learning_rate": 4.776977806000726e-09, + "loss": 0.88616145, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.3359375, + "step": 16277, + "time_per_iteration": 2.335324764251709 + }, + { + "auxiliary_loss_clip": 0.01049428, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.01221955, + "balance_loss_mlp": 1.01536107, + "epoch": 0.9786863069292049, + "flos": 17419983091200.0, + "grad_norm": 1.8116618765733885, + "language_loss": 0.71823227, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73907351, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.33984375, + "step": 16278, + "time_per_iteration": 2.359335422515869 + }, + { + "auxiliary_loss_clip": 0.01050676, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.01329029, + "balance_loss_mlp": 1.01500022, + "epoch": 0.9787464301818728, + "flos": 20844020327040.0, + "grad_norm": 1.7729552304562761, + "language_loss": 0.8480013, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86887449, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35742188, + "step": 16279, + "time_per_iteration": 2.379136800765991 + }, + { + "auxiliary_loss_clip": 0.0105216, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.01681471, + "balance_loss_mlp": 1.01583052, + "epoch": 0.9788065534345408, + "flos": 18221972999040.0, + "grad_norm": 2.360107763605047, + "language_loss": 0.80136001, + "learning_rate": 4.696612778808395e-09, + "loss": 0.82229024, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 16280, + "time_per_iteration": 3.80787992477417 + }, + { + "auxiliary_loss_clip": 0.01049972, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.01242185, + "balance_loss_mlp": 1.01585007, + "epoch": 0.9788666766872087, + "flos": 21577196211840.0, + "grad_norm": 1.5633876356811685, + "language_loss": 0.80212593, + "learning_rate": 4.669975759268085e-09, + "loss": 0.82296777, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.34179688, + "step": 16281, + "time_per_iteration": 2.4323813915252686 + }, + { + "auxiliary_loss_clip": 0.01052168, + "auxiliary_loss_mlp": 0.0103518, + "balance_loss_clip": 1.0114336, + "balance_loss_mlp": 1.01598072, + "epoch": 0.9789267999398767, + "flos": 24899321589120.0, + "grad_norm": 1.801068673361125, + "language_loss": 0.81043297, + "learning_rate": 4.643414402842216e-09, + "loss": 0.83130646, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 16282, + "time_per_iteration": 2.4071526527404785 + }, + { + "auxiliary_loss_clip": 0.01051048, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_clip": 1.0197103, + "balance_loss_mlp": 1.01585007, + "epoch": 0.9789869231925448, + "flos": 19572099252480.0, + "grad_norm": 2.2465563921005476, + "language_loss": 0.84447289, + "learning_rate": 4.616928710538204e-09, + "loss": 0.8654114, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 16283, + "time_per_iteration": 3.738436460494995 + }, + { + "auxiliary_loss_clip": 0.01051413, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.01119757, + "balance_loss_mlp": 1.01588917, + "epoch": 0.9790470464452127, + "flos": 16795806071040.0, + "grad_norm": 1.742790294408586, + "language_loss": 0.73318106, + "learning_rate": 4.590518683360134e-09, + "loss": 0.75403303, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 16284, + "time_per_iteration": 2.349609851837158 + }, + { + "auxiliary_loss_clip": 0.01049571, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.01459718, + "balance_loss_mlp": 1.01593137, + "epoch": 0.9791071696978807, + "flos": 18368363796480.0, + "grad_norm": 1.669754642408173, + "language_loss": 0.652264, + "learning_rate": 4.56418432230965e-09, + "loss": 0.67312634, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3359375, + "step": 16285, + "time_per_iteration": 2.347547769546509 + }, + { + "auxiliary_loss_clip": 0.01051575, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.01776958, + "balance_loss_mlp": 1.01555085, + "epoch": 0.9791672929505486, + "flos": 24169182992640.0, + "grad_norm": 1.5994928084959203, + "language_loss": 0.71672165, + "learning_rate": 4.537925628385286e-09, + "loss": 0.73764777, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.359375, + "step": 16286, + "time_per_iteration": 2.380171775817871 + }, + { + "auxiliary_loss_clip": 0.01050032, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.01218545, + "balance_loss_mlp": 1.01542842, + "epoch": 0.9792274162032166, + "flos": 24352930811520.0, + "grad_norm": 1.36836579799763, + "language_loss": 0.59567952, + "learning_rate": 4.511742602582691e-09, + "loss": 0.61652851, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34570312, + "step": 16287, + "time_per_iteration": 2.4127933979034424 + }, + { + "auxiliary_loss_clip": 0.01052118, + "auxiliary_loss_mlp": 0.01034226, + "balance_loss_clip": 1.01099169, + "balance_loss_mlp": 1.01647866, + "epoch": 0.9792875394558845, + "flos": 26394581831040.0, + "grad_norm": 1.9872444633135307, + "language_loss": 0.82398105, + "learning_rate": 4.485635245894626e-09, + "loss": 0.8448444, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 16288, + "time_per_iteration": 2.4053053855895996 + }, + { + "auxiliary_loss_clip": 0.01051781, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.01176965, + "balance_loss_mlp": 1.01523495, + "epoch": 0.9793476627085526, + "flos": 28146732923520.0, + "grad_norm": 1.4894667424566832, + "language_loss": 0.72917533, + "learning_rate": 4.459603559311631e-09, + "loss": 0.7500484, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36523438, + "step": 16289, + "time_per_iteration": 2.435150623321533 + }, + { + "auxiliary_loss_clip": 0.01050878, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.01566732, + "balance_loss_mlp": 1.01630354, + "epoch": 0.9794077859612205, + "flos": 16762987526400.0, + "grad_norm": 2.6668417755022964, + "language_loss": 0.76404607, + "learning_rate": 4.43364754382003e-09, + "loss": 0.78494614, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34570312, + "step": 16290, + "time_per_iteration": 2.3375704288482666 + }, + { + "auxiliary_loss_clip": 0.01052494, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.01545405, + "balance_loss_mlp": 1.01542783, + "epoch": 0.9794679092138885, + "flos": 19279841328000.0, + "grad_norm": 1.5387307888874768, + "language_loss": 0.68080485, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.70171571, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37109375, + "step": 16291, + "time_per_iteration": 2.3815112113952637 + }, + { + "auxiliary_loss_clip": 0.0105395, + "auxiliary_loss_mlp": 0.01035126, + "balance_loss_clip": 1.01040256, + "balance_loss_mlp": 1.01636255, + "epoch": 0.9795280324665564, + "flos": 32155390742400.0, + "grad_norm": 2.2290795879219054, + "language_loss": 0.62703609, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64792681, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 16292, + "time_per_iteration": 2.437255620956421 + }, + { + "auxiliary_loss_clip": 0.01051094, + "auxiliary_loss_mlp": 0.01038036, + "balance_loss_clip": 1.01476693, + "balance_loss_mlp": 1.01568174, + "epoch": 0.9795881557192244, + "flos": 19059993296640.0, + "grad_norm": 1.6038583088690632, + "language_loss": 0.74417138, + "learning_rate": 4.356233533724829e-09, + "loss": 0.76506269, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 16293, + "time_per_iteration": 2.370506525039673 + }, + { + "auxiliary_loss_clip": 0.01052429, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.01264143, + "balance_loss_mlp": 1.0156126, + "epoch": 0.9796482789718923, + "flos": 28328665351680.0, + "grad_norm": 2.1385163730804613, + "language_loss": 0.84564602, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86652446, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3671875, + "step": 16294, + "time_per_iteration": 3.8501548767089844 + }, + { + "auxiliary_loss_clip": 0.01048888, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.01240373, + "balance_loss_mlp": 1.01577413, + "epoch": 0.9797084022245603, + "flos": 17967002273280.0, + "grad_norm": 2.081096309108261, + "language_loss": 0.73562968, + "learning_rate": 4.305002567088767e-09, + "loss": 0.75646216, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33203125, + "step": 16295, + "time_per_iteration": 2.368640661239624 + }, + { + "auxiliary_loss_clip": 0.010541, + "auxiliary_loss_mlp": 0.01041585, + "balance_loss_clip": 1.0173614, + "balance_loss_mlp": 1.01682496, + "epoch": 0.9797685254772284, + "flos": 20265683788800.0, + "grad_norm": 1.903534382346113, + "language_loss": 0.81335032, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.83430719, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.37304688, + "step": 16296, + "time_per_iteration": 2.369436025619507 + }, + { + "auxiliary_loss_clip": 0.01051945, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01581728, + "balance_loss_mlp": 1.01654851, + "epoch": 0.9798286487298963, + "flos": 26905919736960.0, + "grad_norm": 1.9038925107441067, + "language_loss": 0.76223636, + "learning_rate": 4.254074308266853e-09, + "loss": 0.78313941, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 16297, + "time_per_iteration": 2.426706075668335 + }, + { + "auxiliary_loss_clip": 0.01053771, + "auxiliary_loss_mlp": 0.01045316, + "balance_loss_clip": 1.02179551, + "balance_loss_mlp": 1.01663387, + "epoch": 0.9798887719825643, + "flos": 27159878033280.0, + "grad_norm": 1.576993630612611, + "language_loss": 0.78714371, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80813456, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.37109375, + "step": 16298, + "time_per_iteration": 2.43342661857605 + }, + { + "auxiliary_loss_clip": 0.01047826, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.01325762, + "balance_loss_mlp": 1.01481533, + "epoch": 0.9799488952352322, + "flos": 20667359514240.0, + "grad_norm": 1.5043192394048588, + "language_loss": 0.73935461, + "learning_rate": 4.203448764984019e-09, + "loss": 0.76018232, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33007812, + "step": 16299, + "time_per_iteration": 2.372545003890991 + }, + { + "auxiliary_loss_clip": 0.01052235, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.01729071, + "balance_loss_mlp": 1.01578879, + "epoch": 0.9800090184879002, + "flos": 21980128746240.0, + "grad_norm": 1.9358624560935058, + "language_loss": 0.90422457, + "learning_rate": 4.178249514071419e-09, + "loss": 0.92515063, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36523438, + "step": 16300, + "time_per_iteration": 2.3983559608459473 + }, + { + "auxiliary_loss_clip": 0.01052413, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.01268387, + "balance_loss_mlp": 1.01551521, + "epoch": 0.9800691417405681, + "flos": 21287347171200.0, + "grad_norm": 2.128496337870732, + "language_loss": 0.79383481, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.81471264, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36914062, + "step": 16301, + "time_per_iteration": 2.3979315757751465 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.0110414, + "balance_loss_mlp": 1.01460326, + "epoch": 0.9801292649932362, + "flos": 18438958298880.0, + "grad_norm": 1.9545706333687962, + "language_loss": 0.76325989, + "learning_rate": 4.128078058480921e-09, + "loss": 0.78413093, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.359375, + "step": 16302, + "time_per_iteration": 2.3920507431030273 + }, + { + "auxiliary_loss_clip": 0.01050627, + "auxiliary_loss_mlp": 0.01040017, + "balance_loss_clip": 1.01674771, + "balance_loss_mlp": 1.01532149, + "epoch": 0.9801893882459041, + "flos": 25045188716160.0, + "grad_norm": 1.7627906205551664, + "language_loss": 0.80796307, + "learning_rate": 4.103105855705724e-09, + "loss": 0.82886952, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35351562, + "step": 16303, + "time_per_iteration": 2.418084144592285 + }, + { + "auxiliary_loss_clip": 0.01053973, + "auxiliary_loss_mlp": 0.01037262, + "balance_loss_clip": 1.01252615, + "balance_loss_mlp": 1.01634848, + "epoch": 0.9802495114985721, + "flos": 18510600142080.0, + "grad_norm": 1.8643372158584641, + "language_loss": 0.8410511, + "learning_rate": 4.078209337540883e-09, + "loss": 0.86196345, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 16304, + "time_per_iteration": 2.364396572113037 + }, + { + "auxiliary_loss_clip": 0.01047321, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01345515, + "balance_loss_mlp": 1.01457667, + "epoch": 0.98030963475124, + "flos": 21468790840320.0, + "grad_norm": 1.9962320865016747, + "language_loss": 0.71741819, + "learning_rate": 4.053388504930089e-09, + "loss": 0.73821688, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.328125, + "step": 16305, + "time_per_iteration": 2.413853645324707 + }, + { + "auxiliary_loss_clip": 0.01052406, + "auxiliary_loss_mlp": 0.01041729, + "balance_loss_clip": 1.01866174, + "balance_loss_mlp": 1.01642942, + "epoch": 0.980369758003908, + "flos": 20411236713600.0, + "grad_norm": 1.8516876462046858, + "language_loss": 0.73463619, + "learning_rate": 4.028643358815032e-09, + "loss": 0.75557756, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.359375, + "step": 16306, + "time_per_iteration": 2.3753771781921387 + }, + { + "auxiliary_loss_clip": 0.01049483, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.01496041, + "balance_loss_mlp": 1.01566863, + "epoch": 0.9804298812565759, + "flos": 23398196238720.0, + "grad_norm": 2.0379638759101057, + "language_loss": 0.74983633, + "learning_rate": 4.00397390013385e-09, + "loss": 0.77070022, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33789062, + "step": 16307, + "time_per_iteration": 2.4346423149108887 + }, + { + "auxiliary_loss_clip": 0.01047408, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.0102483, + "balance_loss_mlp": 1.01538658, + "epoch": 0.980490004509244, + "flos": 23291606257920.0, + "grad_norm": 1.6155750704564833, + "language_loss": 0.75853771, + "learning_rate": 3.979380129822018e-09, + "loss": 0.77929974, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.3203125, + "step": 16308, + "time_per_iteration": 2.41501784324646 + }, + { + "auxiliary_loss_clip": 0.01006726, + "auxiliary_loss_mlp": 0.01003774, + "balance_loss_clip": 1.00189042, + "balance_loss_mlp": 1.0003922, + "epoch": 0.980550127761912, + "flos": 56048200479360.0, + "grad_norm": 0.7610824099977884, + "language_loss": 0.57824188, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59834689, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.06347656, + "step": 16309, + "time_per_iteration": 2.909175395965576 + }, + { + "auxiliary_loss_clip": 0.01052031, + "auxiliary_loss_mlp": 0.01042467, + "balance_loss_clip": 1.01869702, + "balance_loss_mlp": 1.01564407, + "epoch": 0.9806102510145799, + "flos": 25332244848000.0, + "grad_norm": 1.828554166883765, + "language_loss": 0.67729443, + "learning_rate": 3.930419658033646e-09, + "loss": 0.69823939, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 16310, + "time_per_iteration": 2.418511390686035 + }, + { + "auxiliary_loss_clip": 0.0100654, + "auxiliary_loss_mlp": 0.01006595, + "balance_loss_clip": 1.00460434, + "balance_loss_mlp": 1.00031316, + "epoch": 0.9806703742672479, + "flos": 67271064998400.0, + "grad_norm": 0.8269730407302652, + "language_loss": 0.54613614, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56626749, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.0625, + "step": 16311, + "time_per_iteration": 3.1059730052948 + }, + { + "auxiliary_loss_clip": 0.01051054, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.01365995, + "balance_loss_mlp": 1.01642013, + "epoch": 0.9807304975199158, + "flos": 25227784460160.0, + "grad_norm": 2.191380492024427, + "language_loss": 0.80553001, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82640225, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 16312, + "time_per_iteration": 3.6441431045532227 + }, + { + "auxiliary_loss_clip": 0.01050211, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.01486015, + "balance_loss_mlp": 1.01615763, + "epoch": 0.9807906207725838, + "flos": 17455454899200.0, + "grad_norm": 1.86700925168628, + "language_loss": 0.64590198, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.66676295, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 16313, + "time_per_iteration": 2.3372106552124023 + }, + { + "auxiliary_loss_clip": 0.01052855, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.01612067, + "balance_loss_mlp": 1.01648092, + "epoch": 0.9808507440252517, + "flos": 21031364016000.0, + "grad_norm": 1.8742319919235488, + "language_loss": 0.74606657, + "learning_rate": 3.833407015731316e-09, + "loss": 0.76699531, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 16314, + "time_per_iteration": 2.4005391597747803 + }, + { + "auxiliary_loss_clip": 0.01006612, + "auxiliary_loss_mlp": 0.01003886, + "balance_loss_clip": 1.00188375, + "balance_loss_mlp": 1.00040197, + "epoch": 0.9809108672779198, + "flos": 64041355589760.0, + "grad_norm": 0.6868386528969249, + "language_loss": 0.51750958, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53761458, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06201172, + "step": 16315, + "time_per_iteration": 3.021420955657959 + }, + { + "auxiliary_loss_clip": 0.01048719, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.01419151, + "balance_loss_mlp": 1.01465309, + "epoch": 0.9809709905305877, + "flos": 22779605036160.0, + "grad_norm": 1.2819991976477334, + "language_loss": 0.70196968, + "learning_rate": 3.785354859932033e-09, + "loss": 0.72282064, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34179688, + "step": 16316, + "time_per_iteration": 2.4467756748199463 + }, + { + "auxiliary_loss_clip": 0.01052262, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.01670289, + "balance_loss_mlp": 1.01617038, + "epoch": 0.9810311137832557, + "flos": 37011913862400.0, + "grad_norm": 1.8679204417348145, + "language_loss": 0.56294227, + "learning_rate": 3.76144232656661e-09, + "loss": 0.58385563, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.36132812, + "step": 16317, + "time_per_iteration": 2.4888687133789062 + }, + { + "auxiliary_loss_clip": 0.0105089, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.01532817, + "balance_loss_mlp": 1.01575983, + "epoch": 0.9810912370359236, + "flos": 18915313155840.0, + "grad_norm": 1.5739996012222122, + "language_loss": 0.74428546, + "learning_rate": 3.737605490767404e-09, + "loss": 0.76516843, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 16318, + "time_per_iteration": 2.408968925476074 + }, + { + "auxiliary_loss_clip": 0.01049467, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.01258898, + "balance_loss_mlp": 1.01564896, + "epoch": 0.9811513602885916, + "flos": 18440634044160.0, + "grad_norm": 2.112073027753667, + "language_loss": 0.83150452, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.85234457, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33789062, + "step": 16319, + "time_per_iteration": 2.354438066482544 + }, + { + "auxiliary_loss_clip": 0.01006576, + "auxiliary_loss_mlp": 0.01001918, + "balance_loss_clip": 1.00009441, + "balance_loss_mlp": 1.00030351, + "epoch": 0.9812114835412595, + "flos": 68055669048960.0, + "grad_norm": 0.7186234152472349, + "language_loss": 0.53690332, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55698824, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.0625, + "step": 16320, + "time_per_iteration": 4.356598377227783 + }, + { + "auxiliary_loss_clip": 0.01051474, + "auxiliary_loss_mlp": 0.01038612, + "balance_loss_clip": 1.01528263, + "balance_loss_mlp": 1.01536965, + "epoch": 0.9812716067939276, + "flos": 25371871107840.0, + "grad_norm": 1.697664362211634, + "language_loss": 0.74585223, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.76675308, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 16321, + "time_per_iteration": 2.404913902282715 + }, + { + "auxiliary_loss_clip": 0.01051909, + "auxiliary_loss_mlp": 0.01035697, + "balance_loss_clip": 1.01398873, + "balance_loss_mlp": 1.01790237, + "epoch": 0.9813317300465956, + "flos": 22855715533440.0, + "grad_norm": 1.794032487660216, + "language_loss": 0.80031633, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.82119238, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.33984375, + "step": 16322, + "time_per_iteration": 2.4366705417633057 + }, + { + "auxiliary_loss_clip": 0.0105036, + "auxiliary_loss_mlp": 0.01034748, + "balance_loss_clip": 1.01282549, + "balance_loss_mlp": 1.01555061, + "epoch": 0.9813918532992635, + "flos": 23585819218560.0, + "grad_norm": 1.8611670388025916, + "language_loss": 0.81730074, + "learning_rate": 3.619556806799595e-09, + "loss": 0.83815181, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34765625, + "step": 16323, + "time_per_iteration": 2.421527147293091 + }, + { + "auxiliary_loss_clip": 0.01052486, + "auxiliary_loss_mlp": 0.01037605, + "balance_loss_clip": 1.01383495, + "balance_loss_mlp": 1.01604724, + "epoch": 0.9814519765519315, + "flos": 19605406556160.0, + "grad_norm": 2.20197819573889, + "language_loss": 0.85819775, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87909865, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36328125, + "step": 16324, + "time_per_iteration": 3.7363414764404297 + }, + { + "auxiliary_loss_clip": 0.01051588, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.01130199, + "balance_loss_mlp": 1.01570761, + "epoch": 0.9815120998045994, + "flos": 33944933767680.0, + "grad_norm": 1.6607739776269461, + "language_loss": 0.75312394, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.77397728, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.359375, + "step": 16325, + "time_per_iteration": 2.4911468029022217 + }, + { + "auxiliary_loss_clip": 0.01049488, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.02024114, + "balance_loss_mlp": 1.01593375, + "epoch": 0.9815722230572674, + "flos": 20848977740160.0, + "grad_norm": 1.661901723457258, + "language_loss": 0.77186984, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.79277158, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.3359375, + "step": 16326, + "time_per_iteration": 2.3968594074249268 + }, + { + "auxiliary_loss_clip": 0.01051957, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.01720965, + "balance_loss_mlp": 1.01639569, + "epoch": 0.9816323463099353, + "flos": 22893386757120.0, + "grad_norm": 1.8822983097358847, + "language_loss": 0.68451631, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.70544267, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 16327, + "time_per_iteration": 2.384472608566284 + }, + { + "auxiliary_loss_clip": 0.01055862, + "auxiliary_loss_mlp": 0.01042867, + "balance_loss_clip": 1.01625907, + "balance_loss_mlp": 1.01776254, + "epoch": 0.9816924695626034, + "flos": 31538126171520.0, + "grad_norm": 1.4197552973288343, + "language_loss": 0.74265397, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.76364124, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.38085938, + "step": 16328, + "time_per_iteration": 2.467512845993042 + }, + { + "auxiliary_loss_clip": 0.01056171, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.01617372, + "balance_loss_mlp": 1.0168891, + "epoch": 0.9817525928152713, + "flos": 21505833659520.0, + "grad_norm": 2.0093214443929313, + "language_loss": 0.82760537, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.84858441, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.390625, + "step": 16329, + "time_per_iteration": 2.4182605743408203 + }, + { + "auxiliary_loss_clip": 0.01053431, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.01250672, + "balance_loss_mlp": 1.01616454, + "epoch": 0.9818127160679393, + "flos": 25549509438720.0, + "grad_norm": 1.901969208113868, + "language_loss": 0.77092868, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.79183346, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.37304688, + "step": 16330, + "time_per_iteration": 2.462303638458252 + }, + { + "auxiliary_loss_clip": 0.01056039, + "auxiliary_loss_mlp": 0.01044888, + "balance_loss_clip": 1.01555049, + "balance_loss_mlp": 1.0166446, + "epoch": 0.9818728393206072, + "flos": 28802401856640.0, + "grad_norm": 2.267107463902833, + "language_loss": 0.68386292, + "learning_rate": 3.434615511252126e-09, + "loss": 0.70487213, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.39453125, + "step": 16331, + "time_per_iteration": 2.457507371902466 + }, + { + "auxiliary_loss_clip": 0.0105052, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.01300478, + "balance_loss_mlp": 1.01530433, + "epoch": 0.9819329625732752, + "flos": 23221116489600.0, + "grad_norm": 1.6782099954030898, + "language_loss": 0.74765062, + "learning_rate": 3.411838534981948e-09, + "loss": 0.76850879, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 16332, + "time_per_iteration": 2.4048938751220703 + }, + { + "auxiliary_loss_clip": 0.01051252, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.01140499, + "balance_loss_mlp": 1.01644313, + "epoch": 0.9819930858259431, + "flos": 17529470714880.0, + "grad_norm": 1.623199087691056, + "language_loss": 0.77490056, + "learning_rate": 3.389137269534936e-09, + "loss": 0.7957325, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34765625, + "step": 16333, + "time_per_iteration": 2.3599488735198975 + }, + { + "auxiliary_loss_clip": 0.01049926, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.01477146, + "balance_loss_mlp": 1.01555395, + "epoch": 0.9820532090786112, + "flos": 12529140238080.0, + "grad_norm": 2.128241156479475, + "language_loss": 0.74847984, + "learning_rate": 3.366511715771958e-09, + "loss": 0.7693435, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 16334, + "time_per_iteration": 3.781158685684204 + }, + { + "auxiliary_loss_clip": 0.01052799, + "auxiliary_loss_mlp": 0.01042669, + "balance_loss_clip": 1.01750374, + "balance_loss_mlp": 1.01616991, + "epoch": 0.9821133323312792, + "flos": 18839272481280.0, + "grad_norm": 1.8091069030298181, + "language_loss": 0.7942704, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.81522512, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3671875, + "step": 16335, + "time_per_iteration": 2.37719988822937 + }, + { + "auxiliary_loss_clip": 0.01053932, + "auxiliary_loss_mlp": 0.01048432, + "balance_loss_clip": 1.02219439, + "balance_loss_mlp": 1.01653314, + "epoch": 0.9821734555839471, + "flos": 34822580325120.0, + "grad_norm": 2.425592967701404, + "language_loss": 0.659935, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.68095863, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 16336, + "time_per_iteration": 2.5023622512817383 + }, + { + "auxiliary_loss_clip": 0.01054364, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.01877618, + "balance_loss_mlp": 1.0168879, + "epoch": 0.9822335788366151, + "flos": 17127236407680.0, + "grad_norm": 1.8065807538027445, + "language_loss": 0.74699605, + "learning_rate": 3.299089333152372e-09, + "loss": 0.76798958, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 16337, + "time_per_iteration": 2.4047181606292725 + }, + { + "auxiliary_loss_clip": 0.01051256, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.01244664, + "balance_loss_mlp": 1.01498878, + "epoch": 0.982293702089283, + "flos": 20812214211840.0, + "grad_norm": 1.6305318392751946, + "language_loss": 0.74286151, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.76373136, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 16338, + "time_per_iteration": 2.4317257404327393 + }, + { + "auxiliary_loss_clip": 0.01049906, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.01102781, + "balance_loss_mlp": 1.01463807, + "epoch": 0.982353825341951, + "flos": 24679683025920.0, + "grad_norm": 1.628800820731604, + "language_loss": 0.82452738, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.84538251, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.35351562, + "step": 16339, + "time_per_iteration": 2.4313089847564697 + }, + { + "auxiliary_loss_clip": 0.01049615, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.01598871, + "balance_loss_mlp": 1.01561904, + "epoch": 0.982413948594619, + "flos": 20849431587840.0, + "grad_norm": 1.9065594793568583, + "language_loss": 0.63436842, + "learning_rate": 3.232348386403405e-09, + "loss": 0.65523779, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 16340, + "time_per_iteration": 2.4013330936431885 + }, + { + "auxiliary_loss_clip": 0.0105317, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.01270425, + "balance_loss_mlp": 1.0162518, + "epoch": 0.982474071847287, + "flos": 15376481769600.0, + "grad_norm": 2.0251203735115415, + "language_loss": 0.86770135, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88860536, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.36914062, + "step": 16341, + "time_per_iteration": 2.36991024017334 + }, + { + "auxiliary_loss_clip": 0.01048899, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.01275373, + "balance_loss_mlp": 1.01584196, + "epoch": 0.9825341950999549, + "flos": 23773442198400.0, + "grad_norm": 1.943545147746063, + "language_loss": 0.67592835, + "learning_rate": 3.188233008645014e-09, + "loss": 0.6967485, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.33007812, + "step": 16342, + "time_per_iteration": 2.457547426223755 + }, + { + "auxiliary_loss_clip": 0.01052202, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.01356721, + "balance_loss_mlp": 1.01610136, + "epoch": 0.9825943183526229, + "flos": 22745215480320.0, + "grad_norm": 1.528696695250381, + "language_loss": 0.77857774, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79946291, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36132812, + "step": 16343, + "time_per_iteration": 2.409417152404785 + }, + { + "auxiliary_loss_clip": 0.01051048, + "auxiliary_loss_mlp": 0.01035729, + "balance_loss_clip": 1.01381874, + "balance_loss_mlp": 1.01587558, + "epoch": 0.9826544416052908, + "flos": 27708712606080.0, + "grad_norm": 1.5934468345004658, + "language_loss": 0.75953948, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.78040731, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.3515625, + "step": 16344, + "time_per_iteration": 2.4772963523864746 + }, + { + "auxiliary_loss_clip": 0.01051949, + "auxiliary_loss_mlp": 0.01042672, + "balance_loss_clip": 1.01972413, + "balance_loss_mlp": 1.01660037, + "epoch": 0.9827145648579588, + "flos": 26940483849600.0, + "grad_norm": 2.098973843192174, + "language_loss": 0.6844663, + "learning_rate": 3.122627838848313e-09, + "loss": 0.70541251, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 16345, + "time_per_iteration": 2.463862180709839 + }, + { + "auxiliary_loss_clip": 0.01048697, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.0129652, + "balance_loss_mlp": 1.01544106, + "epoch": 0.9827746881106267, + "flos": 21865613886720.0, + "grad_norm": 1.4070859122924622, + "language_loss": 0.79759431, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81840873, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.33203125, + "step": 16346, + "time_per_iteration": 2.371816396713257 + }, + { + "auxiliary_loss_clip": 0.01053324, + "auxiliary_loss_mlp": 0.01044732, + "balance_loss_clip": 1.02081871, + "balance_loss_mlp": 1.01584303, + "epoch": 0.9828348113632948, + "flos": 20849710878720.0, + "grad_norm": 2.010777699848973, + "language_loss": 0.76546705, + "learning_rate": 3.079269666552031e-09, + "loss": 0.78644758, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.375, + "step": 16347, + "time_per_iteration": 2.388453960418701 + }, + { + "auxiliary_loss_clip": 0.01049945, + "auxiliary_loss_mlp": 0.0104172, + "balance_loss_clip": 1.01997566, + "balance_loss_mlp": 1.01553833, + "epoch": 0.9828949346159628, + "flos": 34567784156160.0, + "grad_norm": 1.5649975228841562, + "language_loss": 0.67696249, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.69787908, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34375, + "step": 16348, + "time_per_iteration": 2.4815850257873535 + }, + { + "auxiliary_loss_clip": 0.01051285, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.01287544, + "balance_loss_mlp": 1.01590323, + "epoch": 0.9829550578686307, + "flos": 24456448592640.0, + "grad_norm": 1.7787945419996758, + "language_loss": 0.70110482, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.72198403, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35351562, + "step": 16349, + "time_per_iteration": 2.401388645172119 + }, + { + "auxiliary_loss_clip": 0.01047332, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.01076579, + "balance_loss_mlp": 1.01488185, + "epoch": 0.9830151811212987, + "flos": 16909133944320.0, + "grad_norm": 1.8976733584348868, + "language_loss": 0.76967633, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.79046416, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.32421875, + "step": 16350, + "time_per_iteration": 2.318695545196533 + }, + { + "auxiliary_loss_clip": 0.01052541, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.01292872, + "balance_loss_mlp": 1.01647234, + "epoch": 0.9830753043739666, + "flos": 21287242437120.0, + "grad_norm": 1.9837902232221079, + "language_loss": 0.85340846, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.87430036, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.359375, + "step": 16351, + "time_per_iteration": 2.428532600402832 + }, + { + "auxiliary_loss_clip": 0.01051862, + "auxiliary_loss_mlp": 0.01033266, + "balance_loss_clip": 1.01072359, + "balance_loss_mlp": 1.01640344, + "epoch": 0.9831354276266346, + "flos": 31722153281280.0, + "grad_norm": 1.7998351366716587, + "language_loss": 0.69635701, + "learning_rate": 2.972199410170795e-09, + "loss": 0.71720827, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35351562, + "step": 16352, + "time_per_iteration": 3.73988938331604 + }, + { + "auxiliary_loss_clip": 0.01049963, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_clip": 1.01881731, + "balance_loss_mlp": 1.01599264, + "epoch": 0.9831955508793025, + "flos": 21617904723840.0, + "grad_norm": 1.3999214156838855, + "language_loss": 0.66915941, + "learning_rate": 2.951012538143782e-09, + "loss": 0.69007051, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33984375, + "step": 16353, + "time_per_iteration": 2.402040481567383 + }, + { + "auxiliary_loss_clip": 0.01048279, + "auxiliary_loss_mlp": 0.01037541, + "balance_loss_clip": 1.01596367, + "balance_loss_mlp": 1.014691, + "epoch": 0.9832556741319706, + "flos": 22967053459200.0, + "grad_norm": 1.587405341781174, + "language_loss": 0.75480109, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.77565926, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.3359375, + "step": 16354, + "time_per_iteration": 2.3968896865844727 + }, + { + "auxiliary_loss_clip": 0.01051115, + "auxiliary_loss_mlp": 0.01036232, + "balance_loss_clip": 1.01446426, + "balance_loss_mlp": 1.01566374, + "epoch": 0.9833157973846385, + "flos": 21322853890560.0, + "grad_norm": 1.9923948189419034, + "language_loss": 0.78895617, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.80982959, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 16355, + "time_per_iteration": 2.390308141708374 + }, + { + "auxiliary_loss_clip": 0.01051042, + "auxiliary_loss_mlp": 0.01036099, + "balance_loss_clip": 1.01402175, + "balance_loss_mlp": 1.01607752, + "epoch": 0.9833759206373065, + "flos": 21067673696640.0, + "grad_norm": 5.691180774739733, + "language_loss": 0.74567938, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.76655078, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34960938, + "step": 16356, + "time_per_iteration": 2.362691879272461 + }, + { + "auxiliary_loss_clip": 0.01050198, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.01431906, + "balance_loss_mlp": 1.01598334, + "epoch": 0.9834360438899744, + "flos": 18696337908480.0, + "grad_norm": 1.6630162489642106, + "language_loss": 0.77205515, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.79293394, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34179688, + "step": 16357, + "time_per_iteration": 2.366004467010498 + }, + { + "auxiliary_loss_clip": 0.01050562, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.01125503, + "balance_loss_mlp": 1.01585782, + "epoch": 0.9834961671426424, + "flos": 21104192845440.0, + "grad_norm": 2.7101708347111306, + "language_loss": 0.81360042, + "learning_rate": 2.846214118442436e-09, + "loss": 0.83444208, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34765625, + "step": 16358, + "time_per_iteration": 2.3793528079986572 + }, + { + "auxiliary_loss_clip": 0.01049981, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.01194942, + "balance_loss_mlp": 1.01493049, + "epoch": 0.9835562903953103, + "flos": 26686420819200.0, + "grad_norm": 2.246732026108116, + "language_loss": 0.69060469, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.71143985, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.3515625, + "step": 16359, + "time_per_iteration": 3.789628505706787 + }, + { + "auxiliary_loss_clip": 0.01049272, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.01056838, + "balance_loss_mlp": 1.01567852, + "epoch": 0.9836164136479784, + "flos": 22089092699520.0, + "grad_norm": 1.5189103504553005, + "language_loss": 0.70524323, + "learning_rate": 2.804824870920264e-09, + "loss": 0.72606087, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3359375, + "step": 16360, + "time_per_iteration": 2.368866443634033 + }, + { + "auxiliary_loss_clip": 0.01052786, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_clip": 1.02009153, + "balance_loss_mlp": 1.01654315, + "epoch": 0.9836765369006463, + "flos": 23877274181760.0, + "grad_norm": 2.1360631863796304, + "language_loss": 0.85343736, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.87439275, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36328125, + "step": 16361, + "time_per_iteration": 2.395916700363159 + }, + { + "auxiliary_loss_clip": 0.01051072, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.01395011, + "balance_loss_mlp": 1.01601064, + "epoch": 0.9837366601533143, + "flos": 25843931867520.0, + "grad_norm": 1.7913811570868579, + "language_loss": 0.76969475, + "learning_rate": 2.76373855876022e-09, + "loss": 0.7905674, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 16362, + "time_per_iteration": 2.3901352882385254 + }, + { + "auxiliary_loss_clip": 0.01051585, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.01342356, + "balance_loss_mlp": 1.01649702, + "epoch": 0.9837967834059823, + "flos": 21357033978240.0, + "grad_norm": 1.6598741600626963, + "language_loss": 0.72194862, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.74281961, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3515625, + "step": 16363, + "time_per_iteration": 2.387979745864868 + }, + { + "auxiliary_loss_clip": 0.01050433, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.01429558, + "balance_loss_mlp": 1.01623356, + "epoch": 0.9838569066586502, + "flos": 18514789505280.0, + "grad_norm": 1.6462241306423049, + "language_loss": 0.6392808, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.66013223, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.34179688, + "step": 16364, + "time_per_iteration": 3.7885820865631104 + }, + { + "auxiliary_loss_clip": 0.01052301, + "auxiliary_loss_mlp": 0.01041543, + "balance_loss_clip": 1.01985943, + "balance_loss_mlp": 1.01662302, + "epoch": 0.9839170299113182, + "flos": 22451386544640.0, + "grad_norm": 1.6526162809597875, + "language_loss": 0.75870448, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77964294, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.35546875, + "step": 16365, + "time_per_iteration": 2.3752334117889404 + }, + { + "auxiliary_loss_clip": 0.01049171, + "auxiliary_loss_mlp": 0.01035501, + "balance_loss_clip": 1.01313686, + "balance_loss_mlp": 1.01480484, + "epoch": 0.9839771531639862, + "flos": 27891063970560.0, + "grad_norm": 1.925007329654407, + "language_loss": 0.77271682, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.79356349, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 16366, + "time_per_iteration": 2.4409008026123047 + }, + { + "auxiliary_loss_clip": 0.0104982, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.01020932, + "balance_loss_mlp": 1.01564491, + "epoch": 0.9840372764166542, + "flos": 28213417353600.0, + "grad_norm": 1.5772827560837581, + "language_loss": 0.77588344, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79670358, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34179688, + "step": 16367, + "time_per_iteration": 2.4201433658599854 + }, + { + "auxiliary_loss_clip": 0.01051186, + "auxiliary_loss_mlp": 0.01034337, + "balance_loss_clip": 1.01090002, + "balance_loss_mlp": 1.01638412, + "epoch": 0.9840973996693221, + "flos": 23402874360960.0, + "grad_norm": 1.4342141464422853, + "language_loss": 0.61606389, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63691914, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34765625, + "step": 16368, + "time_per_iteration": 2.3948118686676025 + }, + { + "auxiliary_loss_clip": 0.01047682, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.01494074, + "balance_loss_mlp": 1.01528692, + "epoch": 0.9841575229219901, + "flos": 21394844847360.0, + "grad_norm": 1.5397183860523675, + "language_loss": 0.66948199, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.69031137, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.32421875, + "step": 16369, + "time_per_iteration": 2.387934684753418 + }, + { + "auxiliary_loss_clip": 0.01052292, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.01497483, + "balance_loss_mlp": 1.01544714, + "epoch": 0.984217646174658, + "flos": 24462872282880.0, + "grad_norm": 1.6489402260598014, + "language_loss": 0.69834191, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.7192533, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3671875, + "step": 16370, + "time_per_iteration": 2.391291379928589 + }, + { + "auxiliary_loss_clip": 0.010523, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.01388597, + "balance_loss_mlp": 1.0158987, + "epoch": 0.984277769427326, + "flos": 16434140630400.0, + "grad_norm": 1.6945437524693494, + "language_loss": 0.74585634, + "learning_rate": 2.582599145159792e-09, + "loss": 0.76674664, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 16371, + "time_per_iteration": 2.3580658435821533 + }, + { + "auxiliary_loss_clip": 0.0100671, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.00120878, + "balance_loss_mlp": 1.00046444, + "epoch": 0.9843378926799939, + "flos": 64527136513920.0, + "grad_norm": 0.7804969087272537, + "language_loss": 0.65293753, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67303753, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.0625, + "step": 16372, + "time_per_iteration": 3.062891960144043 + }, + { + "auxiliary_loss_clip": 0.0104958, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.01163876, + "balance_loss_mlp": 1.0155468, + "epoch": 0.984398015932662, + "flos": 17381892931200.0, + "grad_norm": 3.197183451187209, + "language_loss": 0.71823049, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.73905069, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.33984375, + "step": 16373, + "time_per_iteration": 3.8125150203704834 + }, + { + "auxiliary_loss_clip": 0.01051274, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.01388347, + "balance_loss_mlp": 1.01677942, + "epoch": 0.9844581391853299, + "flos": 23877937497600.0, + "grad_norm": 1.8119811309082339, + "language_loss": 0.82228935, + "learning_rate": 2.523582674173186e-09, + "loss": 0.84315515, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34570312, + "step": 16374, + "time_per_iteration": 2.4055557250976562 + }, + { + "auxiliary_loss_clip": 0.01053227, + "auxiliary_loss_mlp": 0.01038252, + "balance_loss_clip": 1.01668704, + "balance_loss_mlp": 1.01687217, + "epoch": 0.9845182624379979, + "flos": 19864322265600.0, + "grad_norm": 1.6948332393450802, + "language_loss": 0.70059633, + "learning_rate": 2.504062005197927e-09, + "loss": 0.72151113, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.36328125, + "step": 16375, + "time_per_iteration": 2.3628551959991455 + }, + { + "auxiliary_loss_clip": 0.01053874, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.01545906, + "balance_loss_mlp": 1.01687169, + "epoch": 0.9845783856906659, + "flos": 28253462549760.0, + "grad_norm": 1.8038506376845742, + "language_loss": 0.81753165, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83847392, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.36914062, + "step": 16376, + "time_per_iteration": 2.448657751083374 + }, + { + "auxiliary_loss_clip": 0.01049874, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.01137865, + "balance_loss_mlp": 1.01557243, + "epoch": 0.9846385089433338, + "flos": 28327164163200.0, + "grad_norm": 2.7224078595634276, + "language_loss": 0.63177055, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.65260458, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34375, + "step": 16377, + "time_per_iteration": 2.425400733947754 + }, + { + "auxiliary_loss_clip": 0.01052046, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.01286077, + "balance_loss_mlp": 1.01589584, + "epoch": 0.9846986321960018, + "flos": 24315608701440.0, + "grad_norm": 1.923162811279243, + "language_loss": 0.74360383, + "learning_rate": 2.445954472695133e-09, + "loss": 0.7644918, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.36132812, + "step": 16378, + "time_per_iteration": 2.4479565620422363 + }, + { + "auxiliary_loss_clip": 0.01050197, + "auxiliary_loss_mlp": 0.01037946, + "balance_loss_clip": 1.01533198, + "balance_loss_mlp": 1.01548088, + "epoch": 0.9847587554486698, + "flos": 27270762111360.0, + "grad_norm": 1.678696161280776, + "language_loss": 0.71934736, + "learning_rate": 2.426736789116868e-09, + "loss": 0.74022883, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34765625, + "step": 16379, + "time_per_iteration": 2.4347543716430664 + }, + { + "auxiliary_loss_clip": 0.01051446, + "auxiliary_loss_mlp": 0.01039855, + "balance_loss_clip": 1.01579881, + "balance_loss_mlp": 1.01512051, + "epoch": 0.9848188787013378, + "flos": 16541847774720.0, + "grad_norm": 1.8959567621396247, + "language_loss": 0.69682419, + "learning_rate": 2.407594853716999e-09, + "loss": 0.71773726, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36328125, + "step": 16380, + "time_per_iteration": 2.3840978145599365 + }, + { + "auxiliary_loss_clip": 0.01052682, + "auxiliary_loss_mlp": 0.01038774, + "balance_loss_clip": 1.01669669, + "balance_loss_mlp": 1.01606941, + "epoch": 0.9848790019540057, + "flos": 20192610579840.0, + "grad_norm": 2.008953450416004, + "language_loss": 0.79923964, + "learning_rate": 2.38852866722139e-09, + "loss": 0.82015419, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.3671875, + "step": 16381, + "time_per_iteration": 2.4361753463745117 + }, + { + "auxiliary_loss_clip": 0.01052848, + "auxiliary_loss_mlp": 0.01039896, + "balance_loss_clip": 1.01704383, + "balance_loss_mlp": 1.01651645, + "epoch": 0.9849391252066737, + "flos": 28258385051520.0, + "grad_norm": 1.4061687513289898, + "language_loss": 0.83037704, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.85130447, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 16382, + "time_per_iteration": 2.4688243865966797 + }, + { + "auxiliary_loss_clip": 0.01051979, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.0148133, + "balance_loss_mlp": 1.01506865, + "epoch": 0.9849992484593416, + "flos": 22453865251200.0, + "grad_norm": 2.0619820185821904, + "language_loss": 0.74925745, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.77016687, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.36914062, + "step": 16383, + "time_per_iteration": 2.3873274326324463 + }, + { + "auxiliary_loss_clip": 0.0105189, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.01075017, + "balance_loss_mlp": 1.01647317, + "epoch": 0.9850593717120096, + "flos": 34495723376640.0, + "grad_norm": 1.524843689590293, + "language_loss": 0.67417383, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.69503021, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 16384, + "time_per_iteration": 2.5061910152435303 + }, + { + "auxiliary_loss_clip": 0.01055787, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.01678395, + "balance_loss_mlp": 1.01828146, + "epoch": 0.9851194949646775, + "flos": 38835741709440.0, + "grad_norm": 1.7839418461030894, + "language_loss": 0.71569312, + "learning_rate": 2.313021424697359e-09, + "loss": 0.73667955, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 16385, + "time_per_iteration": 2.5588550567626953 + }, + { + "auxiliary_loss_clip": 0.01054014, + "auxiliary_loss_mlp": 0.01042238, + "balance_loss_clip": 1.01878941, + "balance_loss_mlp": 1.01832306, + "epoch": 0.9851796182173456, + "flos": 17711472965760.0, + "grad_norm": 1.860387565497663, + "language_loss": 0.82609993, + "learning_rate": 2.294333993509978e-09, + "loss": 0.84706247, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 16386, + "time_per_iteration": 2.3769240379333496 + }, + { + "auxiliary_loss_clip": 0.01053968, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.01698184, + "balance_loss_mlp": 1.01737571, + "epoch": 0.9852397414700135, + "flos": 27453078564480.0, + "grad_norm": 1.9674754153159077, + "language_loss": 0.69001901, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.71096158, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.36523438, + "step": 16387, + "time_per_iteration": 2.455188751220703 + }, + { + "auxiliary_loss_clip": 0.01048103, + "auxiliary_loss_mlp": 0.01032565, + "balance_loss_clip": 1.01252627, + "balance_loss_mlp": 1.01465118, + "epoch": 0.9852998647226815, + "flos": 18295709523840.0, + "grad_norm": 1.8928150182760208, + "language_loss": 0.75666195, + "learning_rate": 2.257186391438237e-09, + "loss": 0.77746862, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.33398438, + "step": 16388, + "time_per_iteration": 2.3488147258758545 + }, + { + "auxiliary_loss_clip": 0.01049533, + "auxiliary_loss_mlp": 0.01038254, + "balance_loss_clip": 1.01430464, + "balance_loss_mlp": 1.01448452, + "epoch": 0.9853599879753495, + "flos": 19641646414080.0, + "grad_norm": 1.8982439609514365, + "language_loss": 0.83575523, + "learning_rate": 2.238726221962528e-09, + "loss": 0.85663307, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.3515625, + "step": 16389, + "time_per_iteration": 2.3926444053649902 + }, + { + "auxiliary_loss_clip": 0.01050527, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.0091095, + "balance_loss_mlp": 1.01568162, + "epoch": 0.9854201112280174, + "flos": 23840824855680.0, + "grad_norm": 1.8106626462497077, + "language_loss": 0.68398798, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.70481348, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34765625, + "step": 16390, + "time_per_iteration": 2.405388355255127 + }, + { + "auxiliary_loss_clip": 0.01050998, + "auxiliary_loss_mlp": 0.01039756, + "balance_loss_clip": 1.01652217, + "balance_loss_mlp": 1.01557934, + "epoch": 0.9854802344806854, + "flos": 30079280344320.0, + "grad_norm": 1.5876857948526333, + "language_loss": 0.77615261, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79706013, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 16391, + "time_per_iteration": 3.7885658740997314 + }, + { + "auxiliary_loss_clip": 0.01047593, + "auxiliary_loss_mlp": 0.01035564, + "balance_loss_clip": 1.01595449, + "balance_loss_mlp": 1.01499617, + "epoch": 0.9855403577333534, + "flos": 21906357310080.0, + "grad_norm": 2.321728727887405, + "language_loss": 0.69515967, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.71599126, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.32421875, + "step": 16392, + "time_per_iteration": 2.398983955383301 + }, + { + "auxiliary_loss_clip": 0.01053378, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.01195979, + "balance_loss_mlp": 1.01611471, + "epoch": 0.9856004809860214, + "flos": 15412896184320.0, + "grad_norm": 1.8655368722873553, + "language_loss": 0.56875718, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.58966696, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.37304688, + "step": 16393, + "time_per_iteration": 2.370502233505249 + }, + { + "auxiliary_loss_clip": 0.01055312, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.01433086, + "balance_loss_mlp": 1.01722121, + "epoch": 0.9856606042386893, + "flos": 13652610744960.0, + "grad_norm": 2.3633803376390476, + "language_loss": 0.80490965, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.82585037, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.38085938, + "step": 16394, + "time_per_iteration": 2.3431262969970703 + }, + { + "auxiliary_loss_clip": 0.01053286, + "auxiliary_loss_mlp": 0.01037655, + "balance_loss_clip": 1.01357448, + "balance_loss_mlp": 1.01579428, + "epoch": 0.9857207274913573, + "flos": 23477972428800.0, + "grad_norm": 1.4868486526828084, + "language_loss": 0.76620162, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78711104, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.375, + "step": 16395, + "time_per_iteration": 2.433015823364258 + }, + { + "auxiliary_loss_clip": 0.01049485, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.01479673, + "balance_loss_mlp": 1.01548529, + "epoch": 0.9857808507440252, + "flos": 21064531674240.0, + "grad_norm": 2.0375313154086925, + "language_loss": 0.76209235, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.78295791, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.33984375, + "step": 16396, + "time_per_iteration": 2.428804636001587 + }, + { + "auxiliary_loss_clip": 0.01049989, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.00926781, + "balance_loss_mlp": 1.01610541, + "epoch": 0.9858409739966932, + "flos": 25300194353280.0, + "grad_norm": 1.4852655977870333, + "language_loss": 0.72166562, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.74246228, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.33984375, + "step": 16397, + "time_per_iteration": 2.418898582458496 + }, + { + "auxiliary_loss_clip": 0.01049122, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.00906897, + "balance_loss_mlp": 1.01634789, + "epoch": 0.9859010972493611, + "flos": 20557697333760.0, + "grad_norm": 1.6638806611965085, + "language_loss": 0.72773969, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.74852991, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.328125, + "step": 16398, + "time_per_iteration": 2.406045436859131 + }, + { + "auxiliary_loss_clip": 0.0105116, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.01323032, + "balance_loss_mlp": 1.01632845, + "epoch": 0.9859612205020292, + "flos": 24753803575680.0, + "grad_norm": 1.3566524389275658, + "language_loss": 0.74671513, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76758564, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34765625, + "step": 16399, + "time_per_iteration": 3.8361988067626953 + }, + { + "auxiliary_loss_clip": 0.01051484, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.01499045, + "balance_loss_mlp": 1.01630771, + "epoch": 0.9860213437546971, + "flos": 21104786338560.0, + "grad_norm": 2.1651239772926756, + "language_loss": 0.58111942, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.60201395, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 16400, + "time_per_iteration": 2.3698480129241943 + }, + { + "auxiliary_loss_clip": 0.01055265, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.0170095, + "balance_loss_mlp": 1.01719236, + "epoch": 0.9860814670073651, + "flos": 19135056453120.0, + "grad_norm": 2.197781445422773, + "language_loss": 0.81264442, + "learning_rate": 2.023113299582491e-09, + "loss": 0.83362329, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 16401, + "time_per_iteration": 2.3640732765197754 + }, + { + "auxiliary_loss_clip": 0.01051328, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.00979114, + "balance_loss_mlp": 1.01686144, + "epoch": 0.9861415902600331, + "flos": 17236130538240.0, + "grad_norm": 1.7605566684864031, + "language_loss": 0.79322815, + "learning_rate": 2.005638002662069e-09, + "loss": 0.81406844, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34375, + "step": 16402, + "time_per_iteration": 2.416285991668701 + }, + { + "auxiliary_loss_clip": 0.01053724, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_clip": 1.01717949, + "balance_loss_mlp": 1.01693904, + "epoch": 0.986201713512701, + "flos": 27781331967360.0, + "grad_norm": 2.1815008010941126, + "language_loss": 0.71726942, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.73822081, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.3671875, + "step": 16403, + "time_per_iteration": 2.4480297565460205 + }, + { + "auxiliary_loss_clip": 0.01049145, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.0139817, + "balance_loss_mlp": 1.0152204, + "epoch": 0.986261836765369, + "flos": 28729154090880.0, + "grad_norm": 1.6684205997386472, + "language_loss": 0.75671315, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.77756268, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.33984375, + "step": 16404, + "time_per_iteration": 3.792034387588501 + }, + { + "auxiliary_loss_clip": 0.01051799, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.01659822, + "balance_loss_mlp": 1.01649475, + "epoch": 0.986321960018037, + "flos": 34312045380480.0, + "grad_norm": 1.7615518714837703, + "language_loss": 0.71007299, + "learning_rate": 1.953666699415768e-09, + "loss": 0.73098087, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 16405, + "time_per_iteration": 2.4825515747070312 + }, + { + "auxiliary_loss_clip": 0.01050432, + "auxiliary_loss_mlp": 0.01042139, + "balance_loss_clip": 1.02046633, + "balance_loss_mlp": 1.01687133, + "epoch": 0.986382083270705, + "flos": 25188646959360.0, + "grad_norm": 1.7321461608844917, + "language_loss": 0.70779246, + "learning_rate": 1.93649446302846e-09, + "loss": 0.72871816, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.3359375, + "step": 16406, + "time_per_iteration": 2.39078950881958 + }, + { + "auxiliary_loss_clip": 0.01050277, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.01288557, + "balance_loss_mlp": 1.01596785, + "epoch": 0.9864422065233729, + "flos": 11025396535680.0, + "grad_norm": 2.8071377339465236, + "language_loss": 0.74977183, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.77063656, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.34375, + "step": 16407, + "time_per_iteration": 2.3261735439300537 + }, + { + "auxiliary_loss_clip": 0.01050107, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.0141952, + "balance_loss_mlp": 1.01539922, + "epoch": 0.9865023297760409, + "flos": 16544640683520.0, + "grad_norm": 1.8406105648914812, + "language_loss": 0.78635061, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.80721867, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34765625, + "step": 16408, + "time_per_iteration": 2.339364528656006 + }, + { + "auxiliary_loss_clip": 0.01053087, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.00822437, + "balance_loss_mlp": 1.01623058, + "epoch": 0.9865624530287088, + "flos": 18879178032000.0, + "grad_norm": 1.7591575208731958, + "language_loss": 0.68894213, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70981085, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.36914062, + "step": 16409, + "time_per_iteration": 2.377699136734009 + }, + { + "auxiliary_loss_clip": 0.01006955, + "auxiliary_loss_mlp": 0.01006433, + "balance_loss_clip": 1.00457323, + "balance_loss_mlp": 1.00073314, + "epoch": 0.9866225762813768, + "flos": 68884786949760.0, + "grad_norm": 0.810480237239613, + "language_loss": 0.61117274, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63130659, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.06225586, + "step": 16410, + "time_per_iteration": 3.0539097785949707 + }, + { + "auxiliary_loss_clip": 0.01052222, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.01188374, + "balance_loss_mlp": 1.01587903, + "epoch": 0.9866826995340447, + "flos": 29021830951680.0, + "grad_norm": 2.1481513751438226, + "language_loss": 0.67338413, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.69426012, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.36328125, + "step": 16411, + "time_per_iteration": 2.4786083698272705 + }, + { + "auxiliary_loss_clip": 0.01006738, + "auxiliary_loss_mlp": 0.01002858, + "balance_loss_clip": 1.00090253, + "balance_loss_mlp": 1.00045002, + "epoch": 0.9867428227867128, + "flos": 65373116601600.0, + "grad_norm": 0.7300893932992246, + "language_loss": 0.56260908, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58270502, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.0625, + "step": 16412, + "time_per_iteration": 3.172781229019165 + }, + { + "auxiliary_loss_clip": 0.0105272, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.01426077, + "balance_loss_mlp": 1.01605225, + "epoch": 0.9868029460393807, + "flos": 26505081884160.0, + "grad_norm": 2.0363060590549673, + "language_loss": 0.74241221, + "learning_rate": 1.818410313934926e-09, + "loss": 0.76333827, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 16413, + "time_per_iteration": 3.835279703140259 + }, + { + "auxiliary_loss_clip": 0.01050879, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.01054144, + "balance_loss_mlp": 1.01469946, + "epoch": 0.9868630692920487, + "flos": 22966145763840.0, + "grad_norm": 1.3808273630098753, + "language_loss": 0.72461587, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.74545777, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36132812, + "step": 16414, + "time_per_iteration": 2.4205312728881836 + }, + { + "auxiliary_loss_clip": 0.01050758, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.01159072, + "balance_loss_mlp": 1.0162971, + "epoch": 0.9869231925447167, + "flos": 19827663471360.0, + "grad_norm": 1.620470092236507, + "language_loss": 0.71556932, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.73641551, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34375, + "step": 16415, + "time_per_iteration": 2.38403058052063 + }, + { + "auxiliary_loss_clip": 0.0104774, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.01291561, + "balance_loss_mlp": 1.01476336, + "epoch": 0.9869833157973846, + "flos": 20194146679680.0, + "grad_norm": 1.5205793366655047, + "language_loss": 0.75940329, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.78020906, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.33007812, + "step": 16416, + "time_per_iteration": 2.367724657058716 + }, + { + "auxiliary_loss_clip": 0.01051062, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.01026976, + "balance_loss_mlp": 1.01593173, + "epoch": 0.9870434390500527, + "flos": 16098800221440.0, + "grad_norm": 2.088130091066895, + "language_loss": 0.71793807, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.7387892, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.3515625, + "step": 16417, + "time_per_iteration": 2.3567676544189453 + }, + { + "auxiliary_loss_clip": 0.0105368, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.01690197, + "balance_loss_mlp": 1.01687193, + "epoch": 0.9871035623027206, + "flos": 21759722133120.0, + "grad_norm": 1.517006056349825, + "language_loss": 0.71416926, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.7351445, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.3671875, + "step": 16418, + "time_per_iteration": 2.396503210067749 + }, + { + "auxiliary_loss_clip": 0.01006558, + "auxiliary_loss_mlp": 0.0100493, + "balance_loss_clip": 1.00301087, + "balance_loss_mlp": 1.00038457, + "epoch": 0.9871636855553886, + "flos": 70216024291200.0, + "grad_norm": 0.66152945441132, + "language_loss": 0.53756404, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55767888, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.06176758, + "step": 16419, + "time_per_iteration": 3.1348884105682373 + }, + { + "auxiliary_loss_clip": 0.01053539, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.01529634, + "balance_loss_mlp": 1.01606679, + "epoch": 0.9872238088080565, + "flos": 25044665045760.0, + "grad_norm": 1.5621829674368377, + "language_loss": 0.7906515, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.81158638, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 16420, + "time_per_iteration": 2.425593376159668 + }, + { + "auxiliary_loss_clip": 0.0105147, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.01631284, + "balance_loss_mlp": 1.01733971, + "epoch": 0.9872839320607245, + "flos": 19464776133120.0, + "grad_norm": 1.5180943146734398, + "language_loss": 0.71580184, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73669279, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33984375, + "step": 16421, + "time_per_iteration": 2.4043691158294678 + }, + { + "auxiliary_loss_clip": 0.01053324, + "auxiliary_loss_mlp": 0.01042799, + "balance_loss_clip": 1.01604855, + "balance_loss_mlp": 1.01658678, + "epoch": 0.9873440553133924, + "flos": 26941705747200.0, + "grad_norm": 2.0145977897968224, + "language_loss": 0.83416724, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.85512847, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.3671875, + "step": 16422, + "time_per_iteration": 2.433784008026123 + }, + { + "auxiliary_loss_clip": 0.01051235, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.01178527, + "balance_loss_mlp": 1.01664507, + "epoch": 0.9874041785660604, + "flos": 19061215194240.0, + "grad_norm": 1.6201090059473358, + "language_loss": 0.87122768, + "learning_rate": 1.656159280223779e-09, + "loss": 0.89207178, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.34570312, + "step": 16423, + "time_per_iteration": 2.382376194000244 + }, + { + "auxiliary_loss_clip": 0.01053515, + "auxiliary_loss_mlp": 0.01037484, + "balance_loss_clip": 1.01252198, + "balance_loss_mlp": 1.01698327, + "epoch": 0.9874643018187284, + "flos": 21104751427200.0, + "grad_norm": 1.9333850585285173, + "language_loss": 0.71762133, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.73853129, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36523438, + "step": 16424, + "time_per_iteration": 2.360119342803955 + }, + { + "auxiliary_loss_clip": 0.01050939, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.01186132, + "balance_loss_mlp": 1.01543021, + "epoch": 0.9875244250713964, + "flos": 24424886856960.0, + "grad_norm": 1.766349284181673, + "language_loss": 0.81861985, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.83947521, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 16425, + "time_per_iteration": 2.414161205291748 + }, + { + "auxiliary_loss_clip": 0.01052187, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.0131681, + "balance_loss_mlp": 1.01602817, + "epoch": 0.9875845483240643, + "flos": 25116481445760.0, + "grad_norm": 2.0360563338449875, + "language_loss": 0.81601882, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.83691531, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.36132812, + "step": 16426, + "time_per_iteration": 2.40673828125 + }, + { + "auxiliary_loss_clip": 0.01052019, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.0195297, + "balance_loss_mlp": 1.01637959, + "epoch": 0.9876446715767323, + "flos": 16580845630080.0, + "grad_norm": 1.8578627110663601, + "language_loss": 0.8602435, + "learning_rate": 1.593380599750338e-09, + "loss": 0.8812061, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.35546875, + "step": 16427, + "time_per_iteration": 2.377323627471924 + }, + { + "auxiliary_loss_clip": 0.01050914, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.01332259, + "balance_loss_mlp": 1.01648927, + "epoch": 0.9877047948294003, + "flos": 21615076903680.0, + "grad_norm": 3.55336755998467, + "language_loss": 0.71039045, + "learning_rate": 1.577875377599458e-09, + "loss": 0.73125184, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34375, + "step": 16428, + "time_per_iteration": 2.3823671340942383 + }, + { + "auxiliary_loss_clip": 0.01051118, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.01326823, + "balance_loss_mlp": 1.01647091, + "epoch": 0.9877649180820682, + "flos": 21177440611200.0, + "grad_norm": 1.958637142355758, + "language_loss": 0.81533808, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.8361944, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34570312, + "step": 16429, + "time_per_iteration": 2.3799517154693604 + }, + { + "auxiliary_loss_clip": 0.01050691, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.01578701, + "balance_loss_mlp": 1.01612222, + "epoch": 0.9878250413347363, + "flos": 39747673088640.0, + "grad_norm": 1.5189904413766122, + "language_loss": 0.63345683, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.65433913, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34570312, + "step": 16430, + "time_per_iteration": 2.526319742202759 + }, + { + "auxiliary_loss_clip": 0.01052748, + "auxiliary_loss_mlp": 0.01042089, + "balance_loss_clip": 1.01928413, + "balance_loss_mlp": 1.0172143, + "epoch": 0.9878851645874042, + "flos": 29424309638400.0, + "grad_norm": 1.390196703517565, + "language_loss": 0.73718691, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75813532, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35546875, + "step": 16431, + "time_per_iteration": 3.7655041217803955 + }, + { + "auxiliary_loss_clip": 0.01050171, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.01491237, + "balance_loss_mlp": 1.01535726, + "epoch": 0.9879452878400722, + "flos": 15805599690240.0, + "grad_norm": 2.096132272706326, + "language_loss": 0.82503998, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.84592676, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.34765625, + "step": 16432, + "time_per_iteration": 2.4247753620147705 + }, + { + "auxiliary_loss_clip": 0.0104869, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.01639152, + "balance_loss_mlp": 1.01463985, + "epoch": 0.9880054110927401, + "flos": 22232900056320.0, + "grad_norm": 1.5500754997832111, + "language_loss": 0.81132251, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.83218277, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33984375, + "step": 16433, + "time_per_iteration": 2.38690447807312 + }, + { + "auxiliary_loss_clip": 0.0105019, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.01297116, + "balance_loss_mlp": 1.01581526, + "epoch": 0.9880655343454081, + "flos": 28762670862720.0, + "grad_norm": 1.9474129145018453, + "language_loss": 0.6603868, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.68125731, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.34375, + "step": 16434, + "time_per_iteration": 2.438962697982788 + }, + { + "auxiliary_loss_clip": 0.01050995, + "auxiliary_loss_mlp": 0.01035064, + "balance_loss_clip": 1.01107895, + "balance_loss_mlp": 1.01474714, + "epoch": 0.988125657598076, + "flos": 32852012567040.0, + "grad_norm": 1.5829391262260213, + "language_loss": 0.70495975, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.7258203, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.36328125, + "step": 16435, + "time_per_iteration": 2.473853588104248 + }, + { + "auxiliary_loss_clip": 0.01052346, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.01576042, + "balance_loss_mlp": 1.01689148, + "epoch": 0.988185780850744, + "flos": 19389678065280.0, + "grad_norm": 1.7962635004058187, + "language_loss": 0.76872921, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.78965306, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35546875, + "step": 16436, + "time_per_iteration": 2.3992791175842285 + }, + { + "auxiliary_loss_clip": 0.01052012, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.01617277, + "balance_loss_mlp": 1.01640487, + "epoch": 0.988245904103412, + "flos": 22527322485120.0, + "grad_norm": 2.170621893126387, + "language_loss": 0.74950749, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.770419, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 16437, + "time_per_iteration": 2.366015672683716 + }, + { + "auxiliary_loss_clip": 0.01049125, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.01345158, + "balance_loss_mlp": 1.01564479, + "epoch": 0.98830602735608, + "flos": 28657896272640.0, + "grad_norm": 1.9331709521358371, + "language_loss": 0.61003041, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.63087583, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.33398438, + "step": 16438, + "time_per_iteration": 2.4445104598999023 + }, + { + "auxiliary_loss_clip": 0.01050808, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.01244712, + "balance_loss_mlp": 1.01554596, + "epoch": 0.9883661506087479, + "flos": 20994984512640.0, + "grad_norm": 1.8630177330407571, + "language_loss": 0.73682898, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.75768304, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.3515625, + "step": 16439, + "time_per_iteration": 3.796140432357788 + }, + { + "auxiliary_loss_clip": 0.01049352, + "auxiliary_loss_mlp": 0.01040712, + "balance_loss_clip": 1.01836002, + "balance_loss_mlp": 1.01506066, + "epoch": 0.9884262738614159, + "flos": 32704783896960.0, + "grad_norm": 1.5976280554722637, + "language_loss": 0.60882068, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62972128, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.34375, + "step": 16440, + "time_per_iteration": 2.4942867755889893 + }, + { + "auxiliary_loss_clip": 0.01051366, + "auxiliary_loss_mlp": 0.01035424, + "balance_loss_clip": 1.01198769, + "balance_loss_mlp": 1.01541674, + "epoch": 0.9884863971140839, + "flos": 17563790448000.0, + "grad_norm": 2.2635624710283815, + "language_loss": 0.77716428, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.79803216, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 16441, + "time_per_iteration": 2.3266947269439697 + }, + { + "auxiliary_loss_clip": 0.01052122, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.01482654, + "balance_loss_mlp": 1.01664472, + "epoch": 0.9885465203667518, + "flos": 40550919805440.0, + "grad_norm": 1.880305858835398, + "language_loss": 0.69860953, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.7195133, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 16442, + "time_per_iteration": 2.573410749435425 + }, + { + "auxiliary_loss_clip": 0.01051752, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.01067734, + "balance_loss_mlp": 1.01638424, + "epoch": 0.9886066436194199, + "flos": 13807135889280.0, + "grad_norm": 3.592945895111179, + "language_loss": 0.7580958, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.77894419, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 16443, + "time_per_iteration": 3.7365148067474365 + }, + { + "auxiliary_loss_clip": 0.01051772, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.01249087, + "balance_loss_mlp": 1.01602936, + "epoch": 0.9886667668720878, + "flos": 23324180423040.0, + "grad_norm": 1.9174728672332348, + "language_loss": 0.7486062, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.7694732, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35742188, + "step": 16444, + "time_per_iteration": 2.431607723236084 + }, + { + "auxiliary_loss_clip": 0.01051063, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.01473284, + "balance_loss_mlp": 1.0160687, + "epoch": 0.9887268901247558, + "flos": 22705065550080.0, + "grad_norm": 1.7151853740269392, + "language_loss": 0.69723791, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71811032, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34960938, + "step": 16445, + "time_per_iteration": 2.398573160171509 + }, + { + "auxiliary_loss_clip": 0.01053183, + "auxiliary_loss_mlp": 0.01036835, + "balance_loss_clip": 1.01331472, + "balance_loss_mlp": 1.01720035, + "epoch": 0.9887870133774237, + "flos": 13040478144000.0, + "grad_norm": 2.5497322415446084, + "language_loss": 0.62588871, + "learning_rate": 1.311740377491155e-09, + "loss": 0.6467889, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.359375, + "step": 16446, + "time_per_iteration": 2.379716396331787 + }, + { + "auxiliary_loss_clip": 0.01052261, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.01577199, + "balance_loss_mlp": 1.01642764, + "epoch": 0.9888471366300917, + "flos": 15157960369920.0, + "grad_norm": 5.694985151971942, + "language_loss": 0.72333562, + "learning_rate": 1.297675079582783e-09, + "loss": 0.74423599, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.359375, + "step": 16447, + "time_per_iteration": 2.3430516719818115 + }, + { + "auxiliary_loss_clip": 0.01050788, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.01270318, + "balance_loss_mlp": 1.0162226, + "epoch": 0.9889072598827596, + "flos": 25117633520640.0, + "grad_norm": 1.8761633392726447, + "language_loss": 0.85369349, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.87455285, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 16448, + "time_per_iteration": 2.419823169708252 + }, + { + "auxiliary_loss_clip": 0.01049279, + "auxiliary_loss_mlp": 0.0103703, + "balance_loss_clip": 1.01583493, + "balance_loss_mlp": 1.01491666, + "epoch": 0.9889673831354276, + "flos": 16727690275200.0, + "grad_norm": 1.8914100700921106, + "language_loss": 0.71703279, + "learning_rate": 1.26977185727406e-09, + "loss": 0.73789591, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 0.34375, + "step": 16449, + "time_per_iteration": 2.3459885120391846 + }, + { + "auxiliary_loss_clip": 0.01053033, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.01598048, + "balance_loss_mlp": 1.01595449, + "epoch": 0.9890275063880956, + "flos": 35583861720960.0, + "grad_norm": 2.3974074133582235, + "language_loss": 0.75395608, + "learning_rate": 1.25593393393153e-09, + "loss": 0.77486938, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.37109375, + "step": 16450, + "time_per_iteration": 2.5247578620910645 + }, + { + "auxiliary_loss_clip": 0.01052688, + "auxiliary_loss_mlp": 0.01040394, + "balance_loss_clip": 1.01630151, + "balance_loss_mlp": 1.01529872, + "epoch": 0.9890876296407636, + "flos": 18951378456960.0, + "grad_norm": 1.778131819077099, + "language_loss": 0.80461675, + "learning_rate": 1.242171803164549e-09, + "loss": 0.82554758, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.37304688, + "step": 16451, + "time_per_iteration": 3.8472650051116943 + }, + { + "auxiliary_loss_clip": 0.01052137, + "auxiliary_loss_mlp": 0.01040403, + "balance_loss_clip": 1.01755047, + "balance_loss_mlp": 1.01547623, + "epoch": 0.9891477528934315, + "flos": 23768170583040.0, + "grad_norm": 1.8999544910984703, + "language_loss": 0.71861279, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.73953819, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.3671875, + "step": 16452, + "time_per_iteration": 2.436432361602783 + }, + { + "auxiliary_loss_clip": 0.01049406, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.01334691, + "balance_loss_mlp": 1.01567316, + "epoch": 0.9892078761460995, + "flos": 20771994458880.0, + "grad_norm": 1.6804233244916746, + "language_loss": 0.74504727, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76588428, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.33789062, + "step": 16453, + "time_per_iteration": 2.3605263233184814 + }, + { + "auxiliary_loss_clip": 0.01050984, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.01752734, + "balance_loss_mlp": 1.01503813, + "epoch": 0.9892679993987675, + "flos": 23366704325760.0, + "grad_norm": 1.858412779935992, + "language_loss": 0.72369039, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.74460495, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.359375, + "step": 16454, + "time_per_iteration": 2.4079082012176514 + }, + { + "auxiliary_loss_clip": 0.01048714, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01160383, + "balance_loss_mlp": 1.01617825, + "epoch": 0.9893281226514354, + "flos": 22704472056960.0, + "grad_norm": 1.8728113833447253, + "language_loss": 0.76526284, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.78607583, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.32617188, + "step": 16455, + "time_per_iteration": 2.3817455768585205 + }, + { + "auxiliary_loss_clip": 0.01049193, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.01313996, + "balance_loss_mlp": 1.01489806, + "epoch": 0.9893882459041035, + "flos": 21795647788800.0, + "grad_norm": 1.7634346509215462, + "language_loss": 0.66370571, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.68455434, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34375, + "step": 16456, + "time_per_iteration": 2.4328806400299072 + }, + { + "auxiliary_loss_clip": 0.01052067, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.01335847, + "balance_loss_mlp": 1.01657188, + "epoch": 0.9894483691567714, + "flos": 18112799577600.0, + "grad_norm": 2.0296371867812977, + "language_loss": 0.74619919, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76707625, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35546875, + "step": 16457, + "time_per_iteration": 2.3447089195251465 + }, + { + "auxiliary_loss_clip": 0.01051115, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.01318073, + "balance_loss_mlp": 1.0155673, + "epoch": 0.9895084924094394, + "flos": 31210291704960.0, + "grad_norm": 2.186533251603272, + "language_loss": 0.70660162, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.72747666, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 16458, + "time_per_iteration": 2.4939398765563965 + }, + { + "auxiliary_loss_clip": 0.01050806, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.01354909, + "balance_loss_mlp": 1.01630235, + "epoch": 0.9895686156621073, + "flos": 19677153133440.0, + "grad_norm": 4.801756309589411, + "language_loss": 0.79756331, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.8184334, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34375, + "step": 16459, + "time_per_iteration": 2.356660842895508 + }, + { + "auxiliary_loss_clip": 0.01052128, + "auxiliary_loss_mlp": 0.01037366, + "balance_loss_clip": 1.01375055, + "balance_loss_mlp": 1.01662445, + "epoch": 0.9896287389147753, + "flos": 23580687248640.0, + "grad_norm": 2.4303182584443306, + "language_loss": 0.72389829, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.74479324, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 16460, + "time_per_iteration": 2.4447414875030518 + }, + { + "auxiliary_loss_clip": 0.01052677, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.00868416, + "balance_loss_mlp": 1.01575971, + "epoch": 0.9896888621674432, + "flos": 29604077562240.0, + "grad_norm": 1.6303830324145399, + "language_loss": 0.88177305, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.90261871, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36914062, + "step": 16461, + "time_per_iteration": 2.432691812515259 + }, + { + "auxiliary_loss_clip": 0.01050974, + "auxiliary_loss_mlp": 0.01044886, + "balance_loss_clip": 1.0205797, + "balance_loss_mlp": 1.0157423, + "epoch": 0.9897489854201112, + "flos": 23693945299200.0, + "grad_norm": 1.7475699793329003, + "language_loss": 0.63910568, + "learning_rate": 1.09579082189315e-09, + "loss": 0.66006422, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.3515625, + "step": 16462, + "time_per_iteration": 2.4454967975616455 + }, + { + "auxiliary_loss_clip": 0.01053105, + "auxiliary_loss_mlp": 0.01042155, + "balance_loss_clip": 1.01956463, + "balance_loss_mlp": 1.01727307, + "epoch": 0.9898091086727792, + "flos": 13224295785600.0, + "grad_norm": 2.277039326084464, + "language_loss": 0.7406621, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.76161468, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35742188, + "step": 16463, + "time_per_iteration": 2.3818283081054688 + }, + { + "auxiliary_loss_clip": 0.01052175, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.01500976, + "balance_loss_mlp": 1.01684809, + "epoch": 0.9898692319254472, + "flos": 22929277501440.0, + "grad_norm": 1.693241997588554, + "language_loss": 0.71012038, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.73102033, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35351562, + "step": 16464, + "time_per_iteration": 2.427647590637207 + }, + { + "auxiliary_loss_clip": 0.01052074, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.01418269, + "balance_loss_mlp": 1.01575947, + "epoch": 0.9899293551781151, + "flos": 12457533306240.0, + "grad_norm": 3.419526649472281, + "language_loss": 0.74501026, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.76591426, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36328125, + "step": 16465, + "time_per_iteration": 2.3523447513580322 + }, + { + "auxiliary_loss_clip": 0.0104992, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.01618505, + "balance_loss_mlp": 1.01508725, + "epoch": 0.9899894784307831, + "flos": 26869889347200.0, + "grad_norm": 1.609535067901223, + "language_loss": 0.87546068, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.89633107, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.34765625, + "step": 16466, + "time_per_iteration": 2.4353411197662354 + }, + { + "auxiliary_loss_clip": 0.01050965, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.00978935, + "balance_loss_mlp": 1.01556659, + "epoch": 0.990049601683451, + "flos": 21541061088000.0, + "grad_norm": 1.9364921641085109, + "language_loss": 0.72647011, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.74729717, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.35546875, + "step": 16467, + "time_per_iteration": 2.3762786388397217 + }, + { + "auxiliary_loss_clip": 0.01049049, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.01667798, + "balance_loss_mlp": 1.01504052, + "epoch": 0.990109724936119, + "flos": 28770421184640.0, + "grad_norm": 1.2947714049917085, + "language_loss": 0.6578666, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67872572, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33984375, + "step": 16468, + "time_per_iteration": 2.4969394207000732 + }, + { + "auxiliary_loss_clip": 0.01054899, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.01436973, + "balance_loss_mlp": 1.01735473, + "epoch": 0.9901698481887871, + "flos": 29273101073280.0, + "grad_norm": 2.3372459801976695, + "language_loss": 0.63044477, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.65137267, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.375, + "step": 16469, + "time_per_iteration": 2.4165854454040527 + }, + { + "auxiliary_loss_clip": 0.01052665, + "auxiliary_loss_mlp": 0.01035869, + "balance_loss_clip": 1.01209807, + "balance_loss_mlp": 1.01613653, + "epoch": 0.990229971441455, + "flos": 15958169798400.0, + "grad_norm": 2.5588588558731615, + "language_loss": 0.73851955, + "learning_rate": 9.950925847685976e-10, + "loss": 0.75940484, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.36523438, + "step": 16470, + "time_per_iteration": 2.368504285812378 + }, + { + "auxiliary_loss_clip": 0.01006848, + "auxiliary_loss_mlp": 0.01003025, + "balance_loss_clip": 1.00090301, + "balance_loss_mlp": 1.00055552, + "epoch": 0.990290094694123, + "flos": 69776782076160.0, + "grad_norm": 0.6867722422435837, + "language_loss": 0.55574894, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57584763, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.06298828, + "step": 16471, + "time_per_iteration": 4.559534311294556 + }, + { + "auxiliary_loss_clip": 0.01051879, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.01474309, + "balance_loss_mlp": 1.01609254, + "epoch": 0.9903502179467909, + "flos": 16251544886400.0, + "grad_norm": 2.0758515267114626, + "language_loss": 0.85377383, + "learning_rate": 9.706760407131032e-10, + "loss": 0.8746807, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.35742188, + "step": 16472, + "time_per_iteration": 2.3839216232299805 + }, + { + "auxiliary_loss_clip": 0.01050878, + "auxiliary_loss_mlp": 0.01035771, + "balance_loss_clip": 1.01262021, + "balance_loss_mlp": 1.01583195, + "epoch": 0.9904103411994589, + "flos": 21687347151360.0, + "grad_norm": 1.8922013676686273, + "language_loss": 0.86685491, + "learning_rate": 9.585814735431075e-10, + "loss": 0.88772142, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34960938, + "step": 16473, + "time_per_iteration": 2.385730743408203 + }, + { + "auxiliary_loss_clip": 0.01049752, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.01541722, + "balance_loss_mlp": 1.01552463, + "epoch": 0.9904704644521268, + "flos": 25738249582080.0, + "grad_norm": 1.7016588467145164, + "language_loss": 0.85627317, + "learning_rate": 9.465627102240859e-10, + "loss": 0.87712675, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.34179688, + "step": 16474, + "time_per_iteration": 2.4347822666168213 + }, + { + "auxiliary_loss_clip": 0.01049308, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.01535273, + "balance_loss_mlp": 1.01435614, + "epoch": 0.9905305877047949, + "flos": 21907265005440.0, + "grad_norm": 5.923288563003934, + "language_loss": 0.77048028, + "learning_rate": 9.346197512116738e-10, + "loss": 0.79134715, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 0.34960938, + "step": 16475, + "time_per_iteration": 2.408653497695923 + }, + { + "auxiliary_loss_clip": 0.01050489, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.01579583, + "balance_loss_mlp": 1.01469254, + "epoch": 0.9905907109574628, + "flos": 21391493356800.0, + "grad_norm": 1.4463607383785344, + "language_loss": 0.76385832, + "learning_rate": 9.227525969588423e-10, + "loss": 0.78474885, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35742188, + "step": 16476, + "time_per_iteration": 2.404799699783325 + }, + { + "auxiliary_loss_clip": 0.01054627, + "auxiliary_loss_mlp": 0.01036147, + "balance_loss_clip": 1.01063621, + "balance_loss_mlp": 1.01653099, + "epoch": 0.9906508342101308, + "flos": 20520584691840.0, + "grad_norm": 2.122970700054227, + "language_loss": 0.68313503, + "learning_rate": 9.109612479154538e-10, + "loss": 0.70404279, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.38085938, + "step": 16477, + "time_per_iteration": 2.3602728843688965 + }, + { + "auxiliary_loss_clip": 0.01055157, + "auxiliary_loss_mlp": 0.01044113, + "balance_loss_clip": 1.01843524, + "balance_loss_mlp": 1.01768017, + "epoch": 0.9907109574627987, + "flos": 21360141089280.0, + "grad_norm": 1.9755446152284668, + "language_loss": 0.73181057, + "learning_rate": 8.992457045289282e-10, + "loss": 0.75280333, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.375, + "step": 16478, + "time_per_iteration": 3.8066673278808594 + }, + { + "auxiliary_loss_clip": 0.01051753, + "auxiliary_loss_mlp": 0.01042124, + "balance_loss_clip": 1.01773357, + "balance_loss_mlp": 1.01542413, + "epoch": 0.9907710807154667, + "flos": 17337902751360.0, + "grad_norm": 2.2191726786891226, + "language_loss": 0.82901597, + "learning_rate": 8.876059672433545e-10, + "loss": 0.84995472, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.36328125, + "step": 16479, + "time_per_iteration": 2.324389934539795 + }, + { + "auxiliary_loss_clip": 0.01053318, + "auxiliary_loss_mlp": 0.01039353, + "balance_loss_clip": 1.0153563, + "balance_loss_mlp": 1.01672435, + "epoch": 0.9908312039681346, + "flos": 28620609073920.0, + "grad_norm": 1.6418495672889333, + "language_loss": 0.67532659, + "learning_rate": 8.760420364999355e-10, + "loss": 0.6962533, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.3671875, + "step": 16480, + "time_per_iteration": 2.4523398876190186 + }, + { + "auxiliary_loss_clip": 0.01051491, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.01284862, + "balance_loss_mlp": 1.01650321, + "epoch": 0.9908913272208026, + "flos": 35769250373760.0, + "grad_norm": 1.636763602068245, + "language_loss": 0.72981977, + "learning_rate": 8.645539127374313e-10, + "loss": 0.75066888, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34960938, + "step": 16481, + "time_per_iteration": 2.500537633895874 + }, + { + "auxiliary_loss_clip": 0.01050218, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.00928926, + "balance_loss_mlp": 1.01630259, + "epoch": 0.9909514504734707, + "flos": 19901155616640.0, + "grad_norm": 2.0327553229614135, + "language_loss": 0.79446554, + "learning_rate": 8.531415963912713e-10, + "loss": 0.81527138, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.33984375, + "step": 16482, + "time_per_iteration": 3.768002986907959 + }, + { + "auxiliary_loss_clip": 0.01053068, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.0161984, + "balance_loss_mlp": 1.01690149, + "epoch": 0.9910115737261386, + "flos": 20003940259200.0, + "grad_norm": 1.912557535839443, + "language_loss": 0.76740807, + "learning_rate": 8.418050878944427e-10, + "loss": 0.78833652, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.36132812, + "step": 16483, + "time_per_iteration": 2.365248918533325 + }, + { + "auxiliary_loss_clip": 0.01006592, + "auxiliary_loss_mlp": 0.0100484, + "balance_loss_clip": 1.00275421, + "balance_loss_mlp": 1.00050306, + "epoch": 0.9910716969788066, + "flos": 70685012851200.0, + "grad_norm": 0.6822996060653648, + "language_loss": 0.53698361, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55709791, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.06103516, + "step": 16484, + "time_per_iteration": 3.1693859100341797 + }, + { + "auxiliary_loss_clip": 0.01049217, + "auxiliary_loss_mlp": 0.01035339, + "balance_loss_clip": 1.01242721, + "balance_loss_mlp": 1.0159657, + "epoch": 0.9911318202314745, + "flos": 21432969918720.0, + "grad_norm": 1.6328124082951856, + "language_loss": 0.82757664, + "learning_rate": 8.19359496165184e-10, + "loss": 0.84842217, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.33203125, + "step": 16485, + "time_per_iteration": 2.359832763671875 + }, + { + "auxiliary_loss_clip": 0.01051835, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.01691413, + "balance_loss_mlp": 1.01685619, + "epoch": 0.9911919434841425, + "flos": 19825848080640.0, + "grad_norm": 1.5937759566285437, + "language_loss": 0.82357979, + "learning_rate": 8.082504137836288e-10, + "loss": 0.84448993, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 16486, + "time_per_iteration": 2.4138617515563965 + }, + { + "auxiliary_loss_clip": 0.01051209, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.01357293, + "balance_loss_mlp": 1.01577759, + "epoch": 0.9912520667368104, + "flos": 41717752087680.0, + "grad_norm": 1.3739196914737983, + "language_loss": 0.6689117, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68978369, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.35546875, + "step": 16487, + "time_per_iteration": 2.5413870811462402 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.0103444, + "balance_loss_clip": 1.0137217, + "balance_loss_mlp": 1.01489711, + "epoch": 0.9913121899894785, + "flos": 23768519696640.0, + "grad_norm": 1.7318356897993288, + "language_loss": 0.77711767, + "learning_rate": 7.862596780936481e-10, + "loss": 0.79794967, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.33789062, + "step": 16488, + "time_per_iteration": 2.425225257873535 + }, + { + "auxiliary_loss_clip": 0.01053263, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.0106982, + "balance_loss_mlp": 1.0156213, + "epoch": 0.9913723132421464, + "flos": 23768519696640.0, + "grad_norm": 2.8472199324442946, + "language_loss": 0.69841266, + "learning_rate": 7.753780256190001e-10, + "loss": 0.71929085, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.375, + "step": 16489, + "time_per_iteration": 2.4532511234283447 + }, + { + "auxiliary_loss_clip": 0.01007206, + "auxiliary_loss_mlp": 0.01001766, + "balance_loss_clip": 0.99973929, + "balance_loss_mlp": 1.0008347, + "epoch": 0.9914324364948144, + "flos": 71264117439360.0, + "grad_norm": 0.6060145273869829, + "language_loss": 0.52697647, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54706609, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.06347656, + "step": 16490, + "time_per_iteration": 3.128629684448242 + }, + { + "auxiliary_loss_clip": 0.01054161, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.01164556, + "balance_loss_mlp": 1.01723766, + "epoch": 0.9914925597474823, + "flos": 23694329324160.0, + "grad_norm": 1.6064166689339243, + "language_loss": 0.76273918, + "learning_rate": 7.538421534734052e-10, + "loss": 0.78365022, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.36914062, + "step": 16491, + "time_per_iteration": 3.8346545696258545 + }, + { + "auxiliary_loss_clip": 0.0105455, + "auxiliary_loss_mlp": 0.0104023, + "balance_loss_clip": 1.0140276, + "balance_loss_mlp": 1.01691353, + "epoch": 0.9915526830001503, + "flos": 13433251472640.0, + "grad_norm": 2.3513792111782625, + "language_loss": 0.70770943, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72865725, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 16492, + "time_per_iteration": 2.3726956844329834 + }, + { + "auxiliary_loss_clip": 0.01050564, + "auxiliary_loss_mlp": 0.0103734, + "balance_loss_clip": 1.01415348, + "balance_loss_mlp": 1.01540971, + "epoch": 0.9916128062528182, + "flos": 20739909052800.0, + "grad_norm": 1.8662431848414098, + "language_loss": 0.70366824, + "learning_rate": 7.326095277837563e-10, + "loss": 0.72454727, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3515625, + "step": 16493, + "time_per_iteration": 2.36232852935791 + }, + { + "auxiliary_loss_clip": 0.0105211, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.01652884, + "balance_loss_mlp": 1.01565886, + "epoch": 0.9916729295054862, + "flos": 22486125214080.0, + "grad_norm": 1.7235124053253759, + "language_loss": 0.72396523, + "learning_rate": 7.221069333678276e-10, + "loss": 0.74488205, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.36328125, + "step": 16494, + "time_per_iteration": 2.421680450439453 + }, + { + "auxiliary_loss_clip": 0.01052284, + "auxiliary_loss_mlp": 0.0103842, + "balance_loss_clip": 1.0148766, + "balance_loss_mlp": 1.01681185, + "epoch": 0.9917330527581543, + "flos": 14791616807040.0, + "grad_norm": 2.093319702565956, + "language_loss": 0.69256794, + "learning_rate": 7.116801517701443e-10, + "loss": 0.71347499, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 16495, + "time_per_iteration": 2.3437657356262207 + }, + { + "auxiliary_loss_clip": 0.01006871, + "auxiliary_loss_mlp": 0.01002467, + "balance_loss_clip": 1.00035739, + "balance_loss_mlp": 1.00066948, + "epoch": 0.9917931760108222, + "flos": 59188602896640.0, + "grad_norm": 0.7196328161486152, + "language_loss": 0.53524882, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55534214, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.06201172, + "step": 16496, + "time_per_iteration": 3.1472251415252686 + }, + { + "auxiliary_loss_clip": 0.01051893, + "auxiliary_loss_mlp": 0.01041416, + "balance_loss_clip": 1.01640606, + "balance_loss_mlp": 1.01569521, + "epoch": 0.9918532992634902, + "flos": 26760401723520.0, + "grad_norm": 1.8162424490965101, + "language_loss": 0.72732371, + "learning_rate": 6.91054028607585e-10, + "loss": 0.74825674, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.36132812, + "step": 16497, + "time_per_iteration": 2.4181582927703857 + }, + { + "auxiliary_loss_clip": 0.01053786, + "auxiliary_loss_mlp": 0.01041035, + "balance_loss_clip": 1.01453519, + "balance_loss_mlp": 1.01542497, + "epoch": 0.9919134225161581, + "flos": 14974840955520.0, + "grad_norm": 2.0389644375831035, + "language_loss": 0.83352733, + "learning_rate": 6.808546878249721e-10, + "loss": 0.8544755, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.3828125, + "step": 16498, + "time_per_iteration": 2.3694000244140625 + }, + { + "auxiliary_loss_clip": 0.01053557, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.01428854, + "balance_loss_mlp": 1.01728725, + "epoch": 0.9919735457688261, + "flos": 27816978332160.0, + "grad_norm": 1.5693453934100117, + "language_loss": 0.68640089, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70729589, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.36328125, + "step": 16499, + "time_per_iteration": 2.4395530223846436 + }, + { + "auxiliary_loss_clip": 0.01053233, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.01423359, + "balance_loss_mlp": 1.01755714, + "epoch": 0.992033669021494, + "flos": 22561746952320.0, + "grad_norm": 1.8815836248100426, + "language_loss": 0.83313125, + "learning_rate": 6.606834497904223e-10, + "loss": 0.85403281, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 16500, + "time_per_iteration": 2.4384145736694336 + }, + { + "auxiliary_loss_clip": 0.0105403, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.01523626, + "balance_loss_mlp": 1.0169574, + "epoch": 0.9920937922741621, + "flos": 25373407207680.0, + "grad_norm": 1.6803088256279328, + "language_loss": 0.82835019, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84928048, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.37109375, + "step": 16501, + "time_per_iteration": 2.3931164741516113 + }, + { + "auxiliary_loss_clip": 0.01051608, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.01205015, + "balance_loss_mlp": 1.01614738, + "epoch": 0.99215391552683, + "flos": 22053376512000.0, + "grad_norm": 1.8510840806730784, + "language_loss": 0.78120786, + "learning_rate": 6.408154723420711e-10, + "loss": 0.80207372, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35351562, + "step": 16502, + "time_per_iteration": 2.4098494052886963 + }, + { + "auxiliary_loss_clip": 0.01053774, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.01149321, + "balance_loss_mlp": 1.01640487, + "epoch": 0.992214038779498, + "flos": 15413035829760.0, + "grad_norm": 2.2631846167393426, + "language_loss": 0.73021197, + "learning_rate": 6.309952072811597e-10, + "loss": 0.75111133, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.375, + "step": 16503, + "time_per_iteration": 2.397703170776367 + }, + { + "auxiliary_loss_clip": 0.01007131, + "auxiliary_loss_mlp": 0.01002899, + "balance_loss_clip": 1.00087214, + "balance_loss_mlp": 1.0008707, + "epoch": 0.9922741620321659, + "flos": 62011050762240.0, + "grad_norm": 0.6403820923587432, + "language_loss": 0.55178642, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57188672, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.0625, + "step": 16504, + "time_per_iteration": 3.1349101066589355 + }, + { + "auxiliary_loss_clip": 0.01049642, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.00945425, + "balance_loss_mlp": 1.01544094, + "epoch": 0.9923342852848339, + "flos": 17164523606400.0, + "grad_norm": 1.7338415560850131, + "language_loss": 0.70777673, + "learning_rate": 6.115821263481536e-10, + "loss": 0.7285732, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.34179688, + "step": 16505, + "time_per_iteration": 2.35383677482605 + }, + { + "auxiliary_loss_clip": 0.01052662, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.01175725, + "balance_loss_mlp": 1.01557446, + "epoch": 0.9923944085375018, + "flos": 23182153545600.0, + "grad_norm": 1.9792621296291446, + "language_loss": 0.66055727, + "learning_rate": 6.019893112119146e-10, + "loss": 0.68146271, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.37109375, + "step": 16506, + "time_per_iteration": 2.419633150100708 + }, + { + "auxiliary_loss_clip": 0.01051386, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.00857854, + "balance_loss_mlp": 1.01627064, + "epoch": 0.9924545317901698, + "flos": 20812807704960.0, + "grad_norm": 2.0808993957122333, + "language_loss": 0.6433537, + "learning_rate": 5.924723134487219e-10, + "loss": 0.66419911, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.3515625, + "step": 16507, + "time_per_iteration": 2.3738150596618652 + }, + { + "auxiliary_loss_clip": 0.01052517, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.01339436, + "balance_loss_mlp": 1.01653695, + "epoch": 0.9925146550428379, + "flos": 20082424728960.0, + "grad_norm": 1.8261062524598641, + "language_loss": 0.73778886, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75870091, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.359375, + "step": 16508, + "time_per_iteration": 2.4077346324920654 + }, + { + "auxiliary_loss_clip": 0.01052868, + "auxiliary_loss_mlp": 0.01037877, + "balance_loss_clip": 1.01320052, + "balance_loss_mlp": 1.01633239, + "epoch": 0.9925747782955058, + "flos": 24972534443520.0, + "grad_norm": 1.6369102504319144, + "language_loss": 0.71225953, + "learning_rate": 5.736657714818793e-10, + "loss": 0.73316693, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.36523438, + "step": 16509, + "time_per_iteration": 2.3955023288726807 + }, + { + "auxiliary_loss_clip": 0.01052625, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.01321578, + "balance_loss_mlp": 1.0162859, + "epoch": 0.9926349015481738, + "flos": 60470405752320.0, + "grad_norm": 1.873815565074535, + "language_loss": 0.6992147, + "learning_rate": 5.643762279912146e-10, + "loss": 0.72010058, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 16510, + "time_per_iteration": 2.762310028076172 + }, + { + "auxiliary_loss_clip": 0.01052829, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.01424503, + "balance_loss_mlp": 1.01684022, + "epoch": 0.9926950248008417, + "flos": 20740642191360.0, + "grad_norm": 2.7202154483712326, + "language_loss": 0.83179635, + "learning_rate": 5.551625032997886e-10, + "loss": 0.85270071, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.359375, + "step": 16511, + "time_per_iteration": 3.7451868057250977 + }, + { + "auxiliary_loss_clip": 0.01049959, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.01123214, + "balance_loss_mlp": 1.01458156, + "epoch": 0.9927551480535097, + "flos": 24351813648000.0, + "grad_norm": 1.915278254996496, + "language_loss": 0.9270786, + "learning_rate": 5.460245977570998e-10, + "loss": 0.94790423, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.35351562, + "step": 16512, + "time_per_iteration": 2.413013458251953 + }, + { + "auxiliary_loss_clip": 0.01006882, + "auxiliary_loss_mlp": 0.01005918, + "balance_loss_clip": 1.00385523, + "balance_loss_mlp": 1.00070262, + "epoch": 0.9928152713061776, + "flos": 71272042318080.0, + "grad_norm": 0.6986452908464177, + "language_loss": 0.55286467, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57299268, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.06176758, + "step": 16513, + "time_per_iteration": 3.1287012100219727 + }, + { + "auxiliary_loss_clip": 0.01049611, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.01091242, + "balance_loss_mlp": 1.01484776, + "epoch": 0.9928753945588457, + "flos": 57807440444160.0, + "grad_norm": 1.4500248961640303, + "language_loss": 0.66083801, + "learning_rate": 5.279762455006054e-10, + "loss": 0.681674, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34765625, + "step": 16514, + "time_per_iteration": 2.6887924671173096 + }, + { + "auxiliary_loss_clip": 0.01053682, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.01302004, + "balance_loss_mlp": 1.01661086, + "epoch": 0.9929355178115136, + "flos": 19568084446080.0, + "grad_norm": 1.7715384984141704, + "language_loss": 0.73813522, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75905418, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37109375, + "step": 16515, + "time_per_iteration": 2.362807035446167 + }, + { + "auxiliary_loss_clip": 0.01052162, + "auxiliary_loss_mlp": 0.01035642, + "balance_loss_clip": 1.01374364, + "balance_loss_mlp": 1.01695657, + "epoch": 0.9929956410641816, + "flos": 22963597234560.0, + "grad_norm": 1.533060671236971, + "language_loss": 0.78081566, + "learning_rate": 5.102311739593191e-10, + "loss": 0.80169368, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.3515625, + "step": 16516, + "time_per_iteration": 2.388577461242676 + }, + { + "auxiliary_loss_clip": 0.01050562, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.0102638, + "balance_loss_mlp": 1.01547718, + "epoch": 0.9930557643168495, + "flos": 22565272999680.0, + "grad_norm": 1.7367284201761441, + "language_loss": 0.7871142, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80793083, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 0.3515625, + "step": 16517, + "time_per_iteration": 3.8320353031158447 + }, + { + "auxiliary_loss_clip": 0.01054349, + "auxiliary_loss_mlp": 0.01039594, + "balance_loss_clip": 1.01358271, + "balance_loss_mlp": 1.01637697, + "epoch": 0.9931158875695175, + "flos": 17200100148480.0, + "grad_norm": 2.227524210168445, + "language_loss": 0.69218057, + "learning_rate": 4.927893858248655e-10, + "loss": 0.71311998, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.37890625, + "step": 16518, + "time_per_iteration": 2.350573778152466 + }, + { + "auxiliary_loss_clip": 0.01007158, + "auxiliary_loss_mlp": 0.010063, + "balance_loss_clip": 1.00420213, + "balance_loss_mlp": 1.00088692, + "epoch": 0.9931760108221854, + "flos": 63708214596480.0, + "grad_norm": 0.7381855297801038, + "language_loss": 0.5350194, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55515397, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.06298828, + "step": 16519, + "time_per_iteration": 2.8889756202697754 + }, + { + "auxiliary_loss_clip": 0.01050969, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.01591015, + "balance_loss_mlp": 1.01661777, + "epoch": 0.9932361340748534, + "flos": 15303897319680.0, + "grad_norm": 1.9714136539775473, + "language_loss": 0.61024296, + "learning_rate": 4.756508837426842e-10, + "loss": 0.63113695, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34375, + "step": 16520, + "time_per_iteration": 2.3462846279144287 + }, + { + "auxiliary_loss_clip": 0.01051025, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.01398325, + "balance_loss_mlp": 1.01605904, + "epoch": 0.9932962573275215, + "flos": 36063428423040.0, + "grad_norm": 1.7793011150696274, + "language_loss": 0.63288486, + "learning_rate": 4.671953657853223e-10, + "loss": 0.65376478, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.34960938, + "step": 16521, + "time_per_iteration": 2.519740581512451 + }, + { + "auxiliary_loss_clip": 0.01053087, + "auxiliary_loss_mlp": 0.01037745, + "balance_loss_clip": 1.01285362, + "balance_loss_mlp": 1.0160867, + "epoch": 0.9933563805801894, + "flos": 21469454156160.0, + "grad_norm": 1.6415532292762327, + "language_loss": 0.75530446, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.77621281, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.37109375, + "step": 16522, + "time_per_iteration": 3.7900197505950928 + }, + { + "auxiliary_loss_clip": 0.01052077, + "auxiliary_loss_mlp": 0.01034674, + "balance_loss_clip": 1.01244152, + "balance_loss_mlp": 1.01720369, + "epoch": 0.9934165038328574, + "flos": 23985435173760.0, + "grad_norm": 1.6468408584631735, + "language_loss": 0.74101126, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.76187873, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.34765625, + "step": 16523, + "time_per_iteration": 2.383441925048828 + }, + { + "auxiliary_loss_clip": 0.01051045, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.01181746, + "balance_loss_mlp": 1.01586974, + "epoch": 0.9934766270855253, + "flos": 21906741335040.0, + "grad_norm": 1.6471219832714696, + "language_loss": 0.71916431, + "learning_rate": 4.422837480875241e-10, + "loss": 0.74002278, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 16524, + "time_per_iteration": 2.3766725063323975 + }, + { + "auxiliary_loss_clip": 0.01050693, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.01368475, + "balance_loss_mlp": 1.01527691, + "epoch": 0.9935367503381933, + "flos": 17128179014400.0, + "grad_norm": 2.5444257779415147, + "language_loss": 0.80100977, + "learning_rate": 4.341315219624775e-10, + "loss": 0.82187682, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35351562, + "step": 16525, + "time_per_iteration": 2.3410911560058594 + }, + { + "auxiliary_loss_clip": 0.01050227, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.01258588, + "balance_loss_mlp": 1.01564264, + "epoch": 0.9935968735908612, + "flos": 22345145677440.0, + "grad_norm": 1.7386842694761222, + "language_loss": 0.76350033, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.78435278, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34570312, + "step": 16526, + "time_per_iteration": 2.3898701667785645 + }, + { + "auxiliary_loss_clip": 0.01049294, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.01374292, + "balance_loss_mlp": 1.01554036, + "epoch": 0.9936569968435293, + "flos": 29459257776000.0, + "grad_norm": 1.8364653836878828, + "language_loss": 0.7334305, + "learning_rate": 4.180545412333369e-10, + "loss": 0.75427026, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.33789062, + "step": 16527, + "time_per_iteration": 2.428284168243408 + }, + { + "auxiliary_loss_clip": 0.01052076, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.01626229, + "balance_loss_mlp": 1.01589131, + "epoch": 0.9937171200961972, + "flos": 16543139495040.0, + "grad_norm": 2.698635870111302, + "language_loss": 0.76916313, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.79007936, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 16528, + "time_per_iteration": 2.351348638534546 + }, + { + "auxiliary_loss_clip": 0.01051671, + "auxiliary_loss_mlp": 0.0104076, + "balance_loss_clip": 1.01670337, + "balance_loss_mlp": 1.01549339, + "epoch": 0.9937772433488652, + "flos": 24389100846720.0, + "grad_norm": 2.377174281750105, + "language_loss": 0.69990468, + "learning_rate": 4.022808578922898e-10, + "loss": 0.72082907, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.36132812, + "step": 16529, + "time_per_iteration": 2.4071598052978516 + }, + { + "auxiliary_loss_clip": 0.01054318, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.01295233, + "balance_loss_mlp": 1.01623476, + "epoch": 0.9938373666015331, + "flos": 15668984073600.0, + "grad_norm": 2.1380623992440584, + "language_loss": 0.66526085, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.68621022, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.38085938, + "step": 16530, + "time_per_iteration": 2.375223398208618 + }, + { + "auxiliary_loss_clip": 0.01050984, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.01177335, + "balance_loss_mlp": 1.01514733, + "epoch": 0.9938974898542011, + "flos": 19495290528000.0, + "grad_norm": 2.680685666715921, + "language_loss": 0.72723246, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.74807423, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.359375, + "step": 16531, + "time_per_iteration": 3.8510658740997314 + }, + { + "auxiliary_loss_clip": 0.01051911, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.01345384, + "balance_loss_mlp": 1.0155586, + "epoch": 0.993957613106869, + "flos": 26905675357440.0, + "grad_norm": 1.366043657897605, + "language_loss": 0.75083601, + "learning_rate": 3.791890207045512e-10, + "loss": 0.77171838, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36328125, + "step": 16532, + "time_per_iteration": 2.4291305541992188 + }, + { + "auxiliary_loss_clip": 0.01045898, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.01709771, + "balance_loss_mlp": 1.01398778, + "epoch": 0.994017736359537, + "flos": 14938705831680.0, + "grad_norm": 1.5833368361317355, + "language_loss": 0.71623987, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.7370733, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.3203125, + "step": 16533, + "time_per_iteration": 2.387833595275879 + }, + { + "auxiliary_loss_clip": 0.01053518, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.01168466, + "balance_loss_mlp": 1.01674867, + "epoch": 0.9940778596122051, + "flos": 15376970528640.0, + "grad_norm": 1.9607247985543856, + "language_loss": 0.85703009, + "learning_rate": 3.641735912007782e-10, + "loss": 0.87793696, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.3671875, + "step": 16534, + "time_per_iteration": 2.3824098110198975 + }, + { + "auxiliary_loss_clip": 0.01048818, + "auxiliary_loss_mlp": 0.01035094, + "balance_loss_clip": 1.01441085, + "balance_loss_mlp": 1.01525831, + "epoch": 0.994137982864873, + "flos": 25226946587520.0, + "grad_norm": 1.5772627743867174, + "language_loss": 0.67076039, + "learning_rate": 3.567796158934211e-10, + "loss": 0.69159949, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 0.3359375, + "step": 16535, + "time_per_iteration": 2.393564224243164 + }, + { + "auxiliary_loss_clip": 0.01050285, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.01151562, + "balance_loss_mlp": 1.0164336, + "epoch": 0.994198106117541, + "flos": 18441157714560.0, + "grad_norm": 1.5888027388980284, + "language_loss": 0.65913123, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.67996287, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33984375, + "step": 16536, + "time_per_iteration": 2.3527514934539795 + }, + { + "auxiliary_loss_clip": 0.01049952, + "auxiliary_loss_mlp": 0.01039914, + "balance_loss_clip": 1.01722884, + "balance_loss_mlp": 1.01564372, + "epoch": 0.9942582293702089, + "flos": 16653185700480.0, + "grad_norm": 1.709519813335413, + "language_loss": 0.80246598, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.82336462, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.34375, + "step": 16537, + "time_per_iteration": 2.335855484008789 + }, + { + "auxiliary_loss_clip": 0.01055468, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.0149684, + "balance_loss_mlp": 1.01710868, + "epoch": 0.9943183526228769, + "flos": 21943504863360.0, + "grad_norm": 1.5339870724846063, + "language_loss": 0.69705683, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71801257, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.3828125, + "step": 16538, + "time_per_iteration": 2.416221857070923 + }, + { + "auxiliary_loss_clip": 0.01049986, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01380515, + "balance_loss_mlp": 1.01508188, + "epoch": 0.9943784758755448, + "flos": 23841418348800.0, + "grad_norm": 1.9202060782181256, + "language_loss": 0.76796436, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.78882378, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34960938, + "step": 16539, + "time_per_iteration": 2.3893511295318604 + }, + { + "auxiliary_loss_clip": 0.0105223, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.01527894, + "balance_loss_mlp": 1.01609337, + "epoch": 0.9944385991282129, + "flos": 21468930485760.0, + "grad_norm": 2.1632463974003375, + "language_loss": 0.72516906, + "learning_rate": 3.209471449341361e-10, + "loss": 0.74608588, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.36132812, + "step": 16540, + "time_per_iteration": 2.3841147422790527 + }, + { + "auxiliary_loss_clip": 0.01049959, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.01135528, + "balance_loss_mlp": 1.01548886, + "epoch": 0.9944987223808808, + "flos": 22925995833600.0, + "grad_norm": 4.121484587426536, + "language_loss": 0.7662214, + "learning_rate": 3.140081337600353e-10, + "loss": 0.78704488, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34375, + "step": 16541, + "time_per_iteration": 2.3598291873931885 + }, + { + "auxiliary_loss_clip": 0.01050055, + "auxiliary_loss_mlp": 0.01037642, + "balance_loss_clip": 1.01488471, + "balance_loss_mlp": 1.01465034, + "epoch": 0.9945588456335488, + "flos": 22381909205760.0, + "grad_norm": 1.643422203558223, + "language_loss": 0.77610779, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.79698473, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.35351562, + "step": 16542, + "time_per_iteration": 2.3773975372314453 + }, + { + "auxiliary_loss_clip": 0.01053373, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.0111531, + "balance_loss_mlp": 1.01634741, + "epoch": 0.9946189688862167, + "flos": 21396450769920.0, + "grad_norm": 2.0748769930489823, + "language_loss": 0.76459378, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.78548217, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.37109375, + "step": 16543, + "time_per_iteration": 2.355687379837036 + }, + { + "auxiliary_loss_clip": 0.01054464, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.01364517, + "balance_loss_mlp": 1.01669121, + "epoch": 0.9946790921388847, + "flos": 12415882187520.0, + "grad_norm": 2.857682452209603, + "language_loss": 0.83903992, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.85997146, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.37695312, + "step": 16544, + "time_per_iteration": 2.3367371559143066 + }, + { + "auxiliary_loss_clip": 0.01051383, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.01233256, + "balance_loss_mlp": 1.01631284, + "epoch": 0.9947392153915526, + "flos": 19057409856000.0, + "grad_norm": 1.9325998597359795, + "language_loss": 0.80035365, + "learning_rate": 2.870103745831187e-10, + "loss": 0.82121837, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.3515625, + "step": 16545, + "time_per_iteration": 2.3446030616760254 + }, + { + "auxiliary_loss_clip": 0.01054609, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.01742232, + "balance_loss_mlp": 1.01758015, + "epoch": 0.9947993386442207, + "flos": 27307560551040.0, + "grad_norm": 2.2531104762781338, + "language_loss": 0.73899984, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.7599597, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 16546, + "time_per_iteration": 2.4292893409729004 + }, + { + "auxiliary_loss_clip": 0.01050112, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.01664197, + "balance_loss_mlp": 1.01595426, + "epoch": 0.9948594618968887, + "flos": 20805650876160.0, + "grad_norm": 2.8873900927491514, + "language_loss": 0.78471982, + "learning_rate": 2.739664698798716e-10, + "loss": 0.80559862, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34179688, + "step": 16547, + "time_per_iteration": 2.3404369354248047 + }, + { + "auxiliary_loss_clip": 0.0105054, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.01548362, + "balance_loss_mlp": 1.01546764, + "epoch": 0.9949195851495566, + "flos": 23291885548800.0, + "grad_norm": 2.360478078893796, + "language_loss": 0.71825904, + "learning_rate": 2.67558262122769e-10, + "loss": 0.73914182, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.3515625, + "step": 16548, + "time_per_iteration": 2.369936466217041 + }, + { + "auxiliary_loss_clip": 0.01051257, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.0089643, + "balance_loss_mlp": 1.01596081, + "epoch": 0.9949797084022246, + "flos": 18514475303040.0, + "grad_norm": 4.988725612805417, + "language_loss": 0.7625221, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.78333497, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.35351562, + "step": 16549, + "time_per_iteration": 2.3468565940856934 + }, + { + "auxiliary_loss_clip": 0.01054454, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.01053452, + "balance_loss_mlp": 1.0167799, + "epoch": 0.9950398316548925, + "flos": 30406451495040.0, + "grad_norm": 1.602124964142184, + "language_loss": 0.75402999, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.77492988, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.375, + "step": 16550, + "time_per_iteration": 3.6663777828216553 + }, + { + "auxiliary_loss_clip": 0.01050649, + "auxiliary_loss_mlp": 0.01034455, + "balance_loss_clip": 1.01187634, + "balance_loss_mlp": 1.01520562, + "epoch": 0.9950999549075605, + "flos": 19899863896320.0, + "grad_norm": 2.0206212902985956, + "language_loss": 0.78524315, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80609417, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 16551, + "time_per_iteration": 2.3656516075134277 + }, + { + "auxiliary_loss_clip": 0.01047735, + "auxiliary_loss_mlp": 0.0103513, + "balance_loss_clip": 1.01584244, + "balance_loss_mlp": 1.01515305, + "epoch": 0.9951600781602284, + "flos": 17602578835200.0, + "grad_norm": 1.441477904336404, + "language_loss": 0.67355371, + "learning_rate": 2.426837340270271e-10, + "loss": 0.69438243, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.32617188, + "step": 16552, + "time_per_iteration": 2.3832452297210693 + }, + { + "auxiliary_loss_clip": 0.01051558, + "auxiliary_loss_mlp": 0.01034537, + "balance_loss_clip": 1.01158929, + "balance_loss_mlp": 1.01596928, + "epoch": 0.9952202014128965, + "flos": 28949421058560.0, + "grad_norm": 1.4204943972233643, + "language_loss": 0.81840599, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.8392669, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.35546875, + "step": 16553, + "time_per_iteration": 2.4452714920043945 + }, + { + "auxiliary_loss_clip": 0.01006723, + "auxiliary_loss_mlp": 0.01002789, + "balance_loss_clip": 1.00102425, + "balance_loss_mlp": 1.00049901, + "epoch": 0.9952803246655644, + "flos": 70806614716800.0, + "grad_norm": 0.7244857306798538, + "language_loss": 0.57359666, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59369177, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.06225586, + "step": 16554, + "time_per_iteration": 3.159752130508423 + }, + { + "auxiliary_loss_clip": 0.01051623, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.01381123, + "balance_loss_mlp": 1.0157423, + "epoch": 0.9953404479182324, + "flos": 21797986849920.0, + "grad_norm": 1.5055956709779863, + "language_loss": 0.7808187, + "learning_rate": 2.24824062597051e-10, + "loss": 0.80169082, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 0.359375, + "step": 16555, + "time_per_iteration": 2.4009222984313965 + }, + { + "auxiliary_loss_clip": 0.01051039, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.01739168, + "balance_loss_mlp": 1.01543474, + "epoch": 0.9954005711709003, + "flos": 21936522591360.0, + "grad_norm": 2.141277241643714, + "language_loss": 0.86823559, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88916296, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.35546875, + "step": 16556, + "time_per_iteration": 2.369748592376709 + }, + { + "auxiliary_loss_clip": 0.01050415, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.01341891, + "balance_loss_mlp": 1.01619124, + "epoch": 0.9954606944235683, + "flos": 19353542941440.0, + "grad_norm": 1.8794048945468074, + "language_loss": 0.7420122, + "learning_rate": 2.132967729762125e-10, + "loss": 0.76287919, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34179688, + "step": 16557, + "time_per_iteration": 3.8729825019836426 + }, + { + "auxiliary_loss_clip": 0.0105052, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.01511228, + "balance_loss_mlp": 1.01596987, + "epoch": 0.9955208176762362, + "flos": 30517300661760.0, + "grad_norm": 1.8951657730807943, + "language_loss": 0.77218401, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.79305959, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34570312, + "step": 16558, + "time_per_iteration": 2.435725450515747 + }, + { + "auxiliary_loss_clip": 0.01051108, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.01315093, + "balance_loss_mlp": 1.01517534, + "epoch": 0.9955809409289043, + "flos": 30006940273920.0, + "grad_norm": 2.4634520635712107, + "language_loss": 0.64392465, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.66480064, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.359375, + "step": 16559, + "time_per_iteration": 2.4451541900634766 + }, + { + "auxiliary_loss_clip": 0.01051186, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.01408982, + "balance_loss_mlp": 1.01681316, + "epoch": 0.9956410641815723, + "flos": 21542213162880.0, + "grad_norm": 2.0867080274402583, + "language_loss": 0.75309604, + "learning_rate": 1.965745799148433e-10, + "loss": 0.7739774, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.34375, + "step": 16560, + "time_per_iteration": 2.3613672256469727 + }, + { + "auxiliary_loss_clip": 0.010503, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.01334977, + "balance_loss_mlp": 1.01600516, + "epoch": 0.9957011874342402, + "flos": 21688394492160.0, + "grad_norm": 1.62969815390942, + "language_loss": 0.79620671, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.81706357, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.34375, + "step": 16561, + "time_per_iteration": 2.3648927211761475 + }, + { + "auxiliary_loss_clip": 0.01048872, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.01473427, + "balance_loss_mlp": 1.01538503, + "epoch": 0.9957613106869082, + "flos": 17701977075840.0, + "grad_norm": 2.5459361186105607, + "language_loss": 0.66938078, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.69022262, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.33398438, + "step": 16562, + "time_per_iteration": 3.6739256381988525 + }, + { + "auxiliary_loss_clip": 0.01052778, + "auxiliary_loss_mlp": 0.01039769, + "balance_loss_clip": 1.01500964, + "balance_loss_mlp": 1.01577938, + "epoch": 0.9958214339395761, + "flos": 30554657683200.0, + "grad_norm": 2.0097908343019086, + "language_loss": 0.65354985, + "learning_rate": 1.805348815528962e-10, + "loss": 0.67447531, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.37109375, + "step": 16563, + "time_per_iteration": 2.440884590148926 + }, + { + "auxiliary_loss_clip": 0.01050573, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.00822711, + "balance_loss_mlp": 1.01598966, + "epoch": 0.9958815571922441, + "flos": 24168065829120.0, + "grad_norm": 2.557924873161957, + "language_loss": 0.65371037, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.67452836, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.34570312, + "step": 16564, + "time_per_iteration": 2.414936065673828 + }, + { + "auxiliary_loss_clip": 0.01051241, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.01244569, + "balance_loss_mlp": 1.01660132, + "epoch": 0.995941680444912, + "flos": 15486109038720.0, + "grad_norm": 1.8260159806340535, + "language_loss": 0.74973392, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.77060646, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.34570312, + "step": 16565, + "time_per_iteration": 2.3570871353149414 + }, + { + "auxiliary_loss_clip": 0.01051366, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.01240253, + "balance_loss_mlp": 1.01590562, + "epoch": 0.9960018036975801, + "flos": 18620332145280.0, + "grad_norm": 3.2311357440848334, + "language_loss": 0.80504149, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.82590282, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.35546875, + "step": 16566, + "time_per_iteration": 2.3560965061187744 + }, + { + "auxiliary_loss_clip": 0.01049053, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.01567817, + "balance_loss_mlp": 1.0147717, + "epoch": 0.996061926950248, + "flos": 20083088044800.0, + "grad_norm": 1.6823427014517887, + "language_loss": 0.72081131, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.74167585, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 0.34179688, + "step": 16567, + "time_per_iteration": 2.370073080062866 + }, + { + "auxiliary_loss_clip": 0.01053348, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.01300895, + "balance_loss_mlp": 1.01682663, + "epoch": 0.996122050202916, + "flos": 24346821323520.0, + "grad_norm": 2.084244954583539, + "language_loss": 0.80799186, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.82890701, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.36523438, + "step": 16568, + "time_per_iteration": 2.37907338142395 + }, + { + "auxiliary_loss_clip": 0.01049349, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.01019669, + "balance_loss_mlp": 1.01588547, + "epoch": 0.9961821734555839, + "flos": 24198370755840.0, + "grad_norm": 1.67551543137921, + "language_loss": 0.82949042, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.85028696, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 0.33398438, + "step": 16569, + "time_per_iteration": 2.406996250152588 + }, + { + "auxiliary_loss_clip": 0.01050595, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.0155654, + "balance_loss_mlp": 1.01628983, + "epoch": 0.9962422967082519, + "flos": 22632760391040.0, + "grad_norm": 2.064249725054532, + "language_loss": 0.7116192, + "learning_rate": 1.457630950747468e-10, + "loss": 0.73249388, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.34375, + "step": 16570, + "time_per_iteration": 3.8101067543029785 + }, + { + "auxiliary_loss_clip": 0.01050699, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.01125073, + "balance_loss_mlp": 1.01542306, + "epoch": 0.9963024199609198, + "flos": 26394826210560.0, + "grad_norm": 1.6069440295952848, + "language_loss": 0.76398087, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.78482974, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.3515625, + "step": 16571, + "time_per_iteration": 2.416417360305786 + }, + { + "auxiliary_loss_clip": 0.01051234, + "auxiliary_loss_mlp": 0.01038029, + "balance_loss_clip": 1.01363909, + "balance_loss_mlp": 1.01547384, + "epoch": 0.9963625432135879, + "flos": 16580601250560.0, + "grad_norm": 1.8231272132076841, + "language_loss": 0.81244022, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.83333278, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.35742188, + "step": 16572, + "time_per_iteration": 2.3412251472473145 + }, + { + "auxiliary_loss_clip": 0.0105151, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.01468444, + "balance_loss_mlp": 1.01622939, + "epoch": 0.9964226664662559, + "flos": 26467340837760.0, + "grad_norm": 1.76280921311369, + "language_loss": 0.71461475, + "learning_rate": 1.3199841727074e-10, + "loss": 0.7354961, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.35351562, + "step": 16573, + "time_per_iteration": 2.4155402183532715 + }, + { + "auxiliary_loss_clip": 0.01053322, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.01567352, + "balance_loss_mlp": 1.01603794, + "epoch": 0.9964827897189238, + "flos": 27447248367360.0, + "grad_norm": 1.7050903892801517, + "language_loss": 0.64442003, + "learning_rate": 1.275618614968721e-10, + "loss": 0.66536641, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.375, + "step": 16574, + "time_per_iteration": 2.4193148612976074 + }, + { + "auxiliary_loss_clip": 0.01054076, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.01491761, + "balance_loss_mlp": 1.01613212, + "epoch": 0.9965429129715918, + "flos": 11720971019520.0, + "grad_norm": 2.2581131711859244, + "language_loss": 0.78038186, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.80132252, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.37890625, + "step": 16575, + "time_per_iteration": 2.3530516624450684 + }, + { + "auxiliary_loss_clip": 0.01051399, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.01566517, + "balance_loss_mlp": 1.01540589, + "epoch": 0.9966030362242597, + "flos": 19754974287360.0, + "grad_norm": 2.21943445749183, + "language_loss": 0.71169198, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.73259389, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.359375, + "step": 16576, + "time_per_iteration": 2.3688220977783203 + }, + { + "auxiliary_loss_clip": 0.01049997, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.01190281, + "balance_loss_mlp": 1.01579452, + "epoch": 0.9966631594769277, + "flos": 23914037710080.0, + "grad_norm": 1.573029299593729, + "language_loss": 0.72724378, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.7480889, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.34179688, + "step": 16577, + "time_per_iteration": 2.401250123977661 + }, + { + "auxiliary_loss_clip": 0.01051835, + "auxiliary_loss_mlp": 0.01034845, + "balance_loss_clip": 1.01204014, + "balance_loss_mlp": 1.01564169, + "epoch": 0.9967232827295956, + "flos": 15558693488640.0, + "grad_norm": 1.9050211130784744, + "language_loss": 0.7937783, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.81464505, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.36132812, + "step": 16578, + "time_per_iteration": 2.3328428268432617 + }, + { + "auxiliary_loss_clip": 0.01052065, + "auxiliary_loss_mlp": 0.01036829, + "balance_loss_clip": 1.01301122, + "balance_loss_mlp": 1.0165, + "epoch": 0.9967834059822637, + "flos": 20812004743680.0, + "grad_norm": 2.1849923030623613, + "language_loss": 0.77196419, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.79285318, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.35546875, + "step": 16579, + "time_per_iteration": 2.380669355392456 + }, + { + "auxiliary_loss_clip": 0.0105391, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.01244652, + "balance_loss_mlp": 1.01685953, + "epoch": 0.9968435292349316, + "flos": 36717805635840.0, + "grad_norm": 3.057740186128159, + "language_loss": 0.70457727, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.7254957, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.37109375, + "step": 16580, + "time_per_iteration": 2.487881660461426 + }, + { + "auxiliary_loss_clip": 0.01051383, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.01711011, + "balance_loss_mlp": 1.01594841, + "epoch": 0.9969036524875996, + "flos": 26759703496320.0, + "grad_norm": 1.8887586160955068, + "language_loss": 0.81067926, + "learning_rate": 9.862937031113184e-11, + "loss": 0.83159065, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.35546875, + "step": 16581, + "time_per_iteration": 2.4047374725341797 + }, + { + "auxiliary_loss_clip": 0.01049719, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.00889266, + "balance_loss_mlp": 1.01550937, + "epoch": 0.9969637757402675, + "flos": 24826073823360.0, + "grad_norm": 1.6892049916756566, + "language_loss": 0.81922716, + "learning_rate": 9.479950191249031e-11, + "loss": 0.84002316, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 0.34179688, + "step": 16582, + "time_per_iteration": 2.4090566635131836 + }, + { + "auxiliary_loss_clip": 0.01049586, + "auxiliary_loss_mlp": 0.01036287, + "balance_loss_clip": 1.0143528, + "balance_loss_mlp": 1.0152775, + "epoch": 0.9970238989929355, + "flos": 23037647961600.0, + "grad_norm": 1.7155947294782368, + "language_loss": 0.61321664, + "learning_rate": 9.104547011951069e-11, + "loss": 0.6340754, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 0.34375, + "step": 16583, + "time_per_iteration": 2.3857479095458984 + }, + { + "auxiliary_loss_clip": 0.01051965, + "auxiliary_loss_mlp": 0.01040658, + "balance_loss_clip": 1.01769781, + "balance_loss_mlp": 1.01640689, + "epoch": 0.9970840222456034, + "flos": 25297715646720.0, + "grad_norm": 1.6721892728611192, + "language_loss": 0.79229361, + "learning_rate": 8.736727507452357e-11, + "loss": 0.81321979, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.35546875, + "step": 16584, + "time_per_iteration": 2.426241159439087 + }, + { + "auxiliary_loss_clip": 0.01048554, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.01616728, + "balance_loss_mlp": 1.01526308, + "epoch": 0.9971441454982715, + "flos": 21614553233280.0, + "grad_norm": 1.6406007244497745, + "language_loss": 0.70220906, + "learning_rate": 8.376491691697297e-11, + "loss": 0.7230683, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.33203125, + "step": 16585, + "time_per_iteration": 2.385852575302124 + }, + { + "auxiliary_loss_clip": 0.01050716, + "auxiliary_loss_mlp": 0.01039601, + "balance_loss_clip": 1.01606929, + "balance_loss_mlp": 1.0156883, + "epoch": 0.9972042687509394, + "flos": 14974736221440.0, + "grad_norm": 2.981933283960767, + "language_loss": 0.8215698, + "learning_rate": 8.023839578363834e-11, + "loss": 0.84247303, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.34960938, + "step": 16586, + "time_per_iteration": 2.342684507369995 + }, + { + "auxiliary_loss_clip": 0.01051681, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.01157904, + "balance_loss_mlp": 1.01495337, + "epoch": 0.9972643920036074, + "flos": 25805632239360.0, + "grad_norm": 1.7308406418858153, + "language_loss": 0.78701735, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80788136, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.3671875, + "step": 16587, + "time_per_iteration": 2.4000422954559326 + }, + { + "auxiliary_loss_clip": 0.01055086, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.01711631, + "balance_loss_mlp": 1.01769638, + "epoch": 0.9973245152562754, + "flos": 23325262675200.0, + "grad_norm": 1.8544809896588617, + "language_loss": 0.7361927, + "learning_rate": 7.341286512074773e-11, + "loss": 0.75713551, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.375, + "step": 16588, + "time_per_iteration": 2.3835129737854004 + }, + { + "auxiliary_loss_clip": 0.01053534, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.01198614, + "balance_loss_mlp": 1.01553798, + "epoch": 0.9973846385089433, + "flos": 12165415027200.0, + "grad_norm": 2.625649474134828, + "language_loss": 0.84007698, + "learning_rate": 7.011385585031781e-11, + "loss": 0.86096543, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.37890625, + "step": 16589, + "time_per_iteration": 3.5893871784210205 + }, + { + "auxiliary_loss_clip": 0.0105344, + "auxiliary_loss_mlp": 0.01046173, + "balance_loss_clip": 1.01951754, + "balance_loss_mlp": 1.01655829, + "epoch": 0.9974447617616113, + "flos": 20044194923520.0, + "grad_norm": 2.4441203599462313, + "language_loss": 0.72375965, + "learning_rate": 6.689068412168986e-11, + "loss": 0.74475574, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.36914062, + "step": 16590, + "time_per_iteration": 2.3636531829833984 + }, + { + "auxiliary_loss_clip": 0.0105325, + "auxiliary_loss_mlp": 0.01039692, + "balance_loss_clip": 1.01598096, + "balance_loss_mlp": 1.01614296, + "epoch": 0.9975048850142793, + "flos": 32013259130880.0, + "grad_norm": 2.1832074471126326, + "language_loss": 0.65079105, + "learning_rate": 6.374335005676634e-11, + "loss": 0.67172045, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.37109375, + "step": 16591, + "time_per_iteration": 2.4642603397369385 + }, + { + "auxiliary_loss_clip": 0.01050845, + "auxiliary_loss_mlp": 0.010357, + "balance_loss_clip": 1.0136466, + "balance_loss_mlp": 1.01545966, + "epoch": 0.9975650082669473, + "flos": 36932416963200.0, + "grad_norm": 1.6249904975664007, + "language_loss": 0.74463296, + "learning_rate": 6.067185377522933e-11, + "loss": 0.7654984, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.35351562, + "step": 16592, + "time_per_iteration": 2.482407331466675 + }, + { + "auxiliary_loss_clip": 0.01050862, + "auxiliary_loss_mlp": 0.01036549, + "balance_loss_clip": 1.01432872, + "balance_loss_mlp": 1.01500273, + "epoch": 0.9976251315196152, + "flos": 16471183449600.0, + "grad_norm": 1.6452748713975907, + "language_loss": 0.85781348, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87868762, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.359375, + "step": 16593, + "time_per_iteration": 2.370751142501831 + }, + { + "auxiliary_loss_clip": 0.01048283, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.01060545, + "balance_loss_mlp": 1.01561582, + "epoch": 0.9976852547722832, + "flos": 19645800865920.0, + "grad_norm": 1.7692448035259756, + "language_loss": 0.70439672, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.72519076, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.328125, + "step": 16594, + "time_per_iteration": 2.3823812007904053 + }, + { + "auxiliary_loss_clip": 0.01052133, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.01503527, + "balance_loss_mlp": 1.01567554, + "epoch": 0.9977453780249511, + "flos": 20447406748800.0, + "grad_norm": 2.2219170678976927, + "language_loss": 0.74281514, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.76371133, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.36523438, + "step": 16595, + "time_per_iteration": 2.3725314140319824 + }, + { + "auxiliary_loss_clip": 0.01006767, + "auxiliary_loss_mlp": 0.01003707, + "balance_loss_clip": 1.00170457, + "balance_loss_mlp": 1.00055349, + "epoch": 0.9978055012776191, + "flos": 65452266253440.0, + "grad_norm": 0.7895314572637466, + "language_loss": 0.60387748, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62398219, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.06201172, + "step": 16596, + "time_per_iteration": 2.887441873550415 + }, + { + "auxiliary_loss_clip": 0.01051879, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.01775753, + "balance_loss_mlp": 1.01624358, + "epoch": 0.997865624530287, + "flos": 20630456340480.0, + "grad_norm": 1.8729761914053906, + "language_loss": 0.78548157, + "learning_rate": 4.645194309227385e-11, + "loss": 0.80641234, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 0.35546875, + "step": 16597, + "time_per_iteration": 3.82503080368042 + }, + { + "auxiliary_loss_clip": 0.01053683, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.01906085, + "balance_loss_mlp": 1.01612425, + "epoch": 0.9979257477829551, + "flos": 29385835453440.0, + "grad_norm": 1.6980918599394474, + "language_loss": 0.83088642, + "learning_rate": 4.383547585562475e-11, + "loss": 0.85185254, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.375, + "step": 16598, + "time_per_iteration": 2.4146275520324707 + }, + { + "auxiliary_loss_clip": 0.01053124, + "auxiliary_loss_mlp": 0.01038764, + "balance_loss_clip": 1.01249039, + "balance_loss_mlp": 1.01565742, + "epoch": 0.997985871035623, + "flos": 22634052111360.0, + "grad_norm": 2.3621204448696114, + "language_loss": 0.65646046, + "learning_rate": 4.129484715709175e-11, + "loss": 0.67737937, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.375, + "step": 16599, + "time_per_iteration": 2.399435043334961 + }, + { + "auxiliary_loss_clip": 0.01006765, + "auxiliary_loss_mlp": 0.01003489, + "balance_loss_clip": 1.00153399, + "balance_loss_mlp": 1.00048113, + "epoch": 0.998045994288291, + "flos": 61804226534400.0, + "grad_norm": 0.8540147572191198, + "language_loss": 0.62388647, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64398903, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.06298828, + "step": 16600, + "time_per_iteration": 2.9344897270202637 + }, + { + "auxiliary_loss_clip": 0.01051295, + "auxiliary_loss_mlp": 0.01037152, + "balance_loss_clip": 1.01606369, + "balance_loss_mlp": 1.01672804, + "epoch": 0.998106117540959, + "flos": 19244509165440.0, + "grad_norm": 1.5785463759568659, + "language_loss": 0.79639083, + "learning_rate": 3.644110575717896e-11, + "loss": 0.81727529, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.34570312, + "step": 16601, + "time_per_iteration": 3.6506690979003906 + }, + { + "auxiliary_loss_clip": 0.01052655, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.01579106, + "balance_loss_mlp": 1.01535285, + "epoch": 0.9981662407936269, + "flos": 21105135452160.0, + "grad_norm": 2.7323943294607167, + "language_loss": 0.83518046, + "learning_rate": 3.412799323987414e-11, + "loss": 0.85609567, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.37304688, + "step": 16602, + "time_per_iteration": 2.36869215965271 + }, + { + "auxiliary_loss_clip": 0.01053094, + "auxiliary_loss_mlp": 0.0104078, + "balance_loss_clip": 1.01764202, + "balance_loss_mlp": 1.01698816, + "epoch": 0.998226364046295, + "flos": 24315678524160.0, + "grad_norm": 2.148807205422047, + "language_loss": 0.64074594, + "learning_rate": 3.189071962883538e-11, + "loss": 0.66168469, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.36132812, + "step": 16603, + "time_per_iteration": 2.422283411026001 + }, + { + "auxiliary_loss_clip": 0.01051161, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.00999415, + "balance_loss_mlp": 1.01442075, + "epoch": 0.9982864872989629, + "flos": 23835413594880.0, + "grad_norm": 1.7758104468031144, + "language_loss": 0.71897662, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73983592, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.3671875, + "step": 16604, + "time_per_iteration": 2.4389402866363525 + }, + { + "auxiliary_loss_clip": 0.01049571, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.01082516, + "balance_loss_mlp": 1.01469016, + "epoch": 0.9983466105516309, + "flos": 18332123938560.0, + "grad_norm": 1.60166590914381, + "language_loss": 0.65852296, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.67935163, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.34960938, + "step": 16605, + "time_per_iteration": 2.3867013454437256 + }, + { + "auxiliary_loss_clip": 0.0105063, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01119995, + "balance_loss_mlp": 1.01707256, + "epoch": 0.9984067338042988, + "flos": 17235851247360.0, + "grad_norm": 1.8891311316556585, + "language_loss": 0.71784663, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73867834, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 0.3359375, + "step": 16606, + "time_per_iteration": 2.3932013511657715 + }, + { + "auxiliary_loss_clip": 0.01052363, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.01388836, + "balance_loss_mlp": 1.01659214, + "epoch": 0.9984668570569668, + "flos": 20666835843840.0, + "grad_norm": 1.9181256287367567, + "language_loss": 0.83503181, + "learning_rate": 2.370001590090709e-11, + "loss": 0.85591704, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.35742188, + "step": 16607, + "time_per_iteration": 2.3787946701049805 + }, + { + "auxiliary_loss_clip": 0.01051762, + "auxiliary_loss_mlp": 0.01038196, + "balance_loss_clip": 1.01409149, + "balance_loss_mlp": 1.01468182, + "epoch": 0.9985269803096347, + "flos": 30261247683840.0, + "grad_norm": 1.740544355641763, + "language_loss": 0.67780882, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69870836, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.37109375, + "step": 16608, + "time_per_iteration": 2.4853529930114746 + }, + { + "auxiliary_loss_clip": 0.01052376, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.01106322, + "balance_loss_mlp": 1.01666641, + "epoch": 0.9985871035623027, + "flos": 10560213313920.0, + "grad_norm": 3.001978947404949, + "language_loss": 0.81982499, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.84068692, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35742188, + "step": 16609, + "time_per_iteration": 3.8795828819274902 + }, + { + "auxiliary_loss_clip": 0.0105028, + "auxiliary_loss_mlp": 0.01049954, + "balance_loss_clip": 1.02661312, + "balance_loss_mlp": 1.01489878, + "epoch": 0.9986472268149706, + "flos": 16872510061440.0, + "grad_norm": 1.6257287334137476, + "language_loss": 0.6403774, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.66137981, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 0.35351562, + "step": 16610, + "time_per_iteration": 2.3731353282928467 + }, + { + "auxiliary_loss_clip": 0.01050697, + "auxiliary_loss_mlp": 0.01036106, + "balance_loss_clip": 1.01456451, + "balance_loss_mlp": 1.01536846, + "epoch": 0.9987073500676387, + "flos": 22053446334720.0, + "grad_norm": 3.9446694888340867, + "language_loss": 0.68684506, + "learning_rate": 1.672274094288717e-11, + "loss": 0.70771313, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 0.35351562, + "step": 16611, + "time_per_iteration": 2.3932766914367676 + }, + { + "auxiliary_loss_clip": 0.01051885, + "auxiliary_loss_mlp": 0.01039526, + "balance_loss_clip": 1.01412213, + "balance_loss_mlp": 1.01600552, + "epoch": 0.9987674733203066, + "flos": 30481549562880.0, + "grad_norm": 1.5134086601207857, + "language_loss": 0.70626664, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.72718072, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.359375, + "step": 16612, + "time_per_iteration": 2.4830305576324463 + }, + { + "auxiliary_loss_clip": 0.01048354, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.0156045, + "balance_loss_mlp": 1.01522839, + "epoch": 0.9988275965729746, + "flos": 27744009857280.0, + "grad_norm": 1.5752497637223863, + "language_loss": 0.75244451, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.77330542, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 0.33203125, + "step": 16613, + "time_per_iteration": 2.446263074874878 + }, + { + "auxiliary_loss_clip": 0.01052605, + "auxiliary_loss_mlp": 0.01039184, + "balance_loss_clip": 1.0143764, + "balance_loss_mlp": 1.01666212, + "epoch": 0.9988877198256426, + "flos": 17523396138240.0, + "grad_norm": 2.138552975294756, + "language_loss": 0.74666679, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.76758468, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.359375, + "step": 16614, + "time_per_iteration": 2.3415441513061523 + }, + { + "auxiliary_loss_clip": 0.01051252, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.01466441, + "balance_loss_mlp": 1.01571035, + "epoch": 0.9989478430783105, + "flos": 20995438360320.0, + "grad_norm": 1.605833188603838, + "language_loss": 0.73889935, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75978398, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.35546875, + "step": 16615, + "time_per_iteration": 2.389955520629883 + }, + { + "auxiliary_loss_clip": 0.01053131, + "auxiliary_loss_mlp": 0.010398, + "balance_loss_clip": 1.01703095, + "balance_loss_mlp": 1.01675344, + "epoch": 0.9990079663309785, + "flos": 13369778887680.0, + "grad_norm": 2.1459221654044325, + "language_loss": 0.7934528, + "learning_rate": 9.70753783247069e-12, + "loss": 0.81438208, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 0.36328125, + "step": 16616, + "time_per_iteration": 2.3717541694641113 + }, + { + "auxiliary_loss_clip": 0.01051345, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.01318955, + "balance_loss_mlp": 1.0167439, + "epoch": 0.9990680895836465, + "flos": 17309308481280.0, + "grad_norm": 2.46601451547681, + "language_loss": 0.83377492, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85464215, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 0.34570312, + "step": 16617, + "time_per_iteration": 2.3321564197540283 + }, + { + "auxiliary_loss_clip": 0.01049965, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.00888789, + "balance_loss_mlp": 1.01516509, + "epoch": 0.9991282128363145, + "flos": 24206819304960.0, + "grad_norm": 1.5024403267944428, + "language_loss": 0.79315591, + "learning_rate": 7.43233506206309e-12, + "loss": 0.81397545, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 16618, + "time_per_iteration": 2.4143178462982178 + }, + { + "auxiliary_loss_clip": 0.01050796, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.01604092, + "balance_loss_mlp": 1.01518524, + "epoch": 0.9991883360889824, + "flos": 21174333500160.0, + "grad_norm": 1.6461092931747305, + "language_loss": 0.75733626, + "learning_rate": 6.408493534060255e-12, + "loss": 0.77823389, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.35546875, + "step": 16619, + "time_per_iteration": 2.356168508529663 + }, + { + "auxiliary_loss_clip": 0.01048561, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.01345778, + "balance_loss_mlp": 1.01525736, + "epoch": 0.9992484593416504, + "flos": 19900143187200.0, + "grad_norm": 2.5299689132960475, + "language_loss": 0.87344885, + "learning_rate": 5.460491963260594e-12, + "loss": 0.89427286, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.33203125, + "step": 16620, + "time_per_iteration": 2.3617000579833984 + }, + { + "auxiliary_loss_clip": 0.01048509, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.01145434, + "balance_loss_mlp": 1.0148344, + "epoch": 0.9993085825943183, + "flos": 24856832597760.0, + "grad_norm": 2.303121441465776, + "language_loss": 0.73033917, + "learning_rate": 4.58833038607942e-12, + "loss": 0.75114775, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 0.3359375, + "step": 16621, + "time_per_iteration": 2.3869481086730957 + }, + { + "auxiliary_loss_clip": 0.01006963, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00126839, + "balance_loss_mlp": 1.00072789, + "epoch": 0.9993687058469863, + "flos": 71280700335360.0, + "grad_norm": 0.7382641114313386, + "language_loss": 0.56558859, + "learning_rate": 3.79200883515729e-12, + "loss": 0.5856899, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.0625, + "step": 16622, + "time_per_iteration": 3.2371137142181396 + }, + { + "auxiliary_loss_clip": 0.01051269, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.00979447, + "balance_loss_mlp": 1.01572907, + "epoch": 0.9994288290996542, + "flos": 12198861976320.0, + "grad_norm": 1.7898881538013114, + "language_loss": 0.72773027, + "learning_rate": 3.071527340914315e-12, + "loss": 0.74857301, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.35546875, + "step": 16623, + "time_per_iteration": 2.559809684753418 + }, + { + "auxiliary_loss_clip": 0.01051026, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.00904608, + "balance_loss_mlp": 1.01600718, + "epoch": 0.9994889523523223, + "flos": 17889111296640.0, + "grad_norm": 1.8956237333838624, + "language_loss": 0.75629091, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.77712911, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.34960938, + "step": 16624, + "time_per_iteration": 2.4585185050964355 + }, + { + "auxiliary_loss_clip": 0.01050857, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.01013291, + "balance_loss_mlp": 1.01548636, + "epoch": 0.9995490756049902, + "flos": 26577666334080.0, + "grad_norm": 1.6150226341336997, + "language_loss": 0.74553597, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.76638091, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.35351562, + "step": 16625, + "time_per_iteration": 2.498415231704712 + }, + { + "auxiliary_loss_clip": 0.01050294, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.01225019, + "balance_loss_mlp": 1.01582503, + "epoch": 0.9996091988576582, + "flos": 22198964348160.0, + "grad_norm": 2.1528677229734066, + "language_loss": 0.77779073, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.79863077, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 0.34570312, + "step": 16626, + "time_per_iteration": 2.484957456588745 + }, + { + "auxiliary_loss_clip": 0.01052587, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.01540959, + "balance_loss_mlp": 1.01794553, + "epoch": 0.9996693221103262, + "flos": 27372185210880.0, + "grad_norm": 1.9442722822712908, + "language_loss": 0.8299917, + "learning_rate": 9.480024334429515e-13, + "loss": 0.85090303, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.34765625, + "step": 16627, + "time_per_iteration": 2.4946837425231934 + }, + { + "auxiliary_loss_clip": 0.01054537, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.01183534, + "balance_loss_mlp": 1.01711273, + "epoch": 0.9997294453629941, + "flos": 26869191120000.0, + "grad_norm": 1.8155562901067694, + "language_loss": 0.71910471, + "learning_rate": 6.067215747584952e-13, + "loss": 0.74001628, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.375, + "step": 16628, + "time_per_iteration": 2.5552148818969727 + }, + { + "auxiliary_loss_clip": 0.01051624, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.01417601, + "balance_loss_mlp": 1.01620984, + "epoch": 0.9997895686156621, + "flos": 23475877747200.0, + "grad_norm": 1.5155299183913533, + "language_loss": 0.76431108, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.78520536, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.35546875, + "step": 16629, + "time_per_iteration": 3.7055492401123047 + }, + { + "auxiliary_loss_clip": 0.01053934, + "auxiliary_loss_mlp": 0.01040972, + "balance_loss_clip": 1.01703477, + "balance_loss_mlp": 1.01638842, + "epoch": 0.9998496918683301, + "flos": 20224067581440.0, + "grad_norm": 1.6683973943061003, + "language_loss": 0.6142379, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.63518697, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.375, + "step": 16630, + "time_per_iteration": 2.4113969802856445 + }, + { + "auxiliary_loss_clip": 0.01053071, + "auxiliary_loss_mlp": 0.01039839, + "balance_loss_clip": 1.01722455, + "balance_loss_mlp": 1.01667798, + "epoch": 0.9999098151209981, + "flos": 21651840432000.0, + "grad_norm": 2.0743731405423356, + "language_loss": 0.61565042, + "learning_rate": 3.792010017100722e-14, + "loss": 0.63657951, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.36523438, + "step": 16631, + "time_per_iteration": 2.3413243293762207 + }, + { + "auxiliary_loss_clip": 0.01049699, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.01523566, + "balance_loss_mlp": 1.01644707, + "epoch": 0.999969938373666, + "flos": 11543856359040.0, + "grad_norm": 1.941040473940969, + "language_loss": 0.73196745, + "learning_rate": 0.0, + "loss": 0.7528308, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 0.33203125, + "step": 16632, + "time_per_iteration": 2.3182687759399414 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3991449643649597e+18, + "train_loss": 0.7862426474342815, + "train_runtime": 44270.1087, + "train_samples_per_second": 15.028, + "train_steps_per_second": 0.376 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3991449643649597e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}